mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [crawler manager] get all splash dockers, proxies and launch all crawlers
This commit is contained in:
parent
9828ffdbc8
commit
41cacf7129
6 changed files with 158 additions and 29 deletions
27
OVERVIEW.md
27
OVERVIEW.md
|
@ -420,6 +420,33 @@ Supported cryptocurrency:
|
|||
}
|
||||
```
|
||||
|
||||
### Splash containers and proxies:
|
||||
| SET - Key | Value |
|
||||
| ------ | ------ |
|
||||
| all_proxy | **proxy name** |
|
||||
| all_splash | **splash name** |
|
||||
|
||||
| HSET - Key | Field | Value |
|
||||
| ------ | ------ | ------ |
|
||||
| proxy:metadata:**proxy name** | host | **host** |
|
||||
| proxy:metadata:**proxy name** | port | **port** |
|
||||
| proxy:metadata:**proxy name** | type | **type** |
|
||||
| proxy:metadata:**proxy name** | crawler_type | **crawler_type** |
|
||||
| proxy:metadata:**proxy name** | description | **proxy description** |
|
||||
| | | |
|
||||
| splash:metadata:**splash name** | description | **splash description** |
|
||||
| splash:metadata:**splash name** | crawler_type | **crawler_type** |
|
||||
| splash:metadata:**splash name** | proxy | **splash proxy (None if null)** |
|
||||
|
||||
| SET - Key | Value |
|
||||
| ------ | ------ |
|
||||
| splash:url:**container name** | **splash url** |
|
||||
| proxy:splash:**proxy name** | **container name** |
|
||||
|
||||
| Key | Value |
|
||||
| ------ | ------ |
|
||||
| splash:map:url:name:**splash url** | **container name** |
|
||||
|
||||
##### CRAWLER QUEUES:
|
||||
| SET - Key | Value |
|
||||
| ------ | ------ |
|
||||
|
|
|
@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
|||
crawler_config['port'] = port
|
||||
print('Launching Crawler: {}'.format(url))
|
||||
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
retry = True
|
||||
nb_retry = 0
|
||||
|
@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
|||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN')
|
||||
nb_retry == 0
|
||||
|
||||
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
||||
|
@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
|||
time.sleep(10)
|
||||
|
||||
if r.status_code == 200:
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
|
||||
# save config in cash
|
||||
UUID = str(uuid.uuid4())
|
||||
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
|
||||
|
@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
|||
print('')
|
||||
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||
print('------------------------------------------------------------------------')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error')
|
||||
exit(-2)
|
||||
else:
|
||||
print(process.stdout.read())
|
||||
|
@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
|||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
|
||||
exit(1)
|
||||
|
||||
# check external links (full_crawl)
|
||||
|
@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain):
|
|||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
if len(sys.argv) != 2 and len(sys.argv) != 3:
|
||||
print('usage:', 'Crawler.py', 'splash_port')
|
||||
print('usage:', 'Crawler.py', 'splash_name', 'splash_url')
|
||||
exit(1)
|
||||
##################################################
|
||||
#mode = sys.argv[1]
|
||||
splash_port = sys.argv[1]
|
||||
|
||||
rotation_mode = deque(['onion', 'regular'])
|
||||
default_proto_map = {'http': 80, 'https': 443}
|
||||
######################################################## add ftp ???
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
@ -323,9 +318,20 @@ if __name__ == '__main__':
|
|||
# Setup the I/O queues
|
||||
p = Process(config_section)
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
splash_port = sys.argv[1]
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port)
|
||||
else:
|
||||
splash_name = sys.argv[1]
|
||||
splash_url = sys.argv[2]
|
||||
print(splash_name)
|
||||
|
||||
print('splash url: {}'.format(splash_url))
|
||||
|
||||
rotation_mode = deque(['onion', 'regular'])
|
||||
default_proto_map = {'http': 80, 'https': 443}
|
||||
######################################################## add ftp ???
|
||||
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
||||
|
||||
r_serv_metadata = redis.StrictRedis(
|
||||
|
@ -372,9 +378,9 @@ if __name__ == '__main__':
|
|||
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
|
||||
|
||||
# Track launched crawler
|
||||
r_cache.sadd('all_crawler', splash_port)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
r_cache.sadd('all_crawler', splash_url)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
# update hardcoded blacklist
|
||||
load_blacklist('onion')
|
||||
|
@ -408,7 +414,7 @@ if __name__ == '__main__':
|
|||
'epoch': int(time.time())}
|
||||
|
||||
# Update crawler status type
|
||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)
|
||||
|
||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||
# check if default crawler
|
||||
|
@ -456,11 +462,11 @@ if __name__ == '__main__':
|
|||
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
||||
|
||||
#update crawler status
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
|
||||
r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')
|
||||
|
||||
# Update crawler status type
|
||||
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)
|
||||
|
||||
# add next auto Crawling in queue:
|
||||
if to_crawl['paste'] == 'auto':
|
||||
|
|
47
bin/core/Crawler_manager.py
Executable file
47
bin/core/Crawler_manager.py
Executable file
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
||||
import ConfigLoader
|
||||
import crawlers
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader()
|
||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||
config_loader = None
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||
SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||
api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||
crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
|
||||
config_loader = None
|
||||
|
||||
import screen
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if not crawlers.ping_splash_manager():
|
||||
print('Error, Can\'t cnnect to Splash manager')
|
||||
|
||||
crawlers.reload_splash_and_proxies_list()
|
||||
|
||||
# # TODO: handle mutltiple splash_manager
|
||||
|
||||
for crawler_splash in crawlers_to_launch:
|
||||
splash_name = crawler_splash[0]
|
||||
nb_crawlers = int(crawler_splash[1])
|
||||
|
||||
all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
|
||||
if nb_crawlers > len(all_crawler_urls):
|
||||
print('Error, can\'t launch all Splash Dockers')
|
||||
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
|
||||
nb_crawlers = len(all_crawler_urls)
|
||||
|
||||
for i in range(0, int(nb_crawlers)):
|
||||
splash_url = all_crawler_urls[i]
|
||||
print(all_crawler_urls[i])
|
||||
|
||||
crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url))
|
|
@ -4,6 +4,7 @@
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import re
|
||||
|
||||
all_screen_name = set()
|
||||
|
||||
|
@ -16,7 +17,10 @@ def is_screen_install():
|
|||
print(p.stderr)
|
||||
return False
|
||||
|
||||
def exist_screen(screen_name):
|
||||
def exist_screen(screen_name, with_sudoer=False):
|
||||
if with_sudoer:
|
||||
cmd_1 = ['sudo', 'screen', '-ls']
|
||||
else:
|
||||
cmd_1 = ['screen', '-ls']
|
||||
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
|
||||
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
|
||||
|
@ -27,6 +31,28 @@ def exist_screen(screen_name):
|
|||
return True
|
||||
return False
|
||||
|
||||
def get_screen_pid(screen_name, with_sudoer=False):
|
||||
if with_sudoer:
|
||||
cmd_1 = ['sudo', 'screen', '-ls']
|
||||
else:
|
||||
cmd_1 = ['screen', '-ls']
|
||||
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
|
||||
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
|
||||
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
|
||||
p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits.
|
||||
output = p2.communicate()[0]
|
||||
if output:
|
||||
# extract pids with screen name
|
||||
regex_pid_screen_name = b'[0-9]+.' + screen_name.encode()
|
||||
pids = re.findall(regex_pid_screen_name, output)
|
||||
# extract pids
|
||||
all_pids = []
|
||||
for pid_name in pids:
|
||||
pid = pid_name.split(b'.')[0].decode()
|
||||
all_pids.append(pid)
|
||||
return all_pids
|
||||
return []
|
||||
|
||||
def create_screen(screen_name):
|
||||
if not exist_screen(screen_name):
|
||||
cmd = ['screen', '-dmS', screen_name]
|
||||
|
@ -38,6 +64,18 @@ def create_screen(screen_name):
|
|||
print(p.stderr)
|
||||
return False
|
||||
|
||||
def kill_screen(screen_name, with_sudoer=False):
|
||||
if get_screen_pid(screen_name, with_sudoer=with_sudoer):
|
||||
for pid in get_screen_pid(screen_name, with_sudoer=with_sudoer):
|
||||
cmd = ['kill', pid]
|
||||
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if p.stderr:
|
||||
print(p.stderr)
|
||||
else:
|
||||
print('{} killed'.format(pid))
|
||||
return True
|
||||
return False
|
||||
|
||||
# # TODO: add check if len(window_name) == 20
|
||||
# use: screen -S 'pid.screen_name' -p %window_id% -Q title
|
||||
# if len(windows_name) > 20 (truncated by default)
|
||||
|
@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False):
|
|||
print(p.stderr)
|
||||
|
||||
if __name__ == '__main__':
|
||||
res = get_screen_windows_list('Script_AIL')
|
||||
res = kill_screen('Docker_Splash', with_sudoer=True)
|
||||
print(res)
|
||||
|
|
|
@ -55,3 +55,12 @@ class ConfigLoader(object):
|
|||
|
||||
def has_section(self, section):
|
||||
return self.cfg.has_section(section)
|
||||
|
||||
def get_all_keys_values_from_section(self, section):
|
||||
if section in self.cfg:
|
||||
all_keys_values = []
|
||||
for key_name in self.cfg[section]:
|
||||
all_keys_values.append((key_name, self.cfg.get(section, key_name)))
|
||||
return all_keys_values
|
||||
else:
|
||||
return []
|
||||
|
|
|
@ -38,8 +38,8 @@ config_loader = None
|
|||
|
||||
# load crawler config
|
||||
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||
#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||
#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||
config_loader = None
|
||||
|
||||
faup = Faup()
|
||||
|
@ -691,11 +691,13 @@ def load_all_proxy():
|
|||
if description:
|
||||
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
|
||||
|
||||
def init_splash_list_db():
|
||||
delete_all_splash_containers()
|
||||
delete_all_proxies()
|
||||
def reload_splash_and_proxies_list():
|
||||
if ping_splash_manager():
|
||||
# LOAD SPLASH containers
|
||||
delete_all_splash_containers()
|
||||
load_all_splash_containers()
|
||||
# LOAD PROXIES containers
|
||||
delete_all_proxies()
|
||||
load_all_proxy()
|
||||
# # TODO: kill crawler screen ?
|
||||
## -- ##
|
||||
|
|
Loading…
Reference in a new issue