From 41cacf712947ec01ee7fb18d91571d4d7925b7c2 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 9 Jun 2020 18:33:41 +0200 Subject: [PATCH] chg: [crawler manager] get all splash dockers, proxies and launch all crawlers --- OVERVIEW.md | 27 +++++++++++++++++++++ bin/Crawler.py | 48 +++++++++++++++++++++---------------- bin/core/Crawler_manager.py | 47 ++++++++++++++++++++++++++++++++++++ bin/core/screen.py | 44 +++++++++++++++++++++++++++++++--- bin/lib/ConfigLoader.py | 9 +++++++ bin/lib/crawlers.py | 12 ++++++---- 6 files changed, 158 insertions(+), 29 deletions(-) create mode 100755 bin/core/Crawler_manager.py diff --git a/OVERVIEW.md b/OVERVIEW.md index 3ff870b4..c3ab3ce4 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -420,6 +420,33 @@ Supported cryptocurrency: } ``` +### Splash containers and proxies: +| SET - Key | Value | +| ------ | ------ | +| all_proxy | **proxy name** | +| all_splash | **splash name** | + +| HSET - Key | Field | Value | +| ------ | ------ | ------ | +| proxy:metadata:**proxy name** | host | **host** | +| proxy:metadata:**proxy name** | port | **port** | +| proxy:metadata:**proxy name** | type | **type** | +| proxy:metadata:**proxy name** | crawler_type | **crawler_type** | +| proxy:metadata:**proxy name** | description | **proxy description** | +| | | | +| splash:metadata:**splash name** | description | **splash description** | +| splash:metadata:**splash name** | crawler_type | **crawler_type** | +| splash:metadata:**splash name** | proxy | **splash proxy (None if null)** | + +| SET - Key | Value | +| ------ | ------ | +| splash:url:**container name** | **splash url** | +| proxy:splash:**proxy name** | **container name** | + +| Key | Value | +| ------ | ------ | +| splash:map:url:name:**splash url** | **container name** | + ##### CRAWLER QUEUES: | SET - Key | Value | | ------ | ------ | diff --git a/bin/Crawler.py b/bin/Crawler.py index 4d745aad..a06b4698 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): crawler_config['port'] = port print('Launching Crawler: {}'.format(url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) retry = True nb_retry = 0 @@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN') nb_retry == 0 print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') @@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): time.sleep(10) if r.status_code == 200: - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') # save config in cash UUID = str(uuid.uuid4()) r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) @@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('') print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) print('------------------------------------------------------------------------') - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error') exit(-2) else: print(process.stdout.read()) @@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') exit(1) # check external links (full_crawl) @@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain): if __name__ == '__main__': - if len(sys.argv) != 2: + if len(sys.argv) != 2 and len(sys.argv) != 3: print('usage:', 'Crawler.py', 'splash_port') + print('usage:', 'Crawler.py', 'splash_name', 'splash_url') exit(1) ################################################## - #mode = sys.argv[1] - splash_port = sys.argv[1] - - rotation_mode = deque(['onion', 'regular']) - default_proto_map = {'http': 80, 'https': 443} -######################################################## add ftp ??? publisher.port = 6380 publisher.channel = "Script" @@ -323,9 +318,20 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port) + if len(sys.argv) == 2: + splash_port = sys.argv[1] + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port) + else: + splash_name = sys.argv[1] + splash_url = sys.argv[2] + print(splash_name) + print('splash url: {}'.format(splash_url)) + rotation_mode = deque(['onion', 'regular']) + default_proto_map = {'http': 80, 'https': 443} +######################################################## add ftp ??? + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) r_serv_metadata = redis.StrictRedis( @@ -372,9 +378,9 @@ if __name__ == '__main__': 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler - r_cache.sadd('all_crawler', splash_port) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + r_cache.sadd('all_crawler', splash_url) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) # update hardcoded blacklist load_blacklist('onion') @@ -408,7 +414,7 @@ if __name__ == '__main__': 'epoch': int(time.time())} # Update crawler status type - r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port) + r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url) crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler @@ -456,11 +462,11 @@ if __name__ == '__main__': redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) #update crawler status - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') + r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain') # Update crawler status type - r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port) + r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url) # add next auto Crawling in queue: if to_crawl['paste'] == 'auto': diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py new file mode 100755 index 00000000..3b64ae97 --- /dev/null +++ b/bin/core/Crawler_manager.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import json +import os +import sys + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import ConfigLoader +import crawlers + +config_loader = ConfigLoader.ConfigLoader() +r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +config_loader = None + +config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') +SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url') +api_key = config_loader.get_config_str('Splash_Manager', 'api_key') +crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers') +config_loader = None + +import screen + +if __name__ == '__main__': + + if not crawlers.ping_splash_manager(): + print('Error, Can\'t cnnect to Splash manager') + + crawlers.reload_splash_and_proxies_list() + + # # TODO: handle mutltiple splash_manager + + for crawler_splash in crawlers_to_launch: + splash_name = crawler_splash[0] + nb_crawlers = int(crawler_splash[1]) + + all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True) + if nb_crawlers > len(all_crawler_urls): + print('Error, can\'t launch all Splash Dockers') + print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) + nb_crawlers = len(all_crawler_urls) + + for i in range(0, int(nb_crawlers)): + splash_url = all_crawler_urls[i] + print(all_crawler_urls[i]) + + crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url)) diff --git a/bin/core/screen.py b/bin/core/screen.py index bc6ebdb2..1be37e68 100755 --- a/bin/core/screen.py +++ b/bin/core/screen.py @@ -4,6 +4,7 @@ import os import subprocess import sys +import re all_screen_name = set() @@ -16,8 +17,11 @@ def is_screen_install(): print(p.stderr) return False -def exist_screen(screen_name): - cmd_1 = ['screen', '-ls'] +def exist_screen(screen_name, with_sudoer=False): + if with_sudoer: + cmd_1 = ['sudo', 'screen', '-ls'] + else: + cmd_1 = ['screen', '-ls'] cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)] p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE) @@ -27,6 +31,28 @@ def exist_screen(screen_name): return True return False +def get_screen_pid(screen_name, with_sudoer=False): + if with_sudoer: + cmd_1 = ['sudo', 'screen', '-ls'] + else: + cmd_1 = ['screen', '-ls'] + cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)] + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE) + p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. + output = p2.communicate()[0] + if output: + # extract pids with screen name + regex_pid_screen_name = b'[0-9]+.' + screen_name.encode() + pids = re.findall(regex_pid_screen_name, output) + # extract pids + all_pids = [] + for pid_name in pids: + pid = pid_name.split(b'.')[0].decode() + all_pids.append(pid) + return all_pids + return [] + def create_screen(screen_name): if not exist_screen(screen_name): cmd = ['screen', '-dmS', screen_name] @@ -38,6 +64,18 @@ def create_screen(screen_name): print(p.stderr) return False +def kill_screen(screen_name, with_sudoer=False): + if get_screen_pid(screen_name, with_sudoer=with_sudoer): + for pid in get_screen_pid(screen_name, with_sudoer=with_sudoer): + cmd = ['kill', pid] + p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if p.stderr: + print(p.stderr) + else: + print('{} killed'.format(pid)) + return True + return False + # # TODO: add check if len(window_name) == 20 # use: screen -S 'pid.screen_name' -p %window_id% -Q title # if len(windows_name) > 20 (truncated by default) @@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False): print(p.stderr) if __name__ == '__main__': - res = get_screen_windows_list('Script_AIL') + res = kill_screen('Docker_Splash', with_sudoer=True) print(res) diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index c244b2e5..6eedadd0 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -55,3 +55,12 @@ class ConfigLoader(object): def has_section(self, section): return self.cfg.has_section(section) + + def get_all_keys_values_from_section(self, section): + if section in self.cfg: + all_keys_values = [] + for key_name in self.cfg[section]: + all_keys_values.append((key_name, self.cfg.get(section, key_name))) + return all_keys_values + else: + return [] diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index f30aac07..bb072068 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -38,8 +38,8 @@ config_loader = None # load crawler config config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') -#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url') -#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key') +splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url') +splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key') config_loader = None faup = Faup() @@ -691,11 +691,13 @@ def load_all_proxy(): if description: r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description) -def init_splash_list_db(): - delete_all_splash_containers() - delete_all_proxies() +def reload_splash_and_proxies_list(): if ping_splash_manager(): + # LOAD SPLASH containers + delete_all_splash_containers() load_all_splash_containers() + # LOAD PROXIES containers + delete_all_proxies() load_all_proxy() # # TODO: kill crawler screen ? ## -- ##