chg: [crawler manager] get all splash dockers, proxies and launch all crawlers

2024-11-26 15:57:16 +00:00 · 2020-06-09 18:33:41 +02:00 · 2020-06-09 18:33:41 +02:00 · 41cacf7129
commit 41cacf7129
parent 9828ffdbc8
6 changed files with 158 additions and 29 deletions
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@ -420,6 +420,33 @@ Supported cryptocurrency:
 }
 ```
 ### Splash containers and proxies:
 | SET - Key | Value |
 | ------ | ------ |
 | all_proxy  | **proxy name**  |
 | all_splash | **splash name** |
 | HSET - Key | Field | Value |
 | ------ | ------ | ------ |
 | proxy:metadata:**proxy name** | host         | **host**              |
 | proxy:metadata:**proxy name** | port         | **port**              |
 | proxy:metadata:**proxy name** | type         | **type**              |
 | proxy:metadata:**proxy name** | crawler_type | **crawler_type**      |
 | proxy:metadata:**proxy name** | description  | **proxy description** |
 |  |  |  |
 | splash:metadata:**splash name** | description  | **splash description**          |
 | splash:metadata:**splash name** | crawler_type | **crawler_type**                |
 | splash:metadata:**splash name** | proxy        | **splash proxy (None if null)** |
 | SET - Key | Value |
 | ------ | ------ |
 | splash:url:**container name** | **splash url**     |
 | proxy:splash:**proxy name**   | **container name** |
 |  Key | Value |
 | ------ | ------ |
 | splash:map:url:name:**splash url** | **container name** |
 ##### CRAWLER QUEUES:
 | SET - Key | Value |
 | ------ | ------ |
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
    crawler_config['port'] = port
    print('Launching Crawler: {}'.format(url))
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain)
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
    retry = True
    nb_retry = 0
@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
                print('--------------------------------------')
                print('         \033[91m DOCKER SPLASH DOWN\033[0m')
                print('          {} DOWN'.format(splash_url))
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN')
                nb_retry == 0
            print('         \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
            time.sleep(10)
    if r.status_code == 200:
-        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
+        r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
        # save config in cash
        UUID = str(uuid.uuid4())
        r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
                print('')
                print('            PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
                print('------------------------------------------------------------------------')
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error')
                exit(-2)
        else:
            print(process.stdout.read())
@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
        print('--------------------------------------')
        print('         \033[91m DOCKER SPLASH DOWN\033[0m')
        print('          {} DOWN'.format(splash_url))
-        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
+        r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
        exit(1)
 # check external links (full_crawl)
@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain):
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
        print('usage:', 'Crawler.py', 'splash_port')
        print('usage:', 'Crawler.py', 'splash_name', 'splash_url')
        exit(1)
 ##################################################
    #mode = sys.argv[1]
    splash_port = sys.argv[1]
    rotation_mode = deque(['onion', 'regular'])
    default_proto_map = {'http': 80, 'https': 443}
 ######################################################## add ftp ???
    publisher.port = 6380
    publisher.channel = "Script"
@ -323,9 +318,20 @@ if __name__ == '__main__':
    # Setup the I/O queues
    p = Process(config_section)
-    splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"),  splash_port)
+    if len(sys.argv) == 2:
        splash_port = sys.argv[1]
        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"),  splash_port)
    else:
        splash_name = sys.argv[1]
        splash_url =  sys.argv[2]
        print(splash_name)
    print('splash url: {}'.format(splash_url))
    rotation_mode = deque(['onion', 'regular'])
    default_proto_map = {'http': 80, 'https': 443}
 ######################################################## add ftp ???
    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
    r_serv_metadata = redis.StrictRedis(
@ -372,9 +378,9 @@ if __name__ == '__main__':
                              'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
    # Track launched crawler
-    r_cache.sadd('all_crawler', splash_port)
+    r_cache.sadd('all_crawler', splash_url)
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
    # update hardcoded blacklist
    load_blacklist('onion')
@ -408,7 +414,7 @@ if __name__ == '__main__':
                        'epoch': int(time.time())}
                # Update crawler status type
-                r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
+                r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)
                crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'],  to_crawl['url'], date)
                # check if default crawler
@ -456,11 +462,11 @@ if __name__ == '__main__':
                redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
                #update crawler status
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
-                r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
+                r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')
                # Update crawler status type
-                r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
+                r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)
                # add next auto Crawling in queue:
                if to_crawl['paste'] == 'auto':
--- a/bin/core/Crawler_manager.py
+++ b/bin/core/Crawler_manager.py
@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import json
 import os
 import sys
 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
 import ConfigLoader
 import crawlers
 config_loader = ConfigLoader.ConfigLoader()
 r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
 config_loader = None
 config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
 SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
 api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
 crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
 config_loader = None
 import screen
 if __name__ == '__main__':
    if not crawlers.ping_splash_manager():
        print('Error, Can\'t cnnect to Splash manager')
    crawlers.reload_splash_and_proxies_list()
    # # TODO: handle mutltiple splash_manager
    for crawler_splash in crawlers_to_launch:
        splash_name = crawler_splash[0]
        nb_crawlers = int(crawler_splash[1])
        all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
        if nb_crawlers > len(all_crawler_urls):
            print('Error, can\'t launch all Splash Dockers')
            print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
            nb_crawlers = len(all_crawler_urls)
        for i in range(0, int(nb_crawlers)):
            splash_url = all_crawler_urls[i]
            print(all_crawler_urls[i])
            crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url))
--- a/bin/core/screen.py
+++ b/bin/core/screen.py
@ -4,6 +4,7 @@
 import os
 import subprocess
 import sys
 import re
 all_screen_name = set()
@ -16,8 +17,11 @@ def is_screen_install():
    print(p.stderr)
    return False
-def exist_screen(screen_name):
+def exist_screen(screen_name, with_sudoer=False):
-    cmd_1 = ['screen', '-ls']
+    if with_sudoer:
        cmd_1 = ['sudo', 'screen', '-ls']
    else:
        cmd_1 = ['screen', '-ls']
    cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
    p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
@ -27,6 +31,28 @@ def exist_screen(screen_name):
        return True
    return False
 def get_screen_pid(screen_name,  with_sudoer=False):
    if with_sudoer:
        cmd_1 = ['sudo', 'screen', '-ls']
    else:
        cmd_1 = ['screen', '-ls']
    cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
    p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
    p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
    output = p2.communicate()[0]
    if output:
        # extract pids with screen name
        regex_pid_screen_name = b'[0-9]+.' + screen_name.encode()
        pids = re.findall(regex_pid_screen_name, output)
        # extract pids
        all_pids = []
        for pid_name in pids:
            pid = pid_name.split(b'.')[0].decode()
            all_pids.append(pid)
        return all_pids
    return []
 def create_screen(screen_name):
    if not exist_screen(screen_name):
        cmd = ['screen', '-dmS', screen_name]
@ -38,6 +64,18 @@ def create_screen(screen_name):
            print(p.stderr)
    return False
 def kill_screen(screen_name, with_sudoer=False):
    if get_screen_pid(screen_name, with_sudoer=with_sudoer):
        for pid in get_screen_pid(screen_name,  with_sudoer=with_sudoer):
            cmd = ['kill', pid]
            p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            if p.stderr:
                print(p.stderr)
            else:
                print('{} killed'.format(pid))
        return True
    return False
 # # TODO: add check if len(window_name) == 20
 # use: screen -S 'pid.screen_name' -p %window_id% -Q title
 # if len(windows_name) > 20 (truncated by default)
@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False):
    print(p.stderr)
 if __name__ == '__main__':
-    res = get_screen_windows_list('Script_AIL')
+    res = kill_screen('Docker_Splash', with_sudoer=True)
    print(res)
--- a/bin/lib/ConfigLoader.py
+++ b/bin/lib/ConfigLoader.py
@ -55,3 +55,12 @@ class ConfigLoader(object):
    def has_section(self, section):
        return self.cfg.has_section(section)
    def get_all_keys_values_from_section(self, section):
        if section in self.cfg:
            all_keys_values = []
            for key_name in self.cfg[section]:
                all_keys_values.append((key_name, self.cfg.get(section, key_name)))
            return all_keys_values
        else:
            return []
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -38,8 +38,8 @@ config_loader = None
 # load crawler config
 config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
-#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
+splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
-#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
+splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
 config_loader = None
 faup = Faup()
@ -691,11 +691,13 @@ def load_all_proxy():
        if description:
            r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
-def init_splash_list_db():
+def reload_splash_and_proxies_list():
    delete_all_splash_containers()
    delete_all_proxies()
    if ping_splash_manager():
        # LOAD SPLASH containers
        delete_all_splash_containers()
        load_all_splash_containers()
        # LOAD PROXIES containers
        delete_all_proxies()
        load_all_proxy()
    # # TODO: kill crawler screen ?
    ## -- ##