chg: [crawler manager] get all splash dockers, proxies and launch all crawlers

2024-11-26 15:57:16 +00:00 · 2020-06-09 18:33:41 +02:00 · 2020-06-09 18:33:41 +02:00 · 41cacf7129
commit 41cacf7129
parent 9828ffdbc8
6 changed files with 158 additions and 29 deletions
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@ -420,6 +420,33 @@ Supported cryptocurrency:
 }
 ```

+### Splash containers and proxies:
+| SET - Key | Value |
+| ------ | ------ |
+| all_proxy  | **proxy name**  |
+| all_splash | **splash name** |
+
+| HSET - Key | Field | Value |
+| ------ | ------ | ------ |
+| proxy:metadata:**proxy name** | host         | **host**              |
+| proxy:metadata:**proxy name** | port         | **port**              |
+| proxy:metadata:**proxy name** | type         | **type**              |
+| proxy:metadata:**proxy name** | crawler_type | **crawler_type**      |
+| proxy:metadata:**proxy name** | description  | **proxy description** |
+|  |  |  |
+| splash:metadata:**splash name** | description  | **splash description**          |
+| splash:metadata:**splash name** | crawler_type | **crawler_type**                |
+| splash:metadata:**splash name** | proxy        | **splash proxy (None if null)** |
+
+| SET - Key | Value |
+| ------ | ------ |
+| splash:url:**container name** | **splash url**     |
+| proxy:splash:**proxy name**   | **container name** |
+
+|  Key | Value |
+| ------ | ------ |
+| splash:map:url:name:**splash url** | **container name** |
+
 ##### CRAWLER QUEUES:
 | SET - Key | Value |
 | ------ | ------ |
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
    crawler_config['port'] = port
    print('Launching Crawler: {}'.format(url))

-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain)
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))

    retry = True
    nb_retry = 0
@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
                print('--------------------------------------')
                print('         \033[91m DOCKER SPLASH DOWN\033[0m')
                print('          {} DOWN'.format(splash_url))
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN')
                nb_retry == 0

            print('         \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
            time.sleep(10)

    if r.status_code == 200:
-        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
+        r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
        # save config in cash
        UUID = str(uuid.uuid4())
        r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
                print('')
                print('            PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
                print('------------------------------------------------------------------------')
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error')
                exit(-2)
        else:
            print(process.stdout.read())
@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
        print('--------------------------------------')
        print('         \033[91m DOCKER SPLASH DOWN\033[0m')
        print('          {} DOWN'.format(splash_url))
-        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
+        r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
        exit(1)

 # check external links (full_crawl)
@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain):

 if __name__ == '__main__':

-    if len(sys.argv) != 2:
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
        print('usage:', 'Crawler.py', 'splash_port')
+        print('usage:', 'Crawler.py', 'splash_name', 'splash_url')
        exit(1)
 ##################################################
-    #mode = sys.argv[1]
-    splash_port = sys.argv[1]
-
-    rotation_mode = deque(['onion', 'regular'])
-    default_proto_map = {'http': 80, 'https': 443}
-######################################################## add ftp ???

    publisher.port = 6380
    publisher.channel = "Script"
@ -323,9 +318,20 @@ if __name__ == '__main__':
    # Setup the I/O queues
    p = Process(config_section)

-    splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"),  splash_port)
+    if len(sys.argv) == 2:
+        splash_port = sys.argv[1]
+        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"),  splash_port)
+    else:
+        splash_name = sys.argv[1]
+        splash_url =  sys.argv[2]
+        print(splash_name)
+
    print('splash url: {}'.format(splash_url))

+    rotation_mode = deque(['onion', 'regular'])
+    default_proto_map = {'http': 80, 'https': 443}
+######################################################## add ftp ???
+
    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))

    r_serv_metadata = redis.StrictRedis(
@ -372,9 +378,9 @@ if __name__ == '__main__':
                              'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}

    # Track launched crawler
-    r_cache.sadd('all_crawler', splash_port)
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
-    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
+    r_cache.sadd('all_crawler', splash_url)
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
+    r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))

    # update hardcoded blacklist
    load_blacklist('onion')
@ -408,7 +414,7 @@ if __name__ == '__main__':
                        'epoch': int(time.time())}

                # Update crawler status type
-                r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
+                r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)

                crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'],  to_crawl['url'], date)
                # check if default crawler
@ -456,11 +462,11 @@ if __name__ == '__main__':
                redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)

                #update crawler status
-                r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
-                r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
+                r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
+                r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')

                # Update crawler status type
-                r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
+                r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)

                # add next auto Crawling in queue:
                if to_crawl['paste'] == 'auto':
--- a/bin/core/Crawler_manager.py
+++ b/bin/core/Crawler_manager.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import json
+import os
+import sys
+
+sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
+import ConfigLoader
+import crawlers
+
+config_loader = ConfigLoader.ConfigLoader()
+r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
+config_loader = None
+
+config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
+SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
+api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
+crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
+config_loader = None
+
+import screen
+
+if __name__ == '__main__':
+
+    if not crawlers.ping_splash_manager():
+        print('Error, Can\'t cnnect to Splash manager')
+
+    crawlers.reload_splash_and_proxies_list()
+
+    # # TODO: handle mutltiple splash_manager
+
+    for crawler_splash in crawlers_to_launch:
+        splash_name = crawler_splash[0]
+        nb_crawlers = int(crawler_splash[1])
+
+        all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
+        if nb_crawlers > len(all_crawler_urls):
+            print('Error, can\'t launch all Splash Dockers')
+            print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
+            nb_crawlers = len(all_crawler_urls)
+
+        for i in range(0, int(nb_crawlers)):
+            splash_url = all_crawler_urls[i]
+            print(all_crawler_urls[i])
+
+            crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url))
--- a/bin/core/screen.py
+++ b/bin/core/screen.py
@ -4,6 +4,7 @@
 import os
 import subprocess
 import sys
+import re

 all_screen_name = set()

@ -16,8 +17,11 @@ def is_screen_install():
    print(p.stderr)
    return False

-def exist_screen(screen_name):
-    cmd_1 = ['screen', '-ls']
+def exist_screen(screen_name, with_sudoer=False):
+    if with_sudoer:
+        cmd_1 = ['sudo', 'screen', '-ls']
+    else:
+        cmd_1 = ['screen', '-ls']
    cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
    p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
@ -27,6 +31,28 @@ def exist_screen(screen_name):
        return True
    return False

+def get_screen_pid(screen_name,  with_sudoer=False):
+    if with_sudoer:
+        cmd_1 = ['sudo', 'screen', '-ls']
+    else:
+        cmd_1 = ['screen', '-ls']
+    cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
+    p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
+    p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
+    p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
+    output = p2.communicate()[0]
+    if output:
+        # extract pids with screen name
+        regex_pid_screen_name = b'[0-9]+.' + screen_name.encode()
+        pids = re.findall(regex_pid_screen_name, output)
+        # extract pids
+        all_pids = []
+        for pid_name in pids:
+            pid = pid_name.split(b'.')[0].decode()
+            all_pids.append(pid)
+        return all_pids
+    return []
+
 def create_screen(screen_name):
    if not exist_screen(screen_name):
        cmd = ['screen', '-dmS', screen_name]
@ -38,6 +64,18 @@ def create_screen(screen_name):
            print(p.stderr)
    return False

+def kill_screen(screen_name, with_sudoer=False):
+    if get_screen_pid(screen_name, with_sudoer=with_sudoer):
+        for pid in get_screen_pid(screen_name,  with_sudoer=with_sudoer):
+            cmd = ['kill', pid]
+            p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            if p.stderr:
+                print(p.stderr)
+            else:
+                print('{} killed'.format(pid))
+        return True
+    return False
+
 # # TODO: add check if len(window_name) == 20
 # use: screen -S 'pid.screen_name' -p %window_id% -Q title
 # if len(windows_name) > 20 (truncated by default)
@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False):
    print(p.stderr)

 if __name__ == '__main__':
-    res = get_screen_windows_list('Script_AIL')
+    res = kill_screen('Docker_Splash', with_sudoer=True)
    print(res)
--- a/bin/lib/ConfigLoader.py
+++ b/bin/lib/ConfigLoader.py
@ -55,3 +55,12 @@ class ConfigLoader(object):

    def has_section(self, section):
        return self.cfg.has_section(section)
+
+    def get_all_keys_values_from_section(self, section):
+        if section in self.cfg:
+            all_keys_values = []
+            for key_name in self.cfg[section]:
+                all_keys_values.append((key_name, self.cfg.get(section, key_name)))
+            return all_keys_values
+        else:
+            return []
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -38,8 +38,8 @@ config_loader = None

 # load crawler config
 config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
-#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
-#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
+splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
+splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
 config_loader = None

 faup = Faup()
@ -691,11 +691,13 @@ def load_all_proxy():
        if description:
            r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)

-def init_splash_list_db():
-    delete_all_splash_containers()
-    delete_all_proxies()
+def reload_splash_and_proxies_list():
    if ping_splash_manager():
+        # LOAD SPLASH containers
+        delete_all_splash_containers()
        load_all_splash_containers()
+        # LOAD PROXIES containers
+        delete_all_proxies()
        load_all_proxy()
    # # TODO: kill crawler screen ?
    ## -- ##