From 41cacf712947ec01ee7fb18d91571d4d7925b7c2 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 9 Jun 2020 18:33:41 +0200 Subject: [PATCH 01/20] chg: [crawler manager] get all splash dockers, proxies and launch all crawlers --- OVERVIEW.md | 27 +++++++++++++++++++++ bin/Crawler.py | 48 +++++++++++++++++++++---------------- bin/core/Crawler_manager.py | 47 ++++++++++++++++++++++++++++++++++++ bin/core/screen.py | 44 +++++++++++++++++++++++++++++++--- bin/lib/ConfigLoader.py | 9 +++++++ bin/lib/crawlers.py | 12 ++++++---- 6 files changed, 158 insertions(+), 29 deletions(-) create mode 100755 bin/core/Crawler_manager.py diff --git a/OVERVIEW.md b/OVERVIEW.md index 3ff870b4..c3ab3ce4 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -420,6 +420,33 @@ Supported cryptocurrency: } ``` +### Splash containers and proxies: +| SET - Key | Value | +| ------ | ------ | +| all_proxy | **proxy name** | +| all_splash | **splash name** | + +| HSET - Key | Field | Value | +| ------ | ------ | ------ | +| proxy:metadata:**proxy name** | host | **host** | +| proxy:metadata:**proxy name** | port | **port** | +| proxy:metadata:**proxy name** | type | **type** | +| proxy:metadata:**proxy name** | crawler_type | **crawler_type** | +| proxy:metadata:**proxy name** | description | **proxy description** | +| | | | +| splash:metadata:**splash name** | description | **splash description** | +| splash:metadata:**splash name** | crawler_type | **crawler_type** | +| splash:metadata:**splash name** | proxy | **splash proxy (None if null)** | + +| SET - Key | Value | +| ------ | ------ | +| splash:url:**container name** | **splash url** | +| proxy:splash:**proxy name** | **container name** | + +| Key | Value | +| ------ | ------ | +| splash:map:url:name:**splash url** | **container name** | + ##### CRAWLER QUEUES: | SET - Key | Value | | ------ | ------ | diff --git a/bin/Crawler.py b/bin/Crawler.py index 4d745aad..a06b4698 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): crawler_config['port'] = port print('Launching Crawler: {}'.format(url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) retry = True nb_retry = 0 @@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN') nb_retry == 0 print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') @@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): time.sleep(10) if r.status_code == 200: - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') # save config in cash UUID = str(uuid.uuid4()) r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) @@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('') print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) print('------------------------------------------------------------------------') - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error') exit(-2) else: print(process.stdout.read()) @@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') exit(1) # check external links (full_crawl) @@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain): if __name__ == '__main__': - if len(sys.argv) != 2: + if len(sys.argv) != 2 and len(sys.argv) != 3: print('usage:', 'Crawler.py', 'splash_port') + print('usage:', 'Crawler.py', 'splash_name', 'splash_url') exit(1) ################################################## - #mode = sys.argv[1] - splash_port = sys.argv[1] - - rotation_mode = deque(['onion', 'regular']) - default_proto_map = {'http': 80, 'https': 443} -######################################################## add ftp ??? publisher.port = 6380 publisher.channel = "Script" @@ -323,9 +318,20 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port) + if len(sys.argv) == 2: + splash_port = sys.argv[1] + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port) + else: + splash_name = sys.argv[1] + splash_url = sys.argv[2] + print(splash_name) + print('splash url: {}'.format(splash_url)) + rotation_mode = deque(['onion', 'regular']) + default_proto_map = {'http': 80, 'https': 443} +######################################################## add ftp ??? + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) r_serv_metadata = redis.StrictRedis( @@ -372,9 +378,9 @@ if __name__ == '__main__': 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler - r_cache.sadd('all_crawler', splash_port) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + r_cache.sadd('all_crawler', splash_url) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) # update hardcoded blacklist load_blacklist('onion') @@ -408,7 +414,7 @@ if __name__ == '__main__': 'epoch': int(time.time())} # Update crawler status type - r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port) + r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url) crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler @@ -456,11 +462,11 @@ if __name__ == '__main__': redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) #update crawler status - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') + r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain') # Update crawler status type - r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port) + r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url) # add next auto Crawling in queue: if to_crawl['paste'] == 'auto': diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py new file mode 100755 index 00000000..3b64ae97 --- /dev/null +++ b/bin/core/Crawler_manager.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import json +import os +import sys + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import ConfigLoader +import crawlers + +config_loader = ConfigLoader.ConfigLoader() +r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +config_loader = None + +config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') +SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url') +api_key = config_loader.get_config_str('Splash_Manager', 'api_key') +crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers') +config_loader = None + +import screen + +if __name__ == '__main__': + + if not crawlers.ping_splash_manager(): + print('Error, Can\'t cnnect to Splash manager') + + crawlers.reload_splash_and_proxies_list() + + # # TODO: handle mutltiple splash_manager + + for crawler_splash in crawlers_to_launch: + splash_name = crawler_splash[0] + nb_crawlers = int(crawler_splash[1]) + + all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True) + if nb_crawlers > len(all_crawler_urls): + print('Error, can\'t launch all Splash Dockers') + print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) + nb_crawlers = len(all_crawler_urls) + + for i in range(0, int(nb_crawlers)): + splash_url = all_crawler_urls[i] + print(all_crawler_urls[i]) + + crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url)) diff --git a/bin/core/screen.py b/bin/core/screen.py index bc6ebdb2..1be37e68 100755 --- a/bin/core/screen.py +++ b/bin/core/screen.py @@ -4,6 +4,7 @@ import os import subprocess import sys +import re all_screen_name = set() @@ -16,8 +17,11 @@ def is_screen_install(): print(p.stderr) return False -def exist_screen(screen_name): - cmd_1 = ['screen', '-ls'] +def exist_screen(screen_name, with_sudoer=False): + if with_sudoer: + cmd_1 = ['sudo', 'screen', '-ls'] + else: + cmd_1 = ['screen', '-ls'] cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)] p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE) @@ -27,6 +31,28 @@ def exist_screen(screen_name): return True return False +def get_screen_pid(screen_name, with_sudoer=False): + if with_sudoer: + cmd_1 = ['sudo', 'screen', '-ls'] + else: + cmd_1 = ['screen', '-ls'] + cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)] + p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE) + p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE) + p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. + output = p2.communicate()[0] + if output: + # extract pids with screen name + regex_pid_screen_name = b'[0-9]+.' + screen_name.encode() + pids = re.findall(regex_pid_screen_name, output) + # extract pids + all_pids = [] + for pid_name in pids: + pid = pid_name.split(b'.')[0].decode() + all_pids.append(pid) + return all_pids + return [] + def create_screen(screen_name): if not exist_screen(screen_name): cmd = ['screen', '-dmS', screen_name] @@ -38,6 +64,18 @@ def create_screen(screen_name): print(p.stderr) return False +def kill_screen(screen_name, with_sudoer=False): + if get_screen_pid(screen_name, with_sudoer=with_sudoer): + for pid in get_screen_pid(screen_name, with_sudoer=with_sudoer): + cmd = ['kill', pid] + p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if p.stderr: + print(p.stderr) + else: + print('{} killed'.format(pid)) + return True + return False + # # TODO: add check if len(window_name) == 20 # use: screen -S 'pid.screen_name' -p %window_id% -Q title # if len(windows_name) > 20 (truncated by default) @@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False): print(p.stderr) if __name__ == '__main__': - res = get_screen_windows_list('Script_AIL') + res = kill_screen('Docker_Splash', with_sudoer=True) print(res) diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index c244b2e5..6eedadd0 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -55,3 +55,12 @@ class ConfigLoader(object): def has_section(self, section): return self.cfg.has_section(section) + + def get_all_keys_values_from_section(self, section): + if section in self.cfg: + all_keys_values = [] + for key_name in self.cfg[section]: + all_keys_values.append((key_name, self.cfg.get(section, key_name))) + return all_keys_values + else: + return [] diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index f30aac07..bb072068 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -38,8 +38,8 @@ config_loader = None # load crawler config config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') -#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url') -#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key') +splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url') +splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key') config_loader = None faup = Faup() @@ -691,11 +691,13 @@ def load_all_proxy(): if description: r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description) -def init_splash_list_db(): - delete_all_splash_containers() - delete_all_proxies() +def reload_splash_and_proxies_list(): if ping_splash_manager(): + # LOAD SPLASH containers + delete_all_splash_containers() load_all_splash_containers() + # LOAD PROXIES containers + delete_all_proxies() load_all_proxy() # # TODO: kill crawler screen ? ## -- ## From f15e24049992557fa69af8efc8e3437a862bef92 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 17 Jun 2020 15:39:24 +0200 Subject: [PATCH 02/20] fix: [install] force virtual environmemt activation --- install_virtualenv.sh | 4 +++- installing_deps.sh | 12 +++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/install_virtualenv.sh b/install_virtualenv.sh index 9c80b08c..f61eccce 100755 --- a/install_virtualenv.sh +++ b/install_virtualenv.sh @@ -16,10 +16,12 @@ if [ -z "$VIRTUAL_ENV" ]; then echo export AIL_REDIS=$(pwd)/redis/src/ >> ./AILENV/bin/activate echo export AIL_ARDB=$(pwd)/ardb/src/ >> ./AILENV/bin/activate - . ./AILENV/bin/activate fi +# activate virtual environment +. ./AILENV/bin/activate + pip3 install -U pip pip3 install 'git+https://github.com/D4-project/BGP-Ranking.git/@7e698f87366e6f99b4d0d11852737db28e3ddc62#egg=pybgpranking&subdirectory=client' pip3 install -U -r requirements.txt diff --git a/installing_deps.sh b/installing_deps.sh index 49e7cb2e..a95e365e 100755 --- a/installing_deps.sh +++ b/installing_deps.sh @@ -88,16 +88,18 @@ fi # create AILENV + intall python packages ./install_virtualenv.sh +# force virtual environment +. ./AILENV/bin/activate -pushd ${AIL_BIN}helper/gen_cert +pushd ${AIL_BIN}/helper/gen_cert ./gen_root.sh wait ./gen_cert.sh wait popd -cp ${AIL_BIN}helper/gen_cert/server.crt ${AIL_FLASK}server.crt -cp ${AIL_BIN}helper/gen_cert/server.key ${AIL_FLASK}server.key +cp ${AIL_BIN}/helper/gen_cert/server.crt ${AIL_FLASK}server.crt +cp ${AIL_BIN}/helper/gen_cert/server.key ${AIL_FLASK}server.key mkdir -p $AIL_HOME/PASTES @@ -116,7 +118,7 @@ git describe --abbrev=0 --tags popd # LAUNCH ARDB -bash ${AIL_BIN}LAUNCH.sh -lav & +bash ${AIL_BIN}/LAUNCH.sh -lav & wait echo "" @@ -125,6 +127,6 @@ pushd ${AIL_FLASK} python3 create_default_user.py popd -bash ${AIL_BIN}LAUNCH.sh -k & +bash ${AIL_BIN}/LAUNCH.sh -k & wait echo "" From 7e9115d4d5f355b1430f519af92c8407184f552c Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 17 Jul 2020 15:53:00 +0200 Subject: [PATCH 03/20] chg: [core module] disable phone module by default --- bin/LAUNCH.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 9c855668..f484044a 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -198,8 +198,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Tools" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tools.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Phone.py; read x" - sleep 0.1 + #screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Phone.py; read x" + #sleep 0.1 #screen -S "Script_AIL" -X screen -t "Release" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Release.py; read x" #sleep 0.1 screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x" From c31aae4efc6c041d0a83e58486a4bc271206bb97 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 24 Jul 2020 08:54:54 +0200 Subject: [PATCH 04/20] chg: [crawler] crawler queue + restart docker on error --- bin/Crawler.py | 75 ++++++++------------------- bin/core/Crawler_manager.py | 36 +++++++++---- bin/lib/crawlers.py | 65 +++++++++++++++++++++++ etc/splash/proxy-profiles/default.ini | 4 -- 4 files changed, 112 insertions(+), 68 deletions(-) delete mode 100644 etc/splash/proxy-profiles/default.ini diff --git a/bin/Crawler.py b/bin/Crawler.py index a06b4698..c34f6f80 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -19,6 +19,9 @@ sys.path.append(os.environ['AIL_BIN']) from Helper import Process from pubsublogger import publisher +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import crawlers + # ======== FUNCTIONS ======== def load_blacklist(service_type): @@ -117,43 +120,6 @@ def unpack_url(url): return to_crawl -# get url, paste and service_type to crawl -def get_elem_to_crawl(rotation_mode): - message = None - domain_service_type = None - - #load_priority_queue - for service_type in rotation_mode: - message = redis_crawler.spop('{}_crawler_priority_queue'.format(service_type)) - if message is not None: - domain_service_type = service_type - break - #load_discovery_queue - if message is None: - for service_type in rotation_mode: - message = redis_crawler.spop('{}_crawler_discovery_queue'.format(service_type)) - if message is not None: - domain_service_type = service_type - break - #load_normal_queue - if message is None: - for service_type in rotation_mode: - message = redis_crawler.spop('{}_crawler_queue'.format(service_type)) - if message is not None: - domain_service_type = service_type - break - - if message: - splitted = message.rsplit(';', 1) - if len(splitted) == 2: - url, paste = splitted - if paste: - paste = paste.replace(PASTES_FOLDER+'/', '') - - message = {'url': url, 'paste': paste, 'type_service': domain_service_type, 'original_message': message} - - return message - def get_crawler_config(redis_server, mode, service_type, domain, url=None): crawler_options = {} if mode=='auto': @@ -237,6 +203,9 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): # TODO: relaunch docker or send error message nb_retry += 1 + if nb_retry == 2: + crawlers.restart_splash_docker(splash_url) + if nb_retry == 6: on_error_send_message_back_in_queue(type_service, domain, message) publisher.error('{} SPASH DOWN'.format(splash_url)) @@ -304,11 +273,23 @@ def search_potential_source_domain(type_service, domain): if __name__ == '__main__': - if len(sys.argv) != 2 and len(sys.argv) != 3: - print('usage:', 'Crawler.py', 'splash_port') - print('usage:', 'Crawler.py', 'splash_name', 'splash_url') + if len(sys.argv) != 2: + print('usage:', 'Crawler.py', 'splash_url') exit(1) ################################################## + splash_url = sys.argv[1] + + splash_name = crawlers.get_splash_name_by_url(splash_url) + crawler_type = crawlers.get_splash_crawler_type(splash_name) + + print(splash_name) + print(crawler_type) + + #rotation_mode = deque(['onion', 'regular']) + rotation_mode = deque(crawlers.get_crawler_queue_type_by_proxy(splash_name, crawler_type)) + + default_proto_map = {'http': 80, 'https': 443} +######################################################## add ftp ??? publisher.port = 6380 publisher.channel = "Script" @@ -318,20 +299,8 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) - if len(sys.argv) == 2: - splash_port = sys.argv[1] - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port) - else: - splash_name = sys.argv[1] - splash_url = sys.argv[2] - print(splash_name) - print('splash url: {}'.format(splash_url)) - rotation_mode = deque(['onion', 'regular']) - default_proto_map = {'http': 80, 'https': 443} -######################################################## add ftp ??? - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) r_serv_metadata = redis.StrictRedis( @@ -391,7 +360,7 @@ if __name__ == '__main__': update_auto_crawler() rotation_mode.rotate() - to_crawl = get_elem_to_crawl(rotation_mode) + to_crawl = crawlers.get_elem_to_crawl_by_queue_type(rotation_mode) if to_crawl: url_data = unpack_url(to_crawl['url']) # remove domain from queue diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index 3b64ae97..a5ac7dd6 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* -import json import os import sys +import time sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import ConfigLoader @@ -21,15 +21,7 @@ config_loader = None import screen -if __name__ == '__main__': - - if not crawlers.ping_splash_manager(): - print('Error, Can\'t cnnect to Splash manager') - - crawlers.reload_splash_and_proxies_list() - - # # TODO: handle mutltiple splash_manager - +def launch_crawlers(): for crawler_splash in crawlers_to_launch: splash_name = crawler_splash[0] nb_crawlers = int(crawler_splash[1]) @@ -44,4 +36,26 @@ if __name__ == '__main__': splash_url = all_crawler_urls[i] print(all_crawler_urls[i]) - crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url)) + crawlers.launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url)) + +# # TODO: handle mutltiple splash_manager +if __name__ == '__main__': + + if not crawlers.ping_splash_manager(): + print('Error, Can\'t cnnect to Splash manager') + + crawlers.reload_splash_and_proxies_list() + launch_crawlers() + last_refresh = time.time() + + while True: + + + # refresh splash and proxy list + if False: + crawlers.reload_splash_and_proxies_list() + print('list of splash and proxies refreshed') + else: + time.sleep(10) + + # # TODO: handle mutltiple splash_manager diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index bb072068..06399658 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -34,6 +34,7 @@ config_loader = ConfigLoader.ConfigLoader() r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") r_cache = config_loader.get_redis_conn("Redis_Cache") +PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) config_loader = None # load crawler config @@ -545,6 +546,48 @@ def save_har(har_dir, item_id, har_content): with open(filename, 'w') as f: f.write(json.dumps(har_content)) +#### CRAWLER QUEUES #### +def get_crawler_queue_type_by_proxy(splash_name, proxy_type): + all_domain_type = [] + if splash_name != 'default_splash' and splash_name != 'default_splash_tor': + all_domain_type.append(splash_name) + # check if can be used for discovery + if not is_splash_used_in_discovery(splash_name): + return all_domain_type + if proxy_type == 'tor': + all_domain_type.append('onion') + all_domain_type.append('regular') + # proxy_type = web + else: + all_domain_type.append('regular') + return all_domain_type + +def get_elem_to_crawl_by_queue_type(l_queue_type): + ## queues priority: + # 1 - priority queue + # 2 - discovery queue + # 3 - normal queue + ## + all_queue_key = ['{}_crawler_priority_queue', '{}_crawler_discovery_queue', '{}_crawler_queue'] + + for queue_key in all_queue_key: + for queue_type in l_queue_type: + message = r_serv_onion.spop(queue_key.format(queue_type)) + if message: + dict_to_crawl = {} + splitted = message.rsplit(';', 1) + if len(splitted) == 2: + url, item_id = splitted + item_id = item_id.replace(PASTES_FOLDER+'/', '') + else: + # # TODO: to check/refractor + item_id = None + url = message + return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message} + return None + +#### ---- #### + #### SPLASH MANAGER #### def get_splash_manager_url(reload=False): # TODO: add config reload @@ -558,6 +601,17 @@ def get_splash_url_from_manager_url(splash_manager_url, splash_port): host = url.netloc.split(':', 1)[0] return 'http://{}:{}'.format(host, splash_port) +def is_splash_used_in_discovery(splash_name): + res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue') + if res == 'True': + return True + else: + return False + +def restart_splash_docker(splash_url): + splash_port = splash_url.split(':')[-1] + return _restart_splash_docker(splash_port) + ## API ## def ping_splash_manager(): req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) @@ -580,6 +634,14 @@ def get_all_splash_manager_proxies(): return req.json() else: print(req.json()) + +def _restart_splash_docker(splash_port): + dict_to_send = {'docker_port': splash_port} + req = requests.post('{}/api/v1/splash/restart'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False, json=dict_to_send) + if req.status_code == 200: + return req.json() + else: + print(req.json()) ## -- ## ## SPLASH ## @@ -648,6 +710,9 @@ def delete_all_proxies(): for proxy_name in get_all_proxies(): delete_proxy(proxy_name) +def set_proxy_used_in_discovery(proxy_name, value): + r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value) + def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy) proxy_splash = get_all_splash_by_proxy(proxy_name) if proxy_splash: diff --git a/etc/splash/proxy-profiles/default.ini b/etc/splash/proxy-profiles/default.ini deleted file mode 100644 index 91208135..00000000 --- a/etc/splash/proxy-profiles/default.ini +++ /dev/null @@ -1,4 +0,0 @@ -[proxy] -host=localhost -port=9050 -type=SOCKS5 From 39c3918d09dc0ea0b30d6e3eba50c28c3680010d Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 27 Jul 2020 15:46:09 +0200 Subject: [PATCH 05/20] chg: [crawler] manage crawlers --- bin/Crawler.py | 6 ++-- bin/core/Crawler_manager.py | 44 ++++++++++++++++++++++------ bin/core/screen.py | 57 +++++++++++++++++++++++++++++++++---- bin/lib/crawlers.py | 57 ++++++++++++++++++++++++++++++------- 4 files changed, 139 insertions(+), 25 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index c34f6f80..34406574 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -143,7 +143,7 @@ def get_crawler_config(redis_server, mode, service_type, domain, url=None): def load_crawler_config(service_type, domain, paste, url, date): crawler_config = {} - crawler_config['splash_url'] = splash_url + crawler_config['splash_url'] = f'http://{splash_url}' crawler_config['item'] = paste crawler_config['service_type'] = service_type crawler_config['domain'] = domain @@ -197,7 +197,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): nb_retry = 0 while retry: try: - r = requests.get(splash_url , timeout=30.0) + r = requests.get(f'http://{splash_url}' , timeout=30.0) retry = False except Exception: # TODO: relaunch docker or send error message @@ -244,6 +244,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): print('------------------------------------------------------------------------') r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error') exit(-2) + else: + crawlers.update_splash_manager_connection_status(True) else: print(process.stdout.read()) exit(-1) diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index a5ac7dd6..6f1e3cf7 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -21,6 +21,9 @@ config_loader = None import screen +# # TODO: lauch me in core screen +# # TODO: check if already launched in tor screen + def launch_crawlers(): for crawler_splash in crawlers_to_launch: splash_name = crawler_splash[0] @@ -41,21 +44,46 @@ def launch_crawlers(): # # TODO: handle mutltiple splash_manager if __name__ == '__main__': - if not crawlers.ping_splash_manager(): - print('Error, Can\'t cnnect to Splash manager') - - crawlers.reload_splash_and_proxies_list() - launch_crawlers() - last_refresh = time.time() + is_manager_connected = crawlers.ping_splash_manager() + if not is_manager_connected: + print('Error, Can\'t connect to Splash manager') + session_uuid = None + else: + print('Splash manager connected') + session_uuid = crawlers.get_splash_manager_session_uuid() + is_manager_connected = crawlers.reload_splash_and_proxies_list() + print(is_manager_connected) + if is_manager_connected: + launch_crawlers() + last_check = int(time.time()) while True: + # check if manager is connected + if int(time.time()) - last_check > 60: + is_manager_connected = crawlers.is_splash_manager_connected() + current_session_uuid = crawlers.get_splash_manager_session_uuid() + # reload proxy and splash list + if current_session_uuid and current_session_uuid != session_uuid: + is_manager_connected = crawlers.reload_splash_and_proxies_list() + if is_manager_connected: + print('reload proxies and splash list') + launch_crawlers() + session_uuid = current_session_uuid + if not is_manager_connected: + print('Error, Can\'t connect to Splash manager') + last_check = int(time.time()) + # # TODO: lauch crawlers if was never connected # refresh splash and proxy list - if False: + elif False: crawlers.reload_splash_and_proxies_list() print('list of splash and proxies refreshed') else: - time.sleep(10) + time.sleep(5) + + # kill/launch new crawler / crawler manager check if already launched + # # TODO: handle mutltiple splash_manager + # catch reload request diff --git a/bin/core/screen.py b/bin/core/screen.py index 1be37e68..8b65daa4 100755 --- a/bin/core/screen.py +++ b/bin/core/screen.py @@ -53,6 +53,14 @@ def get_screen_pid(screen_name, with_sudoer=False): return all_pids return [] +def detach_screen(screen_name): + cmd = ['screen', '-d', screen_name] + p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + #if p.stdout: + # print(p.stdout) + if p.stderr: + print(p.stderr) + def create_screen(screen_name): if not exist_screen(screen_name): cmd = ['screen', '-dmS', screen_name] @@ -79,15 +87,44 @@ def kill_screen(screen_name, with_sudoer=False): # # TODO: add check if len(window_name) == 20 # use: screen -S 'pid.screen_name' -p %window_id% -Q title # if len(windows_name) > 20 (truncated by default) -def get_screen_windows_list(screen_name): +def get_screen_windows_list(screen_name, r_set=True): + # detach screen to avoid incomplete result + detach_screen(screen_name) + if r_set: + all_windows_name = set() + else: + all_windows_name = [] cmd = ['screen', '-S', screen_name, '-Q', 'windows'] p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if p.stdout: for window_row in p.stdout.split(b' '): window_id, window_name = window_row.decode().split() - print(window_id) - print(window_name) - print('---') + #print(window_id) + #print(window_name) + #print('---') + if r_set: + all_windows_name.add(window_name) + else: + all_windows_name.append(window_name) + if p.stderr: + print(p.stderr) + return all_windows_name + +def get_screen_windows_id(screen_name): + # detach screen to avoid incomplete result + detach_screen(screen_name) + all_windows_id = {} + cmd = ['screen', '-S', screen_name, '-Q', 'windows'] + p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if p.stdout: + for window_row in p.stdout.split(b' '): + window_id, window_name = window_row.decode().split() + if window_name not in all_windows_id: + all_windows_id[window_name] = [] + all_windows_id[window_name].append(window_id) + if p.stderr: + print(p.stderr) + return all_windows_id # script_location ${AIL_BIN} def launch_windows_script(screen_name, window_name, dir_project, script_location, script_name, script_options=''): @@ -98,6 +135,16 @@ def launch_windows_script(screen_name, window_name, dir_project, script_location print(p.stdout) print(p.stderr) +def launch_uniq_windows_script(screen_name, window_name, dir_project, script_location, script_name, script_options='', kill_previous_windows=False): + all_screen_name = get_screen_windows_id(screen_name) + if window_name in all_screen_name: + if kill_previous_windows: + kill_screen_window(screen_name, all_screen_name[window_name][0], force=True) + else: + print('Error: screen {} already contain a windows with this name {}'.format(screen_name, window_name)) + return None + launch_windows_script(screen_name, window_name, dir_project, script_location, script_name, script_options=script_options) + def kill_screen_window(screen_name, window_id, force=False): if force:# kill cmd = ['screen', '-S', screen_name, '-p', window_id, '-X', 'kill'] @@ -108,5 +155,5 @@ def kill_screen_window(screen_name, window_id, force=False): print(p.stderr) if __name__ == '__main__': - res = kill_screen('Docker_Splash', with_sudoer=True) + res = get_screen_windows_list('Script_AIL') print(res) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 06399658..3fcf82bf 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -13,6 +13,7 @@ import os import re import redis import sys +import time import uuid from datetime import datetime, timedelta @@ -590,16 +591,16 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): #### SPLASH MANAGER #### -def get_splash_manager_url(reload=False): # TODO: add config reload +def get_splash_manager_url(reload=False): # TODO: add in db config return splash_manager_url -def get_splash_api_key(reload=False): # TODO: add config reload +def get_splash_api_key(reload=False): # TODO: add in db config return splash_api_key def get_splash_url_from_manager_url(splash_manager_url, splash_port): url = urlparse(splash_manager_url) host = url.netloc.split(':', 1)[0] - return 'http://{}:{}'.format(host, splash_port) + return '{}:{}'.format(host, splash_port) def is_splash_used_in_discovery(splash_name): res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue') @@ -612,14 +613,47 @@ def restart_splash_docker(splash_url): splash_port = splash_url.split(':')[-1] return _restart_splash_docker(splash_port) +def is_splash_manager_connected(delta_check=30): + last_check = r_cache.hget('crawler:splash:manager', 'last_check') + if last_check: + if int(time.time()) - int(last_check) > delta_check: + ping_splash_manager() + else: + ping_splash_manager() + res = r_cache.hget('crawler:splash:manager', 'connected') + return res == 'True' + +def update_splash_manager_connection_status(is_connected): + r_cache.hset('crawler:splash:manager', 'connected', is_connected) + r_cache.hset('crawler:splash:manager', 'last_check', int(time.time())) + ## API ## def ping_splash_manager(): - req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - return True - else: - print(req.json()) - return False + try: + req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) + if req.status_code == 200: + update_splash_manager_connection_status(True) + return True + else: + print(req.json()) + except requests.exceptions.ConnectionError: + pass + # splash manager unreachable + update_splash_manager_connection_status(False) + return False + +def get_splash_manager_session_uuid(): + try: + req = requests.get('{}/api/v1/get/session_uuid'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) + if req.status_code == 200: + res = req.json() + if res: + return res['session_uuid'] + else: + print(req.json()) + except requests.exceptions.ConnectionError: + # splash manager unreachable + update_splash_manager_connection_status(False) def get_all_splash_manager_containers_name(): req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) @@ -764,6 +798,9 @@ def reload_splash_and_proxies_list(): # LOAD PROXIES containers delete_all_proxies() load_all_proxy() + return True + else: + return False # # TODO: kill crawler screen ? ## -- ## @@ -774,7 +811,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''): script_location = os.path.join(os.environ['AIL_BIN']) script_name = 'Crawler.py' screen.create_screen(screen_name) - screen.launch_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options) + screen.launch_uniq_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options, kill_previous_windows=True) ## -- ## From 3ea14b29b8d63e745e5cb43bf8c0766b6ddaa153 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 17 Aug 2020 21:52:57 +0200 Subject: [PATCH 06/20] chg: [crawler] show all crawlers type on dashboard --- bin/Crawler.py | 6 +- bin/core/Crawler_manager.py | 4 + bin/lib/crawlers.py | 74 ++++- var/www/blueprints/crawler_splash.py | 32 +++ .../hiddenServices/Flask_hiddenServices.py | 49 ++-- .../templates/Crawler_dashboard.html | 267 ------------------ .../dashboard_splash_crawler.html | 235 +++++++++++++++ var/www/templates/crawler/menu_sidebar.html | 8 +- var/www/templates/nav_bar.html | 2 +- 9 files changed, 371 insertions(+), 306 deletions(-) delete mode 100644 var/www/modules/hiddenServices/templates/Crawler_dashboard.html create mode 100644 var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html diff --git a/bin/Crawler.py b/bin/Crawler.py index 34406574..6a61a0ba 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -349,7 +349,7 @@ if __name__ == '__main__': 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler - r_cache.sadd('all_crawler', splash_url) + r_cache.sadd('all_splash_crawlers', splash_url) r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) @@ -385,7 +385,7 @@ if __name__ == '__main__': 'epoch': int(time.time())} # Update crawler status type - r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler @@ -437,7 +437,7 @@ if __name__ == '__main__': r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain') # Update crawler status type - r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url) + r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) # add next auto Crawling in queue: if to_crawl['paste'] == 'auto': diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index 6f1e3cf7..8b43be99 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -35,6 +35,8 @@ def launch_crawlers(): print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) nb_crawlers = len(all_crawler_urls) + crawlers.reset_all_spash_crawler_status() + for i in range(0, int(nb_crawlers)): splash_url = all_crawler_urls[i] print(all_crawler_urls[i]) @@ -59,6 +61,8 @@ if __name__ == '__main__': while True: + # # TODO: avoid multiple ping + # check if manager is connected if int(time.time()) - last_check > 60: is_manager_connected = crawlers.is_splash_manager_connected() diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index e4643601..6448843d 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -46,9 +46,20 @@ config_loader = None faup = Faup() +# # # # # # # # +# # +# COMMON # +# # +# # # # # # # # + def generate_uuid(): return str(uuid.uuid4()).replace('-', '') +def get_current_date(): + return datetime.now().strftime("%Y%m%d") + +##-- COMMON --# + ################################################################################ # # TODO: handle prefix cookies @@ -377,6 +388,55 @@ def api_create_cookie(user_id, cookiejar_uuid, cookie_dict): #### #### +# # # # # # # # +# # +# CRAWLER # +# # +# # # # # # # # + +#### CRAWLER GLOBAL #### + +def get_all_spash_crawler_status(): + crawler_metadata = [] + all_crawlers = r_cache.smembers('all_splash_crawlers') + for crawler in all_crawlers: + crawler_metadata.append(get_splash_crawler_status(crawler)) + return crawler_metadata + +def reset_all_spash_crawler_status(): + r_cache.delete('all_splash_crawlers') + +def get_splash_crawler_status(spash_url): + crawler_type = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'type') + crawling_domain = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'crawling_domain') + started_time = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'started_time') + status_info = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'status') + crawler_info = '{} - {}'.format(spash_url, started_time) + if status_info=='Waiting' or status_info=='Crawling': + status=True + else: + status=False + return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type} + +def get_stats_last_crawled_domains(crawler_types, date): + statDomains = {} + for crawler_type in crawler_types: + stat_type = {} + stat_type['domains_up'] = r_serv_onion.scard('{}_up:{}'.format(crawler_type, date)) + stat_type['domains_down'] = r_serv_onion.scard('{}_down:{}'.format(crawler_type, date)) + stat_type['total'] = stat_type['domains_up'] + stat_type['domains_down'] + stat_type['domains_queue'] = get_nb_elem_to_crawl_by_type(crawler_type) + statDomains[crawler_type] = stat_type + return statDomains + +# # TODO: handle custom proxy +def get_splash_crawler_latest_stats(): + now = datetime.now() + date = now.strftime("%Y%m%d") + return get_stats_last_crawled_domains(['onion', 'regular'], date) + +##-- CRAWLER GLOBAL --## + #### CRAWLER TASK #### def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): @@ -587,10 +647,20 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message} return None +def get_nb_elem_to_crawl_by_type(queue_type): + nb = r_serv_onion.scard('{}_crawler_priority_queue'.format(queue_type)) + nb += r_serv_onion.scard('{}_crawler_discovery_queue'.format(queue_type)) + nb += r_serv_onion.scard('{}_crawler_queue'.format(queue_type)) + return nb + #### ---- #### +# # # # # # # # # # # # +# # +# SPLASH MANAGER # +# # +# # # # # # # # # # # # -#### SPLASH MANAGER #### def get_splash_manager_url(reload=False): # TODO: add in db config return splash_manager_url @@ -636,6 +706,8 @@ def ping_splash_manager(): return True else: print(req.json()) + update_splash_manager_connection_status(False) + return False except requests.exceptions.ConnectionError: pass # splash manager unreachable diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 7d006c3d..5d9324ed 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -48,6 +48,31 @@ def create_json_response(data, status_code): return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code # ============= ROUTES ============== +@crawler_splash.route("/crawlers/dashboard", methods=['GET']) +@login_required +@login_read_only +def crawlers_dashboard(): + # # TODO: get splash manager status + crawler_enabled = crawlers.ping_splash_manager() + all_splash_crawler_status = crawlers.get_all_spash_crawler_status() + splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() + date = crawlers.get_current_date() + + return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status, + crawler_enabled=crawler_enabled, date=date, + splash_crawlers_latest_stats=splash_crawlers_latest_stats) + +@crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET']) +@login_required +@login_read_only +def crawler_dashboard_json(): + + all_splash_crawler_status = crawlers.get_all_spash_crawler_status() + splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() + + return jsonify({'all_splash_crawler_status': all_splash_crawler_status, + 'splash_crawlers_latest_stats':splash_crawlers_latest_stats}) + @crawler_splash.route("/crawlers/manual", methods=['GET']) @login_required @login_read_only @@ -403,4 +428,11 @@ def crawler_cookiejar_cookie_json_add_post(): return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid)) +@crawler_splash.route('/crawler/cookiejar/cookie/json_add_post', methods=['GET']) +@login_required +@login_analyst +def crawler_splash_setings(): + + return render_template("settings_splash_crawler.html", cookiejar_uuid=True, cookie_uuid=False) + ## - - ## diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index bab5553a..bf9a0ec8 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -231,22 +231,22 @@ def delete_auto_crawler(url): # ============= ROUTES ============== -@hiddenServices.route("/crawlers/", methods=['GET']) -@login_required -@login_read_only -def dashboard(): - crawler_metadata_onion = get_crawler_splash_status('onion') - crawler_metadata_regular = get_crawler_splash_status('regular') - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - statDomains_onion = get_stats_last_crawled_domains('onion', date) - statDomains_regular = get_stats_last_crawled_domains('regular', date) - - return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, - crawler_enabled=crawler_enabled, date=date, - crawler_metadata_regular=crawler_metadata_regular, - statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) +# @hiddenServices.route("/crawlers/", methods=['GET']) +# @login_required +# @login_read_only +# def dashboard(): +# crawler_metadata_onion = get_crawler_splash_status('onion') +# crawler_metadata_regular = get_crawler_splash_status('regular') +# +# now = datetime.datetime.now() +# date = now.strftime("%Y%m%d") +# statDomains_onion = get_stats_last_crawled_domains('onion', date) +# statDomains_regular = get_stats_last_crawled_domains('regular', date) +# +# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, +# crawler_enabled=crawler_enabled, date=date, +# crawler_metadata_regular=crawler_metadata_regular, +# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) @login_required @@ -439,23 +439,6 @@ def remove_auto_crawler(): delete_auto_crawler(url) return redirect(url_for('hiddenServices.auto_crawler', page=page)) -@hiddenServices.route("/crawlers/crawler_dashboard_json", methods=['GET']) -@login_required -@login_read_only -def crawler_dashboard_json(): - - crawler_metadata_onion = get_crawler_splash_status('onion') - crawler_metadata_regular = get_crawler_splash_status('regular') - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - - statDomains_onion = get_stats_last_crawled_domains('onion', date) - statDomains_regular = get_stats_last_crawled_domains('regular', date) - - return jsonify({'statDomains_onion': statDomains_onion, 'statDomains_regular': statDomains_regular, - 'crawler_metadata_onion':crawler_metadata_onion, 'crawler_metadata_regular':crawler_metadata_regular}) - # # TODO: refractor @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) @login_required diff --git a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html deleted file mode 100644 index 9c0e1933..00000000 --- a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html +++ /dev/null @@ -1,267 +0,0 @@ - - - - - AIL-Framework - - - - - - - - - - - - - - - {% include 'nav_bar.html' %} - -
-
- - {% include 'crawler/menu_sidebar.html' %} - -
- - {% include 'crawler/crawler_disabled.html' %} - -
-
- -
-
-
Onions Crawlers
-
- -
- {{ statDomains_onion['total'] }} Crawled - {{ statDomains_onion['domains_queue'] }} Queue -
-
-
-
- - - {% for crawler in crawler_metadata_onion %} - - - - - - {% endfor %} - -
- {{crawler['crawler_info']}} - - {{crawler['crawling_domain']}} - - {{crawler['status_info']}} -
-
-
- -
-
-
-
-
Regular Crawlers
-
- -
- {{ statDomains_regular['total'] }} Crawled - {{ statDomains_regular['domains_queue'] }} Queue -
-
-
-
- - - {% for crawler in crawler_metadata_regular %} - - - - - - {% endfor %} - -
- {{crawler['crawler_info']}} - - {{crawler['crawling_domain']}} - - {{crawler['status_info']}} -
-
-
-
-
- -
-
-
-
Show Domain:
-
-
- -
- -
-
-
-
-
-
- - -
- - - - - {% with object_type='domain' %} - {% include 'tags/block_obj_tags_search.html' %} - {% endwith %} - -
-
-
- - - - - diff --git a/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html new file mode 100644 index 00000000..5b059e23 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html @@ -0,0 +1,235 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ + {% include 'crawler/crawler_disabled.html' %} + + + + + + {% for splash_crawler in all_splash_crawler_status %} + + + + + + + {% endfor %} + +
+ {{splash_crawler['crawler_info']}} + + {%if splash_crawler['type']=='onion'%} + + {%else%} + + {%endif%} + + {{splash_crawler['crawling_domain']}} + + {{splash_crawler['status_info']}} +
+ +
+
+
+
Show Domain:
+
+
+ +
+ +
+
+
+
+
+
+ + +
+ + + + + {% with object_type='domain' %} + {% include 'tags/block_obj_tags_search.html' %} + {% endwith %} + +
+
+
+ + + + + diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html index c14abbbe..66a5f4f5 100644 --- a/var/www/templates/crawler/menu_sidebar.html +++ b/var/www/templates/crawler/menu_sidebar.html @@ -14,7 +14,7 @@