From a382b572c6c614a53928953b5dcc03e161442cf9 Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 7 Dec 2023 11:28:35 +0100 Subject: [PATCH] chg: [crawler] push onion discovery capture_uuid to another AIL --- bin/crawlers/Crawler.py | 18 ++++++ bin/lib/crawlers.py | 80 +++++++++++++++++++----- configs/core.cfg.sample | 2 + var/www/modules/restApi/Flask_restApi.py | 14 +++++ 4 files changed, 97 insertions(+), 17 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index eb492207..06ebe982 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -6,6 +6,7 @@ import logging.config import sys import time +from pyail import PyAIL from requests.exceptions import ConnectionError sys.path.append(os.environ['AIL_BIN']) @@ -44,6 +45,15 @@ class Crawler(AbstractModule): self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit') + ail_url_to_push_discovery = config_loader.get_config_str('Crawler', 'ail_url_to_push_onion_discovery') + ail_key_to_push_discovery = config_loader.get_config_str('Crawler', 'ail_key_to_push_onion_discovery') + if ail_url_to_push_discovery and ail_key_to_push_discovery: + ail = PyAIL(ail_url_to_push_discovery, ail_key_to_push_discovery, ssl=False) + if ail.ping_ail(): + self.ail_to_push_discovery = ail + else: + self.ail_to_push_discovery = None + # TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES # update hardcoded blacklist @@ -183,6 +193,14 @@ class Crawler(AbstractModule): crawlers.create_capture(capture_uuid, task_uuid) print(task.uuid, capture_uuid, 'launched') + + if self.ail_to_push_discovery: + if task.get_depth() == 1 and priority < 10 and task.get_domain().endswith('.onion'): + har = task.get_har() + screenshot = task.get_screenshot() + self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har, + screenshot=screenshot, depth_limit=1, proxy='force_tor') + print(task.uuid, capture_uuid, 'Added to ail_to_push_discovery') return capture_uuid # CRAWL DOMAIN diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 3484afa0..d256daee 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -309,6 +309,16 @@ def get_all_har_ids(): har_ids.append(har_id) return har_ids +def get_month_har_ids(year, month): + har_ids = [] + month_path = os.path.join(HAR_DIR, year, month) + for root, dirs, files in os.walk(month_path): + for file in files: + har_id = os.path.relpath(os.path.join(root, file), HAR_DIR) + har_ids.append(har_id) + return har_ids + + def get_har_content(har_id): har_path = os.path.join(HAR_DIR, har_id) try: @@ -1519,7 +1529,7 @@ class CrawlerTask: # TODO SANITIZE PRIORITY # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, - user_agent=None, tags=[], parent='manual', priority=0): + user_agent=None, tags=[], parent='manual', priority=0, external=False): if self.exists(): raise Exception('Error: Task already exists') @@ -1576,8 +1586,8 @@ class CrawlerTask: r_crawler.hset('crawler:queue:hash', hash_query, self.uuid) self._set_field('hash', hash_query) - r_crawler.zadd('crawler:queue', {self.uuid: priority}) - self.add_to_db_crawler_queue(priority) + if not external: + self.add_to_db_crawler_queue(priority) # UI domain_type = dom.get_domain_type() r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid) @@ -1637,7 +1647,7 @@ def add_task_to_lacus_queue(): # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, - user_agent=None, tags=[], parent='manual', priority=0, task_uuid=None): + user_agent=None, tags=[], parent='manual', priority=0, task_uuid=None, external=False): if task_uuid: if CrawlerTask(task_uuid).exists(): task_uuid = gen_uuid() @@ -1645,7 +1655,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar= task_uuid = gen_uuid() task = CrawlerTask(task_uuid) task_uuid = task.create(url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, - proxy=proxy, user_agent=user_agent, tags=tags, parent=parent, priority=priority) + proxy=proxy, user_agent=user_agent, tags=tags, parent=parent, priority=priority, + external=external) return task_uuid @@ -1655,7 +1666,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar= # # TODO: ADD user agent # # TODO: sanitize URL -def api_add_crawler_task(data, user_id=None): + +def api_parse_task_dict_basic(data, user_id): url = data.get('url', None) if not url or url == '\n': return {'status': 'error', 'reason': 'No url supplied'}, 400 @@ -1681,6 +1693,31 @@ def api_add_crawler_task(data, user_id=None): else: depth_limit = 0 + # PROXY + proxy = data.get('proxy', None) + if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor': + proxy = 'force_tor' + elif proxy: + verify = api_verify_proxy(proxy) + if verify[1] != 200: + return verify + + tags = data.get('tags', []) + + return {'url': url, 'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}, 200 + +def api_add_crawler_task(data, user_id=None): + task, resp = api_parse_task_dict_basic(data, user_id) + if resp != 200: + return task, resp + + url = task['url'] + screenshot = task['screenshot'] + har = task['har'] + depth_limit = task['depth_limit'] + proxy = task['proxy'] + tags = task['tags'] + cookiejar_uuid = data.get('cookiejar', None) if cookiejar_uuid: cookiejar = Cookiejar(cookiejar_uuid) @@ -1725,17 +1762,6 @@ def api_add_crawler_task(data, user_id=None): return {'error': 'Invalid frequency'}, 400 frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}' - # PROXY - proxy = data.get('proxy', None) - if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor': - proxy = 'force_tor' - elif proxy: - verify = api_verify_proxy(proxy) - if verify[1] != 200: - return verify - - tags = data.get('tags', []) - if frequency: # TODO verify user task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, @@ -1752,6 +1778,26 @@ def api_add_crawler_task(data, user_id=None): #### #### +# TODO cookiejar - cookies - frequency +def api_add_crawler_capture(data, user_id): + task, resp = api_parse_task_dict_basic(data, user_id) + if resp != 200: + return task, resp + + task_uuid = data.get('task_uuid') + if not task_uuid: + return {'error': 'Invalid task_uuid', 'task_uuid': task_uuid}, 400 + capture_uuid = data.get('capture_uuid') + if not capture_uuid: + return {'error': 'Invalid capture_uuid', 'task_uuid': capture_uuid}, 400 + + # TODO parent + create_task(task['url'], depth=task['depth_limit'], har=task['har'], screenshot=task['screenshot'], + proxy=task['proxy'], tags=task['tags'], + parent='AIL_capture', task_uuid=task_uuid, external=True) + + create_capture(capture_uuid, task_uuid) + return capture_uuid, 200 ################################################################################### ################################################################################### diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index d152578b..852590c5 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -261,6 +261,8 @@ default_depth_limit = 1 default_har = True default_screenshot = True onion_proxy = onion.foundation +ail_url_to_push_onion_discovery = +ail_key_to_push_onion_discovery = [Translation] libretranslate = diff --git a/var/www/modules/restApi/Flask_restApi.py b/var/www/modules/restApi/Flask_restApi.py index 0c95d0a0..90dba6d6 100644 --- a/var/www/modules/restApi/Flask_restApi.py +++ b/var/www/modules/restApi/Flask_restApi.py @@ -523,6 +523,20 @@ def add_crawler_task(): return create_json_response(dict_res, 200) +@restApi.route("api/v1/add/crawler/capture", methods=['POST']) +@token_required('analyst') +def add_crawler_task(): + data = request.get_json() + user_token = get_auth_from_header() + user_id = Users.get_token_user(user_token) + res = crawlers.api_add_crawler_capture(data, user_id) + if res: + return create_json_response(res[0], res[1]) + + dict_res = {'url': data['url']} + return create_json_response(dict_res, 200) + + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # DOMAIN # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #