From 1254c1c9c01170b8d65a1bcde2d1ded6cbecace5 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 14 Sep 2022 10:02:38 +0200 Subject: [PATCH] chg: [api] send url to crawler --- bin/lib/crawlers.py | 47 ++++++++++++++++++++++-- var/www/modules/restApi/Flask_restApi.py | 13 +++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index a2201c74..932938b2 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -640,7 +640,6 @@ def update_auto_crawler_queue(): print(mess) r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess) - ##-- AUTOMATIC CRAWLER --## #### CRAWLER TASK #### @@ -707,14 +706,54 @@ def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url= r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_mode, crawler_type, domain, url), json.dumps(crawler_config)) def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): - print('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) - r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) + print(f'{crawler_type}_crawler_priority_queue', f'{url};{crawler_mode}') + r_serv_onion.sadd(f'{crawler_type}_crawler_priority_queue', f'{url};{crawler_mode}') # add auto crawled url for user UI if crawler_mode == 'auto': - r_serv_onion.sadd('auto_crawler_url:{}'.format(crawler_type), url) + r_serv_onion.sadd(f'auto_crawler_url:{crawler_type}', url) + +def add_url_to_crawl_in_queue(url, crawler_mode='manual'): # crawler_type + print(f'{crawler_type}_crawler_priority_queue', f'{url};{crawler_mode}') + r_serv_onion.sadd(f'{crawler_type}_crawler_priority_queue', f'{url};{crawler_mode}') + # CURRENTLY DISABLED + # # add auto crawled url for user UI + # if crawler_mode == 'auto': + # r_serv_onion.sadd(f'auto_crawler_url:{crawler_type}', url) #### #### #### CRAWLER TASK API #### + +# # TODO: ADD RESULT JSON Response + +# # TODO: ADD user agent +# # TODO: sanitize URL +def api_add_crawler_task(data, user_id=None): + url = data.get('url', None) + if not url or url=='\n': + return ({'status': 'error', 'reason': 'No url supplied'}, 400) + + screenshot = data.get('screenshot', False) + if screenshot: + screenshot = True + else: + screenshot = False + har = data.get('har', False) + if har: + har = True + else: + har = False + depth_limit = data.get('depth_limit', 1) + if depth_limit: + try: + depth_limit = int(depth_limit) + if depth_limit < 0: + depth_limit = 0 + except ValueError: + return ({'error':'invalid depth limit'}, 400) + print(url, screenshot, har, depth_limit) + return create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, crawler_type='onion') + + def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): # validate url if url is None or url=='' or url=='\n': diff --git a/var/www/modules/restApi/Flask_restApi.py b/var/www/modules/restApi/Flask_restApi.py index cd74d963..fb9dda5b 100644 --- a/var/www/modules/restApi/Flask_restApi.py +++ b/var/www/modules/restApi/Flask_restApi.py @@ -556,6 +556,19 @@ def get_crawled_domain_list(): dict_res['domain_type'] = domain_type return create_json_response(dict_res, res[1]) +# # TODO: ADD RESULT JSON Response +@restApi.route("api/v1/add/crawler/task", methods=['POST']) +@token_required('analyst') +def add_crawler_task(): + data = request.get_json() + user_id = get_user_from_token(token) + res = crawlers.api_add_crawler_task(data, user_id=user_id) + if res: + return create_json_response(res[0], res[1]) + + dict_res = {'url': data['url']} + return create_json_response(dict_res, res[1]) + # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #