diff --git a/bin/Crawler.py b/bin/Crawler.py index 793b0c93..4d745aad 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -368,6 +368,7 @@ if __name__ == '__main__': 'png': default_crawler_png, 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), 'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"), + 'cookiejar_uuid': None, 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py new file mode 100755 index 00000000..c31d40af --- /dev/null +++ b/bin/lib/crawlers.py @@ -0,0 +1,466 @@ +#!/usr/bin/python3 + +""" +API Helper +=================== + + +""" +import base64 +import gzip +import json +import os +import re +import redis +import sys +import uuid + +from datetime import datetime, timedelta +from urllib.parse import urlparse + +from pyfaup.faup import Faup + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + + +config_loader = ConfigLoader.ConfigLoader() +r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") +r_cache = config_loader.get_redis_conn("Redis_Cache") +config_loader = None + +faup = Faup() + +def generate_uuid(): + return str(uuid.uuid4()).replace('-', '') + +################################################################################ + +# # TODO: handle prefix cookies +# # TODO: fill empty fields +def create_cookie_crawler(cookie_dict, crawler_type='regular'): + # tor browser: disable secure cookie + if crawler_type=='onion': + cookie_dict['secure'] = False + + # force cookie domain + # url = urlparse(browser_cookie['Host raw']) + # domain = url.netloc.split(':', 1)[0] + # cookie_dict['domain'] = '.{}'.format(domain) + + # change expire date + cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' + return cookie_dict + +def load_crawler_cookies(cookiejar_uuid, crawler_type='regular'): + cookies = get_cookiejar_cookies_list(cookiejar_uuid) + all_cookies = [] + for cookie_dict in cookies: + all_cookies.append(create_cookie_crawler(cookie_dict, crawler_type=crawler_type)) + return all_cookies + +################################################################################ + +def get_all_cookiejar(): + r_serv_onion.smembers('cookiejar:all') + +def get_global_cookiejar(): + res = r_serv_onion.smembers('cookiejar:global') + if not res: + res = [] + return res + +def get_user_cookiejar(user_id): + res = r_serv_onion.smembers('cookiejar:user:{}'.format(user_id)) + if not res: + res = [] + return res + +def exist_cookiejar(cookiejar_uuid): + return r_serv_onion.exists('cookiejar_metadata:{}'.format(cookiejar_uuid)) + +def create_cookiejar(user_id, level=1, description=None): + cookiejar_uuid = str(uuid.uuid4()) + + r_serv_onion.sadd('cookiejar:all', cookiejar_uuid) + if level==0: + r_serv_onion.sadd('cookiejar:user:{}'.format(user_id), cookiejar_uuid) + else: + r_serv_onion.sadd('cookiejar:global', cookiejar_uuid) + # metadata + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'user_id', user_id) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level', level) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'description', description) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date', datetime.now().strftime("%Y%m%d")) + + # if json_cookies: + # json_cookies = json.loads(json_cookies) # # TODO: catch Exception + # r_serv_onion.set('cookies:json_cookies:{}'.format(cookies_uuid), json.dumps(json_cookies)) + # + # for cookie_dict in l_cookies: + # r_serv_onion.hset('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_dict['name'], cookie_dict['value']) + return cookiejar_uuid + +def delete_cookie_jar(cookiejar_uuid): + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: + user_id = get_cookiejar_owner(cookiejar_uuid) + r_serv_onion.srem('cookiejar:user:{}'.format(user_id), cookiejar_uuid) + else: + r_serv_onion.srem('cookiejar:global', cookiejar_uuid) + + r_serv_onion.delete('cookiejar_metadata:{}'.format(cookiejar_uuid)) + +def get_cookiejar_cookies_uuid(cookiejar_uuid): + res = r_serv_onion.smembers('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid)) + if not res: + res = [] + return res + +def get_cookiejar_cookies_list(cookiejar_uuid): + l_cookiejar = [] + for cookie_uuid in get_cookiejar_cookies_uuid(cookiejar_uuid): + l_cookiejar.append(get_cookie_dict(cookie_uuid)) + return l_cookiejar + +## Cookiejar metadata ## +def get_cookiejar_description(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'description') + +def get_cookiejar_date(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date') + +def get_cookiejar_owner(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'user_id') + +def get_cookiejar_date(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date') + +def get_cookiejar_level(cookiejar_uuid): + res = r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level') + if not res: + res = 1 + return int(res) + +def get_cookiejar_metadata(cookiejar_uuid, level=False): + dict_cookiejar = {} + if exist_cookiejar(cookiejar_uuid): + dict_cookiejar['cookiejar_uuid'] = cookiejar_uuid + dict_cookiejar['description'] = get_cookiejar_description(cookiejar_uuid) + dict_cookiejar['date'] = get_cookiejar_date(cookiejar_uuid) + dict_cookiejar['user_id'] = get_cookiejar_owner(cookiejar_uuid) + if level: + dict_cookiejar['level'] = get_cookies_level(cookiejar_uuid) + return dict_cookiejar + +def get_cookiejar_metadata_by_iterator(iter_cookiejar_uuid): + l_cookiejar_metadata = [] + for cookiejar_uuid in iter_cookiejar_uuid: + l_cookiejar_metadata.append(get_cookiejar_metadata(cookiejar_uuid)) + return l_cookiejar_metadata + +# # # # # # # # +# # +# COOKIES # +# # +# # # # # # # # + +# # # # +# Cookies Fields: +# - name +# - value +# - path (optional) +# - domain (optional) +# - secure (optional) +# - httpOnly (optional) +# - text (optional) +# # # # + +def exists_cookie(cookie_uuid): + if int(r_serv_onion.scard('cookies:map:cookiejar:{}'.format(cookie_uuid))) > 0: + return True + return False + +def get_cookie_value(cookie_uuid, name): + return r_serv_onion.hget('cookiejar:cookie:{}'.format(cookie_uuid), name) + +def set_cookie_value(cookie_uuid, name, value): + r_serv_onion.hset('cookiejar:cookie:{}'.format(cookie_uuid), name, value) + +def get_cookie_dict(cookie_uuid): + cookie_dict = {} + for key_name in r_serv_onion.hkeys('cookiejar:cookie:{}'.format(cookie_uuid)): + cookie_dict[key_name] = get_cookie_value(cookie_uuid, key_name) + return cookie_dict + +# name, value, path=None, httpOnly=None, secure=None, domain=None, text=None +def add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict): + cookie_uuid = generate_uuid() + r_serv_onion.sadd('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid), cookie_uuid) + r_serv_onion.sadd('cookies:map:cookiejar:{}'.format(cookie_uuid), cookiejar_uuid) + + set_cookie_value(cookie_uuid, 'name', cookie_dict['name']) + set_cookie_value(cookie_uuid, 'value', cookie_dict['value']) + if 'path' in cookie_dict: + set_cookie_value(cookie_uuid, 'path', cookie_dict['path']) + if 'httpOnly' in cookie_dict: + set_cookie_value(cookie_uuid, 'httpOnly', cookie_dict['httpOnly']) + if 'secure' in cookie_dict: + set_cookie_value(cookie_uuid, 'secure', cookie_dict['secure']) + if 'domain' in cookie_dict: + set_cookie_value(cookie_uuid, 'domain', cookie_dict['domain']) + if 'text' in cookie_dict: + set_cookie_value(cookie_uuid, 'text', cookie_dict['text']) + return cookie_uuid + +def add_cookies_to_cookiejar(cookiejar_uuid, l_cookies): + for cookie_dict in l_cookies: + add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + +def delete_all_cookies_from_cookiejar(cookiejar_uuid): + for cookie_uuid in get_cookiejar_cookies_uuid(cookiejar_uuid): + delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid) + +def delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid): + r_serv_onion.srem('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid), cookie_uuid) + r_serv_onion.srem('cookies:map:cookiejar:{}'.format(cookie_uuid), cookiejar_uuid) + if not exists_cookie(cookie_uuid): + r_serv_onion.delete('cookiejar:cookie:{}'.format(cookies_uuid)) + +## - - ## +## Cookies import ## # TODO: add browser type ? +def import_cookies_from_json(json_cookies, cookiejar_uuid): + for cookie in json_cookies: + try: + cookie_dict = unpack_imported_json_cookie(cookie) + add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + except KeyError: + return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid} + +# # TODO: add text field +def unpack_imported_json_cookie(json_cookie): + cookie_dict = {'name': json_cookie['Name raw'], 'value': json_cookie['Content raw']} + if 'Path raw' in json_cookie: + cookie_dict['path'] = json_cookie['Path raw'] + if 'httpOnly' in json_cookie: + cookie_dict['httpOnly'] = json_cookie['HTTP only raw'] == 'true' + if 'secure' in json_cookie: + cookie_dict['secure'] = json_cookie['Send for'] == 'Encrypted connections only' + if 'Host raw' in json_cookie: + url = urlparse(json_cookie['Host raw']) + cookie_dict['domain'] = url.netloc.split(':', 1)[0] + return cookie_dict + +def misp_cookie_import(misp_object, cookiejar_uuid): + pass +## - - ## +#### COOKIEJAR API #### +def api_import_cookies_from_json(json_cookies_str, cookiejar_uuid): # # TODO: add catch + json_cookies = json.loads(json_cookies_str) + res = import_cookies_from_json(json_cookies, cookiejar_uuid) + if res: + return (res, 400) +#### #### + +#### COOKIES API #### + +def api_get_cookiejar_cookies_uuid(cookiejar_uuid, user_id): + if not exist_cookiejar(cookiejar_uuid): + return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: # # TODO: check if user is admin + cookie_owner = get_cookiejar_owner(cookiejar_uuid) + if cookie_owner != user_id: + return ({'error': 'The access to this cookiejar is restricted'}, 403) + res = get_cookiejar_cookies_uuid(cookiejar_uuid) + res = {'json_cookies': res[0], 'manual_cookies': res[1]} + return (res, 200) + +def api_get_cookiejar_cookies(cookiejar_uuid, user_id): + if not exist_cookiejar(cookiejar_uuid): + return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: # # TODO: check if user is admin + cookie_owner = get_cookiejar_owner(cookiejar_uuid) + if cookie_owner != user_id: + return ({'error': 'The access to this cookiejar is restricted'}, 403) + res = get_cookiejar_cookies_list(cookiejar_uuid) + return (res, 200) + +def api_get_cookies_list_select(user_id): + l_cookiejar = [] + for cookies_uuid in get_global_cookiejar(): + l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + for cookies_uuid in get_user_cookiejar(user_id): + l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + return sorted(l_cookiejar) +#### #### + +#### CRAWLER TASK #### +def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): + crawler_config = {} + crawler_config['depth_limit'] = depth_limit + crawler_config['closespider_pagecount'] = max_pages + + if screenshot: + crawler_config['screenshot'] = True + else: + crawler_config['screenshot'] = False + if har: + crawler_config['har'] = True + else: + crawler_config['har'] = False + + if user_agent: + crawler_config['user_agent'] = user_agent + if cookiejar_uuid: + crawler_config['cookiejar_uuid'] = cookiejar_uuid + + if auto_crawler: + crawler_mode = 'auto' + else: + crawler_mode = 'manual' + + # get crawler_mode + faup.decode(url) + unpack_url = faup.get() + ## TODO: # FIXME: remove me + try: + domain = unpack_url['domain'].decode() + except: + domain = unpack_url['domain'] + + ## TODO: # FIXME: remove me + try: + tld = unpack_url['tld'].decode() + except: + tld = unpack_url['tld'] + if tld == 'onion': + crawler_type = 'onion' + else: + crawler_type = 'regular' + + save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=url) + send_url_to_crawl_in_queue(crawler_mode, crawler_type, url) + +def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=None): + if crawler_mode == 'manual': + r_cache.set('crawler_config:{}:{}:{}'.format(crawler_mode, crawler_type, domain), json.dumps(crawler_config)) + elif crawler_mode == 'auto': + r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_type, crawler_type, domain, url), json.dumps(crawler_config)) + +def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): + r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) + # add auto crawled url for user UI + if crawler_mode == 'auto': + r_serv_onion.sadd('auto_crawler_url:{}'.format(crawler_type), url) + +#### #### +#### CRAWLER TASK API #### +def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): + # validate url + if url is None or url=='' or url=='\n': + return ({'error':'invalid depth limit'}, 400) + + if depth_limit: + try: + depth_limit = int(depth_limit) + if depth_limit < 0: + depth_limit = 0 + except ValueError: + return ({'error':'invalid depth limit'}, 400) + if max_pages: + try: + max_pages = int(max_pages) + if max_pages < 1: + max_pages = 1 + except ValueError: + return ({'error':'invalid max_pages limit'}, 400) + + if auto_crawler: + try: + crawler_time = int(crawler_time) + if crawler_time < 0: + return ({'error':'invalid delta bettween two pass of the crawler'}, 400) + except ValueError: + return ({'error':'invalid delta bettween two pass of the crawler'}, 400) + + if cookiejar_uuid: + if not exist_cookiejar(cookiejar_uuid): + return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: # # TODO: check if user is admin + cookie_owner = get_cookiejar_owner(cookiejar_uuid) + if cookie_owner != user_id: + return ({'error': 'The access to this cookiejar is restricted'}, 403) + + create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) + return None +#### #### + +def is_redirection(domain, last_url): + url = urlparse(last_url) + last_domain = url.netloc + last_domain = last_domain.split('.') + last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1]) + return domain != last_domain + +# domain up +def create_domain_metadata(domain_type, domain, current_port, date, date_month): + # Add to global set + r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain) + r_serv_onion.sadd('full_{}_up'.format(domain_type), domain) + r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain) + + # create onion metadata + if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)): + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date) + + # Update domain port number + all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports') + if all_domain_ports: + all_domain_ports = all_domain_ports.split(';') + else: + all_domain_ports = [] + if current_port not in all_domain_ports: + all_domain_ports.append(current_port) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports)) + +# add root_item to history +def add_domain_root_item(root_item, domain_type, domain, epoch_date, port): + # Create/Update crawler history + r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item) + +def create_item_metadata(item_id, domain, url, port, item_father): + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port)) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url) + # add this item_id to his father + r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id) + +def create_item_id(item_dir, domain): + if len(domain) > 215: + UUID = domain[-215:]+str(uuid.uuid4()) + else: + UUID = domain+str(uuid.uuid4()) + return os.path.join(item_dir, UUID) + +def save_crawled_item(item_id, item_content): + try: + gzipencoded = gzip.compress(item_content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + return gzip64encoded + except: + print("file error: {}".format(item_id)) + return False + +def save_har(har_dir, item_id, har_content): + if not os.path.exists(har_dir): + os.makedirs(har_dir) + item_id = item_id.split('/')[-1] + filename = os.path.join(har_dir, item_id + '.json') + with open(filename, 'w') as f: + f.write(json.dumps(har_content)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index cf408d83..9ff94883 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -63,6 +63,10 @@ function main(splash, args) last_url = splash:url() } end + if reason == "http504" then + splash:set_result_status_code(504) + return '' + end splash:wait{args.wait} -- Page instrumentation @@ -95,10 +99,10 @@ class TorSplashCrawler(): 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, - 'RETRY_TIMES': 0, + 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'], - 'SPLASH_COOKIES_DEBUG': True + 'SPLASH_COOKIES_DEBUG': False }) def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): @@ -139,7 +143,7 @@ class TorSplashCrawler(): def build_request_arg(self, cookies): return {'wait': 10, - 'resource_timeout': 10, + 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ 'timeout': 30, 'cookies': cookies, 'lua_source': script_cookie @@ -161,8 +165,9 @@ class TorSplashCrawler(): #print(response.headers) #print(response.status) if response.status == 504: - # down ? - print('504 detected') + # no response + #print('504 detected') + pass # LUA ERROR # # TODO: print/display errors elif 'error' in response.data: diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index b74bfcc6..db263fad 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -37,7 +37,11 @@ if __name__ == '__main__': crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] requested_mode = crawler_json['requested'] - cookies = crawlers.load_cookies('ccad0090-bdcb-4ba5-875b-3dae8f936216', domain, crawler_type=service_type) + + if crawler_options['cookiejar_uuid']: + cookies = crawlers.load_crawler_cookies(crawler_options['cookiejar_uuid'], crawler_type=service_type) + else: + cookies = [] redis_cache.delete('crawler_request:{}'.format(uuid)) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 3a0fbfdb..510e93f9 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -49,13 +49,41 @@ def create_json_response(data, status_code): # ============= ROUTES ============== @crawler_splash.route("/crawlers/manual", methods=['GET']) -#@login_required -#@login_read_only +@login_required +@login_read_only def manual(): user_id = current_user.get_id() - l_cookies = crawlers.api_get_cookies_list(user_id) - return render_template("crawler_manual.html", crawler_enabled=True, l_cookies=l_cookies) + l_cookiejar = crawlers.api_get_cookies_list_select(user_id) + return render_template("crawler_manual.html", crawler_enabled=True, l_cookiejar=l_cookiejar) +@crawler_splash.route("/crawlers/send_to_spider", methods=['POST']) +@login_required +@login_analyst +def send_to_spider(): + user_id = current_user.get_id() + + # POST val + url = request.form.get('url_to_crawl') + auto_crawler = request.form.get('crawler_type') + crawler_delta = request.form.get('crawler_epoch') + screenshot = request.form.get('screenshot') + har = request.form.get('har') + depth_limit = request.form.get('depth_limit') + max_pages = request.form.get('max_pages') + cookiejar_uuid = request.form.get('cookiejar') + + if cookiejar_uuid: + if cookiejar_uuid == 'None': + cookiejar_uuid = None + else: + cookiejar_uuid = cookiejar_uuid.rsplit(':') + cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') + + res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid) + if res: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.manual')) # add route : /crawlers/show_domain @crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST']) @@ -172,14 +200,14 @@ def domains_explorer_web(): ## Cookiejar ## @crawler_splash.route('/crawler/cookiejar/add', methods=['GET']) -#@login_required -#@login_analyst +@login_required +@login_analyst def crawler_cookiejar_add(): return render_template("add_cookiejar.html") @crawler_splash.route('/crawler/cookiejar/add_post', methods=['POST']) -#@login_required -#@login_analyst +@login_required +@login_analyst def crawler_cookiejar_add_post(): user_id = current_user.get_id() @@ -235,13 +263,13 @@ def crawler_cookiejar_show(): user_id = current_user.get_id() cookiejar_uuid = request.args.get('cookiejar_uuid') - res = crawlers.api_get_cookiejar_cookies(cookiejar_uuid, user_id)) + res = crawlers.api_get_cookiejar_cookies(cookiejar_uuid, user_id) if res[1] !=200: return create_json_response(res[0], res[1]) cookiejar_metadata = crawlers.get_cookiejar_metadata(cookiejar_uuid, level=False) - cookies = json.dumps(res[0]['json_cookies'], indent=4, sort_keys=True) - return render_template("show_cookiejar.html", cookiejar_metadata=cookiejar_metadata, l_cookies=res[0]) + cookies = json.dumps(res[0], indent=4, sort_keys=True) + return render_template("show_cookiejar.html", cookiejar_metadata=cookiejar_metadata, l_cookies=cookies) ## - - ## diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 0b4bb2be..bab5553a 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -217,18 +217,6 @@ def get_crawler_splash_status(type): return crawler_metadata -def create_crawler_config(mode, service_type, crawler_config, domain, url=None): - if mode == 'manual': - r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) - elif mode == 'auto': - r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config)) - -def send_url_to_crawl_in_queue(mode, service_type, url): - r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode)) - # add auto crawled url for user UI - if mode == 'auto': - r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url) - def delete_auto_crawler(url): domain = get_domain_from_url(url) type = get_type_domain(domain) @@ -386,94 +374,6 @@ def unblacklist_domain(): else: return 'Incorrect type' -@hiddenServices.route("/crawlers/create_spider_splash", methods=['POST']) -@login_required -@login_analyst -def create_spider_splash(): - url = request.form.get('url_to_crawl') - automatic = request.form.get('crawler_type') - crawler_time = request.form.get('crawler_epoch') - #html = request.form.get('html_content_id') - screenshot = request.form.get('screenshot') - har = request.form.get('har') - depth_limit = request.form.get('depth_limit') - max_pages = request.form.get('max_pages') - - # validate url - if url is None or url=='' or url=='\n': - return 'incorrect url' - - crawler_config = {} - - # verify user input - if automatic: - automatic = True - else: - automatic = False - if not screenshot: - crawler_config['png'] = 0 - if not har: - crawler_config['har'] = 0 - - # verify user input - if depth_limit: - try: - depth_limit = int(depth_limit) - if depth_limit < 0: - return 'incorrect depth_limit' - else: - crawler_config['depth_limit'] = depth_limit - except: - return 'incorrect depth_limit' - if max_pages: - try: - max_pages = int(max_pages) - if max_pages < 1: - return 'incorrect max_pages' - else: - crawler_config['closespider_pagecount'] = max_pages - except: - return 'incorrect max_pages' - - # get service_type - faup.decode(url) - unpack_url = faup.get() - ## TODO: # FIXME: remove me - try: - domain = unpack_url['domain'].decode() - except: - domain = unpack_url['domain'] - - ## TODO: # FIXME: remove me - try: - tld = unpack_url['tld'].decode() - except: - tld = unpack_url['tld'] - - if tld == 'onion': - service_type = 'onion' - else: - service_type = 'regular' - - if automatic: - mode = 'auto' - try: - crawler_time = int(crawler_time) - if crawler_time < 0: - return 'incorrect epoch' - else: - crawler_config['time'] = crawler_time - except: - return 'incorrect epoch' - else: - mode = 'manual' - epoch = None - - create_crawler_config(mode, service_type, crawler_config, domain, url=url) - send_url_to_crawl_in_queue(mode, service_type, url) - - return redirect(url_for('crawler_splash.manual')) - @hiddenServices.route("/crawlers/auto_crawler", methods=['GET']) @login_required @login_read_only diff --git a/var/www/templates/crawler/crawler_splash/crawler_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html index 5a76d9b6..1072920b 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_manual.html +++ b/var/www/templates/crawler/crawler_splash/crawler_manual.html @@ -38,7 +38,7 @@

Enter a domain and choose what kind of data you want.

-
+
@@ -109,12 +109,17 @@
- + +
+ Cookiejar: + +
+