diff --git a/bin/Crawler.py b/bin/Crawler.py index c7051b75..4d745aad 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -351,23 +351,24 @@ if __name__ == '__main__': # get HAR files default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har") if default_crawler_har: - default_crawler_har = 1 + default_crawler_har = True else: - default_crawler_har = 0 + default_crawler_har = False # get PNG files default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png") if default_crawler_png: - default_crawler_png = 1 + default_crawler_png = True else: - default_crawler_png = 0 + default_crawler_png = False # Default crawler options - default_crawler_config = {'html': 1, + default_crawler_config = {'html': True, 'har': default_crawler_har, 'png': default_crawler_png, 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), 'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"), + 'cookiejar_uuid': None, 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler diff --git a/bin/lib/Screenshot.py b/bin/lib/Screenshot.py index 83d2552a..46141e30 100755 --- a/bin/lib/Screenshot.py +++ b/bin/lib/Screenshot.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* +import base64 import os import sys import redis +from hashlib import sha256 from io import BytesIO sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) @@ -164,6 +166,25 @@ def get_screenshot_file_content(sha256_string): file_content = BytesIO(f.read()) return file_content +# if force save, ignore max_size +def save_crawled_screeshot(b64_screenshot, max_size, f_save=False): + screenshot_size = (len(b64_screenshot)*3) /4 + if screenshot_size < max_size or f_save: + image_content = base64.standard_b64decode(b64_screenshot.encode()) + sha256_string = sha256(image_content).hexdigest() + filepath = get_screenshot_filepath(sha256_string) + if os.path.isfile(filepath): + #print('File already exist') + return sha256_string + # create dir + dirname = os.path.dirname(filepath) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filepath, 'wb') as f: + f.write(image_content) + return sha256_string + return False + def save_screenshot_file(sha256_string, io_content): filepath = get_screenshot_filepath(sha256_string) if os.path.isfile(filepath): diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py new file mode 100755 index 00000000..6e876349 --- /dev/null +++ b/bin/lib/crawlers.py @@ -0,0 +1,532 @@ +#!/usr/bin/python3 + +""" +API Helper +=================== + + +""" +import base64 +import gzip +import json +import os +import re +import redis +import sys +import uuid + +from datetime import datetime, timedelta +from urllib.parse import urlparse + +from pyfaup.faup import Faup + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + + +config_loader = ConfigLoader.ConfigLoader() +r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") +r_cache = config_loader.get_redis_conn("Redis_Cache") +config_loader = None + +faup = Faup() + +def generate_uuid(): + return str(uuid.uuid4()).replace('-', '') + +################################################################################ + +# # TODO: handle prefix cookies +# # TODO: fill empty fields +def create_cookie_crawler(cookie_dict, domain, crawler_type='regular'): + # check cookie domain filed + if not 'domain' in cookie_dict: + cookie_dict['domain'] = '.{}'.format(domain) + + # tor browser: disable secure cookie + if crawler_type=='onion': + cookie_dict['secure'] = False + + # force cookie domain + # url = urlparse(browser_cookie['Host raw']) + # domain = url.netloc.split(':', 1)[0] + # cookie_dict['domain'] = '.{}'.format(domain) + + # change expire date + cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' + return cookie_dict + +def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='regular'): + cookies = get_cookiejar_cookies_list(cookiejar_uuid) + all_cookies = [] + for cookie_dict in cookies: + all_cookies.append(create_cookie_crawler(cookie_dict, domain, crawler_type=crawler_type)) + return all_cookies + +################################################################################ + +def get_all_cookiejar(): + r_serv_onion.smembers('cookiejar:all') + +def get_global_cookiejar(): + res = r_serv_onion.smembers('cookiejar:global') + if not res: + res = [] + return res + +def get_user_cookiejar(user_id): + res = r_serv_onion.smembers('cookiejar:user:{}'.format(user_id)) + if not res: + res = [] + return res + +def exist_cookiejar(cookiejar_uuid): + return r_serv_onion.exists('cookiejar_metadata:{}'.format(cookiejar_uuid)) + +def create_cookiejar(user_id, level=1, description=None): + cookiejar_uuid = str(uuid.uuid4()) + + r_serv_onion.sadd('cookiejar:all', cookiejar_uuid) + if level==0: + r_serv_onion.sadd('cookiejar:user:{}'.format(user_id), cookiejar_uuid) + else: + r_serv_onion.sadd('cookiejar:global', cookiejar_uuid) + # metadata + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'user_id', user_id) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level', level) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'description', description) + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date', datetime.now().strftime("%Y%m%d")) + + # if json_cookies: + # json_cookies = json.loads(json_cookies) # # TODO: catch Exception + # r_serv_onion.set('cookies:json_cookies:{}'.format(cookies_uuid), json.dumps(json_cookies)) + # + # for cookie_dict in l_cookies: + # r_serv_onion.hset('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_dict['name'], cookie_dict['value']) + return cookiejar_uuid + +def delete_cookie_jar(cookiejar_uuid): + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: + user_id = get_cookiejar_owner(cookiejar_uuid) + r_serv_onion.srem('cookiejar:user:{}'.format(user_id), cookiejar_uuid) + else: + r_serv_onion.srem('cookiejar:global', cookiejar_uuid) + + r_serv_onion.delete('cookiejar_metadata:{}'.format(cookiejar_uuid)) + +def get_cookiejar_cookies_uuid(cookiejar_uuid): + res = r_serv_onion.smembers('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid)) + if not res: + res = [] + return res + +def get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=False): + l_cookiejar = [] + for cookie_uuid in get_cookiejar_cookies_uuid(cookiejar_uuid): + if add_cookie_uuid: + l_cookiejar.append((get_cookie_dict(cookie_uuid), cookie_uuid)) + else: + l_cookiejar.append(get_cookie_dict(cookie_uuid)) + return l_cookiejar + +## Cookiejar metadata ## +def get_cookiejar_description(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'description') + +def get_cookiejar_date(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date') + +def get_cookiejar_owner(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'user_id') + +def get_cookiejar_date(cookiejar_uuid): + return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date') + +def get_cookiejar_level(cookiejar_uuid): + res = r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level') + if not res: + res = 1 + return int(res) + +def get_cookiejar_metadata(cookiejar_uuid, level=False): + dict_cookiejar = {} + if exist_cookiejar(cookiejar_uuid): + dict_cookiejar['cookiejar_uuid'] = cookiejar_uuid + dict_cookiejar['description'] = get_cookiejar_description(cookiejar_uuid) + dict_cookiejar['date'] = get_cookiejar_date(cookiejar_uuid) + dict_cookiejar['user_id'] = get_cookiejar_owner(cookiejar_uuid) + if level: + dict_cookiejar['level'] = get_cookies_level(cookiejar_uuid) + return dict_cookiejar + +def get_cookiejar_metadata_by_iterator(iter_cookiejar_uuid): + l_cookiejar_metadata = [] + for cookiejar_uuid in iter_cookiejar_uuid: + l_cookiejar_metadata.append(get_cookiejar_metadata(cookiejar_uuid)) + return l_cookiejar_metadata + +def edit_cookiejar_description(cookiejar_uuid, description): + r_serv_onion.hset('cookiejar_metadata:{}'.format(cookiejar_uuid), 'description', description) + +# # # # # # # # +# # +# COOKIES # +# # +# # # # # # # # + +# # # # +# Cookies Fields: +# - name +# - value +# - path (optional) +# - domain (optional) +# - secure (optional) +# - httpOnly (optional) +# - text (optional) +# # # # +def get_cookie_all_keys_name(): + return ['name', 'value', 'domain', 'path', 'httpOnly', 'secure'] + +def exists_cookie(cookie_uuid): + if int(r_serv_onion.scard('cookies:map:cookiejar:{}'.format(cookie_uuid))) > 0: + return True + return False + +def get_cookie_value(cookie_uuid, name): + return r_serv_onion.hget('cookiejar:cookie:{}'.format(cookie_uuid), name) + +def set_cookie_value(cookie_uuid, name, value): + r_serv_onion.hset('cookiejar:cookie:{}'.format(cookie_uuid), name, value) + +def delete_cookie_value(cookie_uuid, name): + r_serv_onion.hdel('cookiejar:cookie:{}'.format(cookie_uuid), name) + +def get_cookie_dict(cookie_uuid): + cookie_dict = {} + for key_name in r_serv_onion.hkeys('cookiejar:cookie:{}'.format(cookie_uuid)): + cookie_dict[key_name] = get_cookie_value(cookie_uuid, key_name) + return cookie_dict + +# name, value, path=None, httpOnly=None, secure=None, domain=None, text=None +def add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict): + cookie_uuid = generate_uuid() + r_serv_onion.sadd('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid), cookie_uuid) + r_serv_onion.sadd('cookies:map:cookiejar:{}'.format(cookie_uuid), cookiejar_uuid) + + set_cookie_value(cookie_uuid, 'name', cookie_dict['name']) + set_cookie_value(cookie_uuid, 'value', cookie_dict['value']) + if 'path' in cookie_dict: + set_cookie_value(cookie_uuid, 'path', cookie_dict['path']) + if 'httpOnly' in cookie_dict: + set_cookie_value(cookie_uuid, 'httpOnly', cookie_dict['httpOnly']) + if 'secure' in cookie_dict: + set_cookie_value(cookie_uuid, 'secure', cookie_dict['secure']) + if 'domain' in cookie_dict: + set_cookie_value(cookie_uuid, 'domain', cookie_dict['domain']) + if 'text' in cookie_dict: + set_cookie_value(cookie_uuid, 'text', cookie_dict['text']) + return cookie_uuid + +def add_cookies_to_cookiejar(cookiejar_uuid, l_cookies): + for cookie_dict in l_cookies: + add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + +def delete_all_cookies_from_cookiejar(cookiejar_uuid): + for cookie_uuid in get_cookiejar_cookies_uuid(cookiejar_uuid): + delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid) + +def delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid): + r_serv_onion.srem('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid), cookie_uuid) + r_serv_onion.srem('cookies:map:cookiejar:{}'.format(cookie_uuid), cookiejar_uuid) + if not exists_cookie(cookie_uuid): + r_serv_onion.delete('cookiejar:cookie:{}'.format(cookie_uuid)) + +def edit_cookie(cookiejar_uuid, cookie_uuid, cookie_dict): + # delete old keys + for key_name in r_serv_onion.hkeys('cookiejar:cookie:{}'.format(cookie_uuid)): + if key_name not in cookie_dict: + delete_cookie_value(cookie_uuid, key_name) + # add new keys + cookie_all_keys_name = get_cookie_all_keys_name() + for key_name in cookie_dict: + if key_name in cookie_all_keys_name: + set_cookie_value(cookie_uuid, key_name, cookie_dict[key_name]) + +## - - ## +## Cookies import ## # TODO: add browser type ? +def import_cookies_from_json(json_cookies, cookiejar_uuid): + for cookie in json_cookies: + try: + cookie_dict = unpack_imported_json_cookie(cookie) + add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + except KeyError: + return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid} + +# # TODO: add text field +def unpack_imported_json_cookie(json_cookie): + cookie_dict = {'name': json_cookie['Name raw'], 'value': json_cookie['Content raw']} + if 'Path raw' in json_cookie: + cookie_dict['path'] = json_cookie['Path raw'] + if 'httpOnly' in json_cookie: + cookie_dict['httpOnly'] = json_cookie['HTTP only raw'] == 'true' + if 'secure' in json_cookie: + cookie_dict['secure'] = json_cookie['Send for'] == 'Encrypted connections only' + if 'Host raw' in json_cookie: + url = urlparse(json_cookie['Host raw']) + cookie_dict['domain'] = url.netloc.split(':', 1)[0] + return cookie_dict + +def misp_cookie_import(misp_object, cookiejar_uuid): + pass +## - - ## +#### COOKIEJAR API #### +def api_import_cookies_from_json(json_cookies_str, cookiejar_uuid): # # TODO: add catch + json_cookies = json.loads(json_cookies_str) + res = import_cookies_from_json(json_cookies, cookiejar_uuid) + if res: + return (res, 400) +#### #### + +#### COOKIES API #### + +def api_verify_basic_cookiejar(cookiejar_uuid, user_id): + if not exist_cookiejar(cookiejar_uuid): + return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: # # TODO: check if user is admin + cookie_owner = get_cookiejar_owner(cookiejar_uuid) + if cookie_owner != user_id: + return ({'error': 'The access to this cookiejar is restricted'}, 403) + +def api_get_cookiejar_cookies(cookiejar_uuid, user_id): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + res = get_cookiejar_cookies_list(cookiejar_uuid) + return (res, 200) + +def api_edit_cookiejar_description(user_id, cookiejar_uuid, description): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + edit_cookiejar_description(cookiejar_uuid, description) + return ({'cookiejar_uuid': cookiejar_uuid}, 200) + +def api_get_cookiejar_cookies_with_uuid(cookiejar_uuid, user_id): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + res = get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True) + return (res, 200) + +def api_get_cookies_list_select(user_id): + l_cookiejar = [] + for cookies_uuid in get_global_cookiejar(): + l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + for cookies_uuid in get_user_cookiejar(user_id): + l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + return sorted(l_cookiejar) + +def api_delete_cookie_from_cookiejar(user_id, cookiejar_uuid, cookie_uuid): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid) + return ({'cookiejar_uuid': cookiejar_uuid, 'cookie_uuid': cookie_uuid}, 200) + +def api_delete_cookie_jar(user_id, cookiejar_uuid): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + delete_cookie_jar(cookiejar_uuid) + return ({'cookiejar_uuid': cookiejar_uuid}, 200) + +def api_edit_cookie(user_id, cookiejar_uuid, cookie_uuid, cookie_dict): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + if 'name' not in cookie_dict or 'value' not in cookie_dict or cookie_dict['name'] == '': + ({'error': 'cookie name or value not provided'}, 400) + edit_cookie(cookiejar_uuid, cookie_uuid, cookie_dict) + return (get_cookie_dict(cookie_uuid), 200) + +def api_create_cookie(user_id, cookiejar_uuid, cookie_dict): + res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if res: + return res + if 'name' not in cookie_dict or 'value' not in cookie_dict or cookie_dict['name'] == '': + ({'error': 'cookie name or value not provided'}, 400) + res = add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + return (res, 200) + +#### #### + +#### CRAWLER TASK #### +def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): + crawler_config = {} + crawler_config['depth_limit'] = depth_limit + crawler_config['closespider_pagecount'] = max_pages + + if screenshot: + crawler_config['screenshot'] = True + else: + crawler_config['screenshot'] = False + if har: + crawler_config['har'] = True + else: + crawler_config['har'] = False + + if user_agent: + crawler_config['user_agent'] = user_agent + if cookiejar_uuid: + crawler_config['cookiejar_uuid'] = cookiejar_uuid + + if auto_crawler: + crawler_mode = 'auto' + else: + crawler_mode = 'manual' + + # get crawler_mode + faup.decode(url) + unpack_url = faup.get() + ## TODO: # FIXME: remove me + try: + domain = unpack_url['domain'].decode() + except: + domain = unpack_url['domain'] + + ## TODO: # FIXME: remove me + try: + tld = unpack_url['tld'].decode() + except: + tld = unpack_url['tld'] + if tld == 'onion': + crawler_type = 'onion' + else: + crawler_type = 'regular' + + save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=url) + send_url_to_crawl_in_queue(crawler_mode, crawler_type, url) + +def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=None): + if crawler_mode == 'manual': + r_cache.set('crawler_config:{}:{}:{}'.format(crawler_mode, crawler_type, domain), json.dumps(crawler_config)) + elif crawler_mode == 'auto': + r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_type, crawler_type, domain, url), json.dumps(crawler_config)) + +def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): + r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) + # add auto crawled url for user UI + if crawler_mode == 'auto': + r_serv_onion.sadd('auto_crawler_url:{}'.format(crawler_type), url) + +#### #### +#### CRAWLER TASK API #### +def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): + # validate url + if url is None or url=='' or url=='\n': + return ({'error':'invalid depth limit'}, 400) + + if depth_limit: + try: + depth_limit = int(depth_limit) + if depth_limit < 0: + depth_limit = 0 + except ValueError: + return ({'error':'invalid depth limit'}, 400) + if max_pages: + try: + max_pages = int(max_pages) + if max_pages < 1: + max_pages = 1 + except ValueError: + return ({'error':'invalid max_pages limit'}, 400) + + if auto_crawler: + try: + crawler_time = int(crawler_time) + if crawler_time < 0: + return ({'error':'invalid delta bettween two pass of the crawler'}, 400) + except ValueError: + return ({'error':'invalid delta bettween two pass of the crawler'}, 400) + + if cookiejar_uuid: + if not exist_cookiejar(cookiejar_uuid): + return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + level = get_cookiejar_level(cookiejar_uuid) + if level == 0: # # TODO: check if user is admin + cookie_owner = get_cookiejar_owner(cookiejar_uuid) + if cookie_owner != user_id: + return ({'error': 'The access to this cookiejar is restricted'}, 403) + + create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) + return None +#### #### + +def is_redirection(domain, last_url): + url = urlparse(last_url) + last_domain = url.netloc + last_domain = last_domain.split('.') + last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1]) + return domain != last_domain + +# domain up +def create_domain_metadata(domain_type, domain, current_port, date, date_month): + # Add to global set + r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain) + r_serv_onion.sadd('full_{}_up'.format(domain_type), domain) + r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain) + + # create onion metadata + if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)): + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date) + + # Update domain port number + all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports') + if all_domain_ports: + all_domain_ports = all_domain_ports.split(';') + else: + all_domain_ports = [] + if current_port not in all_domain_ports: + all_domain_ports.append(current_port) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports)) + +# add root_item to history +def add_domain_root_item(root_item, domain_type, domain, epoch_date, port): + # Create/Update crawler history + r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item) + +def create_item_metadata(item_id, domain, url, port, item_father): + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port)) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url) + # add this item_id to his father + r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id) + +def create_item_id(item_dir, domain): + if len(domain) > 215: + UUID = domain[-215:]+str(uuid.uuid4()) + else: + UUID = domain+str(uuid.uuid4()) + return os.path.join(item_dir, UUID) + +def save_crawled_item(item_id, item_content): + try: + gzipencoded = gzip.compress(item_content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + return gzip64encoded + except: + print("file error: {}".format(item_id)) + return False + +def save_har(har_dir, item_id, har_content): + if not os.path.exists(har_dir): + os.makedirs(har_dir) + item_id = item_id.split('/')[-1] + filename = os.path.join(har_dir, item_id + '.json') + with open(filename, 'w') as f: + f.write(json.dumps(har_content)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index e505ab63..9ff94883 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -3,11 +3,8 @@ import os import sys -import gzip -import base64 import uuid import datetime -import base64 import redis import json import time @@ -23,15 +20,73 @@ from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler -from scrapy_splash import SplashRequest +from scrapy_splash import SplashRequest, SplashJsonResponse sys.path.append(os.environ['AIL_BIN']) from Helper import Process +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +#import ConfigLoader +import Screenshot +import crawlers + +script_cookie = """ +function main(splash, args) + -- Default values + splash.js_enabled = true + splash.private_mode_enabled = true + splash.images_enabled = true + splash.webgl_enabled = true + splash.media_source_enabled = true + + -- Force enable things + splash.plugins_enabled = true + splash.request_body_enabled = true + splash.response_body_enabled = true + + splash.indexeddb_enabled = true + splash.html5_media_enabled = true + splash.http2_enabled = true + + -- User defined + splash.resource_timeout = args.resource_timeout + splash.timeout = args.timeout + + -- Allow to pass cookies + splash:init_cookies(args.cookies) + + -- Run + ok, reason = splash:go{args.url} + if not ok and not reason:find("http") then + return { + error = reason, + last_url = splash:url() + } + end + if reason == "http504" then + splash:set_result_status_code(504) + return '' + end + + splash:wait{args.wait} + -- Page instrumentation + -- splash.scroll_position = {y=1000} + splash:wait{args.wait} + -- Response + return { + har = splash:har(), + html = splash:html(), + png = splash:png{render_all=true}, + cookies = splash:get_cookies(), + last_url = splash:url() + } +end +""" + class TorSplashCrawler(): def __init__(self, splash_url, crawler_options): - self.process = CrawlerProcess({'LOG_ENABLED': False}) + self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, @@ -39,24 +94,26 @@ class TorSplashCrawler(): 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], - 'DEPTH_LIMIT': crawler_options['depth_limit'] + 'DEPTH_LIMIT': crawler_options['depth_limit'], + 'SPLASH_COOKIES_DEBUG': False }) - def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, original_item): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, original_item=original_item) + def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): - self.type = type + def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): + self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None @@ -68,166 +125,101 @@ class TorSplashCrawler(): self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) - # # TODO: timeout in config - self.arg_crawler = { 'html': crawler_options['html'], - 'wait': 10, - 'render_all': 1, - 'timeout': 30, - 'har': crawler_options['har'], - 'png': crawler_options['png']} + self.png = crawler_options['png'] + self.har = crawler_options['har'] + self.cookies = cookies config_section = 'Crawler' self.p = Process(config_section) - - self.r_cache = redis.StrictRedis( - host=self.p.config.get("Redis_Cache", "host"), - port=self.p.config.getint("Redis_Cache", "port"), - db=self.p.config.getint("Redis_Cache", "db"), - decode_responses=True) - + self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) + self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) - self.r_serv_metadata = redis.StrictRedis( - host=self.p.config.get("ARDB_Metadata", "host"), - port=self.p.config.getint("ARDB_Metadata", "port"), - db=self.p.config.getint("ARDB_Metadata", "db"), - decode_responses=True) + self.root_key = None - self.r_serv_onion = redis.StrictRedis( - host=self.p.config.get("ARDB_Onion", "host"), - port=self.p.config.getint("ARDB_Onion", "port"), - db=self.p.config.getint("ARDB_Onion", "db"), - decode_responses=True) - - self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) - - self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), - self.p.config.get("Directories", "crawled"), date_str ) - - self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) - self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") ) + def build_request_arg(self, cookies): + return {'wait': 10, + 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ + 'timeout': 30, + 'cookies': cookies, + 'lua_source': script_cookie + } def start_requests(self): + l_cookies = self.build_request_arg(self.cookies) yield SplashRequest( self.start_urls, self.parse, errback=self.errback_catcher, - endpoint='render.json', - meta={'father': self.original_item, 'root_key': None}, - args=self.arg_crawler + endpoint='execute', + meta={'father': self.original_item}, + args=l_cookies ) + # # TODO: remove duplicate and anchor def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: - # down ? - print('504 detected') + # no response + #print('504 detected') + pass + + # LUA ERROR # # TODO: print/display errors + elif 'error' in response.data: + if(response.data['error'] == 'network99'): + print('Connection to proxy refused') + else: + print(response.data['error']) + elif response.status != 200: print('other response: {}'.format(response.status)) - #print(error_log) - #detect connection to proxy refused + # detect connection to proxy refused error_log = (json.loads(response.body.decode())) - if(error_log['info']['text'] == 'Connection to proxy refused'): - print('Connection to proxy refused') + print(error_log) + #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): + # pass # ignore response else: - #avoid filename too big - if len(self.domains[0]) > 215: - UUID = self.domains[0][-215:]+str(uuid.uuid4()) + item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) + self.save_crawled_item(item_id, response.data['html']) + crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) + + if self.root_key is None: + self.root_key = item_id + crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) + crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) + + if 'cookies' in response.data: + all_cookies = response.data['cookies'] else: - UUID = self.domains[0]+str(uuid.uuid4()) - filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) - relative_filename_paste = os.path.join(self.crawler_path, UUID) - filename_har = os.path.join(self.crawled_har, UUID) + all_cookies = [] - # # TODO: modify me - # save new paste on disk - if self.save_crawled_paste(relative_filename_paste, response.data['html']): + # SCREENSHOT + if 'png' in response.data: + sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode) + if sha256_string: + Screenshot.save_item_relationship(sha256_string, item_id) + Screenshot.save_domain_relationship(sha256_string, self.domains[0]) + # HAR + if 'har' in response.data: + crawlers.save_har(self.har_dir, item_id, response.data['har']) - # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? - #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) - - self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) - self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) - self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) - - # create onion metadata - if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) - - # create root_key - if self.root_key is None: - self.root_key = relative_filename_paste - # Create/Update crawler history - self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) - # Update domain port number - all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') - if all_domain_ports: - all_domain_ports = all_domain_ports.split(';') - else: - all_domain_ports = [] - if self.port not in all_domain_ports: - all_domain_ports.append(self.port) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) - - #create paste metadata - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) - - self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) - - if 'png' in response.data: - size_screenshot = (len(response.data['png'])*3) /4 - - if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto - image_content = base64.standard_b64decode(response.data['png'].encode()) - hash = sha256(image_content).hexdigest() - img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) - filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') - dirname = os.path.dirname(filename_img) - if not os.path.exists(dirname): - os.makedirs(dirname) - if not os.path.exists(filename_img): - with open(filename_img, 'wb') as f: - f.write(image_content) - # add item metadata - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) - # add sha256 metadata - self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste) - # domain map - self.r_serv_onion.sadd('domain_screenshot:{}'.format(self.domains[0]), hash) - self.r_serv_onion.sadd('screenshot_domain:{}'.format(hash), self.domains[0]) - - if 'har' in response.data: - dirname = os.path.dirname(filename_har) - if not os.path.exists(dirname): - os.makedirs(dirname) - with open(filename_har+'.json', 'wb') as f: - f.write(json.dumps(response.data['har']).encode()) - - # save external links in set - #lext = LinkExtractor(deny_domains=self.domains, unique=True) - #for link in lext.extract_links(response): - # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) - # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - - le = LinkExtractor(allow_domains=self.domains, unique=True) - for link in le.extract_links(response): - yield SplashRequest( - link.url, - self.parse, - errback=self.errback_catcher, - endpoint='render.json', - meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, - args=self.arg_crawler - ) + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + l_cookies = self.build_request_arg(all_cookies) + yield SplashRequest( + link.url, + self.parse, + errback=self.errback_catcher, + endpoint='execute', + meta={'father': item_id}, + args=l_cookies + ) def errback_catcher(self, failure): # catch all errback failures, @@ -235,7 +227,7 @@ class TorSplashCrawler(): if failure.check(ResponseNeverReceived): request = failure.request - url = request.meta['splash']['args']['url'] + url= response.data['last_url'] father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) @@ -248,62 +240,28 @@ class TorSplashCrawler(): url, self.parse, errback=self.errback_catcher, - endpoint='render.json', - meta={'father': father, 'root_key': response.meta['root_key']}, - args=self.arg_crawler + endpoint='execute', + cache_args=['lua_source'], + meta={'father': father}, + args=self.build_request_arg(response.cookiejar) ) else: print('failure') #print(failure) print(failure.type) - #print(failure.request.meta['item']) - ''' - #if isinstance(failure.value, HttpError): - elif failure.check(HttpError): - # you can get the response - response = failure.value.response - print('HttpError') - self.logger.error('HttpError on %s', response.url) - - #elif isinstance(failure.value, DNSLookupError): - elif failure.check(DNSLookupError): - # this is the original request - request = failure.request - print(DNSLookupError) - print('DNSLookupError') - self.logger.error('DNSLookupError on %s', request.url) - - #elif isinstance(failure.value, TimeoutError): - elif failure.check(TimeoutError): - request = failure.request - print('TimeoutError') - print(TimeoutError) - self.logger.error('TimeoutError on %s', request.url) - ''' - - def save_crawled_paste(self, filename, content): - - if os.path.isfile(filename): - print('File: {} already exist in submitted pastes'.format(filename)) - return False - - try: - gzipencoded = gzip.compress(content.encode()) - gzip64encoded = base64.standard_b64encode(gzipencoded).decode() - except: - print("file error: {}".format(filename)) - return False + def save_crawled_item(self, item_id, item_content): + gzip64encoded = crawlers.save_crawled_item(item_id, item_content) + # Send item to queue # send paste to Global - relay_message = "{0} {1}".format(filename, gzip64encoded) + relay_message = "{0} {1}".format(item_id, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste - msg = 'infoleak:submission="crawler";{}'.format(filename) + msg = 'infoleak:submission="crawler";{}'.format(item_id) self.p.populate_set_out(msg, 'Tags') - return True diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index ccb645a0..f060482b 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -9,6 +9,7 @@ from TorSplashCrawler import TorSplashCrawler sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader +import crawlers if __name__ == '__main__': @@ -37,7 +38,12 @@ if __name__ == '__main__': date = crawler_json['date'] requested_mode = crawler_json['requested'] + if crawler_options['cookiejar_uuid']: + cookies = crawlers.load_crawler_cookies(crawler_options['cookiejar_uuid'], domain, crawler_type=service_type) + else: + cookies = [] + redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, original_item) + crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item) diff --git a/doc/screenshots/crawler-cookie-edit.png b/doc/screenshots/crawler-cookie-edit.png new file mode 100644 index 00000000..a8cc1cb7 Binary files /dev/null and b/doc/screenshots/crawler-cookie-edit.png differ diff --git a/doc/screenshots/crawler-cookiejar-all.png b/doc/screenshots/crawler-cookiejar-all.png new file mode 100644 index 00000000..291d70dd Binary files /dev/null and b/doc/screenshots/crawler-cookiejar-all.png differ diff --git a/doc/screenshots/crawler-cookiejar-create.png b/doc/screenshots/crawler-cookiejar-create.png new file mode 100644 index 00000000..22281d5f Binary files /dev/null and b/doc/screenshots/crawler-cookiejar-create.png differ diff --git a/doc/screenshots/crawler-cookiejar-domain-crawled.png b/doc/screenshots/crawler-cookiejar-domain-crawled.png new file mode 100644 index 00000000..e49dbfbf Binary files /dev/null and b/doc/screenshots/crawler-cookiejar-domain-crawled.png differ diff --git a/doc/screenshots/crawler-cookiejar-edit.png b/doc/screenshots/crawler-cookiejar-edit.png new file mode 100644 index 00000000..a5c736b6 Binary files /dev/null and b/doc/screenshots/crawler-cookiejar-edit.png differ diff --git a/doc/screenshots/crawler-manual-crawler.png b/doc/screenshots/crawler-manual-crawler.png new file mode 100644 index 00000000..ad206bf3 Binary files /dev/null and b/doc/screenshots/crawler-manual-crawler.png differ diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 7acecfbe..d2d3c65a 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -10,7 +10,7 @@ import sys import json import random -from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, make_response from flask_login import login_required, current_user, login_user, logout_user sys.path.append('modules') @@ -25,6 +25,7 @@ import Tag sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import Domain +import crawlers r_cache = Flask_config.r_cache r_serv_db = Flask_config.r_serv_db @@ -43,7 +44,47 @@ def api_validator(api_response): if api_response: return Response(json.dumps(api_response[0], indent=2, sort_keys=True), mimetype='application/json'), api_response[1] +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code + # ============= ROUTES ============== +@crawler_splash.route("/crawlers/manual", methods=['GET']) +@login_required +@login_read_only +def manual(): + user_id = current_user.get_id() + l_cookiejar = crawlers.api_get_cookies_list_select(user_id) + return render_template("crawler_manual.html", crawler_enabled=True, l_cookiejar=l_cookiejar) + +@crawler_splash.route("/crawlers/send_to_spider", methods=['POST']) +@login_required +@login_analyst +def send_to_spider(): + user_id = current_user.get_id() + + # POST val + url = request.form.get('url_to_crawl') + auto_crawler = request.form.get('crawler_type') + crawler_delta = request.form.get('crawler_epoch') + screenshot = request.form.get('screenshot') + har = request.form.get('har') + depth_limit = request.form.get('depth_limit') + max_pages = request.form.get('max_pages') + cookiejar_uuid = request.form.get('cookiejar') + + if cookiejar_uuid: + if cookiejar_uuid == 'None': + cookiejar_uuid = None + else: + cookiejar_uuid = cookiejar_uuid.rsplit(':') + cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') + + res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid) + if res: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.manual')) + # add route : /crawlers/show_domain @crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST']) @login_required @@ -156,3 +197,210 @@ def domains_explorer_web(): dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to) return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') + +## Cookiejar ## +@crawler_splash.route('/crawler/cookiejar/add', methods=['GET']) +@login_required +@login_analyst +def crawler_cookiejar_add(): + return render_template("add_cookiejar.html") + +@crawler_splash.route('/crawler/cookiejar/add_post', methods=['POST']) +@login_required +@login_analyst +def crawler_cookiejar_add_post(): + user_id = current_user.get_id() + + description = request.form.get('description') + level = request.form.get('level') + if level: + level = 1 + else: + level = 0 + + if 'file' in request.files: + file = request.files['file'] + json_cookies = file.read().decode() + else: + json_cookies = None + + # Get cookies to add + l_manual_cookie = [] + l_invalid_cookie = [] + for obj_tuple in list(request.form): + l_input = request.form.getlist(obj_tuple) + if len(l_input) == 2: + if l_input[0]: # cookie_name + cookie_dict = {'name': l_input[0], 'value': l_input[1]} + l_manual_cookie.append(cookie_dict) + elif l_input[1]: # cookie_value + l_invalid_cookie.append({'name': '', 'value': l_input[1]}) + if l_invalid_cookie: + return create_json_response({'error': 'invalid cookie', 'invalid fileds': l_invalid_cookie}, 400) + + cookiejar_uuid = crawlers.create_cookiejar(user_id, level=level, description=description) + if json_cookies: + res = crawlers.api_import_cookies_from_json(json_cookies, cookiejar_uuid) + if res: + return create_json_response(res[0], res[1]) + if l_manual_cookie: + crawlers.add_cookies_to_cookiejar(cookiejar_uuid, l_manual_cookie) + + return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) + +@crawler_splash.route('/crawler/cookiejar/all', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookiejar_all(): + user_id = current_user.get_id() + user_cookiejar = crawlers.get_cookiejar_metadata_by_iterator(crawlers.get_user_cookiejar(user_id)) + global_cookiejar = crawlers.get_cookiejar_metadata_by_iterator(crawlers.get_global_cookiejar()) + return render_template("all_cookiejar.html", user_cookiejar=user_cookiejar, global_cookiejar=global_cookiejar) + +@crawler_splash.route('/crawler/cookiejar/show', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookiejar_show(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + + res = crawlers.api_get_cookiejar_cookies_with_uuid(cookiejar_uuid, user_id) + if res[1] !=200: + return create_json_response(res[0], res[1]) + + cookiejar_metadata = crawlers.get_cookiejar_metadata(cookiejar_uuid, level=False) + + l_cookies = [] + l_cookie_uuid = [] + for cookie in res[0]: + l_cookies.append(json.dumps(cookie[0], indent=4, sort_keys=True)) + l_cookie_uuid.append(cookie[1]) + return render_template("show_cookiejar.html", cookiejar_uuid=cookiejar_uuid, cookiejar_metadata=cookiejar_metadata, + l_cookies=l_cookies, l_cookie_uuid=l_cookie_uuid) + +@crawler_splash.route('/crawler/cookiejar/cookie/delete', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookiejar_cookie_delete(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + cookie_uuid = request.args.get('cookie_uuid') + + res = crawlers.api_delete_cookie_from_cookiejar(user_id, cookiejar_uuid, cookie_uuid) + if res[1] !=200: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) + +@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookiejar_delete(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + + res = crawlers.api_delete_cookie_jar(user_id, cookiejar_uuid) + if res[1] !=200: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.crawler_cookiejar_all')) + +@crawler_splash.route('/crawler/cookiejar/edit', methods=['GET']) +@login_required +@login_read_only +def crawler_cookiejar_edit(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + description = request.args.get('description') + + res = crawlers.api_edit_cookiejar_description(user_id, cookiejar_uuid, description) + return create_json_response(res[0], res[1]) + +@crawler_splash.route('/crawler/cookiejar/cookie/edit', methods=['GET']) +@login_required +@login_read_only +def crawler_cookiejar_cookie_edit(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + cookie_uuid = request.args.get('cookie_uuid') + + cookie_dict = crawlers.get_cookie_dict(cookie_uuid) + return render_template("edit_cookie.html", cookiejar_uuid=cookiejar_uuid, cookie_uuid=cookie_uuid, cookie_dict=cookie_dict) + +@crawler_splash.route('/crawler/cookiejar/cookie/edit_post', methods=['POST']) +@login_required +@login_read_only +def crawler_cookiejar_cookie_edit_post(): + user_id = current_user.get_id() + cookiejar_uuid = request.form.get('cookiejar_uuid') + cookie_uuid = request.form.get('cookie_uuid') + name = request.form.get('name') + value = request.form.get('value') + domain = request.form.get('domain') + path = request.form.get('path') + httpOnly = request.form.get('httpOnly') + secure = request.form.get('secure') + + cookie_dict = {'name': name, 'value': value} + if domain: + cookie_dict['domain'] = domain + if path: + cookie_dict['path'] = path + if httpOnly: + cookie_dict['httpOnly'] = True + if secure: + cookie_dict['secure'] = True + + res = crawlers.api_edit_cookie(user_id, cookiejar_uuid, cookie_uuid, cookie_dict) + if res[1] != 200: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) + +@crawler_splash.route('/crawler/cookiejar/cookie/add', methods=['GET']) +@login_required +@login_read_only +def crawler_cookiejar_cookie_add(): + user_id = current_user.get_id() + cookiejar_uuid = request.args.get('cookiejar_uuid') + return render_template("add_cookie.html", cookiejar_uuid=cookiejar_uuid) + +@crawler_splash.route('/crawler/cookiejar/cookie/manual_add_post', methods=['POST']) +@login_required +@login_read_only +def crawler_cookiejar_cookie_manual_add_post(): + user_id = current_user.get_id() + cookiejar_uuid = request.form.get('cookiejar_uuid') + name = request.form.get('name') + value = request.form.get('value') + domain = request.form.get('domain') + path = request.form.get('path') + httpOnly = request.form.get('httpOnly') + secure = request.form.get('secure') + + cookie_dict = {'name': name, 'value': value} + if domain: + cookie_dict['domain'] = domain + if path: + cookie_dict['path'] = path + if httpOnly: + cookie_dict['httpOnly'] = True + if secure: + cookie_dict['secure'] = True + + return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) + +@crawler_splash.route('/crawler/cookiejar/cookie/json_add_post', methods=['POST']) +@login_required +@login_read_only +def crawler_cookiejar_cookie_json_add_post(): + user_id = current_user.get_id() + cookiejar_uuid = request.form.get('cookiejar_uuid') + + if 'file' in request.files: + file = request.files['file'] + json_cookies = file.read().decode() + if json_cookies: + res = crawlers.api_import_cookies_from_json(json_cookies, cookiejar_uuid) + return redirect(url_for('crawler_splash.crawler_cookiejar_show', cookiejar_uuid=cookiejar_uuid)) + + return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid)) + +## - - ## diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 0db6bbe6..bab5553a 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -30,6 +30,9 @@ r_serv_metadata = Flask_config.r_serv_metadata crawler_enabled = Flask_config.crawler_enabled bootstrap_label = Flask_config.bootstrap_label +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import crawlers + hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') faup = Faup() @@ -214,18 +217,6 @@ def get_crawler_splash_status(type): return crawler_metadata -def create_crawler_config(mode, service_type, crawler_config, domain, url=None): - if mode == 'manual': - r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) - elif mode == 'auto': - r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config)) - -def send_url_to_crawl_in_queue(mode, service_type, url): - r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode)) - # add auto crawled url for user UI - if mode == 'auto': - r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url) - def delete_auto_crawler(url): domain = get_domain_from_url(url) type = get_type_domain(domain) @@ -257,12 +248,6 @@ def dashboard(): crawler_metadata_regular=crawler_metadata_regular, statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) -@hiddenServices.route("/crawlers/manual", methods=['GET']) -@login_required -@login_read_only -def manual(): - return render_template("Crawler_Splash_manual.html", crawler_enabled=crawler_enabled) - @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) @login_required @login_read_only @@ -389,94 +374,6 @@ def unblacklist_domain(): else: return 'Incorrect type' -@hiddenServices.route("/crawlers/create_spider_splash", methods=['POST']) -@login_required -@login_analyst -def create_spider_splash(): - url = request.form.get('url_to_crawl') - automatic = request.form.get('crawler_type') - crawler_time = request.form.get('crawler_epoch') - #html = request.form.get('html_content_id') - screenshot = request.form.get('screenshot') - har = request.form.get('har') - depth_limit = request.form.get('depth_limit') - max_pages = request.form.get('max_pages') - - # validate url - if url is None or url=='' or url=='\n': - return 'incorrect url' - - crawler_config = {} - - # verify user input - if automatic: - automatic = True - else: - automatic = False - if not screenshot: - crawler_config['png'] = 0 - if not har: - crawler_config['har'] = 0 - - # verify user input - if depth_limit: - try: - depth_limit = int(depth_limit) - if depth_limit < 0: - return 'incorrect depth_limit' - else: - crawler_config['depth_limit'] = depth_limit - except: - return 'incorrect depth_limit' - if max_pages: - try: - max_pages = int(max_pages) - if max_pages < 1: - return 'incorrect max_pages' - else: - crawler_config['closespider_pagecount'] = max_pages - except: - return 'incorrect max_pages' - - # get service_type - faup.decode(url) - unpack_url = faup.get() - ## TODO: # FIXME: remove me - try: - domain = unpack_url['domain'].decode() - except: - domain = unpack_url['domain'] - - ## TODO: # FIXME: remove me - try: - tld = unpack_url['tld'].decode() - except: - tld = unpack_url['tld'] - - if tld == 'onion': - service_type = 'onion' - else: - service_type = 'regular' - - if automatic: - mode = 'auto' - try: - crawler_time = int(crawler_time) - if crawler_time < 0: - return 'incorrect epoch' - else: - crawler_config['time'] = crawler_time - except: - return 'incorrect epoch' - else: - mode = 'manual' - epoch = None - - create_crawler_config(mode, service_type, crawler_config, domain, url=url) - send_url_to_crawl_in_queue(mode, service_type, url) - - return redirect(url_for('hiddenServices.manual')) - @hiddenServices.route("/crawlers/auto_crawler", methods=['GET']) @login_required @login_read_only diff --git a/var/www/templates/crawler/crawler_splash/add_cookie.html b/var/www/templates/crawler/crawler_splash/add_cookie.html new file mode 100644 index 00000000..28893e05 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/add_cookie.html @@ -0,0 +1,116 @@ + + + + + AIL - Add Cookies + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+
+
+
Add Cookie to cookiejar: {{cookiejar_uuid}}
+
+
+ + + +
+
+
+
+ +
+ + {% include 'crawler/crawler_splash/cookie_edit_block.html' %} +
+ +
+ +
+ +
+ +
+ +
Import cookies from file:
+
+ + +
+ +
+ +
+
+ +
+
+ +
+
+
+ + + + diff --git a/var/www/templates/crawler/crawler_splash/add_cookiejar.html b/var/www/templates/crawler/crawler_splash/add_cookiejar.html new file mode 100644 index 00000000..2a60b016 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/add_cookiejar.html @@ -0,0 +1,99 @@ + + + + + AIL - Add Cookies + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+
Create Cookijar
+
+
+ +
+ +
+
+
+
+
+
+ +
+
+
+
+ + +
+
+
+ +
+ + {% include 'crawler/crawler_splash/add_cookies_block.html' %} + +
+ +
+ +
+ +
+
+ + + +
+
+
+ + + + diff --git a/var/www/templates/crawler/crawler_splash/add_cookies_block.html b/var/www/templates/crawler/crawler_splash/add_cookies_block.html new file mode 100644 index 00000000..45a57b88 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/add_cookies_block.html @@ -0,0 +1,58 @@ +
Import cookies:
+
+ + +
+ +
+ +
+
Create cookies:
+ +
+
Cookie Name
+
Cookie Value
+
+ +
+
+
+
+
+ + + +
+ +
+ + +
+
+
+
+
+ + diff --git a/var/www/templates/crawler/crawler_splash/all_cookiejar.html b/var/www/templates/crawler/crawler_splash/all_cookiejar.html new file mode 100644 index 00000000..ccd8ba82 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/all_cookiejar.html @@ -0,0 +1,99 @@ + + + + + + + + AIL - Cookies + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+
Your Cookiejar
+
+
+ {% with all_cookiejar=user_cookiejar, table_id='table_user'%} + {% include 'crawler/crawler_splash/table_cookiejar.html' %} + {% endwith %} +
+
+ +
+
+
Global Cookiejar
+
+
+ {% with all_cookiejar=global_cookiejar, table_id='table_global'%} + {% include 'crawler/crawler_splash/table_cookiejar.html' %} + {% endwith %} +
+
+ + + + Create Cookiejar + + +
+
+
+ + + + + diff --git a/var/www/templates/crawler/crawler_splash/cookie_edit_block.html b/var/www/templates/crawler/crawler_splash/cookie_edit_block.html new file mode 100644 index 00000000..7ef5fcdc --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/cookie_edit_block.html @@ -0,0 +1,37 @@ +
+ +
+ +
+
+
+ +
+ +
+
+
+ +
+ +
+
+
+ +
+ +
+
+ +
+ + +
+
+ + +
diff --git a/var/www/templates/crawler/crawler_splash/cookies_card_block.html b/var/www/templates/crawler/crawler_splash/cookies_card_block.html new file mode 100644 index 00000000..6ae194ea --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/cookies_card_block.html @@ -0,0 +1,36 @@ +{% for dict_cookie in l_elem %} + + {% if loop.index0 % 4 == 0 %} +
+ {% endif %} + +
+
+
+
+ + + +
+
+ + + +
+
+ +
+
+
{{dict_cookie}}
+
+
+ + {% if loop.index0 % 4 == 3 %} +
+ {% endif %} + +{% endfor %} + +{% if l_elem|length % 4 != 0 %} + +{% endif %} diff --git a/var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html similarity index 91% rename from var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html rename to var/www/templates/crawler/crawler_splash/crawler_manual.html index e2ea8ad1..1072920b 100644 --- a/var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html +++ b/var/www/templates/crawler/crawler_splash/crawler_manual.html @@ -38,7 +38,7 @@

Enter a domain and choose what kind of data you want.

-
+
@@ -108,8 +108,18 @@
-
+ +
+ Cookiejar: + +
+ + + + + + + + + + + + + + + diff --git a/var/www/templates/crawler/crawler_splash/showDomain.html b/var/www/templates/crawler/crawler_splash/showDomain.html index ecf53121..d83aee39 100644 --- a/var/www/templates/crawler/crawler_splash/showDomain.html +++ b/var/www/templates/crawler/crawler_splash/showDomain.html @@ -445,7 +445,7 @@
- +
@@ -519,11 +519,11 @@ var draw_img = false; $("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}"); $("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}"); {%else%} - var screenshot = ""; + var screenshot = ""; {%endif%} {%endif%} {%else%} -var screenshot = ""; + var screenshot = ""; {%endif%} img.src = base_url + screenshot; @@ -561,7 +561,9 @@ function img_error() { } function reload_image(new_screenshot, link, item_id) { - $("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img"); + if (screenshot) { + $("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img"); + } screenshot = new_screenshot; img.src=base_url + screenshot; diff --git a/var/www/templates/crawler/crawler_splash/show_cookiejar.html b/var/www/templates/crawler/crawler_splash/show_cookiejar.html new file mode 100644 index 00000000..8de57214 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/show_cookiejar.html @@ -0,0 +1,122 @@ + + + + + AIL - Add Cookies + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+
+
+
Edit Cookiejar
+
+
+ + + +
+
+
+
+ {% with all_cookiejar=[cookiejar_metadata], table_id='table_cookiejar'%} + {% include 'crawler/crawler_splash/table_cookiejar.html' %} + {% endwith %} + + + + + + + + +
+ +
+ +
+
+
+ +
+
+ + {% with l_elem=l_cookies, l_cookie_uuid=l_cookie_uuid, cookiejar_uuid=cookiejar_uuid %} + {% include 'crawler/crawler_splash/cookies_card_block.html' %} + {% endwith %} + +
+
+
+ + + + diff --git a/var/www/templates/crawler/crawler_splash/table_cookiejar.html b/var/www/templates/crawler/crawler_splash/table_cookiejar.html new file mode 100644 index 00000000..0f7eb5ae --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/table_cookiejar.html @@ -0,0 +1,28 @@ + + + + + + + + + + + {% for dict_cookiejar in all_cookiejar %} + + + + + + + {% endfor %} + +
DescriptionDateUUIDUser
{{dict_cookiejar['description']}} + {%if dict_cookiejar['date']%} + {{dict_cookiejar['date'][0:4]}}/{{dict_cookiejar['date'][4:6]}}/{{dict_cookiejar['date'][6:8]}} + {%endif%} + + + {{ dict_cookiejar['cookiejar_uuid']}} + + {{dict_cookiejar['user_id']}}
diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html index a0c37603..c14abbbe 100644 --- a/var/www/templates/crawler/menu_sidebar.html +++ b/var/www/templates/crawler/menu_sidebar.html @@ -8,7 +8,7 @@ + + + + + diff --git a/var/www/templates/import_export/export_object.html b/var/www/templates/import_export/export_object.html index ed65850c..f97a64d9 100644 --- a/var/www/templates/import_export/export_object.html +++ b/var/www/templates/import_export/export_object.html @@ -165,7 +165,6 @@ $('.add-field').click(function() { }); $('.fields').on('click', '.delete-field', function(){ - console.log($(this).parent()); $(this).parent().remove(); //$.get( "#") });