diff --git a/bin/lib/Screenshot.py b/bin/lib/Screenshot.py index 83d2552a..0bed9c9b 100755 --- a/bin/lib/Screenshot.py +++ b/bin/lib/Screenshot.py @@ -1,10 +1,12 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* +import base64 import os import sys import redis +from hashlib import sha256 from io import BytesIO sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) @@ -164,6 +166,25 @@ def get_screenshot_file_content(sha256_string): file_content = BytesIO(f.read()) return file_content +# if force save, ignore max_size +def save_crawled_screeshot(b64_screenshot, max_size, f_save=False): + screenshot_size = (len(b64_screenshot)*3) /4 + if screenshot_size < max_size or f_save: + image_content = base64.standard_b64decode(b64_screenshot.encode()) + sha256_string = sha256(image_content).hexdigest() + filepath = get_screenshot_filepath(sha256_string) + if os.path.isfile(filepath): + #print('File already exist') + return False + # create dir + dirname = os.path.dirname(filepath) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filepath, 'wb') as f: + f.write(image_content) + return sha256_string + return False + def save_screenshot_file(sha256_string, io_content): filepath = get_screenshot_filepath(sha256_string) if os.path.isfile(filepath): diff --git a/bin/lib/crawler_splash.py b/bin/lib/crawler_splash.py index 6f519738..38ea0606 100755 --- a/bin/lib/crawler_splash.py +++ b/bin/lib/crawler_splash.py @@ -6,54 +6,139 @@ API Helper """ - +import base64 +import gzip import json import os import re import redis import sys +import uuid from datetime import datetime, timedelta from urllib.parse import urlparse -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + + +config_loader = ConfigLoader.ConfigLoader() +r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") +config_loader = None # # # # # Cookies Fields: # - name # - value -# - path -# - domain +# - path (optional) +# - domain (optional) +# - secure (optional) +# - httpOnly (optional) # # # # -def create_cookie_dict(cookie): - url = urlparse(cookie['Host raw']) - #scheme = url.scheme - is_secure = cookie['Send for'] == 'Encrypted connections only' - if 'HTTP only raw' in cookie: - if cookie['HTTP only raw'] == "true": - is_secure = False +def create_cookie_dict(browser_cookie=[], cookie_name=None, cookie_value=None, domain=None, crawler_type='regular'): + # UI created + if cookie_name and cookie_value and domain: + dict_cookie = create_cookie_dict_from_input(cookie_name, cookie_value, domain) + # Cookies imported from the browser + else: + dict_cookie = create_cookie_dict_from_browser(browser_cookie) + + # tor browser: disable secure cookie + if crawler_type=='onion': + dict_cookie['secure'] = False + + dict_cookie['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' + return dict_cookie + +def create_cookie_dict_from_input(cookie_name, cookie_value, cookie_domain): + # WebKit use domain for cookie validation + return {'name': cookie_name, 'value': cookie_value, 'domain': '.{}'.format(cookie_domain)} + +# # TODO: handle prefix cookies +# # TODO: fill empty fields +def create_cookie_dict_from_browser(browser_cookie): + url = urlparse(browser_cookie['Host raw']) domain = url.netloc.split(':', 1)[0] - dict_cookie = {'path': cookie['Path raw'], - 'name': cookie['Name raw'], - 'httpOnly': cookie['HTTP only raw'] == 'true', - 'secure': is_secure, - 'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z', + dict_cookie = {'path': browser_cookie['Path raw'], + 'name': browser_cookie['Name raw'], + 'httpOnly': browser_cookie['HTTP only raw'] == 'true', + 'secure': browser_cookie['Send for'] == 'Encrypted connections only', 'domain': domain, - 'value': cookie['Content raw'] + 'value': browser_cookie['Content raw'] } return dict_cookie -def load_cookies(l_cookies): +def load_cookies(l_cookies, domain=None, crawler_type='regular'): all_cookies = [] for cookie_dict in l_cookies: - all_cookies.append(create_cookie_dict(cookie_dict)) + all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type)) + return all_cookies def get_cookies(): l_cookies = [] return l_cookies +# domain up +def create_domain_metadata(domain_type, domain, current_port, date, date_month): + # Add to global set + r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain) + r_serv_onion.sadd('full_{}_up'.format(domain_type), domain) + r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain) + + # create onion metadata + if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)): + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date) + + # Update domain port number + all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports') + if all_domain_ports: + all_domain_ports = all_domain_ports.split(';') + else: + all_domain_ports = [] + if current_port not in all_domain_ports: + all_domain_ports.append(current_port) + r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports)) + +# add root_item to history +def add_domain_root_item(root_item, domain_type, domain, epoch_date, port): + # Create/Update crawler history + r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item) + +def create_item_metadata(item_id, domain, url, port, item_father): + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port)) + r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url) + # add this item_id to his father + r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id) + +def create_item_id(item_dir, domain): + if len(domain) > 215: + UUID = domain[-215:]+str(uuid.uuid4()) + else: + UUID = domain+str(uuid.uuid4()) + return os.path.join(item_dir, UUID) + +def save_crawled_item(item_id, item_content): + try: + gzipencoded = gzip.compress(item_content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + return gzip64encoded + except: + print("file error: {}".format(item_id)) + return False + +def save_har(har_dir, item_id, har_content): + if not os.path.exists(har_dir): + os.makedirs(har_dir) + item_id = item_id.split('/')[-1] + filename = os.path.join(har_dir, item_id + '.json') + with open(filename, 'w') as f: + f.write(json.dumps(har_content)) + if __name__ == "__main__": - all_cookies = load_cookies(get_cookies()) + all_cookies = load_cookies(get_cookies(), '3thxemke2x7hcibu.onion', crawler_type='onion') print(json.dumps(all_cookies)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 14cdaa40..e86b22ce 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -3,11 +3,8 @@ import os import sys -import gzip -import base64 import uuid import datetime -import base64 import redis import json import time @@ -29,43 +26,9 @@ sys.path.append(os.environ['AIL_BIN']) from Helper import Process sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import ConfigLoader - -# script_lua_cookie = """ -# function main(splash, args) -# -# -- default config -# -- load flash plugin -# splash.plugins_enabled = true -# splash.html5_media_enabled = true -# -# -- to check -# splash.request_body_enabled = true -# splash.response_body_enabled = true -# -# -- handle cookies -# splash:init_cookies(args.cookies) -# -# assert(splash:go{ -# args.url, -# headers=args.headers, -# http_method=args.http_method, -# body=args.body -# }) -# -# splash:wait(10) -# -# -- Response -# return { -# url = splash:url(), -# html = splash:html(), -# har = splash:har(), -# cookies = splash:get_cookies(), -# png = splash:png(render_all=true) -# } -# end -# """ - +#import ConfigLoader +import Screenshot +import crawler_splash script_cookie = """ function main(splash, args) @@ -75,25 +38,32 @@ function main(splash, args) splash.images_enabled = true splash.webgl_enabled = true splash.media_source_enabled = true + -- Force enable things splash.plugins_enabled = true splash.request_body_enabled = true splash.response_body_enabled = true - -- Would be nice + splash.indexeddb_enabled = true splash.html5_media_enabled = true splash.http2_enabled = true + -- User defined splash.resource_timeout = args.resource_timeout splash.timeout = args.timeout - -- Allow to pass cookies + -- Allow to pass cookies splash:init_cookies(args.cookies) + -- Run ok, reason = splash:go{args.url} - if not ok then - return {error = reason} + if not ok and not reason:find("http") then + return { + error = reason, + last_url = splash:url() + } end + splash:wait{args.wait} -- Page instrumentation -- splash.scroll_position = {y=1000} @@ -103,7 +73,8 @@ function main(splash, args) har = splash:har(), html = splash:html(), png = splash:png{render_all=true}, - cookies = splash:get_cookies() + cookies = splash:get_cookies(), + last_url = splash:url() } end """ @@ -138,7 +109,7 @@ class TorSplashCrawler(): name = 'TorSplashSpider' def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): - self.type = type + self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None @@ -149,13 +120,23 @@ class TorSplashCrawler(): self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) - - print(requested_mode) + self.png = True self.har = True - self.cookies = cookies + config_section = 'Crawler' + self.p = Process(config_section) + self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) + self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) + self.r_serv_log_submit = redis.StrictRedis( + host=self.p.config.get("Redis_Log_submit", "host"), + port=self.p.config.getint("Redis_Log_submit", "port"), + db=self.p.config.getint("Redis_Log_submit", "db"), + decode_responses=True) + + self.root_key = None + def build_request_arg(self, cookies): return {'wait': 10, 'resource_timeout': 10, @@ -171,54 +152,64 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='execute', - #meta={'father': self.original_item, 'root_key': None}, + meta={'father': self.original_item}, args=l_cookies - #session_id="foo" ) + # # TODO: remove duplicate and anchor def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') + + # LUA ERROR # # TODO: print/display errors + elif 'error' in response.data: + if(response.data['error'] == 'network99'): + print('Connection to proxy refused') + else: + print(response.data['error']) + elif response.status != 200: print('other response: {}'.format(response.status)) - #print(error_log) - #detect connection to proxy refused + # detect connection to proxy refused error_log = (json.loads(response.body.decode())) - if(error_log['info']['text'] == 'Connection to proxy refused'): - print('Connection to proxy refused') + print(error_log) else: # DEBUG: - print('----') - print(response.data.keys()) + # print('----') + # print(response.data.keys()) - # LUA Script Errors - if 'error' in response.data: - print(response.data['error']) - else: - print(response.data['html']) - pass + item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0]) + self.save_crawled_item(item_id, response.data['html']) + crawler_splash.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) + + if self.root_key is None: + self.root_key = item_id + crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) + crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) #print(response.data['cookies']) if 'cookies' in response.data: all_cookies = response.data['cookies'] - for cookie in all_cookies: - print('------------------------') - print(cookie['name']) - print(cookie['value']) - print(cookie) # for cookie in all_cookies: - # print(cookie.name) + # print('------------------------') + # print(cookie['name']) + # print(cookie['value']) + # print(cookie) else: all_cookies = [] - - # if 'png' in response.data: - - - #if 'har' in response.data: + # SCREENSHOT + if 'png' in response.data: + sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode) + if sha256_string: + Screenshot.save_item_relationship(sha256_string, item_id) + Screenshot.save_domain_relationship(sha256_string, self.domains[0]) + # HAR + if 'har' in response.data: + crawler_splash.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): @@ -228,10 +219,8 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='execute', - #meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'}, - #meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'}, + meta={'father': item_id}, args=l_cookies - #session_id="foo" ) def errback_catcher(self, failure): @@ -240,10 +229,8 @@ class TorSplashCrawler(): if failure.check(ResponseNeverReceived): request = failure.request - #url = request.meta['splash']['args']['url'] - url= 'ido' - #father = request.meta['father'] - father = 'ido' + url= response.data['last_url'] + father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) @@ -257,14 +244,26 @@ class TorSplashCrawler(): errback=self.errback_catcher, endpoint='execute', cache_args=['lua_source'], - #meta={'father': father, 'root_key': response.meta['root_key']}, - #meta={'father': father, 'root_key': 'ido'}, + meta={'father': father}, args=self.build_request_arg(response.cookiejar) - #session_id="foo" ) else: print('failure') #print(failure) print(failure.type) - #print(failure.request.meta['item']) + + def save_crawled_item(self, item_id, item_content): + gzip64encoded = crawler_splash.save_crawled_item(item_id, item_content) + + # Send item to queue + # send paste to Global + relay_message = "{0} {1}".format(item_id, gzip64encoded) + self.p.populate_set_out(relay_message, 'Mixer') + + # increase nb of paste by feeder name + self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) + + # tag crawled paste + msg = 'infoleak:submission="crawler";{}'.format(item_id) + self.p.populate_set_out(msg, 'Tags') diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 1ba98c34..bda7ecc8 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -37,7 +37,7 @@ if __name__ == '__main__': crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] requested_mode = crawler_json['requested'] - cookies = crawler_splash.load_cookies(crawler_splash.get_cookies()) + cookies = crawler_splash.load_cookies(crawler_splash.get_cookies(), domain, crawler_type='onion') print(cookies) redis_cache.delete('crawler_request:{}'.format(uuid)) diff --git a/var/www/templates/crawler/crawler_splash/showDomain.html b/var/www/templates/crawler/crawler_splash/showDomain.html index ecf53121..d83aee39 100644 --- a/var/www/templates/crawler/crawler_splash/showDomain.html +++ b/var/www/templates/crawler/crawler_splash/showDomain.html @@ -445,7 +445,7 @@
- +
@@ -519,11 +519,11 @@ var draw_img = false; $("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}"); $("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}"); {%else%} - var screenshot = ""; + var screenshot = ""; {%endif%} {%endif%} {%else%} -var screenshot = ""; + var screenshot = ""; {%endif%} img.src = base_url + screenshot; @@ -561,7 +561,9 @@ function img_error() { } function reload_image(new_screenshot, link, item_id) { - $("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img"); + if (screenshot) { + $("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img"); + } screenshot = new_screenshot; img.src=base_url + screenshot;