diff --git a/bin/lib/crawler_splash.py b/bin/lib/crawler_splash.py new file mode 100755 index 00000000..6f519738 --- /dev/null +++ b/bin/lib/crawler_splash.py @@ -0,0 +1,59 @@ +#!/usr/bin/python3 + +""" +API Helper +=================== + + +""" + +import json +import os +import re +import redis +import sys + +from datetime import datetime, timedelta +from urllib.parse import urlparse + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) + +# # # # +# Cookies Fields: +# - name +# - value +# - path +# - domain +# # # # +def create_cookie_dict(cookie): + url = urlparse(cookie['Host raw']) + #scheme = url.scheme + is_secure = cookie['Send for'] == 'Encrypted connections only' + if 'HTTP only raw' in cookie: + if cookie['HTTP only raw'] == "true": + is_secure = False + domain = url.netloc.split(':', 1)[0] + dict_cookie = {'path': cookie['Path raw'], + 'name': cookie['Name raw'], + 'httpOnly': cookie['HTTP only raw'] == 'true', + 'secure': is_secure, + 'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z', + 'domain': domain, + 'value': cookie['Content raw'] + } + return dict_cookie + +def load_cookies(l_cookies): + all_cookies = [] + + for cookie_dict in l_cookies: + all_cookies.append(create_cookie_dict(cookie_dict)) + return all_cookies + +def get_cookies(): + l_cookies = [] + return l_cookies + +if __name__ == "__main__": + all_cookies = load_cookies(get_cookies()) + print(json.dumps(all_cookies)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index e505ab63..14cdaa40 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -23,15 +23,95 @@ from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler -from scrapy_splash import SplashRequest +from scrapy_splash import SplashRequest, SplashJsonResponse sys.path.append(os.environ['AIL_BIN']) from Helper import Process +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import ConfigLoader + +# script_lua_cookie = """ +# function main(splash, args) +# +# -- default config +# -- load flash plugin +# splash.plugins_enabled = true +# splash.html5_media_enabled = true +# +# -- to check +# splash.request_body_enabled = true +# splash.response_body_enabled = true +# +# -- handle cookies +# splash:init_cookies(args.cookies) +# +# assert(splash:go{ +# args.url, +# headers=args.headers, +# http_method=args.http_method, +# body=args.body +# }) +# +# splash:wait(10) +# +# -- Response +# return { +# url = splash:url(), +# html = splash:html(), +# har = splash:har(), +# cookies = splash:get_cookies(), +# png = splash:png(render_all=true) +# } +# end +# """ + + +script_cookie = """ +function main(splash, args) + -- Default values + splash.js_enabled = true + splash.private_mode_enabled = true + splash.images_enabled = true + splash.webgl_enabled = true + splash.media_source_enabled = true + -- Force enable things + splash.plugins_enabled = true + splash.request_body_enabled = true + splash.response_body_enabled = true + -- Would be nice + splash.indexeddb_enabled = true + splash.html5_media_enabled = true + splash.http2_enabled = true + -- User defined + splash.resource_timeout = args.resource_timeout + splash.timeout = args.timeout + + -- Allow to pass cookies + splash:init_cookies(args.cookies) + -- Run + ok, reason = splash:go{args.url} + if not ok then + return {error = reason} + end + splash:wait{args.wait} + -- Page instrumentation + -- splash.scroll_position = {y=1000} + splash:wait{args.wait} + -- Response + return { + har = splash:har(), + html = splash:html(), + png = splash:png{render_all=true}, + cookies = splash:get_cookies() + } +end +""" + class TorSplashCrawler(): def __init__(self, splash_url, crawler_options): - self.process = CrawlerProcess({'LOG_ENABLED': False}) + self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, @@ -39,23 +119,25 @@ class TorSplashCrawler(): 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, - 'RETRY_TIMES': 2, + 'RETRY_TIMES': 0, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], - 'DEPTH_LIMIT': crawler_options['depth_limit'] + 'DEPTH_LIMIT': crawler_options['depth_limit'], + 'SPLASH_COOKIES_DEBUG': True }) - def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, original_item): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, original_item=original_item) + def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): + def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item @@ -68,57 +150,30 @@ class TorSplashCrawler(): self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) - # # TODO: timeout in config - self.arg_crawler = { 'html': crawler_options['html'], - 'wait': 10, - 'render_all': 1, - 'timeout': 30, - 'har': crawler_options['har'], - 'png': crawler_options['png']} + print(requested_mode) + self.png = True + self.har = True - config_section = 'Crawler' - self.p = Process(config_section) + self.cookies = cookies - self.r_cache = redis.StrictRedis( - host=self.p.config.get("Redis_Cache", "host"), - port=self.p.config.getint("Redis_Cache", "port"), - db=self.p.config.getint("Redis_Cache", "db"), - decode_responses=True) - - self.r_serv_log_submit = redis.StrictRedis( - host=self.p.config.get("Redis_Log_submit", "host"), - port=self.p.config.getint("Redis_Log_submit", "port"), - db=self.p.config.getint("Redis_Log_submit", "db"), - decode_responses=True) - - self.r_serv_metadata = redis.StrictRedis( - host=self.p.config.get("ARDB_Metadata", "host"), - port=self.p.config.getint("ARDB_Metadata", "port"), - db=self.p.config.getint("ARDB_Metadata", "db"), - decode_responses=True) - - self.r_serv_onion = redis.StrictRedis( - host=self.p.config.get("ARDB_Onion", "host"), - port=self.p.config.getint("ARDB_Onion", "port"), - db=self.p.config.getint("ARDB_Onion", "db"), - decode_responses=True) - - self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) - - self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), - self.p.config.get("Directories", "crawled"), date_str ) - - self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) - self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") ) + def build_request_arg(self, cookies): + return {'wait': 10, + 'resource_timeout': 10, + 'timeout': 30, + 'cookies': cookies, + 'lua_source': script_cookie + } def start_requests(self): + l_cookies = self.build_request_arg(self.cookies) yield SplashRequest( self.start_urls, self.parse, errback=self.errback_catcher, - endpoint='render.json', - meta={'father': self.original_item, 'root_key': None}, - args=self.arg_crawler + endpoint='execute', + #meta={'father': self.original_item, 'root_key': None}, + args=l_cookies + #session_id="foo" ) def parse(self,response): @@ -135,99 +190,49 @@ class TorSplashCrawler(): if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: + # DEBUG: + print('----') + print(response.data.keys()) - #avoid filename too big - if len(self.domains[0]) > 215: - UUID = self.domains[0][-215:]+str(uuid.uuid4()) + # LUA Script Errors + if 'error' in response.data: + print(response.data['error']) else: - UUID = self.domains[0]+str(uuid.uuid4()) - filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) - relative_filename_paste = os.path.join(self.crawler_path, UUID) - filename_har = os.path.join(self.crawled_har, UUID) + print(response.data['html']) + pass - # # TODO: modify me - # save new paste on disk - if self.save_crawled_paste(relative_filename_paste, response.data['html']): + #print(response.data['cookies']) + if 'cookies' in response.data: + all_cookies = response.data['cookies'] + for cookie in all_cookies: + print('------------------------') + print(cookie['name']) + print(cookie['value']) + print(cookie) + # for cookie in all_cookies: + # print(cookie.name) + else: + all_cookies = [] - # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? - #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) - self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) - self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) - self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) + # if 'png' in response.data: - # create onion metadata - if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) - # create root_key - if self.root_key is None: - self.root_key = relative_filename_paste - # Create/Update crawler history - self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) - # Update domain port number - all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') - if all_domain_ports: - all_domain_ports = all_domain_ports.split(';') - else: - all_domain_ports = [] - if self.port not in all_domain_ports: - all_domain_ports.append(self.port) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) + #if 'har' in response.data: - #create paste metadata - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) - - self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) - - if 'png' in response.data: - size_screenshot = (len(response.data['png'])*3) /4 - - if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto - image_content = base64.standard_b64decode(response.data['png'].encode()) - hash = sha256(image_content).hexdigest() - img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) - filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') - dirname = os.path.dirname(filename_img) - if not os.path.exists(dirname): - os.makedirs(dirname) - if not os.path.exists(filename_img): - with open(filename_img, 'wb') as f: - f.write(image_content) - # add item metadata - self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) - # add sha256 metadata - self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste) - # domain map - self.r_serv_onion.sadd('domain_screenshot:{}'.format(self.domains[0]), hash) - self.r_serv_onion.sadd('screenshot_domain:{}'.format(hash), self.domains[0]) - - if 'har' in response.data: - dirname = os.path.dirname(filename_har) - if not os.path.exists(dirname): - os.makedirs(dirname) - with open(filename_har+'.json', 'wb') as f: - f.write(json.dumps(response.data['har']).encode()) - - # save external links in set - #lext = LinkExtractor(deny_domains=self.domains, unique=True) - #for link in lext.extract_links(response): - # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) - # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - - le = LinkExtractor(allow_domains=self.domains, unique=True) - for link in le.extract_links(response): - yield SplashRequest( - link.url, - self.parse, - errback=self.errback_catcher, - endpoint='render.json', - meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, - args=self.arg_crawler - ) + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + l_cookies = self.build_request_arg(all_cookies) + yield SplashRequest( + link.url, + self.parse, + errback=self.errback_catcher, + endpoint='execute', + #meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'}, + #meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'}, + args=l_cookies + #session_id="foo" + ) def errback_catcher(self, failure): # catch all errback failures, @@ -235,8 +240,10 @@ class TorSplashCrawler(): if failure.check(ResponseNeverReceived): request = failure.request - url = request.meta['splash']['args']['url'] - father = request.meta['father'] + #url = request.meta['splash']['args']['url'] + url= 'ido' + #father = request.meta['father'] + father = 'ido' self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) @@ -248,9 +255,12 @@ class TorSplashCrawler(): url, self.parse, errback=self.errback_catcher, - endpoint='render.json', - meta={'father': father, 'root_key': response.meta['root_key']}, - args=self.arg_crawler + endpoint='execute', + cache_args=['lua_source'], + #meta={'father': father, 'root_key': response.meta['root_key']}, + #meta={'father': father, 'root_key': 'ido'}, + args=self.build_request_arg(response.cookiejar) + #session_id="foo" ) else: @@ -258,52 +268,3 @@ class TorSplashCrawler(): #print(failure) print(failure.type) #print(failure.request.meta['item']) - - ''' - #if isinstance(failure.value, HttpError): - elif failure.check(HttpError): - # you can get the response - response = failure.value.response - print('HttpError') - self.logger.error('HttpError on %s', response.url) - - #elif isinstance(failure.value, DNSLookupError): - elif failure.check(DNSLookupError): - # this is the original request - request = failure.request - print(DNSLookupError) - print('DNSLookupError') - self.logger.error('DNSLookupError on %s', request.url) - - #elif isinstance(failure.value, TimeoutError): - elif failure.check(TimeoutError): - request = failure.request - print('TimeoutError') - print(TimeoutError) - self.logger.error('TimeoutError on %s', request.url) - ''' - - def save_crawled_paste(self, filename, content): - - if os.path.isfile(filename): - print('File: {} already exist in submitted pastes'.format(filename)) - return False - - try: - gzipencoded = gzip.compress(content.encode()) - gzip64encoded = base64.standard_b64encode(gzipencoded).decode() - except: - print("file error: {}".format(filename)) - return False - - # send paste to Global - relay_message = "{0} {1}".format(filename, gzip64encoded) - self.p.populate_set_out(relay_message, 'Mixer') - - # increase nb of paste by feeder name - self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) - - # tag crawled paste - msg = 'infoleak:submission="crawler";{}'.format(filename) - self.p.populate_set_out(msg, 'Tags') - return True diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index ccb645a0..1ba98c34 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -9,6 +9,7 @@ from TorSplashCrawler import TorSplashCrawler sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader +import crawler_splash if __name__ == '__main__': @@ -36,8 +37,10 @@ if __name__ == '__main__': crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] requested_mode = crawler_json['requested'] + cookies = crawler_splash.load_cookies(crawler_splash.get_cookies()) + print(cookies) redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, original_item) + crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)