#!/usr/bin/env python3 # -*-coding:UTF-8 -* import os import sys import gzip import base64 import uuid import datetime import base64 import redis import json import time from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError from twisted.internet.error import TimeoutError from twisted.web._newclient import ResponseNeverReceived from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler from scrapy_splash import SplashRequest sys.path.append(os.environ['AIL_BIN']) from Helper import Process class TorSplashCrawler(): def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'] }) def crawl(self, type, crawler_options, date, url, domain, port, original_item): self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs): self.type = type self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'har': crawler_options['har'], 'png': crawler_options['png']} config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) def start_requests(self): yield SplashRequest( self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_item, 'root_key': None}, args=self.arg_crawler ) def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: UUID = self.domains[0]+str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') # # TODO: modify me # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) # create root_key if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) # Update domain port number all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') if all_domain_ports: all_domain_ports = all_domain_ports.split(';') else: all_domain_ports = [] if self.port not in all_domain_ports: all_domain_ports.append(self.port) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) if 'png' in response.data: size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) if 'har' in response.data: with open(filename_screenshot+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if response: response_root_key = response.meta['root_key'] else: response_root_key = None yield SplashRequest( url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': father, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True