ail-framework/bin/torcrawler/TorSplashCrawler.py

272 lines
12 KiB
Python
Raw Normal View History

2018-08-09 15:42:21 +00:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import gzip
import base64
import uuid
import datetime
import base64
import redis
import json
import time
2018-08-09 15:42:21 +00:00
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
2018-08-09 15:42:21 +00:00
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
class TorSplashCrawler():
2019-02-21 08:54:43 +00:00
def __init__(self, splash_url, crawler_options):
2018-08-09 15:42:21 +00:00
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
2019-02-21 08:54:43 +00:00
'USER_AGENT': crawler_options['user_agent'],
2018-08-09 15:42:21 +00:00
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
2018-09-17 13:35:06 +00:00
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
2019-02-21 08:54:43 +00:00
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit']
2018-08-09 15:42:21 +00:00
})
2019-02-25 15:38:50 +00:00
def crawl(self, type, crawler_options, date, url, domain, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item)
2018-08-09 15:42:21 +00:00
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
2019-02-25 15:38:50 +00:00
def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs):
self.type = type
2019-02-25 15:38:50 +00:00
self.original_item = original_item
2019-02-22 16:00:24 +00:00
self.root_key = None
2018-08-09 15:42:21 +00:00
self.start_urls = url
self.domains = [domain]
2019-02-25 15:38:50 +00:00
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day']
self.date_month = date['date_month']
self.date_epoch = int(date['epoch'])
2018-08-09 15:42:21 +00:00
2019-02-21 08:54:43 +00:00
self.arg_crawler = { 'html': crawler_options['html'],
'wait': 10,
'render_all': 1,
'har': crawler_options['har'],
'png': crawler_options['png']}
2018-08-09 15:42:21 +00:00
config_section = 'Crawler'
self.p = Process(config_section)
self.r_cache = redis.StrictRedis(
host=self.p.config.get("Redis_Cache", "host"),
port=self.p.config.getint("Redis_Cache", "port"),
db=self.p.config.getint("Redis_Cache", "db"),
decode_responses=True)
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.r_serv_metadata = redis.StrictRedis(
host=self.p.config.get("ARDB_Metadata", "host"),
port=self.p.config.getint("ARDB_Metadata", "port"),
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.r_serv_onion = redis.StrictRedis(
host=self.p.config.get("ARDB_Onion", "host"),
port=self.p.config.getint("ARDB_Onion", "port"),
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
2019-02-25 15:38:50 +00:00
self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
2018-08-09 15:42:21 +00:00
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
2019-02-25 15:38:50 +00:00
self.p.config.get("Directories", "crawled"), date_str )
2018-08-09 15:42:21 +00:00
2019-02-25 15:38:50 +00:00
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
2018-08-09 15:42:21 +00:00
def start_requests(self):
yield SplashRequest(
self.start_urls,
self.parse,
errback=self.errback_catcher,
2018-08-09 15:42:21 +00:00
endpoint='render.json',
2019-02-25 15:38:50 +00:00
meta={'father': self.original_item, 'root_key': None},
2019-02-21 08:54:43 +00:00
args=self.arg_crawler
2018-08-09 15:42:21 +00:00
)
def parse(self,response):
#print(response.headers)
#print(response.status)
2018-09-17 13:35:06 +00:00
if response.status == 504:
# down ?
print('504 detected')
elif response.status != 200:
2018-09-27 14:47:48 +00:00
print('other response: {}'.format(response.status))
#print(error_log)
#detect connection to proxy refused
error_log = (json.loads(response.body.decode()))
if(error_log['info']['text'] == 'Connection to proxy refused'):
print('Connection to proxy refused')
2018-09-17 13:35:06 +00:00
else:
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
2019-02-21 08:54:43 +00:00
# # TODO: modify me
2018-09-17 13:35:06 +00:00
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
2018-09-17 13:35:06 +00:00
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
# create onion metadata
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
2019-02-22 16:00:24 +00:00
# create root_key
if self.root_key is None:
self.root_key = relative_filename_paste
# Create/Update crawler history
2019-02-25 15:38:50 +00:00
self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key)
2018-09-17 13:35:06 +00:00
#create paste metadata
2019-02-22 16:00:24 +00:00
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key)
2018-09-17 13:35:06 +00:00
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
2019-02-21 08:54:43 +00:00
if 'png' in response.data:
size_screenshot = (len(response.data['png'])*3) /4
2018-09-17 13:35:06 +00:00
2019-02-21 08:54:43 +00:00
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
2018-09-17 13:35:06 +00:00
2019-02-21 08:54:43 +00:00
if 'har' in response.data:
with open(filename_screenshot+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
2018-09-17 13:35:06 +00:00
# save external links in set
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
#for link in lext.extract_links(response):
# self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
# self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
2018-09-17 13:35:06 +00:00
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
yield SplashRequest(
link.url,
self.parse,
errback=self.errback_catcher,
2018-09-17 13:35:06 +00:00
endpoint='render.json',
2019-02-22 16:00:24 +00:00
meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']},
2019-02-21 08:54:43 +00:00
args=self.arg_crawler
2018-09-17 13:35:06 +00:00
)
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
url = request.meta['splash']['args']['url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='render.json',
2019-02-22 16:00:24 +00:00
meta={'father': father, 'root_key': response.meta['root_key']},
2019-02-21 08:54:43 +00:00
args=self.arg_crawler
)
else:
print('failure')
#print(failure)
print(failure.type)
#print(failure.request.meta['item'])
'''
#if isinstance(failure.value, HttpError):
elif failure.check(HttpError):
# you can get the response
response = failure.value.response
print('HttpError')
self.logger.error('HttpError on %s', response.url)
#elif isinstance(failure.value, DNSLookupError):
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print(DNSLookupError)
print('DNSLookupError')
self.logger.error('DNSLookupError on %s', request.url)
#elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
print('TimeoutError')
print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url)
'''
2018-08-09 15:42:21 +00:00
def save_crawled_paste(self, filename, content):
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False
try:
gzipencoded = gzip.compress(content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
except:
print("file error: {}".format(filename))
return False
# send paste to Global
relay_message = "{0} {1}".format(filename, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(filename)
self.p.populate_set_out(msg, 'Tags')
return True