chg: [Splash Crawler] use cookies to bypass login

This commit is contained in:
Terrtia 2020-03-09 17:02:18 +01:00
parent 4300fd7803
commit 42ea678b7a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 213 additions and 190 deletions

59
bin/lib/crawler_splash.py Executable file
View file

@ -0,0 +1,59 @@
#!/usr/bin/python3
"""
API Helper
===================
"""
import json
import os
import re
import redis
import sys
from datetime import datetime, timedelta
from urllib.parse import urlparse
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
# # # #
# Cookies Fields:
# - name
# - value
# - path
# - domain
# # # #
def create_cookie_dict(cookie):
url = urlparse(cookie['Host raw'])
#scheme = url.scheme
is_secure = cookie['Send for'] == 'Encrypted connections only'
if 'HTTP only raw' in cookie:
if cookie['HTTP only raw'] == "true":
is_secure = False
domain = url.netloc.split(':', 1)[0]
dict_cookie = {'path': cookie['Path raw'],
'name': cookie['Name raw'],
'httpOnly': cookie['HTTP only raw'] == 'true',
'secure': is_secure,
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': domain,
'value': cookie['Content raw']
}
return dict_cookie
def load_cookies(l_cookies):
all_cookies = []
for cookie_dict in l_cookies:
all_cookies.append(create_cookie_dict(cookie_dict))
return all_cookies
def get_cookies():
l_cookies = []
return l_cookies
if __name__ == "__main__":
all_cookies = load_cookies(get_cookies())
print(json.dumps(all_cookies))

View file

@ -23,15 +23,95 @@ from scrapy import Spider
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler from scrapy.crawler import CrawlerProcess, Crawler
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest, SplashJsonResponse
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
from Helper import Process from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader
# script_lua_cookie = """
# function main(splash, args)
#
# -- default config
# -- load flash plugin
# splash.plugins_enabled = true
# splash.html5_media_enabled = true
#
# -- to check
# splash.request_body_enabled = true
# splash.response_body_enabled = true
#
# -- handle cookies
# splash:init_cookies(args.cookies)
#
# assert(splash:go{
# args.url,
# headers=args.headers,
# http_method=args.http_method,
# body=args.body
# })
#
# splash:wait(10)
#
# -- Response
# return {
# url = splash:url(),
# html = splash:html(),
# har = splash:har(),
# cookies = splash:get_cookies(),
# png = splash:png(render_all=true)
# }
# end
# """
script_cookie = """
function main(splash, args)
-- Default values
splash.js_enabled = true
splash.private_mode_enabled = true
splash.images_enabled = true
splash.webgl_enabled = true
splash.media_source_enabled = true
-- Force enable things
splash.plugins_enabled = true
splash.request_body_enabled = true
splash.response_body_enabled = true
-- Would be nice
splash.indexeddb_enabled = true
splash.html5_media_enabled = true
splash.http2_enabled = true
-- User defined
splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout
-- Allow to pass cookies
splash:init_cookies(args.cookies)
-- Run
ok, reason = splash:go{args.url}
if not ok then
return {error = reason}
end
splash:wait{args.wait}
-- Page instrumentation
-- splash.scroll_position = {y=1000}
splash:wait{args.wait}
-- Response
return {
har = splash:har(),
html = splash:html(),
png = splash:png{render_all=true},
cookies = splash:get_cookies()
}
end
"""
class TorSplashCrawler(): class TorSplashCrawler():
def __init__(self, splash_url, crawler_options): def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': False}) self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, { self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'], 'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url, 'SPLASH_URL': splash_url,
@ -39,23 +119,25 @@ class TorSplashCrawler():
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725, 'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}, },
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True, 'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2, 'RETRY_TIMES': 0,
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'] 'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': True
}) })
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, original_item): def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, original_item=original_item) self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
self.process.start() self.process.start()
class TorSplashSpider(Spider): class TorSplashSpider(Spider):
name = 'TorSplashSpider' name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.type = type self.type = type
self.requested_mode = requested_mode self.requested_mode = requested_mode
self.original_item = original_item self.original_item = original_item
@ -68,57 +150,30 @@ class TorSplashCrawler():
self.date_month = date['date_month'] self.date_month = date['date_month']
self.date_epoch = int(date['epoch']) self.date_epoch = int(date['epoch'])
# # TODO: timeout in config print(requested_mode)
self.arg_crawler = { 'html': crawler_options['html'], self.png = True
'wait': 10, self.har = True
'render_all': 1,
'timeout': 30,
'har': crawler_options['har'],
'png': crawler_options['png']}
config_section = 'Crawler' self.cookies = cookies
self.p = Process(config_section)
self.r_cache = redis.StrictRedis( def build_request_arg(self, cookies):
host=self.p.config.get("Redis_Cache", "host"), return {'wait': 10,
port=self.p.config.getint("Redis_Cache", "port"), 'resource_timeout': 10,
db=self.p.config.getint("Redis_Cache", "db"), 'timeout': 30,
decode_responses=True) 'cookies': cookies,
'lua_source': script_cookie
self.r_serv_log_submit = redis.StrictRedis( }
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.r_serv_metadata = redis.StrictRedis(
host=self.p.config.get("ARDB_Metadata", "host"),
port=self.p.config.getint("ARDB_Metadata", "port"),
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.r_serv_onion = redis.StrictRedis(
host=self.p.config.get("ARDB_Onion", "host"),
port=self.p.config.getint("ARDB_Onion", "port"),
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date_str )
self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
def start_requests(self): def start_requests(self):
l_cookies = self.build_request_arg(self.cookies)
yield SplashRequest( yield SplashRequest(
self.start_urls, self.start_urls,
self.parse, self.parse,
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='execute',
meta={'father': self.original_item, 'root_key': None}, #meta={'father': self.original_item, 'root_key': None},
args=self.arg_crawler args=l_cookies
#session_id="foo"
) )
def parse(self,response): def parse(self,response):
@ -135,99 +190,49 @@ class TorSplashCrawler():
if(error_log['info']['text'] == 'Connection to proxy refused'): if(error_log['info']['text'] == 'Connection to proxy refused'):
print('Connection to proxy refused') print('Connection to proxy refused')
else: else:
# DEBUG:
print('----')
print(response.data.keys())
#avoid filename too big # LUA Script Errors
if len(self.domains[0]) > 215: if 'error' in response.data:
UUID = self.domains[0][-215:]+str(uuid.uuid4()) print(response.data['error'])
else: else:
UUID = self.domains[0]+str(uuid.uuid4()) print(response.data['html'])
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) pass
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_har = os.path.join(self.crawled_har, UUID)
# # TODO: modify me #print(response.data['cookies'])
# save new paste on disk if 'cookies' in response.data:
if self.save_crawled_paste(relative_filename_paste, response.data['html']): all_cookies = response.data['cookies']
for cookie in all_cookies:
print('------------------------')
print(cookie['name'])
print(cookie['value'])
print(cookie)
# for cookie in all_cookies:
# print(cookie.name)
else:
all_cookies = []
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) # if 'png' in response.data:
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
# create onion metadata
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
# create root_key #if 'har' in response.data:
if self.root_key is None:
self.root_key = relative_filename_paste
# Create/Update crawler history
self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
# Update domain port number
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
if all_domain_ports:
all_domain_ports = all_domain_ports.split(';')
else:
all_domain_ports = []
if self.port not in all_domain_ports:
all_domain_ports.append(self.port)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
#create paste metadata le = LinkExtractor(allow_domains=self.domains, unique=True)
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) for link in le.extract_links(response):
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) l_cookies = self.build_request_arg(all_cookies)
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) yield SplashRequest(
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) link.url,
self.parse,
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) errback=self.errback_catcher,
endpoint='execute',
if 'png' in response.data: #meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'},
size_screenshot = (len(response.data['png'])*3) /4 #meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'},
args=l_cookies
if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto #session_id="foo"
image_content = base64.standard_b64decode(response.data['png'].encode()) )
hash = sha256(image_content).hexdigest()
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
dirname = os.path.dirname(filename_img)
if not os.path.exists(dirname):
os.makedirs(dirname)
if not os.path.exists(filename_img):
with open(filename_img, 'wb') as f:
f.write(image_content)
# add item metadata
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
# add sha256 metadata
self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste)
# domain map
self.r_serv_onion.sadd('domain_screenshot:{}'.format(self.domains[0]), hash)
self.r_serv_onion.sadd('screenshot_domain:{}'.format(hash), self.domains[0])
if 'har' in response.data:
dirname = os.path.dirname(filename_har)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename_har+'.json', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
# save external links in set
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
#for link in lext.extract_links(response):
# self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
# self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
yield SplashRequest(
link.url,
self.parse,
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']},
args=self.arg_crawler
)
def errback_catcher(self, failure): def errback_catcher(self, failure):
# catch all errback failures, # catch all errback failures,
@ -235,8 +240,10 @@ class TorSplashCrawler():
if failure.check(ResponseNeverReceived): if failure.check(ResponseNeverReceived):
request = failure.request request = failure.request
url = request.meta['splash']['args']['url'] #url = request.meta['splash']['args']['url']
father = request.meta['father'] url= 'ido'
#father = request.meta['father']
father = 'ido'
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10) time.sleep(10)
@ -248,9 +255,12 @@ class TorSplashCrawler():
url, url,
self.parse, self.parse,
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='execute',
meta={'father': father, 'root_key': response.meta['root_key']}, cache_args=['lua_source'],
args=self.arg_crawler #meta={'father': father, 'root_key': response.meta['root_key']},
#meta={'father': father, 'root_key': 'ido'},
args=self.build_request_arg(response.cookiejar)
#session_id="foo"
) )
else: else:
@ -258,52 +268,3 @@ class TorSplashCrawler():
#print(failure) #print(failure)
print(failure.type) print(failure.type)
#print(failure.request.meta['item']) #print(failure.request.meta['item'])
'''
#if isinstance(failure.value, HttpError):
elif failure.check(HttpError):
# you can get the response
response = failure.value.response
print('HttpError')
self.logger.error('HttpError on %s', response.url)
#elif isinstance(failure.value, DNSLookupError):
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print(DNSLookupError)
print('DNSLookupError')
self.logger.error('DNSLookupError on %s', request.url)
#elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
print('TimeoutError')
print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url)
'''
def save_crawled_paste(self, filename, content):
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False
try:
gzipencoded = gzip.compress(content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
except:
print("file error: {}".format(filename))
return False
# send paste to Global
relay_message = "{0} {1}".format(filename, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(filename)
self.p.populate_set_out(msg, 'Tags')
return True

View file

@ -9,6 +9,7 @@ from TorSplashCrawler import TorSplashCrawler
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader import ConfigLoader
import crawler_splash
if __name__ == '__main__': if __name__ == '__main__':
@ -36,8 +37,10 @@ if __name__ == '__main__':
crawler_options = crawler_json['crawler_options'] crawler_options = crawler_json['crawler_options']
date = crawler_json['date'] date = crawler_json['date']
requested_mode = crawler_json['requested'] requested_mode = crawler_json['requested']
cookies = crawler_splash.load_cookies(crawler_splash.get_cookies())
print(cookies)
redis_cache.delete('crawler_request:{}'.format(uuid)) redis_cache.delete('crawler_request:{}'.format(uuid))
crawler = TorSplashCrawler(splash_url, crawler_options) crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, original_item) crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)