chg: [crawler] bypass login: use cookie provided by user and accept cookie from server + refractor

This commit is contained in:
Terrtia 2020-03-20 16:15:25 +01:00
parent 42ea678b7a
commit 6cfd3fe36d
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
5 changed files with 217 additions and 110 deletions

View file

@ -1,10 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
import base64
import os import os
import sys import sys
import redis import redis
from hashlib import sha256
from io import BytesIO from io import BytesIO
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
@ -164,6 +166,25 @@ def get_screenshot_file_content(sha256_string):
file_content = BytesIO(f.read()) file_content = BytesIO(f.read())
return file_content return file_content
# if force save, ignore max_size
def save_crawled_screeshot(b64_screenshot, max_size, f_save=False):
screenshot_size = (len(b64_screenshot)*3) /4
if screenshot_size < max_size or f_save:
image_content = base64.standard_b64decode(b64_screenshot.encode())
sha256_string = sha256(image_content).hexdigest()
filepath = get_screenshot_filepath(sha256_string)
if os.path.isfile(filepath):
#print('File already exist')
return False
# create dir
dirname = os.path.dirname(filepath)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filepath, 'wb') as f:
f.write(image_content)
return sha256_string
return False
def save_screenshot_file(sha256_string, io_content): def save_screenshot_file(sha256_string, io_content):
filepath = get_screenshot_filepath(sha256_string) filepath = get_screenshot_filepath(sha256_string)
if os.path.isfile(filepath): if os.path.isfile(filepath):

View file

@ -6,54 +6,139 @@ API Helper
""" """
import base64
import gzip
import json import json
import os import os
import re import re
import redis import redis
import sys import sys
import uuid
from datetime import datetime, timedelta from datetime import datetime, timedelta
from urllib.parse import urlparse from urllib.parse import urlparse
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
# # # # # # # #
# Cookies Fields: # Cookies Fields:
# - name # - name
# - value # - value
# - path # - path (optional)
# - domain # - domain (optional)
# - secure (optional)
# - httpOnly (optional)
# # # # # # # #
def create_cookie_dict(cookie): def create_cookie_dict(browser_cookie=[], cookie_name=None, cookie_value=None, domain=None, crawler_type='regular'):
url = urlparse(cookie['Host raw']) # UI created
#scheme = url.scheme if cookie_name and cookie_value and domain:
is_secure = cookie['Send for'] == 'Encrypted connections only' dict_cookie = create_cookie_dict_from_input(cookie_name, cookie_value, domain)
if 'HTTP only raw' in cookie: # Cookies imported from the browser
if cookie['HTTP only raw'] == "true": else:
is_secure = False dict_cookie = create_cookie_dict_from_browser(browser_cookie)
# tor browser: disable secure cookie
if crawler_type=='onion':
dict_cookie['secure'] = False
dict_cookie['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
return dict_cookie
def create_cookie_dict_from_input(cookie_name, cookie_value, cookie_domain):
# WebKit use domain for cookie validation
return {'name': cookie_name, 'value': cookie_value, 'domain': '.{}'.format(cookie_domain)}
# # TODO: handle prefix cookies
# # TODO: fill empty fields
def create_cookie_dict_from_browser(browser_cookie):
url = urlparse(browser_cookie['Host raw'])
domain = url.netloc.split(':', 1)[0] domain = url.netloc.split(':', 1)[0]
dict_cookie = {'path': cookie['Path raw'], dict_cookie = {'path': browser_cookie['Path raw'],
'name': cookie['Name raw'], 'name': browser_cookie['Name raw'],
'httpOnly': cookie['HTTP only raw'] == 'true', 'httpOnly': browser_cookie['HTTP only raw'] == 'true',
'secure': is_secure, 'secure': browser_cookie['Send for'] == 'Encrypted connections only',
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': domain, 'domain': domain,
'value': cookie['Content raw'] 'value': browser_cookie['Content raw']
} }
return dict_cookie return dict_cookie
def load_cookies(l_cookies): def load_cookies(l_cookies, domain=None, crawler_type='regular'):
all_cookies = [] all_cookies = []
for cookie_dict in l_cookies: for cookie_dict in l_cookies:
all_cookies.append(create_cookie_dict(cookie_dict)) all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type))
return all_cookies return all_cookies
def get_cookies(): def get_cookies():
l_cookies = [] l_cookies = []
return l_cookies return l_cookies
# domain up
def create_domain_metadata(domain_type, domain, current_port, date, date_month):
# Add to global set
r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain)
r_serv_onion.sadd('full_{}_up'.format(domain_type), domain)
r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain)
# create onion metadata
if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)):
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date)
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date)
# Update domain port number
all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports')
if all_domain_ports:
all_domain_ports = all_domain_ports.split(';')
else:
all_domain_ports = []
if current_port not in all_domain_ports:
all_domain_ports.append(current_port)
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports))
# add root_item to history
def add_domain_root_item(root_item, domain_type, domain, epoch_date, port):
# Create/Update crawler history
r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item)
def create_item_metadata(item_id, domain, url, port, item_father):
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father)
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port))
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url)
# add this item_id to his father
r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id)
def create_item_id(item_dir, domain):
if len(domain) > 215:
UUID = domain[-215:]+str(uuid.uuid4())
else:
UUID = domain+str(uuid.uuid4())
return os.path.join(item_dir, UUID)
def save_crawled_item(item_id, item_content):
try:
gzipencoded = gzip.compress(item_content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
return gzip64encoded
except:
print("file error: {}".format(item_id))
return False
def save_har(har_dir, item_id, har_content):
if not os.path.exists(har_dir):
os.makedirs(har_dir)
item_id = item_id.split('/')[-1]
filename = os.path.join(har_dir, item_id + '.json')
with open(filename, 'w') as f:
f.write(json.dumps(har_content))
if __name__ == "__main__": if __name__ == "__main__":
all_cookies = load_cookies(get_cookies()) all_cookies = load_cookies(get_cookies(), '3thxemke2x7hcibu.onion', crawler_type='onion')
print(json.dumps(all_cookies)) print(json.dumps(all_cookies))

View file

@ -3,11 +3,8 @@
import os import os
import sys import sys
import gzip
import base64
import uuid import uuid
import datetime import datetime
import base64
import redis import redis
import json import json
import time import time
@ -29,43 +26,9 @@ sys.path.append(os.environ['AIL_BIN'])
from Helper import Process from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader #import ConfigLoader
import Screenshot
# script_lua_cookie = """ import crawler_splash
# function main(splash, args)
#
# -- default config
# -- load flash plugin
# splash.plugins_enabled = true
# splash.html5_media_enabled = true
#
# -- to check
# splash.request_body_enabled = true
# splash.response_body_enabled = true
#
# -- handle cookies
# splash:init_cookies(args.cookies)
#
# assert(splash:go{
# args.url,
# headers=args.headers,
# http_method=args.http_method,
# body=args.body
# })
#
# splash:wait(10)
#
# -- Response
# return {
# url = splash:url(),
# html = splash:html(),
# har = splash:har(),
# cookies = splash:get_cookies(),
# png = splash:png(render_all=true)
# }
# end
# """
script_cookie = """ script_cookie = """
function main(splash, args) function main(splash, args)
@ -75,25 +38,32 @@ function main(splash, args)
splash.images_enabled = true splash.images_enabled = true
splash.webgl_enabled = true splash.webgl_enabled = true
splash.media_source_enabled = true splash.media_source_enabled = true
-- Force enable things -- Force enable things
splash.plugins_enabled = true splash.plugins_enabled = true
splash.request_body_enabled = true splash.request_body_enabled = true
splash.response_body_enabled = true splash.response_body_enabled = true
-- Would be nice
splash.indexeddb_enabled = true splash.indexeddb_enabled = true
splash.html5_media_enabled = true splash.html5_media_enabled = true
splash.http2_enabled = true splash.http2_enabled = true
-- User defined -- User defined
splash.resource_timeout = args.resource_timeout splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout splash.timeout = args.timeout
-- Allow to pass cookies -- Allow to pass cookies
splash:init_cookies(args.cookies) splash:init_cookies(args.cookies)
-- Run -- Run
ok, reason = splash:go{args.url} ok, reason = splash:go{args.url}
if not ok then if not ok and not reason:find("http") then
return {error = reason} return {
error = reason,
last_url = splash:url()
}
end end
splash:wait{args.wait} splash:wait{args.wait}
-- Page instrumentation -- Page instrumentation
-- splash.scroll_position = {y=1000} -- splash.scroll_position = {y=1000}
@ -103,7 +73,8 @@ function main(splash, args)
har = splash:har(), har = splash:har(),
html = splash:html(), html = splash:html(),
png = splash:png{render_all=true}, png = splash:png{render_all=true},
cookies = splash:get_cookies() cookies = splash:get_cookies(),
last_url = splash:url()
} }
end end
""" """
@ -138,7 +109,7 @@ class TorSplashCrawler():
name = 'TorSplashSpider' name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.type = type self.domain_type = type
self.requested_mode = requested_mode self.requested_mode = requested_mode
self.original_item = original_item self.original_item = original_item
self.root_key = None self.root_key = None
@ -149,13 +120,23 @@ class TorSplashCrawler():
self.full_date = date['date_day'] self.full_date = date['date_day']
self.date_month = date['date_month'] self.date_month = date['date_month']
self.date_epoch = int(date['epoch']) self.date_epoch = int(date['epoch'])
print(requested_mode)
self.png = True self.png = True
self.har = True self.har = True
self.cookies = cookies self.cookies = cookies
config_section = 'Crawler'
self.p = Process(config_section)
self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.root_key = None
def build_request_arg(self, cookies): def build_request_arg(self, cookies):
return {'wait': 10, return {'wait': 10,
'resource_timeout': 10, 'resource_timeout': 10,
@ -171,54 +152,64 @@ class TorSplashCrawler():
self.parse, self.parse,
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='execute', endpoint='execute',
#meta={'father': self.original_item, 'root_key': None}, meta={'father': self.original_item},
args=l_cookies args=l_cookies
#session_id="foo"
) )
# # TODO: remove duplicate and anchor
def parse(self,response): def parse(self,response):
#print(response.headers) #print(response.headers)
#print(response.status) #print(response.status)
if response.status == 504: if response.status == 504:
# down ? # down ?
print('504 detected') print('504 detected')
# LUA ERROR # # TODO: print/display errors
elif 'error' in response.data:
if(response.data['error'] == 'network99'):
print('Connection to proxy refused')
else:
print(response.data['error'])
elif response.status != 200: elif response.status != 200:
print('other response: {}'.format(response.status)) print('other response: {}'.format(response.status))
#print(error_log) # detect connection to proxy refused
#detect connection to proxy refused
error_log = (json.loads(response.body.decode())) error_log = (json.loads(response.body.decode()))
if(error_log['info']['text'] == 'Connection to proxy refused'): print(error_log)
print('Connection to proxy refused')
else: else:
# DEBUG: # DEBUG:
print('----') # print('----')
print(response.data.keys()) # print(response.data.keys())
# LUA Script Errors item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0])
if 'error' in response.data: self.save_crawled_item(item_id, response.data['html'])
print(response.data['error']) crawler_splash.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
else:
print(response.data['html']) if self.root_key is None:
pass self.root_key = item_id
crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
#print(response.data['cookies']) #print(response.data['cookies'])
if 'cookies' in response.data: if 'cookies' in response.data:
all_cookies = response.data['cookies'] all_cookies = response.data['cookies']
for cookie in all_cookies:
print('------------------------')
print(cookie['name'])
print(cookie['value'])
print(cookie)
# for cookie in all_cookies: # for cookie in all_cookies:
# print(cookie.name) # print('------------------------')
# print(cookie['name'])
# print(cookie['value'])
# print(cookie)
else: else:
all_cookies = [] all_cookies = []
# SCREENSHOT
# if 'png' in response.data: if 'png' in response.data:
sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
if sha256_string:
#if 'har' in response.data: Screenshot.save_item_relationship(sha256_string, item_id)
Screenshot.save_domain_relationship(sha256_string, self.domains[0])
# HAR
if 'har' in response.data:
crawler_splash.save_har(self.har_dir, item_id, response.data['har'])
le = LinkExtractor(allow_domains=self.domains, unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response): for link in le.extract_links(response):
@ -228,10 +219,8 @@ class TorSplashCrawler():
self.parse, self.parse,
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='execute', endpoint='execute',
#meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'}, meta={'father': item_id},
#meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'},
args=l_cookies args=l_cookies
#session_id="foo"
) )
def errback_catcher(self, failure): def errback_catcher(self, failure):
@ -240,10 +229,8 @@ class TorSplashCrawler():
if failure.check(ResponseNeverReceived): if failure.check(ResponseNeverReceived):
request = failure.request request = failure.request
#url = request.meta['splash']['args']['url'] url= response.data['last_url']
url= 'ido' father = request.meta['father']
#father = request.meta['father']
father = 'ido'
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10) time.sleep(10)
@ -257,14 +244,26 @@ class TorSplashCrawler():
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='execute', endpoint='execute',
cache_args=['lua_source'], cache_args=['lua_source'],
#meta={'father': father, 'root_key': response.meta['root_key']}, meta={'father': father},
#meta={'father': father, 'root_key': 'ido'},
args=self.build_request_arg(response.cookiejar) args=self.build_request_arg(response.cookiejar)
#session_id="foo"
) )
else: else:
print('failure') print('failure')
#print(failure) #print(failure)
print(failure.type) print(failure.type)
#print(failure.request.meta['item'])
def save_crawled_item(self, item_id, item_content):
gzip64encoded = crawler_splash.save_crawled_item(item_id, item_content)
# Send item to queue
# send paste to Global
relay_message = "{0} {1}".format(item_id, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(item_id)
self.p.populate_set_out(msg, 'Tags')

View file

@ -37,7 +37,7 @@ if __name__ == '__main__':
crawler_options = crawler_json['crawler_options'] crawler_options = crawler_json['crawler_options']
date = crawler_json['date'] date = crawler_json['date']
requested_mode = crawler_json['requested'] requested_mode = crawler_json['requested']
cookies = crawler_splash.load_cookies(crawler_splash.get_cookies()) cookies = crawler_splash.load_cookies(crawler_splash.get_cookies(), domain, crawler_type='onion')
print(cookies) print(cookies)
redis_cache.delete('crawler_request:{}'.format(uuid)) redis_cache.delete('crawler_request:{}'.format(uuid))

View file

@ -445,7 +445,7 @@
<div class="text-center"> <div class="text-center">
<small class="text-info" style="line-height:0.9;"> <small class="text-info" style="line-height:0.9;">
<a target="_blank" href="" id="screenshot_link"></a> <a target="_blank" href="" id="screenshot_link"></a>
<small> </small>
</div> </div>
</div> </div>
@ -519,11 +519,11 @@ var draw_img = false;
$("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}"); $("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}");
$("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}"); $("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}");
{%else%} {%else%}
var screenshot = ""; var screenshot = "";
{%endif%} {%endif%}
{%endif%} {%endif%}
{%else%} {%else%}
var screenshot = ""; var screenshot = "";
{%endif%} {%endif%}
img.src = base_url + screenshot; img.src = base_url + screenshot;
@ -561,7 +561,9 @@ function img_error() {
} }
function reload_image(new_screenshot, link, item_id) { function reload_image(new_screenshot, link, item_id) {
$("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img"); if (screenshot) {
$("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img");
}
screenshot = new_screenshot; screenshot = new_screenshot;
img.src=base_url + screenshot; img.src=base_url + screenshot;