mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [crawler] bypass login: use cookie provided by user and accept cookie from server + refractor
This commit is contained in:
parent
42ea678b7a
commit
6cfd3fe36d
5 changed files with 217 additions and 110 deletions
|
@ -1,10 +1,12 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
import base64
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
|
from hashlib import sha256
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
|
||||||
|
@ -164,6 +166,25 @@ def get_screenshot_file_content(sha256_string):
|
||||||
file_content = BytesIO(f.read())
|
file_content = BytesIO(f.read())
|
||||||
return file_content
|
return file_content
|
||||||
|
|
||||||
|
# if force save, ignore max_size
|
||||||
|
def save_crawled_screeshot(b64_screenshot, max_size, f_save=False):
|
||||||
|
screenshot_size = (len(b64_screenshot)*3) /4
|
||||||
|
if screenshot_size < max_size or f_save:
|
||||||
|
image_content = base64.standard_b64decode(b64_screenshot.encode())
|
||||||
|
sha256_string = sha256(image_content).hexdigest()
|
||||||
|
filepath = get_screenshot_filepath(sha256_string)
|
||||||
|
if os.path.isfile(filepath):
|
||||||
|
#print('File already exist')
|
||||||
|
return False
|
||||||
|
# create dir
|
||||||
|
dirname = os.path.dirname(filepath)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
f.write(image_content)
|
||||||
|
return sha256_string
|
||||||
|
return False
|
||||||
|
|
||||||
def save_screenshot_file(sha256_string, io_content):
|
def save_screenshot_file(sha256_string, io_content):
|
||||||
filepath = get_screenshot_filepath(sha256_string)
|
filepath = get_screenshot_filepath(sha256_string)
|
||||||
if os.path.isfile(filepath):
|
if os.path.isfile(filepath):
|
||||||
|
|
|
@ -6,54 +6,139 @@ API Helper
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
import base64
|
||||||
|
import gzip
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import redis
|
import redis
|
||||||
import sys
|
import sys
|
||||||
|
import uuid
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||||
|
import ConfigLoader
|
||||||
|
|
||||||
|
|
||||||
|
config_loader = ConfigLoader.ConfigLoader()
|
||||||
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||||
|
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
||||||
|
config_loader = None
|
||||||
|
|
||||||
# # # #
|
# # # #
|
||||||
# Cookies Fields:
|
# Cookies Fields:
|
||||||
# - name
|
# - name
|
||||||
# - value
|
# - value
|
||||||
# - path
|
# - path (optional)
|
||||||
# - domain
|
# - domain (optional)
|
||||||
|
# - secure (optional)
|
||||||
|
# - httpOnly (optional)
|
||||||
# # # #
|
# # # #
|
||||||
def create_cookie_dict(cookie):
|
def create_cookie_dict(browser_cookie=[], cookie_name=None, cookie_value=None, domain=None, crawler_type='regular'):
|
||||||
url = urlparse(cookie['Host raw'])
|
# UI created
|
||||||
#scheme = url.scheme
|
if cookie_name and cookie_value and domain:
|
||||||
is_secure = cookie['Send for'] == 'Encrypted connections only'
|
dict_cookie = create_cookie_dict_from_input(cookie_name, cookie_value, domain)
|
||||||
if 'HTTP only raw' in cookie:
|
# Cookies imported from the browser
|
||||||
if cookie['HTTP only raw'] == "true":
|
else:
|
||||||
is_secure = False
|
dict_cookie = create_cookie_dict_from_browser(browser_cookie)
|
||||||
|
|
||||||
|
# tor browser: disable secure cookie
|
||||||
|
if crawler_type=='onion':
|
||||||
|
dict_cookie['secure'] = False
|
||||||
|
|
||||||
|
dict_cookie['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
|
||||||
|
return dict_cookie
|
||||||
|
|
||||||
|
def create_cookie_dict_from_input(cookie_name, cookie_value, cookie_domain):
|
||||||
|
# WebKit use domain for cookie validation
|
||||||
|
return {'name': cookie_name, 'value': cookie_value, 'domain': '.{}'.format(cookie_domain)}
|
||||||
|
|
||||||
|
# # TODO: handle prefix cookies
|
||||||
|
# # TODO: fill empty fields
|
||||||
|
def create_cookie_dict_from_browser(browser_cookie):
|
||||||
|
url = urlparse(browser_cookie['Host raw'])
|
||||||
domain = url.netloc.split(':', 1)[0]
|
domain = url.netloc.split(':', 1)[0]
|
||||||
dict_cookie = {'path': cookie['Path raw'],
|
dict_cookie = {'path': browser_cookie['Path raw'],
|
||||||
'name': cookie['Name raw'],
|
'name': browser_cookie['Name raw'],
|
||||||
'httpOnly': cookie['HTTP only raw'] == 'true',
|
'httpOnly': browser_cookie['HTTP only raw'] == 'true',
|
||||||
'secure': is_secure,
|
'secure': browser_cookie['Send for'] == 'Encrypted connections only',
|
||||||
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
|
|
||||||
'domain': domain,
|
'domain': domain,
|
||||||
'value': cookie['Content raw']
|
'value': browser_cookie['Content raw']
|
||||||
}
|
}
|
||||||
return dict_cookie
|
return dict_cookie
|
||||||
|
|
||||||
def load_cookies(l_cookies):
|
def load_cookies(l_cookies, domain=None, crawler_type='regular'):
|
||||||
all_cookies = []
|
all_cookies = []
|
||||||
|
|
||||||
for cookie_dict in l_cookies:
|
for cookie_dict in l_cookies:
|
||||||
all_cookies.append(create_cookie_dict(cookie_dict))
|
all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type))
|
||||||
|
|
||||||
return all_cookies
|
return all_cookies
|
||||||
|
|
||||||
def get_cookies():
|
def get_cookies():
|
||||||
l_cookies = []
|
l_cookies = []
|
||||||
return l_cookies
|
return l_cookies
|
||||||
|
|
||||||
|
# domain up
|
||||||
|
def create_domain_metadata(domain_type, domain, current_port, date, date_month):
|
||||||
|
# Add to global set
|
||||||
|
r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain)
|
||||||
|
r_serv_onion.sadd('full_{}_up'.format(domain_type), domain)
|
||||||
|
r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain)
|
||||||
|
|
||||||
|
# create onion metadata
|
||||||
|
if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)):
|
||||||
|
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date)
|
||||||
|
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date)
|
||||||
|
|
||||||
|
# Update domain port number
|
||||||
|
all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports')
|
||||||
|
if all_domain_ports:
|
||||||
|
all_domain_ports = all_domain_ports.split(';')
|
||||||
|
else:
|
||||||
|
all_domain_ports = []
|
||||||
|
if current_port not in all_domain_ports:
|
||||||
|
all_domain_ports.append(current_port)
|
||||||
|
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports))
|
||||||
|
|
||||||
|
# add root_item to history
|
||||||
|
def add_domain_root_item(root_item, domain_type, domain, epoch_date, port):
|
||||||
|
# Create/Update crawler history
|
||||||
|
r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item)
|
||||||
|
|
||||||
|
def create_item_metadata(item_id, domain, url, port, item_father):
|
||||||
|
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father)
|
||||||
|
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port))
|
||||||
|
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url)
|
||||||
|
# add this item_id to his father
|
||||||
|
r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id)
|
||||||
|
|
||||||
|
def create_item_id(item_dir, domain):
|
||||||
|
if len(domain) > 215:
|
||||||
|
UUID = domain[-215:]+str(uuid.uuid4())
|
||||||
|
else:
|
||||||
|
UUID = domain+str(uuid.uuid4())
|
||||||
|
return os.path.join(item_dir, UUID)
|
||||||
|
|
||||||
|
def save_crawled_item(item_id, item_content):
|
||||||
|
try:
|
||||||
|
gzipencoded = gzip.compress(item_content.encode())
|
||||||
|
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
|
||||||
|
return gzip64encoded
|
||||||
|
except:
|
||||||
|
print("file error: {}".format(item_id))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def save_har(har_dir, item_id, har_content):
|
||||||
|
if not os.path.exists(har_dir):
|
||||||
|
os.makedirs(har_dir)
|
||||||
|
item_id = item_id.split('/')[-1]
|
||||||
|
filename = os.path.join(har_dir, item_id + '.json')
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
f.write(json.dumps(har_content))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
all_cookies = load_cookies(get_cookies())
|
all_cookies = load_cookies(get_cookies(), '3thxemke2x7hcibu.onion', crawler_type='onion')
|
||||||
print(json.dumps(all_cookies))
|
print(json.dumps(all_cookies))
|
||||||
|
|
|
@ -3,11 +3,8 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import gzip
|
|
||||||
import base64
|
|
||||||
import uuid
|
import uuid
|
||||||
import datetime
|
import datetime
|
||||||
import base64
|
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
@ -29,43 +26,9 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
||||||
import ConfigLoader
|
#import ConfigLoader
|
||||||
|
import Screenshot
|
||||||
# script_lua_cookie = """
|
import crawler_splash
|
||||||
# function main(splash, args)
|
|
||||||
#
|
|
||||||
# -- default config
|
|
||||||
# -- load flash plugin
|
|
||||||
# splash.plugins_enabled = true
|
|
||||||
# splash.html5_media_enabled = true
|
|
||||||
#
|
|
||||||
# -- to check
|
|
||||||
# splash.request_body_enabled = true
|
|
||||||
# splash.response_body_enabled = true
|
|
||||||
#
|
|
||||||
# -- handle cookies
|
|
||||||
# splash:init_cookies(args.cookies)
|
|
||||||
#
|
|
||||||
# assert(splash:go{
|
|
||||||
# args.url,
|
|
||||||
# headers=args.headers,
|
|
||||||
# http_method=args.http_method,
|
|
||||||
# body=args.body
|
|
||||||
# })
|
|
||||||
#
|
|
||||||
# splash:wait(10)
|
|
||||||
#
|
|
||||||
# -- Response
|
|
||||||
# return {
|
|
||||||
# url = splash:url(),
|
|
||||||
# html = splash:html(),
|
|
||||||
# har = splash:har(),
|
|
||||||
# cookies = splash:get_cookies(),
|
|
||||||
# png = splash:png(render_all=true)
|
|
||||||
# }
|
|
||||||
# end
|
|
||||||
# """
|
|
||||||
|
|
||||||
|
|
||||||
script_cookie = """
|
script_cookie = """
|
||||||
function main(splash, args)
|
function main(splash, args)
|
||||||
|
@ -75,25 +38,32 @@ function main(splash, args)
|
||||||
splash.images_enabled = true
|
splash.images_enabled = true
|
||||||
splash.webgl_enabled = true
|
splash.webgl_enabled = true
|
||||||
splash.media_source_enabled = true
|
splash.media_source_enabled = true
|
||||||
|
|
||||||
-- Force enable things
|
-- Force enable things
|
||||||
splash.plugins_enabled = true
|
splash.plugins_enabled = true
|
||||||
splash.request_body_enabled = true
|
splash.request_body_enabled = true
|
||||||
splash.response_body_enabled = true
|
splash.response_body_enabled = true
|
||||||
-- Would be nice
|
|
||||||
splash.indexeddb_enabled = true
|
splash.indexeddb_enabled = true
|
||||||
splash.html5_media_enabled = true
|
splash.html5_media_enabled = true
|
||||||
splash.http2_enabled = true
|
splash.http2_enabled = true
|
||||||
|
|
||||||
-- User defined
|
-- User defined
|
||||||
splash.resource_timeout = args.resource_timeout
|
splash.resource_timeout = args.resource_timeout
|
||||||
splash.timeout = args.timeout
|
splash.timeout = args.timeout
|
||||||
|
|
||||||
-- Allow to pass cookies
|
-- Allow to pass cookies
|
||||||
splash:init_cookies(args.cookies)
|
splash:init_cookies(args.cookies)
|
||||||
|
|
||||||
-- Run
|
-- Run
|
||||||
ok, reason = splash:go{args.url}
|
ok, reason = splash:go{args.url}
|
||||||
if not ok then
|
if not ok and not reason:find("http") then
|
||||||
return {error = reason}
|
return {
|
||||||
|
error = reason,
|
||||||
|
last_url = splash:url()
|
||||||
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
splash:wait{args.wait}
|
splash:wait{args.wait}
|
||||||
-- Page instrumentation
|
-- Page instrumentation
|
||||||
-- splash.scroll_position = {y=1000}
|
-- splash.scroll_position = {y=1000}
|
||||||
|
@ -103,7 +73,8 @@ function main(splash, args)
|
||||||
har = splash:har(),
|
har = splash:har(),
|
||||||
html = splash:html(),
|
html = splash:html(),
|
||||||
png = splash:png{render_all=true},
|
png = splash:png{render_all=true},
|
||||||
cookies = splash:get_cookies()
|
cookies = splash:get_cookies(),
|
||||||
|
last_url = splash:url()
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
"""
|
"""
|
||||||
|
@ -138,7 +109,7 @@ class TorSplashCrawler():
|
||||||
name = 'TorSplashSpider'
|
name = 'TorSplashSpider'
|
||||||
|
|
||||||
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
||||||
self.type = type
|
self.domain_type = type
|
||||||
self.requested_mode = requested_mode
|
self.requested_mode = requested_mode
|
||||||
self.original_item = original_item
|
self.original_item = original_item
|
||||||
self.root_key = None
|
self.root_key = None
|
||||||
|
@ -149,13 +120,23 @@ class TorSplashCrawler():
|
||||||
self.full_date = date['date_day']
|
self.full_date = date['date_day']
|
||||||
self.date_month = date['date_month']
|
self.date_month = date['date_month']
|
||||||
self.date_epoch = int(date['epoch'])
|
self.date_epoch = int(date['epoch'])
|
||||||
|
|
||||||
print(requested_mode)
|
|
||||||
self.png = True
|
self.png = True
|
||||||
self.har = True
|
self.har = True
|
||||||
|
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
|
|
||||||
|
config_section = 'Crawler'
|
||||||
|
self.p = Process(config_section)
|
||||||
|
self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
|
||||||
|
self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
|
||||||
|
self.r_serv_log_submit = redis.StrictRedis(
|
||||||
|
host=self.p.config.get("Redis_Log_submit", "host"),
|
||||||
|
port=self.p.config.getint("Redis_Log_submit", "port"),
|
||||||
|
db=self.p.config.getint("Redis_Log_submit", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
|
self.root_key = None
|
||||||
|
|
||||||
def build_request_arg(self, cookies):
|
def build_request_arg(self, cookies):
|
||||||
return {'wait': 10,
|
return {'wait': 10,
|
||||||
'resource_timeout': 10,
|
'resource_timeout': 10,
|
||||||
|
@ -171,54 +152,64 @@ class TorSplashCrawler():
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='execute',
|
endpoint='execute',
|
||||||
#meta={'father': self.original_item, 'root_key': None},
|
meta={'father': self.original_item},
|
||||||
args=l_cookies
|
args=l_cookies
|
||||||
#session_id="foo"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# # TODO: remove duplicate and anchor
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
#print(response.headers)
|
#print(response.headers)
|
||||||
#print(response.status)
|
#print(response.status)
|
||||||
if response.status == 504:
|
if response.status == 504:
|
||||||
# down ?
|
# down ?
|
||||||
print('504 detected')
|
print('504 detected')
|
||||||
|
|
||||||
|
# LUA ERROR # # TODO: print/display errors
|
||||||
|
elif 'error' in response.data:
|
||||||
|
if(response.data['error'] == 'network99'):
|
||||||
|
print('Connection to proxy refused')
|
||||||
|
else:
|
||||||
|
print(response.data['error'])
|
||||||
|
|
||||||
elif response.status != 200:
|
elif response.status != 200:
|
||||||
print('other response: {}'.format(response.status))
|
print('other response: {}'.format(response.status))
|
||||||
#print(error_log)
|
# detect connection to proxy refused
|
||||||
#detect connection to proxy refused
|
|
||||||
error_log = (json.loads(response.body.decode()))
|
error_log = (json.loads(response.body.decode()))
|
||||||
if(error_log['info']['text'] == 'Connection to proxy refused'):
|
print(error_log)
|
||||||
print('Connection to proxy refused')
|
|
||||||
else:
|
else:
|
||||||
# DEBUG:
|
# DEBUG:
|
||||||
print('----')
|
# print('----')
|
||||||
print(response.data.keys())
|
# print(response.data.keys())
|
||||||
|
|
||||||
# LUA Script Errors
|
item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0])
|
||||||
if 'error' in response.data:
|
self.save_crawled_item(item_id, response.data['html'])
|
||||||
print(response.data['error'])
|
crawler_splash.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
|
||||||
else:
|
|
||||||
print(response.data['html'])
|
if self.root_key is None:
|
||||||
pass
|
self.root_key = item_id
|
||||||
|
crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
|
||||||
|
crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
|
||||||
|
|
||||||
#print(response.data['cookies'])
|
#print(response.data['cookies'])
|
||||||
if 'cookies' in response.data:
|
if 'cookies' in response.data:
|
||||||
all_cookies = response.data['cookies']
|
all_cookies = response.data['cookies']
|
||||||
for cookie in all_cookies:
|
|
||||||
print('------------------------')
|
|
||||||
print(cookie['name'])
|
|
||||||
print(cookie['value'])
|
|
||||||
print(cookie)
|
|
||||||
# for cookie in all_cookies:
|
# for cookie in all_cookies:
|
||||||
# print(cookie.name)
|
# print('------------------------')
|
||||||
|
# print(cookie['name'])
|
||||||
|
# print(cookie['value'])
|
||||||
|
# print(cookie)
|
||||||
else:
|
else:
|
||||||
all_cookies = []
|
all_cookies = []
|
||||||
|
|
||||||
|
# SCREENSHOT
|
||||||
# if 'png' in response.data:
|
if 'png' in response.data:
|
||||||
|
sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
|
||||||
|
if sha256_string:
|
||||||
#if 'har' in response.data:
|
Screenshot.save_item_relationship(sha256_string, item_id)
|
||||||
|
Screenshot.save_domain_relationship(sha256_string, self.domains[0])
|
||||||
|
# HAR
|
||||||
|
if 'har' in response.data:
|
||||||
|
crawler_splash.save_har(self.har_dir, item_id, response.data['har'])
|
||||||
|
|
||||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||||
for link in le.extract_links(response):
|
for link in le.extract_links(response):
|
||||||
|
@ -228,10 +219,8 @@ class TorSplashCrawler():
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='execute',
|
endpoint='execute',
|
||||||
#meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'},
|
meta={'father': item_id},
|
||||||
#meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'},
|
|
||||||
args=l_cookies
|
args=l_cookies
|
||||||
#session_id="foo"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def errback_catcher(self, failure):
|
def errback_catcher(self, failure):
|
||||||
|
@ -240,10 +229,8 @@ class TorSplashCrawler():
|
||||||
|
|
||||||
if failure.check(ResponseNeverReceived):
|
if failure.check(ResponseNeverReceived):
|
||||||
request = failure.request
|
request = failure.request
|
||||||
#url = request.meta['splash']['args']['url']
|
url= response.data['last_url']
|
||||||
url= 'ido'
|
father = request.meta['father']
|
||||||
#father = request.meta['father']
|
|
||||||
father = 'ido'
|
|
||||||
|
|
||||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
@ -257,14 +244,26 @@ class TorSplashCrawler():
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='execute',
|
endpoint='execute',
|
||||||
cache_args=['lua_source'],
|
cache_args=['lua_source'],
|
||||||
#meta={'father': father, 'root_key': response.meta['root_key']},
|
meta={'father': father},
|
||||||
#meta={'father': father, 'root_key': 'ido'},
|
|
||||||
args=self.build_request_arg(response.cookiejar)
|
args=self.build_request_arg(response.cookiejar)
|
||||||
#session_id="foo"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('failure')
|
print('failure')
|
||||||
#print(failure)
|
#print(failure)
|
||||||
print(failure.type)
|
print(failure.type)
|
||||||
#print(failure.request.meta['item'])
|
|
||||||
|
def save_crawled_item(self, item_id, item_content):
|
||||||
|
gzip64encoded = crawler_splash.save_crawled_item(item_id, item_content)
|
||||||
|
|
||||||
|
# Send item to queue
|
||||||
|
# send paste to Global
|
||||||
|
relay_message = "{0} {1}".format(item_id, gzip64encoded)
|
||||||
|
self.p.populate_set_out(relay_message, 'Mixer')
|
||||||
|
|
||||||
|
# increase nb of paste by feeder name
|
||||||
|
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
|
||||||
|
|
||||||
|
# tag crawled paste
|
||||||
|
msg = 'infoleak:submission="crawler";{}'.format(item_id)
|
||||||
|
self.p.populate_set_out(msg, 'Tags')
|
||||||
|
|
|
@ -37,7 +37,7 @@ if __name__ == '__main__':
|
||||||
crawler_options = crawler_json['crawler_options']
|
crawler_options = crawler_json['crawler_options']
|
||||||
date = crawler_json['date']
|
date = crawler_json['date']
|
||||||
requested_mode = crawler_json['requested']
|
requested_mode = crawler_json['requested']
|
||||||
cookies = crawler_splash.load_cookies(crawler_splash.get_cookies())
|
cookies = crawler_splash.load_cookies(crawler_splash.get_cookies(), domain, crawler_type='onion')
|
||||||
print(cookies)
|
print(cookies)
|
||||||
|
|
||||||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||||
|
|
|
@ -445,7 +445,7 @@
|
||||||
<div class="text-center">
|
<div class="text-center">
|
||||||
<small class="text-info" style="line-height:0.9;">
|
<small class="text-info" style="line-height:0.9;">
|
||||||
<a target="_blank" href="" id="screenshot_link"></a>
|
<a target="_blank" href="" id="screenshot_link"></a>
|
||||||
<small>
|
</small>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -519,11 +519,11 @@ var draw_img = false;
|
||||||
$("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}");
|
$("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}");
|
||||||
$("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}");
|
$("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}");
|
||||||
{%else%}
|
{%else%}
|
||||||
var screenshot = "";
|
var screenshot = "";
|
||||||
{%endif%}
|
{%endif%}
|
||||||
{%endif%}
|
{%endif%}
|
||||||
{%else%}
|
{%else%}
|
||||||
var screenshot = "";
|
var screenshot = "";
|
||||||
{%endif%}
|
{%endif%}
|
||||||
|
|
||||||
img.src = base_url + screenshot;
|
img.src = base_url + screenshot;
|
||||||
|
@ -561,7 +561,9 @@ function img_error() {
|
||||||
}
|
}
|
||||||
|
|
||||||
function reload_image(new_screenshot, link, item_id) {
|
function reload_image(new_screenshot, link, item_id) {
|
||||||
$("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img");
|
if (screenshot) {
|
||||||
|
$("#"+screenshot.replace(/\//g, "")).removeClass("icon_selected").addClass("icon_img");
|
||||||
|
}
|
||||||
screenshot = new_screenshot;
|
screenshot = new_screenshot;
|
||||||
|
|
||||||
img.src=base_url + screenshot;
|
img.src=base_url + screenshot;
|
||||||
|
|
Loading…
Reference in a new issue