mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-23 14:37:17 +00:00
chg: [Splash Crawler] use cookies to bypass login
This commit is contained in:
parent
4300fd7803
commit
42ea678b7a
3 changed files with 213 additions and 190 deletions
59
bin/lib/crawler_splash.py
Executable file
59
bin/lib/crawler_splash.py
Executable file
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
API Helper
|
||||||
|
===================
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import redis
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||||
|
|
||||||
|
# # # #
|
||||||
|
# Cookies Fields:
|
||||||
|
# - name
|
||||||
|
# - value
|
||||||
|
# - path
|
||||||
|
# - domain
|
||||||
|
# # # #
|
||||||
|
def create_cookie_dict(cookie):
|
||||||
|
url = urlparse(cookie['Host raw'])
|
||||||
|
#scheme = url.scheme
|
||||||
|
is_secure = cookie['Send for'] == 'Encrypted connections only'
|
||||||
|
if 'HTTP only raw' in cookie:
|
||||||
|
if cookie['HTTP only raw'] == "true":
|
||||||
|
is_secure = False
|
||||||
|
domain = url.netloc.split(':', 1)[0]
|
||||||
|
dict_cookie = {'path': cookie['Path raw'],
|
||||||
|
'name': cookie['Name raw'],
|
||||||
|
'httpOnly': cookie['HTTP only raw'] == 'true',
|
||||||
|
'secure': is_secure,
|
||||||
|
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
|
||||||
|
'domain': domain,
|
||||||
|
'value': cookie['Content raw']
|
||||||
|
}
|
||||||
|
return dict_cookie
|
||||||
|
|
||||||
|
def load_cookies(l_cookies):
|
||||||
|
all_cookies = []
|
||||||
|
|
||||||
|
for cookie_dict in l_cookies:
|
||||||
|
all_cookies.append(create_cookie_dict(cookie_dict))
|
||||||
|
return all_cookies
|
||||||
|
|
||||||
|
def get_cookies():
|
||||||
|
l_cookies = []
|
||||||
|
return l_cookies
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
all_cookies = load_cookies(get_cookies())
|
||||||
|
print(json.dumps(all_cookies))
|
|
@ -23,15 +23,95 @@ from scrapy import Spider
|
||||||
from scrapy.linkextractors import LinkExtractor
|
from scrapy.linkextractors import LinkExtractor
|
||||||
from scrapy.crawler import CrawlerProcess, Crawler
|
from scrapy.crawler import CrawlerProcess, Crawler
|
||||||
|
|
||||||
from scrapy_splash import SplashRequest
|
from scrapy_splash import SplashRequest, SplashJsonResponse
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
||||||
|
import ConfigLoader
|
||||||
|
|
||||||
|
# script_lua_cookie = """
|
||||||
|
# function main(splash, args)
|
||||||
|
#
|
||||||
|
# -- default config
|
||||||
|
# -- load flash plugin
|
||||||
|
# splash.plugins_enabled = true
|
||||||
|
# splash.html5_media_enabled = true
|
||||||
|
#
|
||||||
|
# -- to check
|
||||||
|
# splash.request_body_enabled = true
|
||||||
|
# splash.response_body_enabled = true
|
||||||
|
#
|
||||||
|
# -- handle cookies
|
||||||
|
# splash:init_cookies(args.cookies)
|
||||||
|
#
|
||||||
|
# assert(splash:go{
|
||||||
|
# args.url,
|
||||||
|
# headers=args.headers,
|
||||||
|
# http_method=args.http_method,
|
||||||
|
# body=args.body
|
||||||
|
# })
|
||||||
|
#
|
||||||
|
# splash:wait(10)
|
||||||
|
#
|
||||||
|
# -- Response
|
||||||
|
# return {
|
||||||
|
# url = splash:url(),
|
||||||
|
# html = splash:html(),
|
||||||
|
# har = splash:har(),
|
||||||
|
# cookies = splash:get_cookies(),
|
||||||
|
# png = splash:png(render_all=true)
|
||||||
|
# }
|
||||||
|
# end
|
||||||
|
# """
|
||||||
|
|
||||||
|
|
||||||
|
script_cookie = """
|
||||||
|
function main(splash, args)
|
||||||
|
-- Default values
|
||||||
|
splash.js_enabled = true
|
||||||
|
splash.private_mode_enabled = true
|
||||||
|
splash.images_enabled = true
|
||||||
|
splash.webgl_enabled = true
|
||||||
|
splash.media_source_enabled = true
|
||||||
|
-- Force enable things
|
||||||
|
splash.plugins_enabled = true
|
||||||
|
splash.request_body_enabled = true
|
||||||
|
splash.response_body_enabled = true
|
||||||
|
-- Would be nice
|
||||||
|
splash.indexeddb_enabled = true
|
||||||
|
splash.html5_media_enabled = true
|
||||||
|
splash.http2_enabled = true
|
||||||
|
-- User defined
|
||||||
|
splash.resource_timeout = args.resource_timeout
|
||||||
|
splash.timeout = args.timeout
|
||||||
|
|
||||||
|
-- Allow to pass cookies
|
||||||
|
splash:init_cookies(args.cookies)
|
||||||
|
-- Run
|
||||||
|
ok, reason = splash:go{args.url}
|
||||||
|
if not ok then
|
||||||
|
return {error = reason}
|
||||||
|
end
|
||||||
|
splash:wait{args.wait}
|
||||||
|
-- Page instrumentation
|
||||||
|
-- splash.scroll_position = {y=1000}
|
||||||
|
splash:wait{args.wait}
|
||||||
|
-- Response
|
||||||
|
return {
|
||||||
|
har = splash:har(),
|
||||||
|
html = splash:html(),
|
||||||
|
png = splash:png{render_all=true},
|
||||||
|
cookies = splash:get_cookies()
|
||||||
|
}
|
||||||
|
end
|
||||||
|
"""
|
||||||
|
|
||||||
class TorSplashCrawler():
|
class TorSplashCrawler():
|
||||||
|
|
||||||
def __init__(self, splash_url, crawler_options):
|
def __init__(self, splash_url, crawler_options):
|
||||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
self.process = CrawlerProcess({'LOG_ENABLED': True})
|
||||||
self.crawler = Crawler(self.TorSplashSpider, {
|
self.crawler = Crawler(self.TorSplashSpider, {
|
||||||
'USER_AGENT': crawler_options['user_agent'],
|
'USER_AGENT': crawler_options['user_agent'],
|
||||||
'SPLASH_URL': splash_url,
|
'SPLASH_URL': splash_url,
|
||||||
|
@ -39,23 +119,25 @@ class TorSplashCrawler():
|
||||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
'scrapy_splash.SplashMiddleware': 725,
|
'scrapy_splash.SplashMiddleware': 725,
|
||||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
||||||
|
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
|
||||||
},
|
},
|
||||||
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
'HTTPERROR_ALLOW_ALL': True,
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
'RETRY_TIMES': 2,
|
'RETRY_TIMES': 0,
|
||||||
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
|
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
|
||||||
'DEPTH_LIMIT': crawler_options['depth_limit']
|
'DEPTH_LIMIT': crawler_options['depth_limit'],
|
||||||
|
'SPLASH_COOKIES_DEBUG': True
|
||||||
})
|
})
|
||||||
|
|
||||||
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, original_item):
|
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
||||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, original_item=original_item)
|
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
||||||
self.process.start()
|
self.process.start()
|
||||||
|
|
||||||
class TorSplashSpider(Spider):
|
class TorSplashSpider(Spider):
|
||||||
name = 'TorSplashSpider'
|
name = 'TorSplashSpider'
|
||||||
|
|
||||||
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs):
|
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
||||||
self.type = type
|
self.type = type
|
||||||
self.requested_mode = requested_mode
|
self.requested_mode = requested_mode
|
||||||
self.original_item = original_item
|
self.original_item = original_item
|
||||||
|
@ -68,57 +150,30 @@ class TorSplashCrawler():
|
||||||
self.date_month = date['date_month']
|
self.date_month = date['date_month']
|
||||||
self.date_epoch = int(date['epoch'])
|
self.date_epoch = int(date['epoch'])
|
||||||
|
|
||||||
# # TODO: timeout in config
|
print(requested_mode)
|
||||||
self.arg_crawler = { 'html': crawler_options['html'],
|
self.png = True
|
||||||
'wait': 10,
|
self.har = True
|
||||||
'render_all': 1,
|
|
||||||
'timeout': 30,
|
|
||||||
'har': crawler_options['har'],
|
|
||||||
'png': crawler_options['png']}
|
|
||||||
|
|
||||||
config_section = 'Crawler'
|
self.cookies = cookies
|
||||||
self.p = Process(config_section)
|
|
||||||
|
|
||||||
self.r_cache = redis.StrictRedis(
|
def build_request_arg(self, cookies):
|
||||||
host=self.p.config.get("Redis_Cache", "host"),
|
return {'wait': 10,
|
||||||
port=self.p.config.getint("Redis_Cache", "port"),
|
'resource_timeout': 10,
|
||||||
db=self.p.config.getint("Redis_Cache", "db"),
|
'timeout': 30,
|
||||||
decode_responses=True)
|
'cookies': cookies,
|
||||||
|
'lua_source': script_cookie
|
||||||
self.r_serv_log_submit = redis.StrictRedis(
|
}
|
||||||
host=self.p.config.get("Redis_Log_submit", "host"),
|
|
||||||
port=self.p.config.getint("Redis_Log_submit", "port"),
|
|
||||||
db=self.p.config.getint("Redis_Log_submit", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
self.r_serv_metadata = redis.StrictRedis(
|
|
||||||
host=self.p.config.get("ARDB_Metadata", "host"),
|
|
||||||
port=self.p.config.getint("ARDB_Metadata", "port"),
|
|
||||||
db=self.p.config.getint("ARDB_Metadata", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
self.r_serv_onion = redis.StrictRedis(
|
|
||||||
host=self.p.config.get("ARDB_Onion", "host"),
|
|
||||||
port=self.p.config.getint("ARDB_Onion", "port"),
|
|
||||||
db=self.p.config.getint("ARDB_Onion", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
|
|
||||||
|
|
||||||
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
|
|
||||||
self.p.config.get("Directories", "crawled"), date_str )
|
|
||||||
|
|
||||||
self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
|
|
||||||
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
|
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
|
l_cookies = self.build_request_arg(self.cookies)
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
self.start_urls,
|
self.start_urls,
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='render.json',
|
endpoint='execute',
|
||||||
meta={'father': self.original_item, 'root_key': None},
|
#meta={'father': self.original_item, 'root_key': None},
|
||||||
args=self.arg_crawler
|
args=l_cookies
|
||||||
|
#session_id="foo"
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
|
@ -135,99 +190,49 @@ class TorSplashCrawler():
|
||||||
if(error_log['info']['text'] == 'Connection to proxy refused'):
|
if(error_log['info']['text'] == 'Connection to proxy refused'):
|
||||||
print('Connection to proxy refused')
|
print('Connection to proxy refused')
|
||||||
else:
|
else:
|
||||||
|
# DEBUG:
|
||||||
|
print('----')
|
||||||
|
print(response.data.keys())
|
||||||
|
|
||||||
#avoid filename too big
|
# LUA Script Errors
|
||||||
if len(self.domains[0]) > 215:
|
if 'error' in response.data:
|
||||||
UUID = self.domains[0][-215:]+str(uuid.uuid4())
|
print(response.data['error'])
|
||||||
else:
|
else:
|
||||||
UUID = self.domains[0]+str(uuid.uuid4())
|
print(response.data['html'])
|
||||||
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
|
pass
|
||||||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
|
||||||
filename_har = os.path.join(self.crawled_har, UUID)
|
|
||||||
|
|
||||||
# # TODO: modify me
|
#print(response.data['cookies'])
|
||||||
# save new paste on disk
|
if 'cookies' in response.data:
|
||||||
if self.save_crawled_paste(relative_filename_paste, response.data['html']):
|
all_cookies = response.data['cookies']
|
||||||
|
for cookie in all_cookies:
|
||||||
|
print('------------------------')
|
||||||
|
print(cookie['name'])
|
||||||
|
print(cookie['value'])
|
||||||
|
print(cookie)
|
||||||
|
# for cookie in all_cookies:
|
||||||
|
# print(cookie.name)
|
||||||
|
else:
|
||||||
|
all_cookies = []
|
||||||
|
|
||||||
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
|
||||||
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
|
||||||
|
|
||||||
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
# if 'png' in response.data:
|
||||||
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
|
||||||
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
|
|
||||||
|
|
||||||
# create onion metadata
|
|
||||||
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
|
|
||||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
|
|
||||||
|
|
||||||
# create root_key
|
#if 'har' in response.data:
|
||||||
if self.root_key is None:
|
|
||||||
self.root_key = relative_filename_paste
|
|
||||||
# Create/Update crawler history
|
|
||||||
self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
|
|
||||||
# Update domain port number
|
|
||||||
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
|
|
||||||
if all_domain_ports:
|
|
||||||
all_domain_ports = all_domain_ports.split(';')
|
|
||||||
else:
|
|
||||||
all_domain_ports = []
|
|
||||||
if self.port not in all_domain_ports:
|
|
||||||
all_domain_ports.append(self.port)
|
|
||||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
|
|
||||||
|
|
||||||
#create paste metadata
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
|
for link in le.extract_links(response):
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
|
l_cookies = self.build_request_arg(all_cookies)
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
|
yield SplashRequest(
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)
|
link.url,
|
||||||
|
self.parse,
|
||||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
|
errback=self.errback_catcher,
|
||||||
|
endpoint='execute',
|
||||||
if 'png' in response.data:
|
#meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'},
|
||||||
size_screenshot = (len(response.data['png'])*3) /4
|
#meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'},
|
||||||
|
args=l_cookies
|
||||||
if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto
|
#session_id="foo"
|
||||||
image_content = base64.standard_b64decode(response.data['png'].encode())
|
)
|
||||||
hash = sha256(image_content).hexdigest()
|
|
||||||
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
|
|
||||||
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
|
|
||||||
dirname = os.path.dirname(filename_img)
|
|
||||||
if not os.path.exists(dirname):
|
|
||||||
os.makedirs(dirname)
|
|
||||||
if not os.path.exists(filename_img):
|
|
||||||
with open(filename_img, 'wb') as f:
|
|
||||||
f.write(image_content)
|
|
||||||
# add item metadata
|
|
||||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
|
|
||||||
# add sha256 metadata
|
|
||||||
self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste)
|
|
||||||
# domain map
|
|
||||||
self.r_serv_onion.sadd('domain_screenshot:{}'.format(self.domains[0]), hash)
|
|
||||||
self.r_serv_onion.sadd('screenshot_domain:{}'.format(hash), self.domains[0])
|
|
||||||
|
|
||||||
if 'har' in response.data:
|
|
||||||
dirname = os.path.dirname(filename_har)
|
|
||||||
if not os.path.exists(dirname):
|
|
||||||
os.makedirs(dirname)
|
|
||||||
with open(filename_har+'.json', 'wb') as f:
|
|
||||||
f.write(json.dumps(response.data['har']).encode())
|
|
||||||
|
|
||||||
# save external links in set
|
|
||||||
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
|
||||||
#for link in lext.extract_links(response):
|
|
||||||
# self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
|
||||||
# self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
|
||||||
|
|
||||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
|
||||||
for link in le.extract_links(response):
|
|
||||||
yield SplashRequest(
|
|
||||||
link.url,
|
|
||||||
self.parse,
|
|
||||||
errback=self.errback_catcher,
|
|
||||||
endpoint='render.json',
|
|
||||||
meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']},
|
|
||||||
args=self.arg_crawler
|
|
||||||
)
|
|
||||||
|
|
||||||
def errback_catcher(self, failure):
|
def errback_catcher(self, failure):
|
||||||
# catch all errback failures,
|
# catch all errback failures,
|
||||||
|
@ -235,8 +240,10 @@ class TorSplashCrawler():
|
||||||
|
|
||||||
if failure.check(ResponseNeverReceived):
|
if failure.check(ResponseNeverReceived):
|
||||||
request = failure.request
|
request = failure.request
|
||||||
url = request.meta['splash']['args']['url']
|
#url = request.meta['splash']['args']['url']
|
||||||
father = request.meta['father']
|
url= 'ido'
|
||||||
|
#father = request.meta['father']
|
||||||
|
father = 'ido'
|
||||||
|
|
||||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
@ -248,9 +255,12 @@ class TorSplashCrawler():
|
||||||
url,
|
url,
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='render.json',
|
endpoint='execute',
|
||||||
meta={'father': father, 'root_key': response.meta['root_key']},
|
cache_args=['lua_source'],
|
||||||
args=self.arg_crawler
|
#meta={'father': father, 'root_key': response.meta['root_key']},
|
||||||
|
#meta={'father': father, 'root_key': 'ido'},
|
||||||
|
args=self.build_request_arg(response.cookiejar)
|
||||||
|
#session_id="foo"
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@ -258,52 +268,3 @@ class TorSplashCrawler():
|
||||||
#print(failure)
|
#print(failure)
|
||||||
print(failure.type)
|
print(failure.type)
|
||||||
#print(failure.request.meta['item'])
|
#print(failure.request.meta['item'])
|
||||||
|
|
||||||
'''
|
|
||||||
#if isinstance(failure.value, HttpError):
|
|
||||||
elif failure.check(HttpError):
|
|
||||||
# you can get the response
|
|
||||||
response = failure.value.response
|
|
||||||
print('HttpError')
|
|
||||||
self.logger.error('HttpError on %s', response.url)
|
|
||||||
|
|
||||||
#elif isinstance(failure.value, DNSLookupError):
|
|
||||||
elif failure.check(DNSLookupError):
|
|
||||||
# this is the original request
|
|
||||||
request = failure.request
|
|
||||||
print(DNSLookupError)
|
|
||||||
print('DNSLookupError')
|
|
||||||
self.logger.error('DNSLookupError on %s', request.url)
|
|
||||||
|
|
||||||
#elif isinstance(failure.value, TimeoutError):
|
|
||||||
elif failure.check(TimeoutError):
|
|
||||||
request = failure.request
|
|
||||||
print('TimeoutError')
|
|
||||||
print(TimeoutError)
|
|
||||||
self.logger.error('TimeoutError on %s', request.url)
|
|
||||||
'''
|
|
||||||
|
|
||||||
def save_crawled_paste(self, filename, content):
|
|
||||||
|
|
||||||
if os.path.isfile(filename):
|
|
||||||
print('File: {} already exist in submitted pastes'.format(filename))
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
gzipencoded = gzip.compress(content.encode())
|
|
||||||
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
|
|
||||||
except:
|
|
||||||
print("file error: {}".format(filename))
|
|
||||||
return False
|
|
||||||
|
|
||||||
# send paste to Global
|
|
||||||
relay_message = "{0} {1}".format(filename, gzip64encoded)
|
|
||||||
self.p.populate_set_out(relay_message, 'Mixer')
|
|
||||||
|
|
||||||
# increase nb of paste by feeder name
|
|
||||||
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
|
|
||||||
|
|
||||||
# tag crawled paste
|
|
||||||
msg = 'infoleak:submission="crawler";{}'.format(filename)
|
|
||||||
self.p.populate_set_out(msg, 'Tags')
|
|
||||||
return True
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from TorSplashCrawler import TorSplashCrawler
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||||
import ConfigLoader
|
import ConfigLoader
|
||||||
|
import crawler_splash
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
@ -36,8 +37,10 @@ if __name__ == '__main__':
|
||||||
crawler_options = crawler_json['crawler_options']
|
crawler_options = crawler_json['crawler_options']
|
||||||
date = crawler_json['date']
|
date = crawler_json['date']
|
||||||
requested_mode = crawler_json['requested']
|
requested_mode = crawler_json['requested']
|
||||||
|
cookies = crawler_splash.load_cookies(crawler_splash.get_cookies())
|
||||||
|
print(cookies)
|
||||||
|
|
||||||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||||
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, original_item)
|
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
||||||
|
|
Loading…
Reference in a new issue