ail-framework/bin/torcrawler/TorSplashCrawler.py

270 lines
9.1 KiB
Python

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import gzip
import base64
import uuid
import datetime
import base64
import redis
import json
import time
from hashlib import sha256
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy_splash import SplashRequest, SplashJsonResponse
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader
# script_lua_cookie = """
# function main(splash, args)
#
# -- default config
# -- load flash plugin
# splash.plugins_enabled = true
# splash.html5_media_enabled = true
#
# -- to check
# splash.request_body_enabled = true
# splash.response_body_enabled = true
#
# -- handle cookies
# splash:init_cookies(args.cookies)
#
# assert(splash:go{
# args.url,
# headers=args.headers,
# http_method=args.http_method,
# body=args.body
# })
#
# splash:wait(10)
#
# -- Response
# return {
# url = splash:url(),
# html = splash:html(),
# har = splash:har(),
# cookies = splash:get_cookies(),
# png = splash:png(render_all=true)
# }
# end
# """
script_cookie = """
function main(splash, args)
-- Default values
splash.js_enabled = true
splash.private_mode_enabled = true
splash.images_enabled = true
splash.webgl_enabled = true
splash.media_source_enabled = true
-- Force enable things
splash.plugins_enabled = true
splash.request_body_enabled = true
splash.response_body_enabled = true
-- Would be nice
splash.indexeddb_enabled = true
splash.html5_media_enabled = true
splash.http2_enabled = true
-- User defined
splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout
-- Allow to pass cookies
splash:init_cookies(args.cookies)
-- Run
ok, reason = splash:go{args.url}
if not ok then
return {error = reason}
end
splash:wait{args.wait}
-- Page instrumentation
-- splash.scroll_position = {y=1000}
splash:wait{args.wait}
-- Response
return {
har = splash:har(),
html = splash:html(),
png = splash:png{render_all=true},
cookies = splash:get_cookies()
}
end
"""
class TorSplashCrawler():
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 0,
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': True
})
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.type = type
self.requested_mode = requested_mode
self.original_item = original_item
self.root_key = None
self.start_urls = url
self.domains = [domain]
self.port = str(port)
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day']
self.date_month = date['date_month']
self.date_epoch = int(date['epoch'])
print(requested_mode)
self.png = True
self.har = True
self.cookies = cookies
def build_request_arg(self, cookies):
return {'wait': 10,
'resource_timeout': 10,
'timeout': 30,
'cookies': cookies,
'lua_source': script_cookie
}
def start_requests(self):
l_cookies = self.build_request_arg(self.cookies)
yield SplashRequest(
self.start_urls,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
#meta={'father': self.original_item, 'root_key': None},
args=l_cookies
#session_id="foo"
)
def parse(self,response):
#print(response.headers)
#print(response.status)
if response.status == 504:
# down ?
print('504 detected')
elif response.status != 200:
print('other response: {}'.format(response.status))
#print(error_log)
#detect connection to proxy refused
error_log = (json.loads(response.body.decode()))
if(error_log['info']['text'] == 'Connection to proxy refused'):
print('Connection to proxy refused')
else:
# DEBUG:
print('----')
print(response.data.keys())
# LUA Script Errors
if 'error' in response.data:
print(response.data['error'])
else:
print(response.data['html'])
pass
#print(response.data['cookies'])
if 'cookies' in response.data:
all_cookies = response.data['cookies']
for cookie in all_cookies:
print('------------------------')
print(cookie['name'])
print(cookie['value'])
print(cookie)
# for cookie in all_cookies:
# print(cookie.name)
else:
all_cookies = []
# if 'png' in response.data:
#if 'har' in response.data:
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
l_cookies = self.build_request_arg(all_cookies)
yield SplashRequest(
link.url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
#meta={'father': 'inter', 'root_key': response.meta['root_key'], 'session_id': '092384901834adef'},
#meta={'father': 'inter', 'root_key': 'ido', 'session_id': '092384901834adef'},
args=l_cookies
#session_id="foo"
)
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
#url = request.meta['splash']['args']['url']
url= 'ido'
#father = request.meta['father']
father = 'ido'
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
#meta={'father': father, 'root_key': response.meta['root_key']},
#meta={'father': father, 'root_key': 'ido'},
args=self.build_request_arg(response.cookiejar)
#session_id="foo"
)
else:
print('failure')
#print(failure)
print(failure.type)
#print(failure.request.meta['item'])