fix: [crawler] user agent + splash restart

This commit is contained in:
Terrtia 2021-03-26 11:30:06 +01:00
parent 5a93b86524
commit 8754350d39
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 9 additions and 3 deletions

View file

@ -208,6 +208,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
if nb_retry == 2: if nb_retry == 2:
crawlers.restart_splash_docker(splash_url, splash_name) crawlers.restart_splash_docker(splash_url, splash_name)
time.spleep(20)
if nb_retry == 6: if nb_retry == 6:
on_error_send_message_back_in_queue(type_service, domain, message) on_error_send_message_back_in_queue(type_service, domain, message)

View file

@ -48,6 +48,9 @@ function main(splash, args)
splash.html5_media_enabled = true splash.html5_media_enabled = true
splash.http2_enabled = true splash.http2_enabled = true
-- User Agent
splash:set_user_agent(args.user_agent)
-- User defined -- User defined
splash.resource_timeout = args.resource_timeout splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout splash.timeout = args.timeout
@ -71,7 +74,7 @@ function main(splash, args)
splash:wait{args.wait} splash:wait{args.wait}
-- Page instrumentation -- Page instrumentation
-- splash.scroll_position = {y=1000} -- splash.scroll_position = {y=1000}
splash:wait{args.wait} -- splash:wait{args.wait}
-- Response -- Response
return { return {
har = splash:har(), har = splash:har(),
@ -88,7 +91,7 @@ class TorSplashCrawler():
def __init__(self, splash_url, crawler_options): def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True}) self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, { self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'], 'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script
'SPLASH_URL': splash_url, 'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False, 'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -126,6 +129,7 @@ class TorSplashCrawler():
self.date_month = date['date_month'] self.date_month = date['date_month']
self.date_epoch = int(date['epoch']) self.date_epoch = int(date['epoch'])
self.user_agent = crawler_options['user_agent']
self.png = crawler_options['png'] self.png = crawler_options['png']
self.har = crawler_options['har'] self.har = crawler_options['har']
self.cookies = cookies self.cookies = cookies
@ -150,6 +154,7 @@ class TorSplashCrawler():
return {'wait': 10, return {'wait': 10,
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
'timeout': 30, 'timeout': 30,
'user_agent': self.user_agent,
'cookies': cookies, 'cookies': cookies,
'lua_source': script_cookie 'lua_source': script_cookie
} }

View file

@ -271,7 +271,7 @@ crawler_depth_limit = 1
default_crawler_har = True default_crawler_har = True
default_crawler_png = True default_crawler_png = True
default_crawler_closespider_pagecount = 50 default_crawler_closespider_pagecount = 50
default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0 default_crawler_user_agent = Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0
splash_url = http://127.0.0.1 splash_url = http://127.0.0.1
splash_port = 8050-8052 splash_port = 8050-8052
domain_proxy = onion.foundation domain_proxy = onion.foundation