fix: [crawler] user agent + splash restart

This commit is contained in:
Terrtia 2021-03-26 11:30:06 +01:00
parent 5a93b86524
commit 8754350d39
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 9 additions and 3 deletions

View file

@ -208,6 +208,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
if nb_retry == 2:
crawlers.restart_splash_docker(splash_url, splash_name)
time.spleep(20)
if nb_retry == 6:
on_error_send_message_back_in_queue(type_service, domain, message)

View file

@ -48,6 +48,9 @@ function main(splash, args)
splash.html5_media_enabled = true
splash.http2_enabled = true
-- User Agent
splash:set_user_agent(args.user_agent)
-- User defined
splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout
@ -71,7 +74,7 @@ function main(splash, args)
splash:wait{args.wait}
-- Page instrumentation
-- splash.scroll_position = {y=1000}
splash:wait{args.wait}
-- splash:wait{args.wait}
-- Response
return {
har = splash:har(),
@ -88,7 +91,7 @@ class TorSplashCrawler():
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'],
'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -126,6 +129,7 @@ class TorSplashCrawler():
self.date_month = date['date_month']
self.date_epoch = int(date['epoch'])
self.user_agent = crawler_options['user_agent']
self.png = crawler_options['png']
self.har = crawler_options['har']
self.cookies = cookies
@ -150,6 +154,7 @@ class TorSplashCrawler():
return {'wait': 10,
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
'timeout': 30,
'user_agent': self.user_agent,
'cookies': cookies,
'lua_source': script_cookie
}

View file

@ -271,7 +271,7 @@ crawler_depth_limit = 1
default_crawler_har = True
default_crawler_png = True
default_crawler_closespider_pagecount = 50
default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0
default_crawler_user_agent = Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0
splash_url = http://127.0.0.1
splash_port = 8050-8052
domain_proxy = onion.foundation