fix: [crawler] fix ResponseNeverReceived hanlder, check if splash restarted

This commit is contained in:
Terrtia 2020-09-14 17:03:36 +02:00
parent b72cd978fc
commit abfad61581
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 44 additions and 18 deletions

View file

@ -478,6 +478,19 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
return None
#### ####
#### SPLASH API ####
def is_splash_reachable(splash_url, timeout=1.0):
try:
r = requests.get(splash_url , timeout=timeout)
except Exception:
return False
if r.status_code == 200:
return True
else:
return False
#### ####
def is_redirection(domain, last_url):
@ -545,6 +558,14 @@ def save_har(har_dir, item_id, har_content):
with open(filename, 'w') as f:
f.write(json.dumps(har_content))
def api_add_crawled_item(dict_crawled):
domain = None
# create item_id item_id =
save_crawled_item(item_id, response.data['html'])
create_item_metadata(item_id, domain, 'last_url', port, 'father')
#### SPLASH MANAGER ####
def get_splash_manager_url(reload=False): # TODO: add config reload

View file

@ -105,14 +105,15 @@ class TorSplashCrawler():
'SPLASH_COOKIES_DEBUG': False
})
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.splash_url = splash_url
self.domain_type = type
self.requested_mode = requested_mode
self.original_item = original_item
@ -245,30 +246,34 @@ class TorSplashCrawler():
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
url= request.meta['current_url']
father = request.meta['father']
## DEBUG ##
self.logger.error(failure.request)
if failure.value.response:
self.logger.error(failure.value.response)
## ----- ##
# Extract request metadata
url = failure.request.meta['current_url']
father = failure.request.meta['father']
l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies'])
# Check if Splash restarted
if not crawlers.is_splash_reachable(self.splash_url):
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 20s ...', url)
time.sleep(10)
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
meta={'father': father, 'current_url': url},
args=self.build_request_arg(response.cookiejar)
args=l_cookies
)
else:
print('failure')
#print(failure)
print(failure.type)
self.logger.error(failure.type)
self.logger.error(failure.getErrorMessage())
def save_crawled_item(self, item_id, item_content):
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)

View file

@ -46,4 +46,4 @@ if __name__ == '__main__':
redis_cache.delete('crawler_request:{}'.format(uuid))
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)