From abfad615817c821ab7098d8f689937c56dd269f8 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 14 Sep 2020 17:03:36 +0200 Subject: [PATCH] fix: [crawler] fix ResponseNeverReceived hanlder, check if splash restarted --- bin/lib/crawlers.py | 21 ++++++++++++++++ bin/torcrawler/TorSplashCrawler.py | 39 +++++++++++++++++------------- bin/torcrawler/tor_crawler.py | 2 +- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 284fd637..8a6817f5 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -478,6 +478,19 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) return None + +#### #### + +#### SPLASH API #### +def is_splash_reachable(splash_url, timeout=1.0): + try: + r = requests.get(splash_url , timeout=timeout) + except Exception: + return False + if r.status_code == 200: + return True + else: + return False #### #### def is_redirection(domain, last_url): @@ -545,6 +558,14 @@ def save_har(har_dir, item_id, har_content): with open(filename, 'w') as f: f.write(json.dumps(har_content)) +def api_add_crawled_item(dict_crawled): + + domain = None + # create item_id item_id = + + save_crawled_item(item_id, response.data['html']) + create_item_metadata(item_id, domain, 'last_url', port, 'father') + #### SPLASH MANAGER #### def get_splash_manager_url(reload=False): # TODO: add config reload diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 41f45acb..0ac198cf 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -105,14 +105,15 @@ class TorSplashCrawler(): 'SPLASH_COOKIES_DEBUG': False }) - def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item) + def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): + self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): + def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): + self.splash_url = splash_url self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item @@ -245,30 +246,34 @@ class TorSplashCrawler(): self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): - request = failure.request - url= request.meta['current_url'] - father = request.meta['father'] + ## DEBUG ## + self.logger.error(failure.request) + if failure.value.response: + self.logger.error(failure.value.response) + ## ----- ## + + # Extract request metadata + url = failure.request.meta['current_url'] + father = failure.request.meta['father'] + l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies']) + + # Check if Splash restarted + if not crawlers.is_splash_reachable(self.splash_url): + self.logger.error('Splash, ResponseNeverReceived for %s, retry in 20s ...', url) + time.sleep(10) - self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) - time.sleep(10) - if response: - response_root_key = response.meta['root_key'] - else: - response_root_key = None yield SplashRequest( url, self.parse, errback=self.errback_catcher, endpoint='execute', - cache_args=['lua_source'], meta={'father': father, 'current_url': url}, - args=self.build_request_arg(response.cookiejar) + args=l_cookies ) else: - print('failure') - #print(failure) - print(failure.type) + self.logger.error(failure.type) + self.logger.error(failure.getErrorMessage()) def save_crawled_item(self, item_id, item_content): gzip64encoded = crawlers.save_crawled_item(item_id, item_content) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index f060482b..954eae0f 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -46,4 +46,4 @@ if __name__ == '__main__': redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item) + crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)