mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
fix: [crawler] fix ResponseNeverReceived hanlder, check if splash restarted
This commit is contained in:
parent
b72cd978fc
commit
abfad61581
3 changed files with 44 additions and 18 deletions
|
@ -478,6 +478,19 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit
|
|||
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
|
||||
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
|
||||
return None
|
||||
|
||||
#### ####
|
||||
|
||||
#### SPLASH API ####
|
||||
def is_splash_reachable(splash_url, timeout=1.0):
|
||||
try:
|
||||
r = requests.get(splash_url , timeout=timeout)
|
||||
except Exception:
|
||||
return False
|
||||
if r.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
#### ####
|
||||
|
||||
def is_redirection(domain, last_url):
|
||||
|
@ -545,6 +558,14 @@ def save_har(har_dir, item_id, har_content):
|
|||
with open(filename, 'w') as f:
|
||||
f.write(json.dumps(har_content))
|
||||
|
||||
def api_add_crawled_item(dict_crawled):
|
||||
|
||||
domain = None
|
||||
# create item_id item_id =
|
||||
|
||||
save_crawled_item(item_id, response.data['html'])
|
||||
create_item_metadata(item_id, domain, 'last_url', port, 'father')
|
||||
|
||||
|
||||
#### SPLASH MANAGER ####
|
||||
def get_splash_manager_url(reload=False): # TODO: add config reload
|
||||
|
|
|
@ -105,14 +105,15 @@ class TorSplashCrawler():
|
|||
'SPLASH_COOKIES_DEBUG': False
|
||||
})
|
||||
|
||||
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
||||
def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
||||
self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
||||
self.process.start()
|
||||
|
||||
class TorSplashSpider(Spider):
|
||||
name = 'TorSplashSpider'
|
||||
|
||||
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
||||
def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
||||
self.splash_url = splash_url
|
||||
self.domain_type = type
|
||||
self.requested_mode = requested_mode
|
||||
self.original_item = original_item
|
||||
|
@ -245,30 +246,34 @@ class TorSplashCrawler():
|
|||
self.logger.error(repr(failure))
|
||||
|
||||
if failure.check(ResponseNeverReceived):
|
||||
request = failure.request
|
||||
url= request.meta['current_url']
|
||||
father = request.meta['father']
|
||||
## DEBUG ##
|
||||
self.logger.error(failure.request)
|
||||
if failure.value.response:
|
||||
self.logger.error(failure.value.response)
|
||||
## ----- ##
|
||||
|
||||
# Extract request metadata
|
||||
url = failure.request.meta['current_url']
|
||||
father = failure.request.meta['father']
|
||||
l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies'])
|
||||
|
||||
# Check if Splash restarted
|
||||
if not crawlers.is_splash_reachable(self.splash_url):
|
||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 20s ...', url)
|
||||
time.sleep(10)
|
||||
|
||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||
time.sleep(10)
|
||||
if response:
|
||||
response_root_key = response.meta['root_key']
|
||||
else:
|
||||
response_root_key = None
|
||||
yield SplashRequest(
|
||||
url,
|
||||
self.parse,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='execute',
|
||||
cache_args=['lua_source'],
|
||||
meta={'father': father, 'current_url': url},
|
||||
args=self.build_request_arg(response.cookiejar)
|
||||
args=l_cookies
|
||||
)
|
||||
|
||||
else:
|
||||
print('failure')
|
||||
#print(failure)
|
||||
print(failure.type)
|
||||
self.logger.error(failure.type)
|
||||
self.logger.error(failure.getErrorMessage())
|
||||
|
||||
def save_crawled_item(self, item_id, item_content):
|
||||
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
|
||||
|
|
|
@ -46,4 +46,4 @@ if __name__ == '__main__':
|
|||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
||||
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
||||
|
|
Loading…
Reference in a new issue