mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 01:37:17 +00:00
fix: [crawler] fix ResponseNeverReceived hanlder, check if splash restarted
This commit is contained in:
parent
b72cd978fc
commit
abfad61581
3 changed files with 44 additions and 18 deletions
|
@ -478,6 +478,19 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit
|
||||||
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
|
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
|
||||||
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
|
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
#### ####
|
||||||
|
|
||||||
|
#### SPLASH API ####
|
||||||
|
def is_splash_reachable(splash_url, timeout=1.0):
|
||||||
|
try:
|
||||||
|
r = requests.get(splash_url , timeout=timeout)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
if r.status_code == 200:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
#### ####
|
#### ####
|
||||||
|
|
||||||
def is_redirection(domain, last_url):
|
def is_redirection(domain, last_url):
|
||||||
|
@ -545,6 +558,14 @@ def save_har(har_dir, item_id, har_content):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
f.write(json.dumps(har_content))
|
f.write(json.dumps(har_content))
|
||||||
|
|
||||||
|
def api_add_crawled_item(dict_crawled):
|
||||||
|
|
||||||
|
domain = None
|
||||||
|
# create item_id item_id =
|
||||||
|
|
||||||
|
save_crawled_item(item_id, response.data['html'])
|
||||||
|
create_item_metadata(item_id, domain, 'last_url', port, 'father')
|
||||||
|
|
||||||
|
|
||||||
#### SPLASH MANAGER ####
|
#### SPLASH MANAGER ####
|
||||||
def get_splash_manager_url(reload=False): # TODO: add config reload
|
def get_splash_manager_url(reload=False): # TODO: add config reload
|
||||||
|
|
|
@ -105,14 +105,15 @@ class TorSplashCrawler():
|
||||||
'SPLASH_COOKIES_DEBUG': False
|
'SPLASH_COOKIES_DEBUG': False
|
||||||
})
|
})
|
||||||
|
|
||||||
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
||||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
||||||
self.process.start()
|
self.process.start()
|
||||||
|
|
||||||
class TorSplashSpider(Spider):
|
class TorSplashSpider(Spider):
|
||||||
name = 'TorSplashSpider'
|
name = 'TorSplashSpider'
|
||||||
|
|
||||||
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
||||||
|
self.splash_url = splash_url
|
||||||
self.domain_type = type
|
self.domain_type = type
|
||||||
self.requested_mode = requested_mode
|
self.requested_mode = requested_mode
|
||||||
self.original_item = original_item
|
self.original_item = original_item
|
||||||
|
@ -245,30 +246,34 @@ class TorSplashCrawler():
|
||||||
self.logger.error(repr(failure))
|
self.logger.error(repr(failure))
|
||||||
|
|
||||||
if failure.check(ResponseNeverReceived):
|
if failure.check(ResponseNeverReceived):
|
||||||
request = failure.request
|
## DEBUG ##
|
||||||
url= request.meta['current_url']
|
self.logger.error(failure.request)
|
||||||
father = request.meta['father']
|
if failure.value.response:
|
||||||
|
self.logger.error(failure.value.response)
|
||||||
|
## ----- ##
|
||||||
|
|
||||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
# Extract request metadata
|
||||||
|
url = failure.request.meta['current_url']
|
||||||
|
father = failure.request.meta['father']
|
||||||
|
l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies'])
|
||||||
|
|
||||||
|
# Check if Splash restarted
|
||||||
|
if not crawlers.is_splash_reachable(self.splash_url):
|
||||||
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 20s ...', url)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
if response:
|
|
||||||
response_root_key = response.meta['root_key']
|
|
||||||
else:
|
|
||||||
response_root_key = None
|
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
url,
|
url,
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='execute',
|
endpoint='execute',
|
||||||
cache_args=['lua_source'],
|
|
||||||
meta={'father': father, 'current_url': url},
|
meta={'father': father, 'current_url': url},
|
||||||
args=self.build_request_arg(response.cookiejar)
|
args=l_cookies
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('failure')
|
self.logger.error(failure.type)
|
||||||
#print(failure)
|
self.logger.error(failure.getErrorMessage())
|
||||||
print(failure.type)
|
|
||||||
|
|
||||||
def save_crawled_item(self, item_id, item_content):
|
def save_crawled_item(self, item_id, item_content):
|
||||||
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
|
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
|
||||||
|
|
|
@ -46,4 +46,4 @@ if __name__ == '__main__':
|
||||||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||||
crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
||||||
|
|
Loading…
Reference in a new issue