fix: [crawler] add timeout to Unknown captures

This commit is contained in:
Terrtia 2023-07-10 11:23:44 +02:00
parent 8f0e7f1434
commit c719990125
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 34 additions and 25 deletions

View file

@ -122,11 +122,19 @@ class Crawler(AbstractModule):
if capture:
try:
status = self.lacus.get_capture_status(capture.uuid)
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time ### print start time
if status == crawlers.CaptureStatus.DONE:
return capture
elif status == crawlers.CaptureStatus.UNKNOWN:
capture_start = capture.get_start_time(r_str=False)
if int(time.time()) - capture_start > 600: # TODO ADD in new crawler config
task = capture.get_task()
task.reset()
capture.delete()
else:
capture.update(status)
else:
capture.update(status)
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
else:
return capture
except ConnectionError:
print(capture.uuid)

View file

@ -134,7 +134,7 @@ def unpack_url(url):
# # # # # # # # TODO CREATE NEW OBJECT
def get_favicon_from_html(html, domain, url):
favicon_urls = extract_favicon_from_html(html, url)
favicon_urls, favicons = extract_favicon_from_html(html, url)
# add root favicon
if not favicon_urls:
favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico')
@ -162,7 +162,6 @@ def extract_favicon_from_html(html, url):
# - <meta name="msapplication-TileColor" content="#aaaaaa"> <meta name="theme-color" content="#ffffff">
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
# Root Favicon
f = get_faup()
f.decode(url)
@ -244,13 +243,6 @@ def extract_description_from_html(html):
return description['content']
return ''
def extract_description_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
description = soup.find('meta', attrs={'name': 'description'})
if description:
return description['content']
return ''
def extract_keywords_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
keywords = soup.find('meta', attrs={'name': 'keywords'})
@ -686,8 +678,7 @@ class Cookie:
meta[field] = value
if r_json:
data = json.dumps(meta, indent=4, sort_keys=True)
meta = {'data': data}
meta['uuid'] = self.uuid
meta = {'data': data, 'uuid': self.uuid}
return meta
def edit(self, cookie_dict):
@ -1249,8 +1240,13 @@ class CrawlerCapture:
if task_uuid:
return CrawlerTask(task_uuid)
def get_start_time(self):
return self.get_task().get_start_time()
def get_start_time(self, r_str=True):
start_time = self.get_task().get_start_time()
if r_str:
return start_time
else:
start_time = datetime.strptime(start_time, "%Y/%m/%d - %H:%M.%S").timestamp()
return int(start_time)
def get_status(self):
status = r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
@ -1517,6 +1513,11 @@ class CrawlerTask:
def start(self):
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
def reset(self):
priority = 49
r_crawler.hdel(f'crawler:task:{self.uuid}', 'start_time')
self.add_to_db_crawler_queue(priority)
# Crawler
def remove(self): # zrem cache + DB
capture_uuid = self.get_capture()
@ -1727,13 +1728,13 @@ class CrawlerProxy:
self.uuid = proxy_uuid
def get_description(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'description')
return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'description')
# Host
# Port
# Type -> need test
def get_url(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'url')
#### CRAWLER LACUS ####