fix: [crawler] fix crawler queue stats

This commit is contained in:
terrtia 2024-09-17 16:52:36 +02:00
parent a20b6054e8
commit 759d241b75
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -1020,13 +1020,11 @@ def get_crawlers_stats(domain_type=None):
def reload_crawlers_stats(): def reload_crawlers_stats():
for domain_type in get_crawler_all_types(): for domain_type in get_crawler_all_types():
to_remove = [] tasks = r_crawler.smembers(f'crawler:queue:type:{domain_type}')
for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'): for task_uuid in tasks:
task = CrawlerTask(task_uuid) task = CrawlerTask(task_uuid)
if not task.exists(): if not task.is_in_queue() and task.get_status() is None:
to_remove.append(task_uuid) task.delete()
for task_uuid in to_remove:
r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid)
#### Blocklist #### #### Blocklist ####
@ -1533,6 +1531,12 @@ class CrawlerTask:
def exists(self): def exists(self):
return r_crawler.exists(f'crawler:task:{self.uuid}') return r_crawler.exists(f'crawler:task:{self.uuid}')
def is_in_queue(self):
if r_crawler.zscore('crawler:queue', self.uuid) is not None:
return True
else:
return False
def get_url(self): def get_url(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'url') return r_crawler.hget(f'crawler:task:{self.uuid}', 'url')