From a20b6054e8f74ccb8fd11f631c27f6e464eeba60 Mon Sep 17 00:00:00 2001 From: terrtia Date: Tue, 17 Sep 2024 15:36:15 +0200 Subject: [PATCH] fix: [crawler] fix crawler queue stats --- bin/crawlers/Crawler.py | 2 ++ bin/lib/crawlers.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 3535ed29..66a55ad9 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -61,6 +61,8 @@ class Crawler(AbstractModule): crawlers.load_blacklist() # update captures cache crawlers.reload_crawler_captures() + # update crawler queue stats + crawlers.reload_crawlers_stats() self.crawler_scheduler = crawlers.CrawlerScheduler() diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 8925a3b9..3a3b8bba 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1018,6 +1018,16 @@ def get_crawlers_stats(domain_type=None): stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled} return stats +def reload_crawlers_stats(): + for domain_type in get_crawler_all_types(): + to_remove = [] + for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'): + task = CrawlerTask(task_uuid) + if not task.exists(): + to_remove.append(task_uuid) + for task_uuid in to_remove: + r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid) + #### Blocklist #### def get_blacklist():