From fe2769308b41600bceb5fd17e861aa0b98dfc57d Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 18 Jul 2023 14:30:00 +0200 Subject: [PATCH] fix: [crawler] fix down domain + domain redirection history --- bin/crawlers/Crawler.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index ab972c68..7f2c3df9 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -60,6 +60,7 @@ class Crawler(AbstractModule): self.root_item = None self.date = None self.items_dir = None + self.original_domain = None self.domain = None # TODO Replace with warning list ??? @@ -190,6 +191,7 @@ class Crawler(AbstractModule): print(domain) self.domain = Domain(domain) + self.original_domain = Domain(domain) epoch = int(time.time()) parent_id = task.get_parent() @@ -212,12 +214,20 @@ class Crawler(AbstractModule): # Origin + History + tags if self.root_item: self.domain.set_last_origin(parent_id) - self.domain.add_history(epoch, root_item=self.root_item) # Tags for tag in task.get_tags(): self.domain.add_tag(tag) - elif self.domain.was_up(): - self.domain.add_history(epoch, root_item=epoch) + self.domain.add_history(epoch, root_item=self.root_item) + + if self.domain != self.original_domain: + self.original_domain.update_daterange(self.date.replace('/', '')) + if self.root_item: + self.original_domain.set_last_origin(parent_id) + # Tags + for tag in task.get_tags(): + self.domain.add_tag(tag) + self.original_domain.add_history(epoch, root_item=self.root_item) + crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch) crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch) print('capture:', capture.uuid, 'completed')