From 83e11082b580e5409f97d42fed4297fa349a0fd1 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 7 Oct 2024 11:03:56 +0200 Subject: [PATCH] fix: [crawler] filter lookup parent + domain daterange --- bin/crawlers/Crawler.py | 7 ++++++- bin/lib/ail_core.py | 2 +- bin/lib/ail_queues.py | 3 +++ bin/lib/crawlers.py | 7 ++++++- bin/lib/objects/abstract_object.py | 10 ++++++++++ 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 48da4a08..fb626c2e 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -78,6 +78,7 @@ class Crawler(AbstractModule): self.items_dir = None self.original_domain = None self.domain = None + self.parent = None # TODO Replace with warning list ??? self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank @@ -243,6 +244,7 @@ class Crawler(AbstractModule): return None self.domain = Domain(domain) + self.parent = self.domain.get_parent() self.original_domain = Domain(domain) epoch = int(time.time()) @@ -263,7 +265,9 @@ class Crawler(AbstractModule): # Save Capture self.save_capture_response(parent_id, entries) - self.domain.update_daterange(self.date.replace('/', '')) + if self.parent != 'lookup': + # Update domain first/last seen + self.domain.update_daterange(self.date.replace('/', '')) # Origin + History + tags if self.root_item: self.domain.set_last_origin(parent_id) @@ -271,6 +275,7 @@ class Crawler(AbstractModule): # Tags for tag in task.get_tags(): self.domain.add_tag(tag) + # Crawler stats self.domain.add_history(epoch, root_item=self.root_item) if self.domain != self.original_domain: diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 328c484f..532a89f1 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -86,7 +86,7 @@ def get_default_correlation_objects(): return AIL_OBJECTS_CORRELATIONS_DEFAULT def get_obj_queued(): - return ['item', 'image', 'message', 'ocr', 'qrcode'] + return ['item', 'image', 'message', 'ocr', 'qrcode'] # screenshot ??? def get_objects_tracked(): return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title'] diff --git a/bin/lib/ail_queues.py b/bin/lib/ail_queues.py index e91f42b9..8c706445 100755 --- a/bin/lib/ail_queues.py +++ b/bin/lib/ail_queues.py @@ -195,6 +195,9 @@ def get_processed_end_objs(): def get_processed_end_obj(): return r_obj_process.spop(f'objs:processed') +def is_obj_in_process(obj_gid): + return r_obj_process.sismember(f'objs:process', obj_gid) + def get_processed_objs_by_type(obj_type): return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index f65309b8..357b6423 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -67,7 +67,7 @@ faup = Faup() # is safe ??? # TODO FILTER URL ??? -def api_get_onion_lookup(domain): +def api_get_onion_lookup(domain): # TODO check if object process done ??? domain = domain.lower() url_unpack = unpack_url(domain) domain = url_unpack['domain'] @@ -78,6 +78,11 @@ def api_get_onion_lookup(domain): if is_crawler_activated(): create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT) return {'error': 'domain not found', 'domain': domain}, 404 + if not dom.was_up(): + return {'error': 'domain not found', 'domain': domain}, 404 + # else + ## TODO check if object process done -> return result if more than one history + # #-> check item history meta = dom.get_meta(options={'languages'}) meta['first_seen'] = meta['first_seen'].replace('/', '-') meta['last_seen'] = meta['last_check'].replace('/', '-') diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 7156464a..4b230f11 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -19,6 +19,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from lib import ail_logger +from lib.ail_queues import is_obj_in_process from lib import Tag from lib.ConfigLoader import ConfigLoader from lib import Duplicate @@ -92,6 +93,15 @@ class AbstractObject(ABC): else: return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value) + ## Queues ## + + # is_in_queue , is_in_module + + def is_being_processed(self): + return is_obj_in_process(self.get_global_id()) + + # -Queues- # + ## Tags ## def get_tags(self, r_list=False): tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True))