fix: [crawler] filter lookup parent + domain daterange

This commit is contained in:
terrtia 2024-10-07 11:03:56 +02:00
parent c8b1c67a08
commit 83e11082b5
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
5 changed files with 26 additions and 3 deletions

View file

@ -78,6 +78,7 @@ class Crawler(AbstractModule):
self.items_dir = None self.items_dir = None
self.original_domain = None self.original_domain = None
self.domain = None self.domain = None
self.parent = None
# TODO Replace with warning list ??? # TODO Replace with warning list ???
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
@ -243,6 +244,7 @@ class Crawler(AbstractModule):
return None return None
self.domain = Domain(domain) self.domain = Domain(domain)
self.parent = self.domain.get_parent()
self.original_domain = Domain(domain) self.original_domain = Domain(domain)
epoch = int(time.time()) epoch = int(time.time())
@ -263,7 +265,9 @@ class Crawler(AbstractModule):
# Save Capture # Save Capture
self.save_capture_response(parent_id, entries) self.save_capture_response(parent_id, entries)
self.domain.update_daterange(self.date.replace('/', '')) if self.parent != 'lookup':
# Update domain first/last seen
self.domain.update_daterange(self.date.replace('/', ''))
# Origin + History + tags # Origin + History + tags
if self.root_item: if self.root_item:
self.domain.set_last_origin(parent_id) self.domain.set_last_origin(parent_id)
@ -271,6 +275,7 @@ class Crawler(AbstractModule):
# Tags # Tags
for tag in task.get_tags(): for tag in task.get_tags():
self.domain.add_tag(tag) self.domain.add_tag(tag)
# Crawler stats
self.domain.add_history(epoch, root_item=self.root_item) self.domain.add_history(epoch, root_item=self.root_item)
if self.domain != self.original_domain: if self.domain != self.original_domain:

View file

@ -86,7 +86,7 @@ def get_default_correlation_objects():
return AIL_OBJECTS_CORRELATIONS_DEFAULT return AIL_OBJECTS_CORRELATIONS_DEFAULT
def get_obj_queued(): def get_obj_queued():
return ['item', 'image', 'message', 'ocr', 'qrcode'] return ['item', 'image', 'message', 'ocr', 'qrcode'] # screenshot ???
def get_objects_tracked(): def get_objects_tracked():
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title'] return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title']

View file

@ -195,6 +195,9 @@ def get_processed_end_objs():
def get_processed_end_obj(): def get_processed_end_obj():
return r_obj_process.spop(f'objs:processed') return r_obj_process.spop(f'objs:processed')
def is_obj_in_process(obj_gid):
return r_obj_process.sismember(f'objs:process', obj_gid)
def get_processed_objs_by_type(obj_type): def get_processed_objs_by_type(obj_type):
return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1) return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1)

View file

@ -67,7 +67,7 @@ faup = Faup()
# is safe ??? # is safe ???
# TODO FILTER URL ??? # TODO FILTER URL ???
def api_get_onion_lookup(domain): def api_get_onion_lookup(domain): # TODO check if object process done ???
domain = domain.lower() domain = domain.lower()
url_unpack = unpack_url(domain) url_unpack = unpack_url(domain)
domain = url_unpack['domain'] domain = url_unpack['domain']
@ -78,6 +78,11 @@ def api_get_onion_lookup(domain):
if is_crawler_activated(): if is_crawler_activated():
create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT) create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT)
return {'error': 'domain not found', 'domain': domain}, 404 return {'error': 'domain not found', 'domain': domain}, 404
if not dom.was_up():
return {'error': 'domain not found', 'domain': domain}, 404
# else
## TODO check if object process done -> return result if more than one history
# #-> check item history
meta = dom.get_meta(options={'languages'}) meta = dom.get_meta(options={'languages'})
meta['first_seen'] = meta['first_seen'].replace('/', '-') meta['first_seen'] = meta['first_seen'].replace('/', '-')
meta['last_seen'] = meta['last_check'].replace('/', '-') meta['last_seen'] = meta['last_check'].replace('/', '-')

View file

@ -19,6 +19,7 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages # Import Project packages
################################## ##################################
from lib import ail_logger from lib import ail_logger
from lib.ail_queues import is_obj_in_process
from lib import Tag from lib import Tag
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib import Duplicate from lib import Duplicate
@ -92,6 +93,15 @@ class AbstractObject(ABC):
else: else:
return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value) return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value)
## Queues ##
# is_in_queue , is_in_module
def is_being_processed(self):
return is_obj_in_process(self.get_global_id())
# -Queues- #
## Tags ## ## Tags ##
def get_tags(self, r_list=False): def get_tags(self, r_list=False):
tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True)) tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True))