mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-25 23:37:16 +00:00
fix: [crawler] filter lookup parent + domain daterange
This commit is contained in:
parent
c8b1c67a08
commit
83e11082b5
5 changed files with 26 additions and 3 deletions
|
@ -78,6 +78,7 @@ class Crawler(AbstractModule):
|
||||||
self.items_dir = None
|
self.items_dir = None
|
||||||
self.original_domain = None
|
self.original_domain = None
|
||||||
self.domain = None
|
self.domain = None
|
||||||
|
self.parent = None
|
||||||
|
|
||||||
# TODO Replace with warning list ???
|
# TODO Replace with warning list ???
|
||||||
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
|
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
|
||||||
|
@ -243,6 +244,7 @@ class Crawler(AbstractModule):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self.domain = Domain(domain)
|
self.domain = Domain(domain)
|
||||||
|
self.parent = self.domain.get_parent()
|
||||||
self.original_domain = Domain(domain)
|
self.original_domain = Domain(domain)
|
||||||
|
|
||||||
epoch = int(time.time())
|
epoch = int(time.time())
|
||||||
|
@ -263,7 +265,9 @@ class Crawler(AbstractModule):
|
||||||
# Save Capture
|
# Save Capture
|
||||||
self.save_capture_response(parent_id, entries)
|
self.save_capture_response(parent_id, entries)
|
||||||
|
|
||||||
self.domain.update_daterange(self.date.replace('/', ''))
|
if self.parent != 'lookup':
|
||||||
|
# Update domain first/last seen
|
||||||
|
self.domain.update_daterange(self.date.replace('/', ''))
|
||||||
# Origin + History + tags
|
# Origin + History + tags
|
||||||
if self.root_item:
|
if self.root_item:
|
||||||
self.domain.set_last_origin(parent_id)
|
self.domain.set_last_origin(parent_id)
|
||||||
|
@ -271,6 +275,7 @@ class Crawler(AbstractModule):
|
||||||
# Tags
|
# Tags
|
||||||
for tag in task.get_tags():
|
for tag in task.get_tags():
|
||||||
self.domain.add_tag(tag)
|
self.domain.add_tag(tag)
|
||||||
|
# Crawler stats
|
||||||
self.domain.add_history(epoch, root_item=self.root_item)
|
self.domain.add_history(epoch, root_item=self.root_item)
|
||||||
|
|
||||||
if self.domain != self.original_domain:
|
if self.domain != self.original_domain:
|
||||||
|
|
|
@ -86,7 +86,7 @@ def get_default_correlation_objects():
|
||||||
return AIL_OBJECTS_CORRELATIONS_DEFAULT
|
return AIL_OBJECTS_CORRELATIONS_DEFAULT
|
||||||
|
|
||||||
def get_obj_queued():
|
def get_obj_queued():
|
||||||
return ['item', 'image', 'message', 'ocr', 'qrcode']
|
return ['item', 'image', 'message', 'ocr', 'qrcode'] # screenshot ???
|
||||||
|
|
||||||
def get_objects_tracked():
|
def get_objects_tracked():
|
||||||
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title']
|
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'qrcode', 'title']
|
||||||
|
|
|
@ -195,6 +195,9 @@ def get_processed_end_objs():
|
||||||
def get_processed_end_obj():
|
def get_processed_end_obj():
|
||||||
return r_obj_process.spop(f'objs:processed')
|
return r_obj_process.spop(f'objs:processed')
|
||||||
|
|
||||||
|
def is_obj_in_process(obj_gid):
|
||||||
|
return r_obj_process.sismember(f'objs:process', obj_gid)
|
||||||
|
|
||||||
def get_processed_objs_by_type(obj_type):
|
def get_processed_objs_by_type(obj_type):
|
||||||
return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1)
|
return r_obj_process.zrange(f'objs:process:{obj_type}', 0, -1)
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ faup = Faup()
|
||||||
# is safe ???
|
# is safe ???
|
||||||
# TODO FILTER URL ???
|
# TODO FILTER URL ???
|
||||||
|
|
||||||
def api_get_onion_lookup(domain):
|
def api_get_onion_lookup(domain): # TODO check if object process done ???
|
||||||
domain = domain.lower()
|
domain = domain.lower()
|
||||||
url_unpack = unpack_url(domain)
|
url_unpack = unpack_url(domain)
|
||||||
domain = url_unpack['domain']
|
domain = url_unpack['domain']
|
||||||
|
@ -78,6 +78,11 @@ def api_get_onion_lookup(domain):
|
||||||
if is_crawler_activated():
|
if is_crawler_activated():
|
||||||
create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT)
|
create_task(domain, parent='lookup', priority=0, har=D_HAR, screenshot=D_SCREENSHOT)
|
||||||
return {'error': 'domain not found', 'domain': domain}, 404
|
return {'error': 'domain not found', 'domain': domain}, 404
|
||||||
|
if not dom.was_up():
|
||||||
|
return {'error': 'domain not found', 'domain': domain}, 404
|
||||||
|
# else
|
||||||
|
## TODO check if object process done -> return result if more than one history
|
||||||
|
# #-> check item history
|
||||||
meta = dom.get_meta(options={'languages'})
|
meta = dom.get_meta(options={'languages'})
|
||||||
meta['first_seen'] = meta['first_seen'].replace('/', '-')
|
meta['first_seen'] = meta['first_seen'].replace('/', '-')
|
||||||
meta['last_seen'] = meta['last_check'].replace('/', '-')
|
meta['last_seen'] = meta['last_check'].replace('/', '-')
|
||||||
|
|
|
@ -19,6 +19,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
# Import Project packages
|
# Import Project packages
|
||||||
##################################
|
##################################
|
||||||
from lib import ail_logger
|
from lib import ail_logger
|
||||||
|
from lib.ail_queues import is_obj_in_process
|
||||||
from lib import Tag
|
from lib import Tag
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
from lib import Duplicate
|
from lib import Duplicate
|
||||||
|
@ -92,6 +93,15 @@ class AbstractObject(ABC):
|
||||||
else:
|
else:
|
||||||
return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value)
|
return r_object.hset(f'meta:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', field, value)
|
||||||
|
|
||||||
|
## Queues ##
|
||||||
|
|
||||||
|
# is_in_queue , is_in_module
|
||||||
|
|
||||||
|
def is_being_processed(self):
|
||||||
|
return is_obj_in_process(self.get_global_id())
|
||||||
|
|
||||||
|
# -Queues- #
|
||||||
|
|
||||||
## Tags ##
|
## Tags ##
|
||||||
def get_tags(self, r_list=False):
|
def get_tags(self, r_list=False):
|
||||||
tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True))
|
tags = Tag.get_object_tags(self.type, self.id, self.get_subtype(r_str=True))
|
||||||
|
|
Loading…
Reference in a new issue