From c25ccb8618e9a97da0e8f51c71a2daf8a7452489 Mon Sep 17 00:00:00 2001 From: terrtia Date: Wed, 24 Apr 2024 14:43:11 +0200 Subject: [PATCH] chg: [ocr] add cache + correlation ocr-chats-messages + launch ocr extractor by default --- bin/LAUNCH.sh | 3 +++ bin/lib/correlations_engine.py | 22 ++++++++-------- bin/lib/objects/Domains.py | 4 +-- bin/lib/objects/Messages.py | 2 +- bin/lib/objects/Ocrs.py | 13 +++++++++- bin/lib/objects/abstract_object.py | 4 +-- bin/modules/OcrExtractor.py | 34 ++++++++++++++++++------- update/v5.5/Update.py | 26 +++++++++++++++++++ update/v5.5/Update.sh | 40 ++++++++++++++++++++++++++++++ 9 files changed, 122 insertions(+), 26 deletions(-) create mode 100755 update/v5.5/Update.py create mode 100755 update/v5.5/Update.sh diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index a9941a41..c53b3c3f 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -275,8 +275,11 @@ function launching_scripts { screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x" sleep 0.1 + # IMAGES screen -S "Script_AIL" -X screen -t "Exif" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Exif.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "OcrExtractor" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./OcrExtractor.py; read x" + sleep 0.1 ################################## # TRACKERS MODULES # diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 6afe27da..d8a02f85 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -41,26 +41,26 @@ config_loader = None ################################## CORRELATION_TYPES_BY_OBJ = { - "chat": ["chat-subchannel", "chat-thread", "image", "user-account"], # message or direct correlation like cve, bitcoin, ... ??? - "chat-subchannel": ["chat", "chat-thread", "image", "message", "user-account"], - "chat-thread": ["chat", "chat-subchannel", "image", "message", "user-account"], # TODO user account + "chat": ["chat-subchannel", "chat-thread", "image", "message", "ocr", "user-account"], # message or direct correlation like cve, bitcoin, ... ??? + "chat-subchannel": ["chat", "chat-thread", "image", "message", "ocr", "user-account"], + "chat-thread": ["chat", "chat-subchannel", "image", "message", "ocr", "user-account"], # TODO user account "cookie-name": ["domain"], - "cryptocurrency": ["domain", "item", "message"], - "cve": ["domain", "item", "message"], - "decoded": ["domain", "item", "message"], + "cryptocurrency": ["domain", "item", "message", "ocr"], + "cve": ["domain", "item", "message", "ocr"], + "decoded": ["domain", "item", "message", "ocr"], "domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"], "etag": ["domain"], "favicon": ["domain", "item"], # TODO Decoded "file-name": ["chat", "message"], "hhhash": ["domain"], - "image": ["chat", "message", "ocr", "user-account"], + "image": ["chat", "chat-subchannel", "chat-thread", "message", "ocr", "user-account"], # TODO subchannel + threads ???? "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? - "message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "pgp", "user-account"], # chat ?? - "ocr": ["image"], - "pgp": ["domain", "item", "message"], + "message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"], + "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], + "pgp": ["domain", "item", "message", "ocr"], "screenshot": ["domain", "item"], "title": ["domain", "item"], - "user-account": ["chat", "chat-subchannel", "chat-thread", "image", "message", "username"], + "user-account": ["chat", "chat-subchannel", "chat-thread", "image", "message", "ocr", "username"], "username": ["domain", "item", "message", "user-account"], } diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py index 8d19aee7..9756d7cc 100755 --- a/bin/lib/objects/Domains.py +++ b/bin/lib/objects/Domains.py @@ -209,7 +209,7 @@ class Domain(AbstractObject): def get_screenshot(self): last_item = self.get_last_item_root() if last_item: - screenshot = self._get_external_correlation('item', '', last_item, 'screenshot').get('screenshot') + screenshot = self.get_obj_correlations('item', '', last_item, ['screenshot']).get('screenshot') if screenshot: return screenshot.pop()[1:] @@ -392,7 +392,7 @@ class Domain(AbstractObject): print(har) _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz') # Screenshot - screenshot = self._get_external_correlation('item', '', item_id, 'screenshot') + screenshot = self.get_obj_correlations('item', '', item_id, ['screenshot']) if screenshot and screenshot['screenshot']: screenshot = screenshot['screenshot'].pop()[1:] screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index ecf95cc0..104553b0 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -141,7 +141,7 @@ class Message(AbstractObject): # TODO get thread ID def _get_image_ocr(self, obj_id): - return bool(self._get_external_correlation('image', '', obj_id, 'ocr').get('ocr')) + return bool(self.get_correlation('ocr').get('ocr')) def get_images(self): images = [] diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py index 9d48b4bd..e290e623 100755 --- a/bin/lib/objects/Ocrs.py +++ b/bin/lib/objects/Ocrs.py @@ -228,6 +228,14 @@ class Ocr(AbstractObject): def remove(self, val): return r_object.srem(f'ocr:{self.id}', val) + def update_correlation(self): + image_correl = self.get_obj_correlations('image', '', self.id) + for obj_type in image_correl: + if obj_type != 'ocr': + for obj_raw in image_correl[obj_type]: + obj_subtype, obj_id = obj_raw.split(':', 1) + self.add_correlation(obj_type, obj_subtype, obj_id) + def create(self, extracted_texts, tags=[]): r_object.sadd(f'{self.type}:all', self.id) for extracted in extracted_texts: @@ -235,7 +243,10 @@ class Ocr(AbstractObject): if len(text) > 1: str_coords = self.create_coord_str(bbox) self.add(str_coords, text) - self.add_correlation('image', '', self.id) + + # Correlations + self.update_correlation() + self.add_correlation('image', '', self.id) for tag in tags: self.add_tag(tag) diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 4548ba9f..1a87d1c8 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -225,11 +225,11 @@ class AbstractObject(ABC): ## Correlation ## - def _get_external_correlation(self, req_type, req_subtype, req_id, obj_type): + def get_obj_correlations(self, obj_type, obj_subtype, obj_id, filter_types=[]): """ Get object correlation """ - return get_correlations(req_type, req_subtype, req_id, filter_types=[obj_type]) + return get_correlations(obj_type, obj_subtype, obj_id, filter_types=filter_types) def get_correlation(self, obj_type): """ diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index 85df401f..deb732da 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader from lib import chats_viewer from lib.objects import Messages from lib.objects import Ocrs @@ -68,30 +69,45 @@ class OcrExtractor(AbstractModule): # Waiting time in seconds between to message processed self.pending_seconds = 1 + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + # Send module state to logs self.logger.info(f'Module {self.module_name} initialized') + def is_cached(self): + return self.r_cache.exists(f'ocr:no:{self.obj.id}') + + def add_to_cache(self): + self.r_cache.setex(f'ocr:no:{self.obj.id}', 86400, 0) + def compute(self, message): image = self.get_obj() - path = image.get_filepath() - print(image) - - languages = get_model_languages(image) - print(languages) + print(image.id) ocr = Ocrs.Ocr(image.id) - ocr.delete() + if self.is_cached(): + return None + if not ocr.exists(): + path = image.get_filepath() + languages = get_model_languages(image) + print(languages) texts = Ocrs.extract_text(path, languages) if texts: + print('create') ocr = Ocrs.create(image.id, texts) self.add_message_to_queue(ocr) + # Save in cache + else: + print('no text detected') + self.add_to_cache() + else: + print('update correlation') + ocr.update_correlation() if __name__ == '__main__': module = OcrExtractor() module.run() - # from lib.objects import Images - # module.obj = Images.Image('') - # module.compute('') diff --git a/update/v5.5/Update.py b/update/v5.5/Update.py new file mode 100755 index 00000000..9688f4a8 --- /dev/null +++ b/update/v5.5/Update.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +sys.path.append(os.environ['AIL_HOME']) +################################## +# Import Project packages +################################## +from update.bin.ail_updater import AIL_Updater +from lib import ail_updates +from lib import chats_viewer + +class Updater(AIL_Updater): + """default Updater.""" + + def __init__(self, version): + super(Updater, self).__init__(version) + + +if __name__ == '__main__': + chats_viewer.fix_correlations_subchannel_message() + updater = Updater('v5.5') + updater.run_update() + diff --git a/update/v5.5/Update.sh b/update/v5.5/Update.sh new file mode 100755 index 00000000..d93fd9e7 --- /dev/null +++ b/update/v5.5/Update.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1; + +export PATH=$AIL_HOME:$PATH +export PATH=$AIL_REDIS:$PATH +export PATH=$AIL_BIN:$PATH +export PATH=$AIL_FLASK:$PATH + +GREEN="\\033[1;32m" +DEFAULT="\\033[0;39m" + +echo -e $GREEN"Shutting down AIL ..."$DEFAULT +bash ${AIL_BIN}/LAUNCH.sh -ks +wait + +# SUBMODULES # +git submodule update + +echo "" +echo -e $GREEN"Updating python packages ..."$DEFAULT +echo "" +pip install -U easyocr + + +bash ${AIL_BIN}/LAUNCH.sh -lrv +bash ${AIL_BIN}/LAUNCH.sh -lkv + +echo "" +echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT +echo "" +python ${AIL_HOME}/update/v5.5/Update.py +wait +echo "" +echo "" + +exit 0