From 5e10bfcf6a679c5dbee9e20de2ee1b49990e1740 Mon Sep 17 00:00:00 2001 From: terrtia Date: Tue, 28 Jan 2025 10:42:21 +0100 Subject: [PATCH] chg: [process chat text] process text files from chats + correlations --- bin/importer/FeederImporter.py | 13 ++-- bin/importer/feeders/abstract_chats_feeder.py | 59 +++++++++++++------ bin/lib/ail_core.py | 8 +++ bin/lib/chats_viewer.py | 12 ++-- bin/lib/correlations_engine.py | 6 +- bin/lib/objects/Items.py | 58 ++++++------------ bin/lib/objects/Messages.py | 25 ++++++++ bin/lib/objects/abstract_chat_object.py | 2 +- bin/modules/Global.py | 8 ++- bin/modules/Mixer.py | 2 +- .../chats_explorer/block_message.html | 20 ++++++- 11 files changed, 136 insertions(+), 77 deletions(-) diff --git a/bin/importer/FeederImporter.py b/bin/importer/FeederImporter.py index f3382be3..c2ea4481 100755 --- a/bin/importer/FeederImporter.py +++ b/bin/importer/FeederImporter.py @@ -105,10 +105,15 @@ class FeederImporter(AbstractImporter): objs_messages = [] for obj in objs: - if obj.type == 'item': # object save on disk as file (Items) - gzip64_content = feeder.get_gzip64_content() - relay_message = f'{feeder_name} {gzip64_content}' - objs_messages.append({'obj': obj, 'message': relay_message}) + # Text created + if obj.type == 'item': + if obj.exists(): + objs_messages.append({'obj': obj, 'message': feeder_name}) + # object save on disk as file (Items) + else: + gzip64_content = feeder.get_gzip64_content() + relay_message = f'{feeder_name} {gzip64_content}' + objs_messages.append({'obj': obj, 'message': relay_message}) elif obj.type == 'image': date = feeder.get_date() objs_messages.append({'obj': obj, 'message': f'{feeder_name} {date}'}) diff --git a/bin/importer/feeders/abstract_chats_feeder.py b/bin/importer/feeders/abstract_chats_feeder.py index 9d48c5d0..a3d88681 100755 --- a/bin/importer/feeders/abstract_chats_feeder.py +++ b/bin/importer/feeders/abstract_chats_feeder.py @@ -19,10 +19,12 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from importer.feeders.Default import DefaultFeeder +from lib.ail_core import get_chat_instance_name from lib.objects.Chats import Chat from lib.objects import ChatSubChannels from lib.objects import ChatThreads from lib.objects import Images +from lib.objects import Items from lib.objects import Messages from lib.objects import FilesNames # from lib.objects import Files @@ -87,6 +89,9 @@ class AbstractChatFeeder(DefaultFeeder, ABC): def get_message_id(self): return self.json_data['meta']['id'] + def get_media_id(self): + return self.json_data['meta'].get('media', {}).get('id') + def get_media_name(self): return self.json_data['meta'].get('media', {}).get('name') @@ -95,7 +100,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): def get_date(self): if self.json_data['meta'].get('date'): - date = datetime.datetime.fromtimestamp( self.json_data['meta']['date']['timestamp']) + date = datetime.datetime.fromtimestamp(self.json_data['meta']['date']['timestamp']) date = date.strftime('%Y%m%d') else: date = datetime.date.today().strftime("%Y%m%d") @@ -154,12 +159,15 @@ class AbstractChatFeeder(DefaultFeeder, ABC): # channel id # thread id - # TODO sanitize obj type + # TODO sanitize obj type ################### CHECK IF IS MESSAGE BY DEFAULT obj_type = self.get_obj_type() - if obj_type == 'image': self.obj = Images.Image(self.json_data['data-sha256']) - + elif obj_type == 'text': + d = self.get_date() + instance_name = get_chat_instance_name(self.get_chat_instance_uuid()) + item_id = f'{instance_name}/{d[0:4]}/{d[4:6]}/{d[6:8]}/{self.json_data["data-sha256"]}.gz' + self.obj = Items.Item(item_id) else: obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id) self.obj = Messages.Message(obj_id) @@ -195,8 +203,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): return chat - ############################################################################################################################## - + ########################################################################################################### def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_id=None): meta = self.json_data['meta']['chat'] # todo replace me by function @@ -404,7 +411,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): obj.add_reaction(reaction['reaction'], int(reaction['count'])) elif self.obj.type == 'chat': pass - else: + else: # IMAGE + ITEM chat_id = self.get_chat_id() thread_id = self.get_thread_id() channel_id = self.get_subchannel_id() @@ -416,19 +423,33 @@ class AbstractChatFeeder(DefaultFeeder, ABC): message.create('') objs.add(message) - if message.exists(): # TODO Correlation user-account image/filename ???? - obj = Images.create(self.get_message_content()) - obj.add(date, message) - obj.set_parent(obj_global_id=message.get_global_id()) - - # FILENAME - media_name = self.get_media_name() - if media_name: - FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) - + if message.exists(): + # REACTIONS for reaction in self.get_reactions(): message.add_reaction(reaction['reaction'], int(reaction['count'])) + if self.obj.type == 'image': + obj = Images.create(self.get_message_content()) + obj.add(date, message) + obj.set_parent(obj_global_id=message.get_global_id()) + + # FILENAME + media_name = self.get_media_name() + if media_name: + FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) + + elif self.obj.type == 'item': + obj = self.obj + if not obj.exists(): + obj.create(self.get_message_content()) + obj.add_correlation('message', '', message.id) + + # FILENAME + media_name = self.get_media_name() + if media_name: + file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) + file_name.add_correlation('item', '', obj.id) + for obj in objs: # TODO PERF avoid parsing metas multiple times # TODO get created subchannel + thread @@ -443,7 +464,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): user_account = self.process_sender(new_objs, obj, date, feeder_timestamp) if user_account: - # UserAccount---ChatObjects + # UserAccount---ChatObjects for obj_chat in curr_chats_objs: user_account.add_correlation(obj_chat.type, obj_chat.get_subtype(r_str=True), obj_chat.id) @@ -523,6 +544,6 @@ class AbstractChatFeeder(DefaultFeeder, ABC): if chat_obj.type == 'chat': obj.add_relationship(chat_obj.get_global_id(), 'in') - # -MENTION- # + # -MENTION- # return new_objs | objs diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index a1e6065c..c5456756 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -134,6 +134,14 @@ def unpack_correl_objs_id(obj_type, correl_objs_id, r_type='tuple'): ##-- AIL OBJECTS --## +def get_chat_instance_name(chat_instance): + if chat_instance == '00098785-7e70-5d12-a120-c5cdc1252b2b': + return 'telegram' + elif chat_instance == 'd2426e3f-22f3-5a57-9a98-d2ae9794e683': + return 'discord' + else: + return chat_instance + #### Redis #### def _parse_zscan(response): diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index f0f00409..cf57f1a3 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -344,7 +344,7 @@ def get_username_meta_from_global_id(username_global_id): ############################################################################### # TODO Pagination def list_messages_to_dict(l_messages_id, translation_target=None): - options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} + options = {'content', 'files', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} meta = {} curr_date = None for mess_id in l_messages_id: @@ -621,7 +621,7 @@ def _get_chat_card_meta_options(): return {'created_at', 'icon', 'info', 'nb_participants', 'origin_link', 'subchannels', 'tags_safe', 'threads', 'translation', 'username'} def _get_message_bloc_meta_options(): - return {'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'} + return {'chat', 'content', 'files', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'} def get_message_report(l_mess): # TODO Force language + translation translation_target = 'en' @@ -900,7 +900,7 @@ def api_get_message(message_id, translation_target=None): message = Messages.Message(message_id) if not message.exists(): return {"status": "error", "reason": "Unknown uuid"}, 404 - meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) + meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) if 'forwarded_from' in meta: chat = get_obj_chat_from_global_id(meta['forwarded_from']) meta['forwarded_from'] = chat.get_meta({'icon'}) @@ -993,7 +993,7 @@ def api_chat_messages(subtype, chat_id): if meta['subchannels']: meta['subchannels'] = get_subchannels_meta_from_global_id(meta['subchannels']) else: - options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} + options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} meta['messages'], _, _ = chat.get_messages(nb=-1, options=options) return meta, 200 @@ -1009,7 +1009,7 @@ def api_subchannel_messages(subtype, subchannel_id): meta['threads'] = get_threads_metas(meta['threads']) if meta.get('username'): meta['username'] = get_username_meta_from_global_id(meta['username']) - options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} + options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} meta['messages'], _, _ = subchannel.get_messages(nb=-1, options=options) return meta, 200 @@ -1018,7 +1018,7 @@ def api_thread_messages(subtype, thread_id): if not thread.exists(): return {"status": "error", "reason": "Unknown thread"}, 404 meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'}) - options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} + options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} meta['messages'], _, _ = thread.get_messages(nb=-1, options=options) return meta, 200 diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index cc14c01f..e4772a82 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -53,11 +53,11 @@ CORRELATION_TYPES_BY_OBJ = { "dom-hash": ["domain", "item"], "etag": ["domain"], "favicon": ["domain", "item"], # TODO Decoded - "file-name": ["chat", "message"], + "file-name": ["chat", "item", "message"], "hhhash": ["domain"], "image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ???? - "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? - "message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"], + "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "message", "pgp", "screenshot", "title", "username"], # chat ??? + "message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "item", "ocr", "pgp", "user-account"], "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], "pgp": ["domain", "item", "message", "ocr"], "qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ????? diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py index bbac1b8b..d50a6155 100755 --- a/bin/lib/objects/Items.py +++ b/bin/lib/objects/Items.py @@ -59,6 +59,20 @@ class Item(AbstractObject): """ return item_basic.get_item_date(self.id, add_separator=separator) + def get_link(self, flask_context=False): + if flask_context: + url = url_for('objects_item.showItem', id=self.id) + else: + url = f'{baseurl}/object/item?id={self.id}' + return url + + def get_svg_icon(self): + if is_crawled(self.id): + color = 'red' + else: + color = '#332288' + return {'style': '', 'icon': '', 'color': color, 'radius': 5} + def get_source(self): """ Returns Item source/feeder name @@ -149,18 +163,13 @@ class Item(AbstractObject): if len(basename) > 255: new_basename = f'{basename[:215]}{str(uuid4())}.gz' self.id = rreplace(self.id, basename, new_basename, 1) - - - - - return self.id # # TODO: sanitize_id # # TODO: check if already exists ? # # TODO: check if duplicate - def save_on_disk(self, content, binary=True, compressed=False, b64=False): - if not binary: + def _save_on_disk(self, content, content_type='bytes', b64=False, compressed=False): + if not content_type == 'bytes': content = content.encode() if b64: content = base64.standard_b64decode(content) @@ -181,22 +190,10 @@ class Item(AbstractObject): # tags # origin # duplicate -> all item iterations ??? + # father # - def create(self, content, tags, father=None, duplicates=[], _save=True): - if _save: - self.save_on_disk(content, binary=True, compressed=False, base64=False) - - # # TODO: - # for tag in tags: - # self.add_tag(tag) - - if father: - pass - - for obj_id in duplicates: - for dup in duplicates[obj_id]: - self.add_duplicate(obj_id, dup['algo'], dup['similarity']) - + def create(self, content, content_type='bytes', b64=False, compressed=False): + self._save_on_disk(content, content_type=content_type, b64=b64, compressed=compressed) # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ... @@ -211,20 +208,6 @@ class Item(AbstractObject): #################################################################################### #################################################################################### - def get_link(self, flask_context=False): - if flask_context: - url = url_for('objects_item.showItem', id=self.id) - else: - url = f'{baseurl}/object/item?id={self.id}' - return url - - def get_svg_icon(self): - if is_crawled(self.id): - color = 'red' - else: - color = '#332288' - return {'style': '', 'icon': '', 'color': color, 'radius': 5} - def get_misp_object(self): obj = MISPObject('ail-leak', standalone=True) obj_date = self.get_date() @@ -242,9 +225,6 @@ class Item(AbstractObject): obj_attr.add_tag(tag) return obj - def exist_correlation(self): - pass - def is_crawled(self): return self.id.startswith('crawled') diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 9c007061..4bfb6757 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -180,6 +180,28 @@ class Message(AbstractObject): names.append(name[1:]) return names + def get_nb_files(self): + return self.get_nb_correlation('item') + + def get_files(self, file_names=None): + if not file_names: + file_names = self.get_files_names() + files = {} + nb_files = 0 + s_files = set() + for file_name in file_names: + files[file_name] = [] + for it in self.get_correlation_iter('file-name', '', file_name, 'item'): + files[file_name].append(it[1:]) + s_files.add(it[1:]) + nb_files += 1 + if nb_files < self.get_nb_files(): + files['undefined'] = [] + for f in self.get_correlation('item').get('item'): + if f[1:] not in s_files: + files['undefined'].append(f[1:]) + return files + def get_reactions(self): return r_object.hgetall(f'meta:reactions:{self.type}::{self.id}') @@ -322,6 +344,9 @@ class Message(AbstractObject): meta['qrcodes'] = self.get_qrcodes() if 'files-names' in options: meta['files-names'] = self.get_files_names() + if 'files' in options: + if meta.get('files-names'): + meta['files'] = self.get_files(file_names=meta['files-names']) if 'reactions' in options: meta['reactions'] = self.get_reactions() if 'language' in options: diff --git a/bin/lib/objects/abstract_chat_object.py b/bin/lib/objects/abstract_chat_object.py index f1949b5c..daf12d26 100755 --- a/bin/lib/objects/abstract_chat_object.py +++ b/bin/lib/objects/abstract_chat_object.py @@ -275,7 +275,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC): def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message message = Messages.Message(message[9:]) if not options: - options = {'barcodes', 'content', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'} + options = {'barcodes', 'content', 'files', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'} meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target) return meta diff --git a/bin/modules/Global.py b/bin/modules/Global.py index 71b00ad4..9aafc346 100755 --- a/bin/modules/Global.py +++ b/bin/modules/Global.py @@ -126,7 +126,11 @@ class Global(AbstractModule): return self.obj.id else: - self.logger.info(f"Empty Item: {message} not processed") + if self.obj.exists(): + self.add_message_to_queue(obj=self.obj, queue='Item') + else: + self.logger.info(f"Empty Item: {message} not processed") + elif self.obj.type == 'message' or self.obj.type == 'ocr': # TODO send to specific object queue => image, ... self.add_message_to_queue(obj=self.obj, queue='Item') @@ -217,8 +221,6 @@ class Global(AbstractModule): gunzipped_bytes_obj = fo.read() except Exception as e: self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}') - print(f'Global; Invalid Gzip file: {filename}, {e}') - return gunzipped_bytes_obj def rreplace(self, s, old, new, occurrence): diff --git a/bin/modules/Mixer.py b/bin/modules/Mixer.py index 222652fd..f83bf40f 100755 --- a/bin/modules/Mixer.py +++ b/bin/modules/Mixer.py @@ -102,7 +102,7 @@ class Mixer(AbstractModule): # feeder_name - object if len(splitted) == 1: # feeder_name - object (content already saved) feeder_name = message - gzip64encoded = None + gzip64encoded = '' # Feeder name in message: "feeder obj_id gzip64encoded" elif len(splitted) == 2: # gzip64encoded content diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 5ec06d40..1184937f 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -129,7 +129,25 @@ {% endif %} {% endfor %} {% endif %} - {% if message['files-names'] %} + {% if message['files'] %} + {% for file_name in message['files'] %} + {% if message['files'][file_name] | length > 1 %} +
+ {{ file_name }} + {% for item in message['files'][file_name] %} +
+ {{ loop.index }} + {% endfor %} +
+ {% else %} + +
+ {{ file_name }} +
+
+ {% endif %} + {% endfor %} + {% elif message['files-names'] %} {% for file_name in message['files-names'] %}
{{ file_name }}