chg: [process chat text] process text files from chats + correlations

This commit is contained in:
terrtia 2025-01-28 10:42:21 +01:00
parent bc83726212
commit 5e10bfcf6a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
11 changed files with 136 additions and 77 deletions

View file

@ -105,7 +105,12 @@ class FeederImporter(AbstractImporter):
objs_messages = [] objs_messages = []
for obj in objs: for obj in objs:
if obj.type == 'item': # object save on disk as file (Items) # Text created
if obj.type == 'item':
if obj.exists():
objs_messages.append({'obj': obj, 'message': feeder_name})
# object save on disk as file (Items)
else:
gzip64_content = feeder.get_gzip64_content() gzip64_content = feeder.get_gzip64_content()
relay_message = f'{feeder_name} {gzip64_content}' relay_message = f'{feeder_name} {gzip64_content}'
objs_messages.append({'obj': obj, 'message': relay_message}) objs_messages.append({'obj': obj, 'message': relay_message})

View file

@ -19,10 +19,12 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages # Import Project packages
################################## ##################################
from importer.feeders.Default import DefaultFeeder from importer.feeders.Default import DefaultFeeder
from lib.ail_core import get_chat_instance_name
from lib.objects.Chats import Chat from lib.objects.Chats import Chat
from lib.objects import ChatSubChannels from lib.objects import ChatSubChannels
from lib.objects import ChatThreads from lib.objects import ChatThreads
from lib.objects import Images from lib.objects import Images
from lib.objects import Items
from lib.objects import Messages from lib.objects import Messages
from lib.objects import FilesNames from lib.objects import FilesNames
# from lib.objects import Files # from lib.objects import Files
@ -87,6 +89,9 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
def get_message_id(self): def get_message_id(self):
return self.json_data['meta']['id'] return self.json_data['meta']['id']
def get_media_id(self):
return self.json_data['meta'].get('media', {}).get('id')
def get_media_name(self): def get_media_name(self):
return self.json_data['meta'].get('media', {}).get('name') return self.json_data['meta'].get('media', {}).get('name')
@ -154,12 +159,15 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
# channel id # channel id
# thread id # thread id
# TODO sanitize obj type # TODO sanitize obj type ################### CHECK IF IS MESSAGE BY DEFAULT
obj_type = self.get_obj_type() obj_type = self.get_obj_type()
if obj_type == 'image': if obj_type == 'image':
self.obj = Images.Image(self.json_data['data-sha256']) self.obj = Images.Image(self.json_data['data-sha256'])
elif obj_type == 'text':
d = self.get_date()
instance_name = get_chat_instance_name(self.get_chat_instance_uuid())
item_id = f'{instance_name}/{d[0:4]}/{d[4:6]}/{d[6:8]}/{self.json_data["data-sha256"]}.gz'
self.obj = Items.Item(item_id)
else: else:
obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id) obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id)
self.obj = Messages.Message(obj_id) self.obj = Messages.Message(obj_id)
@ -195,8 +203,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
return chat return chat
############################################################################################################################## ###########################################################################################################
def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_id=None): def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_id=None):
meta = self.json_data['meta']['chat'] # todo replace me by function meta = self.json_data['meta']['chat'] # todo replace me by function
@ -404,7 +411,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
obj.add_reaction(reaction['reaction'], int(reaction['count'])) obj.add_reaction(reaction['reaction'], int(reaction['count']))
elif self.obj.type == 'chat': elif self.obj.type == 'chat':
pass pass
else: else: # IMAGE + ITEM
chat_id = self.get_chat_id() chat_id = self.get_chat_id()
thread_id = self.get_thread_id() thread_id = self.get_thread_id()
channel_id = self.get_subchannel_id() channel_id = self.get_subchannel_id()
@ -416,7 +423,12 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
message.create('') message.create('')
objs.add(message) objs.add(message)
if message.exists(): # TODO Correlation user-account image/filename ???? if message.exists():
# REACTIONS
for reaction in self.get_reactions():
message.add_reaction(reaction['reaction'], int(reaction['count']))
if self.obj.type == 'image':
obj = Images.create(self.get_message_content()) obj = Images.create(self.get_message_content())
obj.add(date, message) obj.add(date, message)
obj.set_parent(obj_global_id=message.get_global_id()) obj.set_parent(obj_global_id=message.get_global_id())
@ -426,8 +438,17 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
if media_name: if media_name:
FilesNames.FilesNames().create(media_name, date, message, file_obj=obj) FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
for reaction in self.get_reactions(): elif self.obj.type == 'item':
message.add_reaction(reaction['reaction'], int(reaction['count'])) obj = self.obj
if not obj.exists():
obj.create(self.get_message_content())
obj.add_correlation('message', '', message.id)
# FILENAME
media_name = self.get_media_name()
if media_name:
file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
file_name.add_correlation('item', '', obj.id)
for obj in objs: # TODO PERF avoid parsing metas multiple times for obj in objs: # TODO PERF avoid parsing metas multiple times

View file

@ -134,6 +134,14 @@ def unpack_correl_objs_id(obj_type, correl_objs_id, r_type='tuple'):
##-- AIL OBJECTS --## ##-- AIL OBJECTS --##
def get_chat_instance_name(chat_instance):
if chat_instance == '00098785-7e70-5d12-a120-c5cdc1252b2b':
return 'telegram'
elif chat_instance == 'd2426e3f-22f3-5a57-9a98-d2ae9794e683':
return 'discord'
else:
return chat_instance
#### Redis #### #### Redis ####
def _parse_zscan(response): def _parse_zscan(response):

View file

@ -344,7 +344,7 @@ def get_username_meta_from_global_id(username_global_id):
############################################################################### ###############################################################################
# TODO Pagination # TODO Pagination
def list_messages_to_dict(l_messages_id, translation_target=None): def list_messages_to_dict(l_messages_id, translation_target=None):
options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} options = {'content', 'files', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
meta = {} meta = {}
curr_date = None curr_date = None
for mess_id in l_messages_id: for mess_id in l_messages_id:
@ -621,7 +621,7 @@ def _get_chat_card_meta_options():
return {'created_at', 'icon', 'info', 'nb_participants', 'origin_link', 'subchannels', 'tags_safe', 'threads', 'translation', 'username'} return {'created_at', 'icon', 'info', 'nb_participants', 'origin_link', 'subchannels', 'tags_safe', 'threads', 'translation', 'username'}
def _get_message_bloc_meta_options(): def _get_message_bloc_meta_options():
return {'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'} return {'chat', 'content', 'files', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'}
def get_message_report(l_mess): # TODO Force language + translation def get_message_report(l_mess): # TODO Force language + translation
translation_target = 'en' translation_target = 'en'
@ -900,7 +900,7 @@ def api_get_message(message_id, translation_target=None):
message = Messages.Message(message_id) message = Messages.Message(message_id)
if not message.exists(): if not message.exists():
return {"status": "error", "reason": "Unknown uuid"}, 404 return {"status": "error", "reason": "Unknown uuid"}, 404
meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
if 'forwarded_from' in meta: if 'forwarded_from' in meta:
chat = get_obj_chat_from_global_id(meta['forwarded_from']) chat = get_obj_chat_from_global_id(meta['forwarded_from'])
meta['forwarded_from'] = chat.get_meta({'icon'}) meta['forwarded_from'] = chat.get_meta({'icon'})
@ -993,7 +993,7 @@ def api_chat_messages(subtype, chat_id):
if meta['subchannels']: if meta['subchannels']:
meta['subchannels'] = get_subchannels_meta_from_global_id(meta['subchannels']) meta['subchannels'] = get_subchannels_meta_from_global_id(meta['subchannels'])
else: else:
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = chat.get_messages(nb=-1, options=options) meta['messages'], _, _ = chat.get_messages(nb=-1, options=options)
return meta, 200 return meta, 200
@ -1009,7 +1009,7 @@ def api_subchannel_messages(subtype, subchannel_id):
meta['threads'] = get_threads_metas(meta['threads']) meta['threads'] = get_threads_metas(meta['threads'])
if meta.get('username'): if meta.get('username'):
meta['username'] = get_username_meta_from_global_id(meta['username']) meta['username'] = get_username_meta_from_global_id(meta['username'])
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = subchannel.get_messages(nb=-1, options=options) meta['messages'], _, _ = subchannel.get_messages(nb=-1, options=options)
return meta, 200 return meta, 200
@ -1018,7 +1018,7 @@ def api_thread_messages(subtype, thread_id):
if not thread.exists(): if not thread.exists():
return {"status": "error", "reason": "Unknown thread"}, 404 return {"status": "error", "reason": "Unknown thread"}, 404
meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'}) meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'})
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'} options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = thread.get_messages(nb=-1, options=options) meta['messages'], _, _ = thread.get_messages(nb=-1, options=options)
return meta, 200 return meta, 200

View file

@ -53,11 +53,11 @@ CORRELATION_TYPES_BY_OBJ = {
"dom-hash": ["domain", "item"], "dom-hash": ["domain", "item"],
"etag": ["domain"], "etag": ["domain"],
"favicon": ["domain", "item"], # TODO Decoded "favicon": ["domain", "item"], # TODO Decoded
"file-name": ["chat", "message"], "file-name": ["chat", "item", "message"],
"hhhash": ["domain"], "hhhash": ["domain"],
"image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ???? "image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ????
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "message", "pgp", "screenshot", "title", "username"], # chat ???
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"], "message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "item", "ocr", "pgp", "user-account"],
"ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
"pgp": ["domain", "item", "message", "ocr"], "pgp": ["domain", "item", "message", "ocr"],
"qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ????? "qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ?????

View file

@ -59,6 +59,20 @@ class Item(AbstractObject):
""" """
return item_basic.get_item_date(self.id, add_separator=separator) return item_basic.get_item_date(self.id, add_separator=separator)
def get_link(self, flask_context=False):
if flask_context:
url = url_for('objects_item.showItem', id=self.id)
else:
url = f'{baseurl}/object/item?id={self.id}'
return url
def get_svg_icon(self):
if is_crawled(self.id):
color = 'red'
else:
color = '#332288'
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
def get_source(self): def get_source(self):
""" """
Returns Item source/feeder name Returns Item source/feeder name
@ -149,18 +163,13 @@ class Item(AbstractObject):
if len(basename) > 255: if len(basename) > 255:
new_basename = f'{basename[:215]}{str(uuid4())}.gz' new_basename = f'{basename[:215]}{str(uuid4())}.gz'
self.id = rreplace(self.id, basename, new_basename, 1) self.id = rreplace(self.id, basename, new_basename, 1)
return self.id return self.id
# # TODO: sanitize_id # # TODO: sanitize_id
# # TODO: check if already exists ? # # TODO: check if already exists ?
# # TODO: check if duplicate # # TODO: check if duplicate
def save_on_disk(self, content, binary=True, compressed=False, b64=False): def _save_on_disk(self, content, content_type='bytes', b64=False, compressed=False):
if not binary: if not content_type == 'bytes':
content = content.encode() content = content.encode()
if b64: if b64:
content = base64.standard_b64decode(content) content = base64.standard_b64decode(content)
@ -181,22 +190,10 @@ class Item(AbstractObject):
# tags # tags
# origin # origin
# duplicate -> all item iterations ??? # duplicate -> all item iterations ???
# father
# #
def create(self, content, tags, father=None, duplicates=[], _save=True): def create(self, content, content_type='bytes', b64=False, compressed=False):
if _save: self._save_on_disk(content, content_type=content_type, b64=b64, compressed=compressed)
self.save_on_disk(content, binary=True, compressed=False, base64=False)
# # TODO:
# for tag in tags:
# self.add_tag(tag)
if father:
pass
for obj_id in duplicates:
for dup in duplicates[obj_id]:
self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
# TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ... # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
@ -211,20 +208,6 @@ class Item(AbstractObject):
#################################################################################### ####################################################################################
#################################################################################### ####################################################################################
def get_link(self, flask_context=False):
if flask_context:
url = url_for('objects_item.showItem', id=self.id)
else:
url = f'{baseurl}/object/item?id={self.id}'
return url
def get_svg_icon(self):
if is_crawled(self.id):
color = 'red'
else:
color = '#332288'
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
def get_misp_object(self): def get_misp_object(self):
obj = MISPObject('ail-leak', standalone=True) obj = MISPObject('ail-leak', standalone=True)
obj_date = self.get_date() obj_date = self.get_date()
@ -242,9 +225,6 @@ class Item(AbstractObject):
obj_attr.add_tag(tag) obj_attr.add_tag(tag)
return obj return obj
def exist_correlation(self):
pass
def is_crawled(self): def is_crawled(self):
return self.id.startswith('crawled') return self.id.startswith('crawled')

View file

@ -180,6 +180,28 @@ class Message(AbstractObject):
names.append(name[1:]) names.append(name[1:])
return names return names
def get_nb_files(self):
return self.get_nb_correlation('item')
def get_files(self, file_names=None):
if not file_names:
file_names = self.get_files_names()
files = {}
nb_files = 0
s_files = set()
for file_name in file_names:
files[file_name] = []
for it in self.get_correlation_iter('file-name', '', file_name, 'item'):
files[file_name].append(it[1:])
s_files.add(it[1:])
nb_files += 1
if nb_files < self.get_nb_files():
files['undefined'] = []
for f in self.get_correlation('item').get('item'):
if f[1:] not in s_files:
files['undefined'].append(f[1:])
return files
def get_reactions(self): def get_reactions(self):
return r_object.hgetall(f'meta:reactions:{self.type}::{self.id}') return r_object.hgetall(f'meta:reactions:{self.type}::{self.id}')
@ -322,6 +344,9 @@ class Message(AbstractObject):
meta['qrcodes'] = self.get_qrcodes() meta['qrcodes'] = self.get_qrcodes()
if 'files-names' in options: if 'files-names' in options:
meta['files-names'] = self.get_files_names() meta['files-names'] = self.get_files_names()
if 'files' in options:
if meta.get('files-names'):
meta['files'] = self.get_files(file_names=meta['files-names'])
if 'reactions' in options: if 'reactions' in options:
meta['reactions'] = self.get_reactions() meta['reactions'] = self.get_reactions()
if 'language' in options: if 'language' in options:

View file

@ -275,7 +275,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC):
def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message
message = Messages.Message(message[9:]) message = Messages.Message(message[9:])
if not options: if not options:
options = {'barcodes', 'content', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'} options = {'barcodes', 'content', 'files', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}
meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target) meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target)
return meta return meta

View file

@ -125,8 +125,12 @@ class Global(AbstractModule):
if r_result: if r_result:
return self.obj.id return self.obj.id
else:
if self.obj.exists():
self.add_message_to_queue(obj=self.obj, queue='Item')
else: else:
self.logger.info(f"Empty Item: {message} not processed") self.logger.info(f"Empty Item: {message} not processed")
elif self.obj.type == 'message' or self.obj.type == 'ocr': elif self.obj.type == 'message' or self.obj.type == 'ocr':
# TODO send to specific object queue => image, ... # TODO send to specific object queue => image, ...
self.add_message_to_queue(obj=self.obj, queue='Item') self.add_message_to_queue(obj=self.obj, queue='Item')
@ -217,8 +221,6 @@ class Global(AbstractModule):
gunzipped_bytes_obj = fo.read() gunzipped_bytes_obj = fo.read()
except Exception as e: except Exception as e:
self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}') self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}')
print(f'Global; Invalid Gzip file: {filename}, {e}')
return gunzipped_bytes_obj return gunzipped_bytes_obj
def rreplace(self, s, old, new, occurrence): def rreplace(self, s, old, new, occurrence):

View file

@ -102,7 +102,7 @@ class Mixer(AbstractModule):
# feeder_name - object # feeder_name - object
if len(splitted) == 1: # feeder_name - object (content already saved) if len(splitted) == 1: # feeder_name - object (content already saved)
feeder_name = message feeder_name = message
gzip64encoded = None gzip64encoded = ''
# Feeder name in message: "feeder obj_id gzip64encoded" # Feeder name in message: "feeder obj_id gzip64encoded"
elif len(splitted) == 2: # gzip64encoded content elif len(splitted) == 2: # gzip64encoded content

View file

@ -129,7 +129,25 @@
{% endif %} {% endif %}
{% endfor %} {% endfor %}
{% endif %} {% endif %}
{% if message['files-names'] %} {% if message['files'] %}
{% for file_name in message['files'] %}
{% if message['files'][file_name] | length > 1 %}
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }}
{% for item in message['files'][file_name] %}
<br>
<a href="{{ url_for('objects_item.showItem') }}?id={{ item }}"><i class="far fa-file"></i> {{ loop.index }}</a>
{% endfor %}
</div>
{% else %}
<a href="{{ url_for('objects_item.showItem') }}?id={{ message['files'][file_name][0] }}">
<div class="flex-shrink-1 bg-white border-primary text-primary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }}
</div>
</a>
{% endif %}
{% endfor %}
{% elif message['files-names'] %}
{% for file_name in message['files-names'] %} {% for file_name in message['files-names'] %}
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto"> <div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }} <i class="far fa-file fa-3x"></i> {{ file_name }}