chg: [process chat text] process text files from chats + correlations

This commit is contained in:
terrtia 2025-01-28 10:42:21 +01:00
parent bc83726212
commit 5e10bfcf6a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
11 changed files with 136 additions and 77 deletions

View file

@ -105,7 +105,12 @@ class FeederImporter(AbstractImporter):
objs_messages = []
for obj in objs:
if obj.type == 'item': # object save on disk as file (Items)
# Text created
if obj.type == 'item':
if obj.exists():
objs_messages.append({'obj': obj, 'message': feeder_name})
# object save on disk as file (Items)
else:
gzip64_content = feeder.get_gzip64_content()
relay_message = f'{feeder_name} {gzip64_content}'
objs_messages.append({'obj': obj, 'message': relay_message})

View file

@ -19,10 +19,12 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages
##################################
from importer.feeders.Default import DefaultFeeder
from lib.ail_core import get_chat_instance_name
from lib.objects.Chats import Chat
from lib.objects import ChatSubChannels
from lib.objects import ChatThreads
from lib.objects import Images
from lib.objects import Items
from lib.objects import Messages
from lib.objects import FilesNames
# from lib.objects import Files
@ -87,6 +89,9 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
def get_message_id(self):
return self.json_data['meta']['id']
def get_media_id(self):
return self.json_data['meta'].get('media', {}).get('id')
def get_media_name(self):
return self.json_data['meta'].get('media', {}).get('name')
@ -154,12 +159,15 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
# channel id
# thread id
# TODO sanitize obj type
# TODO sanitize obj type ################### CHECK IF IS MESSAGE BY DEFAULT
obj_type = self.get_obj_type()
if obj_type == 'image':
self.obj = Images.Image(self.json_data['data-sha256'])
elif obj_type == 'text':
d = self.get_date()
instance_name = get_chat_instance_name(self.get_chat_instance_uuid())
item_id = f'{instance_name}/{d[0:4]}/{d[4:6]}/{d[6:8]}/{self.json_data["data-sha256"]}.gz'
self.obj = Items.Item(item_id)
else:
obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id)
self.obj = Messages.Message(obj_id)
@ -195,8 +203,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
return chat
##############################################################################################################################
###########################################################################################################
def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_id=None):
meta = self.json_data['meta']['chat'] # todo replace me by function
@ -404,7 +411,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
obj.add_reaction(reaction['reaction'], int(reaction['count']))
elif self.obj.type == 'chat':
pass
else:
else: # IMAGE + ITEM
chat_id = self.get_chat_id()
thread_id = self.get_thread_id()
channel_id = self.get_subchannel_id()
@ -416,7 +423,12 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
message.create('')
objs.add(message)
if message.exists(): # TODO Correlation user-account image/filename ????
if message.exists():
# REACTIONS
for reaction in self.get_reactions():
message.add_reaction(reaction['reaction'], int(reaction['count']))
if self.obj.type == 'image':
obj = Images.create(self.get_message_content())
obj.add(date, message)
obj.set_parent(obj_global_id=message.get_global_id())
@ -426,8 +438,17 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
if media_name:
FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
for reaction in self.get_reactions():
message.add_reaction(reaction['reaction'], int(reaction['count']))
elif self.obj.type == 'item':
obj = self.obj
if not obj.exists():
obj.create(self.get_message_content())
obj.add_correlation('message', '', message.id)
# FILENAME
media_name = self.get_media_name()
if media_name:
file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
file_name.add_correlation('item', '', obj.id)
for obj in objs: # TODO PERF avoid parsing metas multiple times

View file

@ -134,6 +134,14 @@ def unpack_correl_objs_id(obj_type, correl_objs_id, r_type='tuple'):
##-- AIL OBJECTS --##
def get_chat_instance_name(chat_instance):
if chat_instance == '00098785-7e70-5d12-a120-c5cdc1252b2b':
return 'telegram'
elif chat_instance == 'd2426e3f-22f3-5a57-9a98-d2ae9794e683':
return 'discord'
else:
return chat_instance
#### Redis ####
def _parse_zscan(response):

View file

@ -344,7 +344,7 @@ def get_username_meta_from_global_id(username_global_id):
###############################################################################
# TODO Pagination
def list_messages_to_dict(l_messages_id, translation_target=None):
options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
options = {'content', 'files', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
meta = {}
curr_date = None
for mess_id in l_messages_id:
@ -621,7 +621,7 @@ def _get_chat_card_meta_options():
return {'created_at', 'icon', 'info', 'nb_participants', 'origin_link', 'subchannels', 'tags_safe', 'threads', 'translation', 'username'}
def _get_message_bloc_meta_options():
return {'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'}
return {'chat', 'content', 'files', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'}
def get_message_report(l_mess): # TODO Force language + translation
translation_target = 'en'
@ -900,7 +900,7 @@ def api_get_message(message_id, translation_target=None):
message = Messages.Message(message_id)
if not message.exists():
return {"status": "error", "reason": "Unknown uuid"}, 404
meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
if 'forwarded_from' in meta:
chat = get_obj_chat_from_global_id(meta['forwarded_from'])
meta['forwarded_from'] = chat.get_meta({'icon'})
@ -993,7 +993,7 @@ def api_chat_messages(subtype, chat_id):
if meta['subchannels']:
meta['subchannels'] = get_subchannels_meta_from_global_id(meta['subchannels'])
else:
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = chat.get_messages(nb=-1, options=options)
return meta, 200
@ -1009,7 +1009,7 @@ def api_subchannel_messages(subtype, subchannel_id):
meta['threads'] = get_threads_metas(meta['threads'])
if meta.get('username'):
meta['username'] = get_username_meta_from_global_id(meta['username'])
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = subchannel.get_messages(nb=-1, options=options)
return meta, 200
@ -1018,7 +1018,7 @@ def api_thread_messages(subtype, thread_id):
if not thread.exists():
return {"status": "error", "reason": "Unknown thread"}, 404
meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'})
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
meta['messages'], _, _ = thread.get_messages(nb=-1, options=options)
return meta, 200

View file

@ -53,11 +53,11 @@ CORRELATION_TYPES_BY_OBJ = {
"dom-hash": ["domain", "item"],
"etag": ["domain"],
"favicon": ["domain", "item"], # TODO Decoded
"file-name": ["chat", "message"],
"file-name": ["chat", "item", "message"],
"hhhash": ["domain"],
"image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ????
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ???
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"],
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "message", "pgp", "screenshot", "title", "username"], # chat ???
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "item", "ocr", "pgp", "user-account"],
"ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
"pgp": ["domain", "item", "message", "ocr"],
"qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ?????

View file

@ -59,6 +59,20 @@ class Item(AbstractObject):
"""
return item_basic.get_item_date(self.id, add_separator=separator)
def get_link(self, flask_context=False):
if flask_context:
url = url_for('objects_item.showItem', id=self.id)
else:
url = f'{baseurl}/object/item?id={self.id}'
return url
def get_svg_icon(self):
if is_crawled(self.id):
color = 'red'
else:
color = '#332288'
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
def get_source(self):
"""
Returns Item source/feeder name
@ -149,18 +163,13 @@ class Item(AbstractObject):
if len(basename) > 255:
new_basename = f'{basename[:215]}{str(uuid4())}.gz'
self.id = rreplace(self.id, basename, new_basename, 1)
return self.id
# # TODO: sanitize_id
# # TODO: check if already exists ?
# # TODO: check if duplicate
def save_on_disk(self, content, binary=True, compressed=False, b64=False):
if not binary:
def _save_on_disk(self, content, content_type='bytes', b64=False, compressed=False):
if not content_type == 'bytes':
content = content.encode()
if b64:
content = base64.standard_b64decode(content)
@ -181,22 +190,10 @@ class Item(AbstractObject):
# tags
# origin
# duplicate -> all item iterations ???
# father
#
def create(self, content, tags, father=None, duplicates=[], _save=True):
if _save:
self.save_on_disk(content, binary=True, compressed=False, base64=False)
# # TODO:
# for tag in tags:
# self.add_tag(tag)
if father:
pass
for obj_id in duplicates:
for dup in duplicates[obj_id]:
self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
def create(self, content, content_type='bytes', b64=False, compressed=False):
self._save_on_disk(content, content_type=content_type, b64=b64, compressed=compressed)
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
# TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
@ -211,20 +208,6 @@ class Item(AbstractObject):
####################################################################################
####################################################################################
def get_link(self, flask_context=False):
if flask_context:
url = url_for('objects_item.showItem', id=self.id)
else:
url = f'{baseurl}/object/item?id={self.id}'
return url
def get_svg_icon(self):
if is_crawled(self.id):
color = 'red'
else:
color = '#332288'
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
def get_misp_object(self):
obj = MISPObject('ail-leak', standalone=True)
obj_date = self.get_date()
@ -242,9 +225,6 @@ class Item(AbstractObject):
obj_attr.add_tag(tag)
return obj
def exist_correlation(self):
pass
def is_crawled(self):
return self.id.startswith('crawled')

View file

@ -180,6 +180,28 @@ class Message(AbstractObject):
names.append(name[1:])
return names
def get_nb_files(self):
return self.get_nb_correlation('item')
def get_files(self, file_names=None):
if not file_names:
file_names = self.get_files_names()
files = {}
nb_files = 0
s_files = set()
for file_name in file_names:
files[file_name] = []
for it in self.get_correlation_iter('file-name', '', file_name, 'item'):
files[file_name].append(it[1:])
s_files.add(it[1:])
nb_files += 1
if nb_files < self.get_nb_files():
files['undefined'] = []
for f in self.get_correlation('item').get('item'):
if f[1:] not in s_files:
files['undefined'].append(f[1:])
return files
def get_reactions(self):
return r_object.hgetall(f'meta:reactions:{self.type}::{self.id}')
@ -322,6 +344,9 @@ class Message(AbstractObject):
meta['qrcodes'] = self.get_qrcodes()
if 'files-names' in options:
meta['files-names'] = self.get_files_names()
if 'files' in options:
if meta.get('files-names'):
meta['files'] = self.get_files(file_names=meta['files-names'])
if 'reactions' in options:
meta['reactions'] = self.get_reactions()
if 'language' in options:

View file

@ -275,7 +275,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC):
def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message
message = Messages.Message(message[9:])
if not options:
options = {'barcodes', 'content', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}
options = {'barcodes', 'content', 'files', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}
meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target)
return meta

View file

@ -125,8 +125,12 @@ class Global(AbstractModule):
if r_result:
return self.obj.id
else:
if self.obj.exists():
self.add_message_to_queue(obj=self.obj, queue='Item')
else:
self.logger.info(f"Empty Item: {message} not processed")
elif self.obj.type == 'message' or self.obj.type == 'ocr':
# TODO send to specific object queue => image, ...
self.add_message_to_queue(obj=self.obj, queue='Item')
@ -217,8 +221,6 @@ class Global(AbstractModule):
gunzipped_bytes_obj = fo.read()
except Exception as e:
self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}')
print(f'Global; Invalid Gzip file: {filename}, {e}')
return gunzipped_bytes_obj
def rreplace(self, s, old, new, occurrence):

View file

@ -102,7 +102,7 @@ class Mixer(AbstractModule):
# feeder_name - object
if len(splitted) == 1: # feeder_name - object (content already saved)
feeder_name = message
gzip64encoded = None
gzip64encoded = ''
# Feeder name in message: "feeder obj_id gzip64encoded"
elif len(splitted) == 2: # gzip64encoded content

View file

@ -129,7 +129,25 @@
{% endif %}
{% endfor %}
{% endif %}
{% if message['files-names'] %}
{% if message['files'] %}
{% for file_name in message['files'] %}
{% if message['files'][file_name] | length > 1 %}
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }}
{% for item in message['files'][file_name] %}
<br>
<a href="{{ url_for('objects_item.showItem') }}?id={{ item }}"><i class="far fa-file"></i> {{ loop.index }}</a>
{% endfor %}
</div>
{% else %}
<a href="{{ url_for('objects_item.showItem') }}?id={{ message['files'][file_name][0] }}">
<div class="flex-shrink-1 bg-white border-primary text-primary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }}
</div>
</a>
{% endif %}
{% endfor %}
{% elif message['files-names'] %}
{% for file_name in message['files-names'] %}
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
<i class="far fa-file fa-3x"></i> {{ file_name }}