mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-31 06:26:14 +00:00
chg: [process chat text] process text files from chats + correlations
This commit is contained in:
parent
bc83726212
commit
5e10bfcf6a
11 changed files with 136 additions and 77 deletions
|
@ -105,7 +105,12 @@ class FeederImporter(AbstractImporter):
|
|||
|
||||
objs_messages = []
|
||||
for obj in objs:
|
||||
if obj.type == 'item': # object save on disk as file (Items)
|
||||
# Text created
|
||||
if obj.type == 'item':
|
||||
if obj.exists():
|
||||
objs_messages.append({'obj': obj, 'message': feeder_name})
|
||||
# object save on disk as file (Items)
|
||||
else:
|
||||
gzip64_content = feeder.get_gzip64_content()
|
||||
relay_message = f'{feeder_name} {gzip64_content}'
|
||||
objs_messages.append({'obj': obj, 'message': relay_message})
|
||||
|
|
|
@ -19,10 +19,12 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
# Import Project packages
|
||||
##################################
|
||||
from importer.feeders.Default import DefaultFeeder
|
||||
from lib.ail_core import get_chat_instance_name
|
||||
from lib.objects.Chats import Chat
|
||||
from lib.objects import ChatSubChannels
|
||||
from lib.objects import ChatThreads
|
||||
from lib.objects import Images
|
||||
from lib.objects import Items
|
||||
from lib.objects import Messages
|
||||
from lib.objects import FilesNames
|
||||
# from lib.objects import Files
|
||||
|
@ -87,6 +89,9 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
def get_message_id(self):
|
||||
return self.json_data['meta']['id']
|
||||
|
||||
def get_media_id(self):
|
||||
return self.json_data['meta'].get('media', {}).get('id')
|
||||
|
||||
def get_media_name(self):
|
||||
return self.json_data['meta'].get('media', {}).get('name')
|
||||
|
||||
|
@ -154,12 +159,15 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
# channel id
|
||||
# thread id
|
||||
|
||||
# TODO sanitize obj type
|
||||
# TODO sanitize obj type ################### CHECK IF IS MESSAGE BY DEFAULT
|
||||
obj_type = self.get_obj_type()
|
||||
|
||||
if obj_type == 'image':
|
||||
self.obj = Images.Image(self.json_data['data-sha256'])
|
||||
|
||||
elif obj_type == 'text':
|
||||
d = self.get_date()
|
||||
instance_name = get_chat_instance_name(self.get_chat_instance_uuid())
|
||||
item_id = f'{instance_name}/{d[0:4]}/{d[4:6]}/{d[6:8]}/{self.json_data["data-sha256"]}.gz'
|
||||
self.obj = Items.Item(item_id)
|
||||
else:
|
||||
obj_id = Messages.create_obj_id(self.get_chat_instance_uuid(), chat_id, message_id, timestamp, thread_id=thread_id)
|
||||
self.obj = Messages.Message(obj_id)
|
||||
|
@ -195,8 +203,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
|
||||
return chat
|
||||
|
||||
##############################################################################################################################
|
||||
|
||||
###########################################################################################################
|
||||
|
||||
def process_chat(self, new_objs, obj, date, timestamp, feeder_timestamp, reply_id=None):
|
||||
meta = self.json_data['meta']['chat'] # todo replace me by function
|
||||
|
@ -404,7 +411,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
obj.add_reaction(reaction['reaction'], int(reaction['count']))
|
||||
elif self.obj.type == 'chat':
|
||||
pass
|
||||
else:
|
||||
else: # IMAGE + ITEM
|
||||
chat_id = self.get_chat_id()
|
||||
thread_id = self.get_thread_id()
|
||||
channel_id = self.get_subchannel_id()
|
||||
|
@ -416,7 +423,12 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
message.create('')
|
||||
objs.add(message)
|
||||
|
||||
if message.exists(): # TODO Correlation user-account image/filename ????
|
||||
if message.exists():
|
||||
# REACTIONS
|
||||
for reaction in self.get_reactions():
|
||||
message.add_reaction(reaction['reaction'], int(reaction['count']))
|
||||
|
||||
if self.obj.type == 'image':
|
||||
obj = Images.create(self.get_message_content())
|
||||
obj.add(date, message)
|
||||
obj.set_parent(obj_global_id=message.get_global_id())
|
||||
|
@ -426,8 +438,17 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
if media_name:
|
||||
FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
|
||||
|
||||
for reaction in self.get_reactions():
|
||||
message.add_reaction(reaction['reaction'], int(reaction['count']))
|
||||
elif self.obj.type == 'item':
|
||||
obj = self.obj
|
||||
if not obj.exists():
|
||||
obj.create(self.get_message_content())
|
||||
obj.add_correlation('message', '', message.id)
|
||||
|
||||
# FILENAME
|
||||
media_name = self.get_media_name()
|
||||
if media_name:
|
||||
file_name = FilesNames.FilesNames().create(media_name, date, message, file_obj=obj)
|
||||
file_name.add_correlation('item', '', obj.id)
|
||||
|
||||
for obj in objs: # TODO PERF avoid parsing metas multiple times
|
||||
|
||||
|
|
|
@ -134,6 +134,14 @@ def unpack_correl_objs_id(obj_type, correl_objs_id, r_type='tuple'):
|
|||
|
||||
##-- AIL OBJECTS --##
|
||||
|
||||
def get_chat_instance_name(chat_instance):
|
||||
if chat_instance == '00098785-7e70-5d12-a120-c5cdc1252b2b':
|
||||
return 'telegram'
|
||||
elif chat_instance == 'd2426e3f-22f3-5a57-9a98-d2ae9794e683':
|
||||
return 'discord'
|
||||
else:
|
||||
return chat_instance
|
||||
|
||||
#### Redis ####
|
||||
|
||||
def _parse_zscan(response):
|
||||
|
|
|
@ -344,7 +344,7 @@ def get_username_meta_from_global_id(username_global_id):
|
|||
###############################################################################
|
||||
# TODO Pagination
|
||||
def list_messages_to_dict(l_messages_id, translation_target=None):
|
||||
options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
options = {'content', 'files', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
meta = {}
|
||||
curr_date = None
|
||||
for mess_id in l_messages_id:
|
||||
|
@ -621,7 +621,7 @@ def _get_chat_card_meta_options():
|
|||
return {'created_at', 'icon', 'info', 'nb_participants', 'origin_link', 'subchannels', 'tags_safe', 'threads', 'translation', 'username'}
|
||||
|
||||
def _get_message_bloc_meta_options():
|
||||
return {'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'}
|
||||
return {'chat', 'content', 'files', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions','thread', 'translation', 'user-account'}
|
||||
|
||||
def get_message_report(l_mess): # TODO Force language + translation
|
||||
translation_target = 'en'
|
||||
|
@ -900,7 +900,7 @@ def api_get_message(message_id, translation_target=None):
|
|||
message = Messages.Message(message_id)
|
||||
if not message.exists():
|
||||
return {"status": "error", "reason": "Unknown uuid"}, 404
|
||||
meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
|
||||
meta = message.get_meta({'barcodes', 'chat', 'container', 'content', 'files', 'files-names', 'forwarded_from', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
|
||||
if 'forwarded_from' in meta:
|
||||
chat = get_obj_chat_from_global_id(meta['forwarded_from'])
|
||||
meta['forwarded_from'] = chat.get_meta({'icon'})
|
||||
|
@ -993,7 +993,7 @@ def api_chat_messages(subtype, chat_id):
|
|||
if meta['subchannels']:
|
||||
meta['subchannels'] = get_subchannels_meta_from_global_id(meta['subchannels'])
|
||||
else:
|
||||
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
meta['messages'], _, _ = chat.get_messages(nb=-1, options=options)
|
||||
return meta, 200
|
||||
|
||||
|
@ -1009,7 +1009,7 @@ def api_subchannel_messages(subtype, subchannel_id):
|
|||
meta['threads'] = get_threads_metas(meta['threads'])
|
||||
if meta.get('username'):
|
||||
meta['username'] = get_username_meta_from_global_id(meta['username'])
|
||||
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
meta['messages'], _, _ = subchannel.get_messages(nb=-1, options=options)
|
||||
return meta, 200
|
||||
|
||||
|
@ -1018,7 +1018,7 @@ def api_thread_messages(subtype, thread_id):
|
|||
if not thread.exists():
|
||||
return {"status": "error", "reason": "Unknown thread"}, 404
|
||||
meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'})
|
||||
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
options = {'content', 'files', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'user-account'}
|
||||
meta['messages'], _, _ = thread.get_messages(nb=-1, options=options)
|
||||
return meta, 200
|
||||
|
||||
|
|
|
@ -53,11 +53,11 @@ CORRELATION_TYPES_BY_OBJ = {
|
|||
"dom-hash": ["domain", "item"],
|
||||
"etag": ["domain"],
|
||||
"favicon": ["domain", "item"], # TODO Decoded
|
||||
"file-name": ["chat", "message"],
|
||||
"file-name": ["chat", "item", "message"],
|
||||
"hhhash": ["domain"],
|
||||
"image": ["barcode", "chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ????
|
||||
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ???
|
||||
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"],
|
||||
"item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "file-name", "message", "pgp", "screenshot", "title", "username"], # chat ???
|
||||
"message": ["barcode", "chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "item", "ocr", "pgp", "user-account"],
|
||||
"ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
|
||||
"pgp": ["domain", "item", "message", "ocr"],
|
||||
"qrcode": ["chat", "cve", "cryptocurrency", "decoded", "domain", "image", "message", "screenshot"], # "chat-subchannel", "chat-thread" ?????
|
||||
|
|
|
@ -59,6 +59,20 @@ class Item(AbstractObject):
|
|||
"""
|
||||
return item_basic.get_item_date(self.id, add_separator=separator)
|
||||
|
||||
def get_link(self, flask_context=False):
|
||||
if flask_context:
|
||||
url = url_for('objects_item.showItem', id=self.id)
|
||||
else:
|
||||
url = f'{baseurl}/object/item?id={self.id}'
|
||||
return url
|
||||
|
||||
def get_svg_icon(self):
|
||||
if is_crawled(self.id):
|
||||
color = 'red'
|
||||
else:
|
||||
color = '#332288'
|
||||
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
|
||||
|
||||
def get_source(self):
|
||||
"""
|
||||
Returns Item source/feeder name
|
||||
|
@ -149,18 +163,13 @@ class Item(AbstractObject):
|
|||
if len(basename) > 255:
|
||||
new_basename = f'{basename[:215]}{str(uuid4())}.gz'
|
||||
self.id = rreplace(self.id, basename, new_basename, 1)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return self.id
|
||||
|
||||
# # TODO: sanitize_id
|
||||
# # TODO: check if already exists ?
|
||||
# # TODO: check if duplicate
|
||||
def save_on_disk(self, content, binary=True, compressed=False, b64=False):
|
||||
if not binary:
|
||||
def _save_on_disk(self, content, content_type='bytes', b64=False, compressed=False):
|
||||
if not content_type == 'bytes':
|
||||
content = content.encode()
|
||||
if b64:
|
||||
content = base64.standard_b64decode(content)
|
||||
|
@ -181,22 +190,10 @@ class Item(AbstractObject):
|
|||
# tags
|
||||
# origin
|
||||
# duplicate -> all item iterations ???
|
||||
# father
|
||||
#
|
||||
def create(self, content, tags, father=None, duplicates=[], _save=True):
|
||||
if _save:
|
||||
self.save_on_disk(content, binary=True, compressed=False, base64=False)
|
||||
|
||||
# # TODO:
|
||||
# for tag in tags:
|
||||
# self.add_tag(tag)
|
||||
|
||||
if father:
|
||||
pass
|
||||
|
||||
for obj_id in duplicates:
|
||||
for dup in duplicates[obj_id]:
|
||||
self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
|
||||
|
||||
def create(self, content, content_type='bytes', b64=False, compressed=False):
|
||||
self._save_on_disk(content, content_type=content_type, b64=b64, compressed=compressed)
|
||||
|
||||
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
|
||||
# TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
|
||||
|
@ -211,20 +208,6 @@ class Item(AbstractObject):
|
|||
####################################################################################
|
||||
####################################################################################
|
||||
|
||||
def get_link(self, flask_context=False):
|
||||
if flask_context:
|
||||
url = url_for('objects_item.showItem', id=self.id)
|
||||
else:
|
||||
url = f'{baseurl}/object/item?id={self.id}'
|
||||
return url
|
||||
|
||||
def get_svg_icon(self):
|
||||
if is_crawled(self.id):
|
||||
color = 'red'
|
||||
else:
|
||||
color = '#332288'
|
||||
return {'style': '', 'icon': '', 'color': color, 'radius': 5}
|
||||
|
||||
def get_misp_object(self):
|
||||
obj = MISPObject('ail-leak', standalone=True)
|
||||
obj_date = self.get_date()
|
||||
|
@ -242,9 +225,6 @@ class Item(AbstractObject):
|
|||
obj_attr.add_tag(tag)
|
||||
return obj
|
||||
|
||||
def exist_correlation(self):
|
||||
pass
|
||||
|
||||
def is_crawled(self):
|
||||
return self.id.startswith('crawled')
|
||||
|
||||
|
|
|
@ -180,6 +180,28 @@ class Message(AbstractObject):
|
|||
names.append(name[1:])
|
||||
return names
|
||||
|
||||
def get_nb_files(self):
|
||||
return self.get_nb_correlation('item')
|
||||
|
||||
def get_files(self, file_names=None):
|
||||
if not file_names:
|
||||
file_names = self.get_files_names()
|
||||
files = {}
|
||||
nb_files = 0
|
||||
s_files = set()
|
||||
for file_name in file_names:
|
||||
files[file_name] = []
|
||||
for it in self.get_correlation_iter('file-name', '', file_name, 'item'):
|
||||
files[file_name].append(it[1:])
|
||||
s_files.add(it[1:])
|
||||
nb_files += 1
|
||||
if nb_files < self.get_nb_files():
|
||||
files['undefined'] = []
|
||||
for f in self.get_correlation('item').get('item'):
|
||||
if f[1:] not in s_files:
|
||||
files['undefined'].append(f[1:])
|
||||
return files
|
||||
|
||||
def get_reactions(self):
|
||||
return r_object.hgetall(f'meta:reactions:{self.type}::{self.id}')
|
||||
|
||||
|
@ -322,6 +344,9 @@ class Message(AbstractObject):
|
|||
meta['qrcodes'] = self.get_qrcodes()
|
||||
if 'files-names' in options:
|
||||
meta['files-names'] = self.get_files_names()
|
||||
if 'files' in options:
|
||||
if meta.get('files-names'):
|
||||
meta['files'] = self.get_files(file_names=meta['files-names'])
|
||||
if 'reactions' in options:
|
||||
meta['reactions'] = self.get_reactions()
|
||||
if 'language' in options:
|
||||
|
|
|
@ -275,7 +275,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC):
|
|||
def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message
|
||||
message = Messages.Message(message[9:])
|
||||
if not options:
|
||||
options = {'barcodes', 'content', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
options = {'barcodes', 'content', 'files', 'files-names', 'forwarded_from', 'images', 'language', 'link', 'parent', 'parent_meta', 'qrcodes', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target)
|
||||
return meta
|
||||
|
||||
|
|
|
@ -125,8 +125,12 @@ class Global(AbstractModule):
|
|||
if r_result:
|
||||
return self.obj.id
|
||||
|
||||
else:
|
||||
if self.obj.exists():
|
||||
self.add_message_to_queue(obj=self.obj, queue='Item')
|
||||
else:
|
||||
self.logger.info(f"Empty Item: {message} not processed")
|
||||
|
||||
elif self.obj.type == 'message' or self.obj.type == 'ocr':
|
||||
# TODO send to specific object queue => image, ...
|
||||
self.add_message_to_queue(obj=self.obj, queue='Item')
|
||||
|
@ -217,8 +221,6 @@ class Global(AbstractModule):
|
|||
gunzipped_bytes_obj = fo.read()
|
||||
except Exception as e:
|
||||
self.logger.warning(f'Global; Invalid Gzip file: {filename}, {e}')
|
||||
print(f'Global; Invalid Gzip file: {filename}, {e}')
|
||||
|
||||
return gunzipped_bytes_obj
|
||||
|
||||
def rreplace(self, s, old, new, occurrence):
|
||||
|
|
|
@ -102,7 +102,7 @@ class Mixer(AbstractModule):
|
|||
# feeder_name - object
|
||||
if len(splitted) == 1: # feeder_name - object (content already saved)
|
||||
feeder_name = message
|
||||
gzip64encoded = None
|
||||
gzip64encoded = ''
|
||||
|
||||
# Feeder name in message: "feeder obj_id gzip64encoded"
|
||||
elif len(splitted) == 2: # gzip64encoded content
|
||||
|
|
|
@ -129,7 +129,25 @@
|
|||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% if message['files-names'] %}
|
||||
{% if message['files'] %}
|
||||
{% for file_name in message['files'] %}
|
||||
{% if message['files'][file_name] | length > 1 %}
|
||||
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
|
||||
<i class="far fa-file fa-3x"></i> {{ file_name }}
|
||||
{% for item in message['files'][file_name] %}
|
||||
<br>
|
||||
<a href="{{ url_for('objects_item.showItem') }}?id={{ item }}"><i class="far fa-file"></i> {{ loop.index }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% else %}
|
||||
<a href="{{ url_for('objects_item.showItem') }}?id={{ message['files'][file_name][0] }}">
|
||||
<div class="flex-shrink-1 bg-white border-primary text-primary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
|
||||
<i class="far fa-file fa-3x"></i> {{ file_name }}
|
||||
</div>
|
||||
</a>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% elif message['files-names'] %}
|
||||
{% for file_name in message['files-names'] %}
|
||||
<div class="flex-shrink-1 bg-white border-primary text-secondary rounded py-2 px-3 ml-4 mb-3" style="overflow-x: auto">
|
||||
<i class="far fa-file fa-3x"></i> {{ file_name }}
|
||||
|
|
Loading…
Add table
Reference in a new issue