From 2db54def46625e10eb0bf0b038c5794d826037f5 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 25 Mar 2024 16:36:24 +0100 Subject: [PATCH] fix: [chat] fix subchannel-message correlation + fix empty message language detection --- bin/importer/feeders/abstract_chats_feeder.py | 56 ++++++++++++++++++- bin/lib/Language.py | 22 ++++++-- bin/lib/chats_viewer.py | 19 ++++++- bin/lib/objects/Messages.py | 2 +- bin/modules/Languages.py | 4 ++ tools/reprocess_objects.py | 37 ++++++++++-- update/v5.4/Update.py | 2 + 7 files changed, 126 insertions(+), 16 deletions(-) diff --git a/bin/importer/feeders/abstract_chats_feeder.py b/bin/importer/feeders/abstract_chats_feeder.py index 6b8f1041..8b337e9f 100755 --- a/bin/importer/feeders/abstract_chats_feeder.py +++ b/bin/importer/feeders/abstract_chats_feeder.py @@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid()) thread = None - # TODO correlation with obj = message/image - subchannel.add(date) + subchannel.add(date, obj) if meta.get('date'): # TODO check if already exists subchannel.set_created_at(int(meta['date']['timestamp'])) @@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC): # CHAT chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id) - # Message forward + # # TODO HANDLE OTHERS OBJECT TYPE + # # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # # Message forward + Discussion + # if self.get_json_meta().get('forward'): + # discussion_id = self.get_json_meta().get('discussion') + # forward_from = self.get_message_forward() + # + # if discussion_id: # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS + # chat_forward_id = forward_from['from']['id'] + # message_forward_id = forward_from['from']['channel_post'] + # + # # if chat_forward_id == discussion_id: + # # linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid()) + # # if linked_chat.exists(): + # # # create thread + # # # add message replies for each childrens + # + # # TODO HANDLE THREAD + # # TODO Change FORWARD META FIELDS + # # meta['forward'] = {} + # # # CHAT ID + # # # SUBCHANNEL ID -> can be None + # # # Message ID + # + # # meta['forward']['origin'] + # # # same as 'forward' + # + # if self.get_json_meta().get('forward'): + # forward = self.get_message_forward() + # f_chat = forward['chat'] + # f_subchannel = forward.get('subchannel') + # f_id = forward.get('id') + # if not f_subchannel: + # chat_forward = Chat(f_chat, self.get_chat_instance_uuid()) + # if chat_forward.exists(): + # for chat_obj in chat_objs: + # if chat_obj.type == 'chat': + # chat_forward.add_relationship(chat_obj.get_global_id(), 'forward') + # # TODO LIST FORWARDED MESSAGES + # + # + # # Discord -> serverID + subchannel ID + message ID + # # Telegram -> chat ID + Message ID + # # + ORIGIN IDs + # + # + # + # # TODO create relationships graph + # + # + # # TODO REMOVE ME + # # Message forward # TODO handle subchannel + message ID # if self.get_json_meta().get('forward'): # forward_from = self.get_message_forward() # print('-----------------------------------------------------------') diff --git a/bin/lib/Language.py b/bin/lib/Language.py index f6c9ebfb..d7b8c1c8 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False): h = html2text.HTML2Text() h.ignore_links = ignore_links h.ignore_images = ignore_links - return h.handle(content) + content = h.handle(content) + if content == '\n\n': + content = '' + return content def _clean_text_to_translate(content, html=False, keys_blocks=True): if html: @@ -482,14 +485,23 @@ class LanguagesDetector: return languages def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ???? + if not content: + return None content = _clean_text_to_translate(content, html=True) - # print('cleaned content', content) - # gcld3 - if len(content) < 100: + if not content: + return None + # DEBUG + # print('-------------------------------------------------------') + # print(content) + # print(len(content)) + # lexilang + if len(content) < 150: + # print('lexilang') languages = self.detect_lexilang(content) + # gcld3 else: # if len(content) >= 200 or not self.lt or force_gcld3: - # print('gcld3') + # print('gcld3') languages = self.detect_gcld3(content) # libretranslate # else: diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index e4b0d82c..6d660c9e 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id): username = Usernames.Username(username_id, instance_uuid) return username.get_meta() - # TODO Filter ## Instance type ## Chats IDS @@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}): nb_messages += chat.get_nb_messages() return nb_messages + +#### FIX #### + +def fix_correlations_subchannel_message(): + for instance_uuid in get_chat_service_instances(): + for chat_id in ChatServiceInstance(instance_uuid).get_chats(): + chat = Chats.Chat(chat_id, instance_uuid) + # subchannels + for subchannel_gid in chat.get_subchannels(): + _, _, subchannel_id = subchannel_gid.split(':', 2) + subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid) + messages, _ = subchannel._get_messages(nb=-1) + for mess in messages: + _, _, message_id = mess[0].split(':', ) + subchannel.add_correlation('message', '', message_id) + #### API #### def api_get_chat_service_instance(chat_instance_uuid): @@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa chat = Chats.Chat(chat_id, chat_instance_uuid) if not chat.exists(): return {"status": "error", "reason": "Unknown chat"}, 404 + # print(chat.get_obj_language_stats()) meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target) if meta['username']: meta['username'] = get_username_meta_from_global_id(meta['username']) @@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb= subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid) if not subchannel.exists(): return {"status": "error", "reason": "Unknown subchannel"}, 404 + # print(subchannel.get_obj_language_stats()) meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target) if meta['chat']: meta['chat'] = get_chat_meta_from_global_id(meta['chat']) diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 6d57d2cf..fbcad8f7 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -113,7 +113,7 @@ class Message(AbstractObject): def get_subchannel(self): subchannel = self.get_correlation('chat-subchannel') if subchannel.get('chat-subchannel'): - return f'user-account:{subchannel["chat-subchannel"].pop()}' + return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}' def get_thread(self): for child in self.get_childrens(): diff --git a/bin/modules/Languages.py b/bin/modules/Languages.py index bff7b0ba..28fbdff6 100755 --- a/bin/modules/Languages.py +++ b/bin/modules/Languages.py @@ -33,6 +33,10 @@ class Languages(AbstractModule): for lang in obj.get_languages(min_probability=0.8, force_gcld3=True): print(lang) domain.add_language(lang) + # Detect Chat Message Language + # elif obj.type == 'message': + # lang = obj.detect_language() + # print(self.obj.id, lang) if __name__ == '__main__': diff --git a/tools/reprocess_objects.py b/tools/reprocess_objects.py index 6d0ffd16..678cf989 100755 --- a/tools/reprocess_objects.py +++ b/tools/reprocess_objects.py @@ -20,17 +20,39 @@ from lib.ail_core import is_object_type from lib import ail_queues from lib.objects import ail_objects -def reprocess_message_objects(object_type): - queue = ail_queues.AILQueue('FeederModuleImporter', -1) - for obj in ail_objects.obj_iterator(object_type, filters={}): - queue.send_message(obj.get_global_id(), message='reprocess') - queue.end() +# from modules.ApiKey import ApiKey +# from modules.Categ import Categ +# from modules.CreditCards import CreditCards +# from modules.DomClassifier import DomClassifier +# from modules.Global import Global +# from modules.Keys import Keys +# from modules.Onion import Onion +# from modules.Telegram import Telegram + +from modules.Languages import Languages + +MODULES = { + 'Languages': Languages +} + +def reprocess_message_objects(object_type, module_name=None): + if module_name: + module = MODULES[module_name]() + for obj in ail_objects.obj_iterator(object_type, filters={}): + module.obj = obj + module.compute(None) + else: + queue = ail_queues.AILQueue('FeederModuleImporter', -1) + for obj in ail_objects.obj_iterator(object_type, filters={}): + queue.send_message(obj.get_global_id(), message='reprocess') + queue.end() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Reprocess AIL Objects') parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True) + parser.add_argument('-m', '--module', type=str, help='AIL Module Name') args = parser.parse_args() if not args.type: @@ -43,4 +65,7 @@ if __name__ == "__main__": if obj_type not in ['item', 'message']: # TODO image raise Exception(f'Currently not supported Object Type: {obj_type}') - reprocess_message_objects(obj_type) \ No newline at end of file + modulename = args.module + if modulename not in MODULES: + raise Exception(f'Currently not supported Module: {modulename}') + reprocess_message_objects(obj_type, module_name=modulename) diff --git a/update/v5.4/Update.py b/update/v5.4/Update.py index a10e4dc9..62e04cc6 100755 --- a/update/v5.4/Update.py +++ b/update/v5.4/Update.py @@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME']) ################################## from update.bin.ail_updater import AIL_Updater from lib import ail_updates +from lib import chats_viewer class Updater(AIL_Updater): """default Updater.""" @@ -19,6 +20,7 @@ class Updater(AIL_Updater): if __name__ == '__main__': + chats_viewer.fix_correlations_subchannel_message() updater = Updater('v5.4') updater.run_update()