fix: [chat] fix subchannel-message correlation + fix empty message language detection

This commit is contained in:
terrtia 2024-03-25 16:36:24 +01:00
parent b9c37167ad
commit 2db54def46
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
7 changed files with 126 additions and 16 deletions

View file

@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid()) subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid())
thread = None thread = None
# TODO correlation with obj = message/image subchannel.add(date, obj)
subchannel.add(date)
if meta.get('date'): # TODO check if already exists if meta.get('date'): # TODO check if already exists
subchannel.set_created_at(int(meta['date']['timestamp'])) subchannel.set_created_at(int(meta['date']['timestamp']))
@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
# CHAT # CHAT
chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id) chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id)
# Message forward # # TODO HANDLE OTHERS OBJECT TYPE
# # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# # Message forward + Discussion
# if self.get_json_meta().get('forward'):
# discussion_id = self.get_json_meta().get('discussion')
# forward_from = self.get_message_forward()
#
# if discussion_id: # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS
# chat_forward_id = forward_from['from']['id']
# message_forward_id = forward_from['from']['channel_post']
#
# # if chat_forward_id == discussion_id:
# # linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid())
# # if linked_chat.exists():
# # # create thread
# # # add message replies for each childrens
#
# # TODO HANDLE THREAD
# # TODO Change FORWARD META FIELDS
# # meta['forward'] = {}
# # # CHAT ID
# # # SUBCHANNEL ID -> can be None
# # # Message ID
#
# # meta['forward']['origin']
# # # same as 'forward'
#
# if self.get_json_meta().get('forward'):
# forward = self.get_message_forward()
# f_chat = forward['chat']
# f_subchannel = forward.get('subchannel')
# f_id = forward.get('id')
# if not f_subchannel:
# chat_forward = Chat(f_chat, self.get_chat_instance_uuid())
# if chat_forward.exists():
# for chat_obj in chat_objs:
# if chat_obj.type == 'chat':
# chat_forward.add_relationship(chat_obj.get_global_id(), 'forward')
# # TODO LIST FORWARDED MESSAGES
#
#
# # Discord -> serverID + subchannel ID + message ID
# # Telegram -> chat ID + Message ID
# # + ORIGIN IDs
#
#
#
# # TODO create relationships graph
#
#
# # TODO REMOVE ME
# # Message forward # TODO handle subchannel + message ID
# if self.get_json_meta().get('forward'): # if self.get_json_meta().get('forward'):
# forward_from = self.get_message_forward() # forward_from = self.get_message_forward()
# print('-----------------------------------------------------------') # print('-----------------------------------------------------------')

View file

@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False):
h = html2text.HTML2Text() h = html2text.HTML2Text()
h.ignore_links = ignore_links h.ignore_links = ignore_links
h.ignore_images = ignore_links h.ignore_images = ignore_links
return h.handle(content) content = h.handle(content)
if content == '\n\n':
content = ''
return content
def _clean_text_to_translate(content, html=False, keys_blocks=True): def _clean_text_to_translate(content, html=False, keys_blocks=True):
if html: if html:
@ -482,14 +485,23 @@ class LanguagesDetector:
return languages return languages
def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ???? def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
if not content:
return None
content = _clean_text_to_translate(content, html=True) content = _clean_text_to_translate(content, html=True)
# print('cleaned content', content) if not content:
# gcld3 return None
if len(content) < 100: # DEBUG
# print('-------------------------------------------------------')
# print(content)
# print(len(content))
# lexilang
if len(content) < 150:
# print('lexilang')
languages = self.detect_lexilang(content) languages = self.detect_lexilang(content)
# gcld3
else: else:
# if len(content) >= 200 or not self.lt or force_gcld3: # if len(content) >= 200 or not self.lt or force_gcld3:
# print('gcld3') # print('gcld3')
languages = self.detect_gcld3(content) languages = self.detect_gcld3(content)
# libretranslate # libretranslate
# else: # else:

View file

@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id):
username = Usernames.Username(username_id, instance_uuid) username = Usernames.Username(username_id, instance_uuid)
return username.get_meta() return username.get_meta()
# TODO Filter # TODO Filter
## Instance type ## Instance type
## Chats IDS ## Chats IDS
@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}):
nb_messages += chat.get_nb_messages() nb_messages += chat.get_nb_messages()
return nb_messages return nb_messages
#### FIX ####
def fix_correlations_subchannel_message():
for instance_uuid in get_chat_service_instances():
for chat_id in ChatServiceInstance(instance_uuid).get_chats():
chat = Chats.Chat(chat_id, instance_uuid)
# subchannels
for subchannel_gid in chat.get_subchannels():
_, _, subchannel_id = subchannel_gid.split(':', 2)
subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid)
messages, _ = subchannel._get_messages(nb=-1)
for mess in messages:
_, _, message_id = mess[0].split(':', )
subchannel.add_correlation('message', '', message_id)
#### API #### #### API ####
def api_get_chat_service_instance(chat_instance_uuid): def api_get_chat_service_instance(chat_instance_uuid):
@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa
chat = Chats.Chat(chat_id, chat_instance_uuid) chat = Chats.Chat(chat_id, chat_instance_uuid)
if not chat.exists(): if not chat.exists():
return {"status": "error", "reason": "Unknown chat"}, 404 return {"status": "error", "reason": "Unknown chat"}, 404
# print(chat.get_obj_language_stats())
meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target) meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target)
if meta['username']: if meta['username']:
meta['username'] = get_username_meta_from_global_id(meta['username']) meta['username'] = get_username_meta_from_global_id(meta['username'])
@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb=
subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid) subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid)
if not subchannel.exists(): if not subchannel.exists():
return {"status": "error", "reason": "Unknown subchannel"}, 404 return {"status": "error", "reason": "Unknown subchannel"}, 404
# print(subchannel.get_obj_language_stats())
meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target) meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target)
if meta['chat']: if meta['chat']:
meta['chat'] = get_chat_meta_from_global_id(meta['chat']) meta['chat'] = get_chat_meta_from_global_id(meta['chat'])

View file

@ -113,7 +113,7 @@ class Message(AbstractObject):
def get_subchannel(self): def get_subchannel(self):
subchannel = self.get_correlation('chat-subchannel') subchannel = self.get_correlation('chat-subchannel')
if subchannel.get('chat-subchannel'): if subchannel.get('chat-subchannel'):
return f'user-account:{subchannel["chat-subchannel"].pop()}' return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}'
def get_thread(self): def get_thread(self):
for child in self.get_childrens(): for child in self.get_childrens():

View file

@ -33,6 +33,10 @@ class Languages(AbstractModule):
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True): for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
print(lang) print(lang)
domain.add_language(lang) domain.add_language(lang)
# Detect Chat Message Language
# elif obj.type == 'message':
# lang = obj.detect_language()
# print(self.obj.id, lang)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -20,17 +20,39 @@ from lib.ail_core import is_object_type
from lib import ail_queues from lib import ail_queues
from lib.objects import ail_objects from lib.objects import ail_objects
def reprocess_message_objects(object_type): # from modules.ApiKey import ApiKey
queue = ail_queues.AILQueue('FeederModuleImporter', -1) # from modules.Categ import Categ
for obj in ail_objects.obj_iterator(object_type, filters={}): # from modules.CreditCards import CreditCards
queue.send_message(obj.get_global_id(), message='reprocess') # from modules.DomClassifier import DomClassifier
queue.end() # from modules.Global import Global
# from modules.Keys import Keys
# from modules.Onion import Onion
# from modules.Telegram import Telegram
from modules.Languages import Languages
MODULES = {
'Languages': Languages
}
def reprocess_message_objects(object_type, module_name=None):
if module_name:
module = MODULES[module_name]()
for obj in ail_objects.obj_iterator(object_type, filters={}):
module.obj = obj
module.compute(None)
else:
queue = ail_queues.AILQueue('FeederModuleImporter', -1)
for obj in ail_objects.obj_iterator(object_type, filters={}):
queue.send_message(obj.get_global_id(), message='reprocess')
queue.end()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Reprocess AIL Objects') parser = argparse.ArgumentParser(description='Reprocess AIL Objects')
parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True) parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True)
parser.add_argument('-m', '--module', type=str, help='AIL Module Name')
args = parser.parse_args() args = parser.parse_args()
if not args.type: if not args.type:
@ -43,4 +65,7 @@ if __name__ == "__main__":
if obj_type not in ['item', 'message']: # TODO image if obj_type not in ['item', 'message']: # TODO image
raise Exception(f'Currently not supported Object Type: {obj_type}') raise Exception(f'Currently not supported Object Type: {obj_type}')
reprocess_message_objects(obj_type) modulename = args.module
if modulename not in MODULES:
raise Exception(f'Currently not supported Module: {modulename}')
reprocess_message_objects(obj_type, module_name=modulename)

View file

@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME'])
################################## ##################################
from update.bin.ail_updater import AIL_Updater from update.bin.ail_updater import AIL_Updater
from lib import ail_updates from lib import ail_updates
from lib import chats_viewer
class Updater(AIL_Updater): class Updater(AIL_Updater):
"""default Updater.""" """default Updater."""
@ -19,6 +20,7 @@ class Updater(AIL_Updater):
if __name__ == '__main__': if __name__ == '__main__':
chats_viewer.fix_correlations_subchannel_message()
updater = Updater('v5.4') updater = Updater('v5.4')
updater.run_update() updater.run_update()