mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-25 15:27:17 +00:00
fix: [chat] fix subchannel-message correlation + fix empty message language detection
This commit is contained in:
parent
b9c37167ad
commit
2db54def46
7 changed files with 126 additions and 16 deletions
|
@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
||||||
subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid())
|
subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid())
|
||||||
thread = None
|
thread = None
|
||||||
|
|
||||||
# TODO correlation with obj = message/image
|
subchannel.add(date, obj)
|
||||||
subchannel.add(date)
|
|
||||||
|
|
||||||
if meta.get('date'): # TODO check if already exists
|
if meta.get('date'): # TODO check if already exists
|
||||||
subchannel.set_created_at(int(meta['date']['timestamp']))
|
subchannel.set_created_at(int(meta['date']['timestamp']))
|
||||||
|
@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
||||||
# CHAT
|
# CHAT
|
||||||
chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id)
|
chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id)
|
||||||
|
|
||||||
# Message forward
|
# # TODO HANDLE OTHERS OBJECT TYPE
|
||||||
|
# # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
# # Message forward + Discussion
|
||||||
|
# if self.get_json_meta().get('forward'):
|
||||||
|
# discussion_id = self.get_json_meta().get('discussion')
|
||||||
|
# forward_from = self.get_message_forward()
|
||||||
|
#
|
||||||
|
# if discussion_id: # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS
|
||||||
|
# chat_forward_id = forward_from['from']['id']
|
||||||
|
# message_forward_id = forward_from['from']['channel_post']
|
||||||
|
#
|
||||||
|
# # if chat_forward_id == discussion_id:
|
||||||
|
# # linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid())
|
||||||
|
# # if linked_chat.exists():
|
||||||
|
# # # create thread
|
||||||
|
# # # add message replies for each childrens
|
||||||
|
#
|
||||||
|
# # TODO HANDLE THREAD
|
||||||
|
# # TODO Change FORWARD META FIELDS
|
||||||
|
# # meta['forward'] = {}
|
||||||
|
# # # CHAT ID
|
||||||
|
# # # SUBCHANNEL ID -> can be None
|
||||||
|
# # # Message ID
|
||||||
|
#
|
||||||
|
# # meta['forward']['origin']
|
||||||
|
# # # same as 'forward'
|
||||||
|
#
|
||||||
|
# if self.get_json_meta().get('forward'):
|
||||||
|
# forward = self.get_message_forward()
|
||||||
|
# f_chat = forward['chat']
|
||||||
|
# f_subchannel = forward.get('subchannel')
|
||||||
|
# f_id = forward.get('id')
|
||||||
|
# if not f_subchannel:
|
||||||
|
# chat_forward = Chat(f_chat, self.get_chat_instance_uuid())
|
||||||
|
# if chat_forward.exists():
|
||||||
|
# for chat_obj in chat_objs:
|
||||||
|
# if chat_obj.type == 'chat':
|
||||||
|
# chat_forward.add_relationship(chat_obj.get_global_id(), 'forward')
|
||||||
|
# # TODO LIST FORWARDED MESSAGES
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# # Discord -> serverID + subchannel ID + message ID
|
||||||
|
# # Telegram -> chat ID + Message ID
|
||||||
|
# # + ORIGIN IDs
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# # TODO create relationships graph
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# # TODO REMOVE ME
|
||||||
|
# # Message forward # TODO handle subchannel + message ID
|
||||||
# if self.get_json_meta().get('forward'):
|
# if self.get_json_meta().get('forward'):
|
||||||
# forward_from = self.get_message_forward()
|
# forward_from = self.get_message_forward()
|
||||||
# print('-----------------------------------------------------------')
|
# print('-----------------------------------------------------------')
|
||||||
|
|
|
@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False):
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
h.ignore_links = ignore_links
|
h.ignore_links = ignore_links
|
||||||
h.ignore_images = ignore_links
|
h.ignore_images = ignore_links
|
||||||
return h.handle(content)
|
content = h.handle(content)
|
||||||
|
if content == '\n\n':
|
||||||
|
content = ''
|
||||||
|
return content
|
||||||
|
|
||||||
def _clean_text_to_translate(content, html=False, keys_blocks=True):
|
def _clean_text_to_translate(content, html=False, keys_blocks=True):
|
||||||
if html:
|
if html:
|
||||||
|
@ -482,14 +485,23 @@ class LanguagesDetector:
|
||||||
return languages
|
return languages
|
||||||
|
|
||||||
def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
|
def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
content = _clean_text_to_translate(content, html=True)
|
content = _clean_text_to_translate(content, html=True)
|
||||||
# print('cleaned content', content)
|
if not content:
|
||||||
# gcld3
|
return None
|
||||||
if len(content) < 100:
|
# DEBUG
|
||||||
|
# print('-------------------------------------------------------')
|
||||||
|
# print(content)
|
||||||
|
# print(len(content))
|
||||||
|
# lexilang
|
||||||
|
if len(content) < 150:
|
||||||
|
# print('lexilang')
|
||||||
languages = self.detect_lexilang(content)
|
languages = self.detect_lexilang(content)
|
||||||
|
# gcld3
|
||||||
else:
|
else:
|
||||||
# if len(content) >= 200 or not self.lt or force_gcld3:
|
# if len(content) >= 200 or not self.lt or force_gcld3:
|
||||||
# print('gcld3')
|
# print('gcld3')
|
||||||
languages = self.detect_gcld3(content)
|
languages = self.detect_gcld3(content)
|
||||||
# libretranslate
|
# libretranslate
|
||||||
# else:
|
# else:
|
||||||
|
|
|
@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id):
|
||||||
username = Usernames.Username(username_id, instance_uuid)
|
username = Usernames.Username(username_id, instance_uuid)
|
||||||
return username.get_meta()
|
return username.get_meta()
|
||||||
|
|
||||||
|
|
||||||
# TODO Filter
|
# TODO Filter
|
||||||
## Instance type
|
## Instance type
|
||||||
## Chats IDS
|
## Chats IDS
|
||||||
|
@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}):
|
||||||
nb_messages += chat.get_nb_messages()
|
nb_messages += chat.get_nb_messages()
|
||||||
return nb_messages
|
return nb_messages
|
||||||
|
|
||||||
|
|
||||||
|
#### FIX ####
|
||||||
|
|
||||||
|
def fix_correlations_subchannel_message():
|
||||||
|
for instance_uuid in get_chat_service_instances():
|
||||||
|
for chat_id in ChatServiceInstance(instance_uuid).get_chats():
|
||||||
|
chat = Chats.Chat(chat_id, instance_uuid)
|
||||||
|
# subchannels
|
||||||
|
for subchannel_gid in chat.get_subchannels():
|
||||||
|
_, _, subchannel_id = subchannel_gid.split(':', 2)
|
||||||
|
subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid)
|
||||||
|
messages, _ = subchannel._get_messages(nb=-1)
|
||||||
|
for mess in messages:
|
||||||
|
_, _, message_id = mess[0].split(':', )
|
||||||
|
subchannel.add_correlation('message', '', message_id)
|
||||||
|
|
||||||
#### API ####
|
#### API ####
|
||||||
|
|
||||||
def api_get_chat_service_instance(chat_instance_uuid):
|
def api_get_chat_service_instance(chat_instance_uuid):
|
||||||
|
@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa
|
||||||
chat = Chats.Chat(chat_id, chat_instance_uuid)
|
chat = Chats.Chat(chat_id, chat_instance_uuid)
|
||||||
if not chat.exists():
|
if not chat.exists():
|
||||||
return {"status": "error", "reason": "Unknown chat"}, 404
|
return {"status": "error", "reason": "Unknown chat"}, 404
|
||||||
|
# print(chat.get_obj_language_stats())
|
||||||
meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target)
|
meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target)
|
||||||
if meta['username']:
|
if meta['username']:
|
||||||
meta['username'] = get_username_meta_from_global_id(meta['username'])
|
meta['username'] = get_username_meta_from_global_id(meta['username'])
|
||||||
|
@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb=
|
||||||
subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid)
|
subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid)
|
||||||
if not subchannel.exists():
|
if not subchannel.exists():
|
||||||
return {"status": "error", "reason": "Unknown subchannel"}, 404
|
return {"status": "error", "reason": "Unknown subchannel"}, 404
|
||||||
|
# print(subchannel.get_obj_language_stats())
|
||||||
meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target)
|
meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target)
|
||||||
if meta['chat']:
|
if meta['chat']:
|
||||||
meta['chat'] = get_chat_meta_from_global_id(meta['chat'])
|
meta['chat'] = get_chat_meta_from_global_id(meta['chat'])
|
||||||
|
|
|
@ -113,7 +113,7 @@ class Message(AbstractObject):
|
||||||
def get_subchannel(self):
|
def get_subchannel(self):
|
||||||
subchannel = self.get_correlation('chat-subchannel')
|
subchannel = self.get_correlation('chat-subchannel')
|
||||||
if subchannel.get('chat-subchannel'):
|
if subchannel.get('chat-subchannel'):
|
||||||
return f'user-account:{subchannel["chat-subchannel"].pop()}'
|
return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}'
|
||||||
|
|
||||||
def get_thread(self):
|
def get_thread(self):
|
||||||
for child in self.get_childrens():
|
for child in self.get_childrens():
|
||||||
|
|
|
@ -33,6 +33,10 @@ class Languages(AbstractModule):
|
||||||
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
|
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
|
||||||
print(lang)
|
print(lang)
|
||||||
domain.add_language(lang)
|
domain.add_language(lang)
|
||||||
|
# Detect Chat Message Language
|
||||||
|
# elif obj.type == 'message':
|
||||||
|
# lang = obj.detect_language()
|
||||||
|
# print(self.obj.id, lang)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -20,17 +20,39 @@ from lib.ail_core import is_object_type
|
||||||
from lib import ail_queues
|
from lib import ail_queues
|
||||||
from lib.objects import ail_objects
|
from lib.objects import ail_objects
|
||||||
|
|
||||||
def reprocess_message_objects(object_type):
|
# from modules.ApiKey import ApiKey
|
||||||
queue = ail_queues.AILQueue('FeederModuleImporter', -1)
|
# from modules.Categ import Categ
|
||||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
# from modules.CreditCards import CreditCards
|
||||||
queue.send_message(obj.get_global_id(), message='reprocess')
|
# from modules.DomClassifier import DomClassifier
|
||||||
queue.end()
|
# from modules.Global import Global
|
||||||
|
# from modules.Keys import Keys
|
||||||
|
# from modules.Onion import Onion
|
||||||
|
# from modules.Telegram import Telegram
|
||||||
|
|
||||||
|
from modules.Languages import Languages
|
||||||
|
|
||||||
|
MODULES = {
|
||||||
|
'Languages': Languages
|
||||||
|
}
|
||||||
|
|
||||||
|
def reprocess_message_objects(object_type, module_name=None):
|
||||||
|
if module_name:
|
||||||
|
module = MODULES[module_name]()
|
||||||
|
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||||
|
module.obj = obj
|
||||||
|
module.compute(None)
|
||||||
|
else:
|
||||||
|
queue = ail_queues.AILQueue('FeederModuleImporter', -1)
|
||||||
|
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||||
|
queue.send_message(obj.get_global_id(), message='reprocess')
|
||||||
|
queue.end()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Reprocess AIL Objects')
|
parser = argparse.ArgumentParser(description='Reprocess AIL Objects')
|
||||||
parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True)
|
parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True)
|
||||||
|
parser.add_argument('-m', '--module', type=str, help='AIL Module Name')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if not args.type:
|
if not args.type:
|
||||||
|
@ -43,4 +65,7 @@ if __name__ == "__main__":
|
||||||
if obj_type not in ['item', 'message']: # TODO image
|
if obj_type not in ['item', 'message']: # TODO image
|
||||||
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
||||||
|
|
||||||
reprocess_message_objects(obj_type)
|
modulename = args.module
|
||||||
|
if modulename not in MODULES:
|
||||||
|
raise Exception(f'Currently not supported Module: {modulename}')
|
||||||
|
reprocess_message_objects(obj_type, module_name=modulename)
|
||||||
|
|
|
@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME'])
|
||||||
##################################
|
##################################
|
||||||
from update.bin.ail_updater import AIL_Updater
|
from update.bin.ail_updater import AIL_Updater
|
||||||
from lib import ail_updates
|
from lib import ail_updates
|
||||||
|
from lib import chats_viewer
|
||||||
|
|
||||||
class Updater(AIL_Updater):
|
class Updater(AIL_Updater):
|
||||||
"""default Updater."""
|
"""default Updater."""
|
||||||
|
@ -19,6 +20,7 @@ class Updater(AIL_Updater):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
chats_viewer.fix_correlations_subchannel_message()
|
||||||
updater = Updater('v5.4')
|
updater = Updater('v5.4')
|
||||||
updater.run_update()
|
updater.run_update()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue