chg: [lang] improve language detection + UI: manual translation and detection

This commit is contained in:
terrtia 2024-03-08 15:26:06 +01:00
parent 7acac4dc0c
commit 197ff0222d
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
10 changed files with 164 additions and 51 deletions

View file

@ -7,6 +7,7 @@ import sys
import html2text import html2text
import gcld3 import gcld3
from lexilang.detector import detect as lexilang_detect
from libretranslatepy import LibreTranslateAPI from libretranslatepy import LibreTranslateAPI
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
@ -342,18 +343,31 @@ def remove_obj_language(language, obj_type, obj_subtype, obj_id):
obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}' obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}'
r_lang.srem(f'obj:lang:{obj_global_id}', language) r_lang.srem(f'obj:lang:{obj_global_id}', language)
delete_obj_translation(obj_global_id, language)
r_lang.srem(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id) r_lang.srem(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id)
if not r_lang.exists(f'langs:{obj_type}:{obj_subtype}:{language}'): if not r_lang.exists(f'langs:{obj_type}:{obj_subtype}:{language}'):
r_lang.srem(f'objs:lang:{obj_type}:{obj_subtype}', language) r_lang.srem(f'objs:lang:{obj_type}:{obj_subtype}', language)
r_lang.srem(f'languages:{language}', f'{obj_type}:{obj_subtype}') r_lang.srem(f'languages:{language}', f'{obj_type}:{obj_subtype}')
if not r_lang.exists(f'objs:lang:{obj_type}:{obj_subtype}'): if not r_lang.exists(f'objs:lang:{obj_type}:{obj_subtype}'):
if r_lang.scard(f'objs:langs:{obj_type}', language) <= 1: if r_lang.scard(f'objs:langs:{obj_type}') <= 1:
r_lang.srem(f'objs:langs:{obj_type}', language) r_lang.srem(f'objs:langs:{obj_type}', language)
def edit_obj_language(language, obj_type, obj_subtype, obj_id): # TODO handle fields
remove_obj_language(language, obj_type, obj_subtype, obj_id) def detect_obj_language(obj_type, obj_subtype, obj_id, content):
add_obj_language(language, obj_type, obj_subtype, obj_id) detector = LanguagesDetector(nb_langs=1)
language = detector.detect(content)
if language:
language = language[0]
previous_lang = get_obj_languages(obj_type, obj_subtype, obj_id)
if previous_lang:
previous_lang = previous_lang[0]
if language != previous_lang:
remove_obj_language(language, obj_type, obj_subtype, obj_id)
add_obj_language(language, obj_type, obj_subtype, obj_id)
else:
add_obj_language(language, obj_type, obj_subtype, obj_id)
return language
## Translation ## Translation
def _get_obj_translation(obj_global_id, language, field=''): def _get_obj_translation(obj_global_id, language, field=''):
@ -364,6 +378,7 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
Returns translated content Returns translated content
""" """
translation = r_cache.get(f'translation:{language}:{obj_global_id}:{field}') translation = r_cache.get(f'translation:{language}:{obj_global_id}:{field}')
r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 0)
if translation: if translation:
# DEBUG # DEBUG
# print('cache') # print('cache')
@ -372,7 +387,10 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
# TODO HANDLE FIELDS TRANSLATION # TODO HANDLE FIELDS TRANSLATION
translation = _get_obj_translation(obj_global_id, language, field=field) translation = _get_obj_translation(obj_global_id, language, field=field)
if not translation: if not translation:
translation = LanguageTranslator().translate(content, source=source, target=language) source, translation = LanguageTranslator().translate(content, source=source, target=language)
if source and translation:
obj_type, subtype, obj_id = obj_global_id.split(':', 2)
add_obj_language(source, obj_type, subtype, obj_id)
if translation: if translation:
r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation) r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation)
r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300) r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300)
@ -380,10 +398,14 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
# TODO Force to edit ???? # TODO Force to edit ????
def set_obj_translation(obj_global_id, language, translation, field=''): def set_obj_translation(obj_global_id, language, translation, field=''):
r_cache.delete(f'translation:{language}:{obj_global_id}:') r_cache.delete(f'translation:{language}:{obj_global_id}:')
return r_lang.hset(f'tr:{obj_global_id}:{field}', language, translation) return r_lang.hset(f'tr:{obj_global_id}:{field}', language, translation)
def delete_obj_translation(obj_global_id, language, field=''):
r_cache.delete(f'translation:{language}:{obj_global_id}:')
r_lang.hdel(f'tr:{obj_global_id}:{field}', language)
## --LANGUAGE ENGINE-- ## ## --LANGUAGE ENGINE-- ##
@ -410,11 +432,22 @@ class LanguagesDetector:
if self.min_len > 0: if self.min_len > 0:
if len(content) < self.min_len: if len(content) < self.min_len:
return languages return languages
# p = self.detector.FindTopNMostFreqLangs(content, num_langs=3)
# for lang in p:
# print(lang.language, lang.probability, lang.proportion, lang.is_reliable)
# print('------------------------------------------------')
for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs): for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs):
if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable: if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable:
languages.append(lang.language) languages.append(lang.language)
return languages return languages
def detect_lexilang(self, content): # TODO clean text ??? - TODO REMOVE SEPARATOR
language, prob = lexilang_detect(content)
if prob > 0:
return [language]
else:
return []
def detect_libretranslate(self, content): def detect_libretranslate(self, content):
languages = [] languages = []
try: try:
@ -431,19 +464,26 @@ class LanguagesDetector:
languages.append(language) languages.append(language)
return languages return languages
def detect(self, content, force_gcld3=False): def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
content = _clean_text_to_translate(content, html=True)
# print('cleaned content', content)
# gcld3 # gcld3
if len(content) >= 200 or not self.lt or force_gcld3: if len(content) < 100:
language = self.detect_gcld3(content) languages = self.detect_lexilang(content)
# libretranslate
else: else:
language = self.detect_libretranslate(content) # if len(content) >= 200 or not self.lt or force_gcld3:
return language # print('gcld3')
languages = self.detect_gcld3(content)
# libretranslate
# else:
# languages = self.detect_libretranslate(content)
return languages
class LanguageTranslator: class LanguageTranslator:
def __init__(self): def __init__(self):
self.lt = LibreTranslateAPI(get_translator_instance()) self.lt = LibreTranslateAPI(get_translator_instance())
self.ld = LanguagesDetector(nb_langs=1)
def languages(self): def languages(self):
languages = [] languages = []
@ -473,13 +513,13 @@ class LanguageTranslator:
return language[0].get('language') return language[0].get('language')
def detect(self, content): def detect(self, content):
# gcld3 # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')
if len(content) >= 200: # print(content)
language = self.detect_gcld3(content) language = self.ld.detect(content)
# libretranslate if language:
else: # print(language[0])
language = self.detect_libretranslate(content) # print('##############################################################')
return language return language[0]
def translate(self, content, source=None, target="en"): # TODO source target def translate(self, content, source=None, target="en"): # TODO source target
if target not in get_translation_languages(): if target not in get_translation_languages():
@ -498,9 +538,9 @@ class LanguageTranslator:
translation = None translation = None
# TODO LOG and display error # TODO LOG and display error
if translation == content: if translation == content:
print('EQUAL') # print('EQUAL')
translation = None translation = None
return translation return source, translation
LIST_LANGUAGES = {} LIST_LANGUAGES = {}

View file

@ -404,18 +404,33 @@ def api_get_message(message_id, translation_target=None):
message = Messages.Message(message_id) message = Messages.Message(message_id)
if not message.exists(): if not message.exists():
return {"status": "error", "reason": "Unknown uuid"}, 404 return {"status": "error", "reason": "Unknown uuid"}, 404
meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
return meta, 200 return meta, 200
def api_manually_translate_message(message_id, translation_target, translation): def api_message_detect_language(message_id):
message = Messages.Message(message_id)
if not message.exists():
return {"status": "error", "reason": "Unknown uuid"}, 404
lang = message.detect_language()
return {"language": lang}, 200
def api_manually_translate_message(message_id, source, translation_target, translation):
message = Messages.Message(message_id) message = Messages.Message(message_id)
if not message.exists(): if not message.exists():
return {"status": "error", "reason": "Unknown uuid"}, 404 return {"status": "error", "reason": "Unknown uuid"}, 404
if len(translation) > 200000: # TODO REVIEW LIMIT
return {"status": "error", "reason": "Max Size reached"}, 400
if translation_target not in Language.get_translation_languages():
return {"status": "error", "reason": "Unknown Language"}, 400
if translation: if translation:
if len(translation) > 200000: # TODO REVIEW LIMIT
return {"status": "error", "reason": "Max Size reached"}, 400
all_languages = Language.get_translation_languages()
if source not in all_languages:
print(source)
return {"status": "error", "reason": "Unknown source Language"}, 400
message_language = message.get_language()
if message_language != source:
message.edit_language(message_language, source)
if translation:
if translation_target not in all_languages:
return {"status": "error", "reason": "Unknown target Language"}, 400
message.set_translation(translation_target, translation) message.set_translation(translation_target, translation)
# TODO SANITYZE translation # TODO SANITYZE translation
return None, 200 return None, 200

View file

@ -175,6 +175,13 @@ class Message(AbstractObject):
# message media # message media
# flag is deleted -> event or missing from feeder pass ??? # flag is deleted -> event or missing from feeder pass ???
def get_language(self):
languages = self.get_languages()
if languages:
return languages.pop()
else:
return None
def get_translation(self, content=None, source=None, target='fr'): def get_translation(self, content=None, source=None, target='fr'):
""" """
Returns translated content Returns translated content
@ -289,8 +296,14 @@ class Message(AbstractObject):
meta['files-names'] = self.get_files_names() meta['files-names'] = self.get_files_names()
if 'reactions' in options: if 'reactions' in options:
meta['reactions'] = self.get_reactions() meta['reactions'] = self.get_reactions()
if 'language' in options:
meta['language'] = self.get_language()
if 'translation' in options and translation_target: if 'translation' in options and translation_target:
meta['translation'] = self.translate(content=meta.get('content'), target=translation_target) if meta.get('language'):
source = meta['language']
else:
source = None
meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)
# meta['encoding'] = None # meta['encoding'] = None
return meta return meta

View file

@ -226,7 +226,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC):
def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message
message = Messages.Message(message[9:]) message = Messages.Message(message[9:])
if not options: if not options:
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target) meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target)
return meta return meta

View file

@ -25,7 +25,7 @@ from lib import Duplicate
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, get_obj_translation, set_obj_translation from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_translation, set_obj_translation, delete_obj_translation
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
logging.config.dictConfig(ail_logger.get_config(name='ail')) logging.config.dictConfig(ail_logger.get_config(name='ail'))
@ -313,12 +313,22 @@ class AbstractObject(ABC):
def remove_language(self, language): def remove_language(self, language):
return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id) return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id)
def edit_language(self, old_language, new_language):
self.remove_language(old_language)
self.add_language(new_language)
def detect_language(self, field=''):
return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content())
def get_translation(self, language, field=''): def get_translation(self, language, field=''):
return get_obj_translation(self.get_global_id(), language, field=field) return get_obj_translation(self.get_global_id(), language, field=field)
def set_translation(self, language, translation, field=''): def set_translation(self, language, translation, field=''):
return set_obj_translation(self.get_global_id(), language, translation, field=field) return set_obj_translation(self.get_global_id(), language, translation, field=field)
def delete_translation(self, language, field=''):
return delete_obj_translation(self.get_global_id(), language, field=field)
def translate(self, content=None, field='', source=None, target='en'): def translate(self, content=None, field='', source=None, target='en'):
global_id = self.get_global_id() global_id = self.get_global_id()
if not content: if not content:

View file

@ -44,6 +44,7 @@ scrapy-splash>=0.7.2
# Languages # Languages
gcld3 gcld3
libretranslatepy libretranslatepy
lexilang
#Graph #Graph
numpy>1.18.1 numpy>1.18.1

View file

@ -24,6 +24,7 @@ echo ""
echo -e $GREEN"Updating python packages ..."$DEFAULT echo -e $GREEN"Updating python packages ..."$DEFAULT
echo "" echo ""
pip install -U pylacus pip install -U pylacus
pip install -U lexilang
bash ${AIL_BIN}/LAUNCH.sh -lrv bash ${AIL_BIN}/LAUNCH.sh -lrv

View file

@ -240,11 +240,24 @@ def objects_message():
@login_read_only @login_read_only
def objects_message_translate(): def objects_message_translate():
message_id = request.form.get('id') message_id = request.form.get('id')
source = request.form.get('language_target')
target = request.form.get('target') target = request.form.get('target')
translation = request.form.get('translation') translation = request.form.get('translation')
if target == "Don't Translate": if target == "Don't Translate":
target = None target = None
resp = chats_viewer.api_manually_translate_message(message_id, target, translation) resp = chats_viewer.api_manually_translate_message(message_id, source, target, translation)
if resp[1] != 200:
return create_json_response(resp[0], resp[1])
else:
return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target))
@chats_explorer.route("/objects/message/detect/language", methods=['GET'])
@login_required
@login_read_only
def objects_message_detect_language():
message_id = request.args.get('id')
target = request.args.get('target')
resp = chats_viewer.api_message_detect_language(message_id)
if resp[1] != 200: if resp[1] != 200:
return create_json_response(resp[0], resp[1]) return create_json_response(resp[0], resp[1])
else: else:

View file

@ -81,24 +81,6 @@
<hr class="m-1"> <hr class="m-1">
<pre class="my-0 text-secondary">{{ message['translation'] }}</pre> <pre class="my-0 text-secondary">{{ message['translation'] }}</pre>
{% set mess_id_escape= message['id'] | replace("/", "_") %}
<button class="btn btn-light p-0" type="button" data-toggle="collapse" data-target="#collapseTrans{{ mess_id_escape }}" aria-expanded="false" aria-controls="collapseTrans{{ mess_id_escape }}">
<i class="fas fa-language"></i>
</button>
<div class="collapse" id="collapseTrans{{ mess_id_escape }}">
<div class="card card-body">
<form method="post" action="{{ url_for('chats_explorer.objects_message_translate') }}" target="_blank">
<input type="text" id="id" name="id" value="{{message['id']}}" hidden>
<input type="text" id="target" name="target" value="{{translation_target}}" hidden>
<span>{{translation_target}}:</span>
<textarea class="form-control" id="translation" name="translation">{{ message['translation'] }}</textarea>
<button class="btn btn-primary" type="submit">
<i class="fas fa-pen-alt"> Manual Translation</i>
</button>
</form>
</div>
</div>
{% endif %} {% endif %}
{% for reaction in message['reactions'] %} {% for reaction in message['reactions'] %}
<span class="border rounded px-1">{{ reaction }} {{ message['reactions'][reaction] }}</span> <span class="border rounded px-1">{{ reaction }} {{ message['reactions'][reaction] }}</span>
@ -113,10 +95,47 @@
<span class="badge badge-{{ bootstrap_label[loop.index0 % 5] }}">{{ tag }}</span> <span class="badge badge-{{ bootstrap_label[loop.index0 % 5] }}">{{ tag }}</span>
{% endfor %} {% endfor %}
<div class=""> <div class="">
{% set mess_id_escape= message['id'] | replace("/", "_") %}
<span class="btn btn-outline-dark p-0 px-1" type="button" data-toggle="collapse" data-target="#collapseTrans{{ mess_id_escape }}" aria-expanded="false" aria-controls="collapseTrans{{ mess_id_escape }}">
<i class="fas fa-language"></i> {% if message['language'] %}{{ message['language'] }}{% endif %}
</span>
<div class="collapse" id="collapseTrans{{ mess_id_escape }}">
<div class="card card-body">
<form method="post" action="{{ url_for('chats_explorer.objects_message_translate') }}" target="_blank">
<input type="text" id="id" name="id" value="{{message['id']}}" hidden>
<span class="badge badge-primary">Source:</span>
<span class="">
<select id="language_target" name="language_target" class="form-select" aria-label="Message Language" onchange="$('#translation').val('');">
<option selected value="{{ message['language'] }}">{{ message['language'] }}</option>
{% for language in translation_languages %}
<option value="{{ language }}">{{ translation_languages[language] }}</option>
{% endfor %}
</select>
</span>
{% if translation_target %}
<input type="text" id="target" name="target" value="{{translation_target}}" hidden>
&nbsp;&nbsp;&nbsp;&nbsp;<span class="badge badge-primary">Target:</span><span>{{translation_target}}</span>
<textarea class="form-control" id="translation" name="translation">{{ message['translation'] }}</textarea>
<button class="btn btn-dark" type="submit">
<i class="fas fa-pen-alt"> Update Language or Translation</i>
</button>
{% else %}
<button class="btn btn-dark" type="submit">
<i class="fas fa-pen-alt"> Update Language</i>
</button>
{% endif %}
</form>
<div>
<a class="btn btn-primary" href="{{ url_for('chats_explorer.objects_message_detect_language')}}?id={{ message['id'] }}">
<i class="fas fa-redo"></i> Detect Language
</a>
</div>
</div>
</div>
<a class="btn btn-light btn-sm text-secondary px-1" href="{{ url_for('correlation.show_correlation')}}?type={{ message['type'] }}&subtype={{ message['subtype'] }}&id={{ message['id'] }}"><i class="fas fa-project-diagram"></i></a> <a class="btn btn-light btn-sm text-secondary px-1" href="{{ url_for('correlation.show_correlation')}}?type={{ message['type'] }}&subtype={{ message['subtype'] }}&id={{ message['id'] }}"><i class="fas fa-project-diagram"></i></a>
<a class="btn btn-light btn-sm text-secondary px-1" href="{{ message['link'] }}"><i class="fas fa-eye"></i></a> <a class="btn btn-light btn-sm text-secondary px-1" href="{{ message['link'] }}"><i class="fas fa-eye"></i></a>
</div> </div>
</div> </div>
</div> </div>

View file

@ -10,6 +10,7 @@
<option selected>Don't Translate</option> <option selected>Don't Translate</option>
{% else %} {% else %}
<option selected value="{{ translation_target }}">{{ translation_target }}</option> <option selected value="{{ translation_target }}">{{ translation_target }}</option>
<option>Don't Translate</option>
{% endif %} {% endif %}
{% for language in translation_languages %} {% for language in translation_languages %}
<option value="{{ language }}">{{ translation_languages[language] }}</option> <option value="{{ language }}">{{ translation_languages[language] }}</option>