mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [lang] improve language detection + UI: manual translation and detection
This commit is contained in:
parent
7acac4dc0c
commit
197ff0222d
10 changed files with 164 additions and 51 deletions
|
@ -7,6 +7,7 @@ import sys
|
|||
import html2text
|
||||
|
||||
import gcld3
|
||||
from lexilang.detector import detect as lexilang_detect
|
||||
from libretranslatepy import LibreTranslateAPI
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
|
@ -342,18 +343,31 @@ def remove_obj_language(language, obj_type, obj_subtype, obj_id):
|
|||
obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}'
|
||||
r_lang.srem(f'obj:lang:{obj_global_id}', language)
|
||||
|
||||
delete_obj_translation(obj_global_id, language)
|
||||
|
||||
r_lang.srem(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id)
|
||||
if not r_lang.exists(f'langs:{obj_type}:{obj_subtype}:{language}'):
|
||||
r_lang.srem(f'objs:lang:{obj_type}:{obj_subtype}', language)
|
||||
r_lang.srem(f'languages:{language}', f'{obj_type}:{obj_subtype}')
|
||||
if not r_lang.exists(f'objs:lang:{obj_type}:{obj_subtype}'):
|
||||
if r_lang.scard(f'objs:langs:{obj_type}', language) <= 1:
|
||||
if r_lang.scard(f'objs:langs:{obj_type}') <= 1:
|
||||
r_lang.srem(f'objs:langs:{obj_type}', language)
|
||||
|
||||
def edit_obj_language(language, obj_type, obj_subtype, obj_id):
|
||||
remove_obj_language(language, obj_type, obj_subtype, obj_id)
|
||||
add_obj_language(language, obj_type, obj_subtype, obj_id)
|
||||
|
||||
# TODO handle fields
|
||||
def detect_obj_language(obj_type, obj_subtype, obj_id, content):
|
||||
detector = LanguagesDetector(nb_langs=1)
|
||||
language = detector.detect(content)
|
||||
if language:
|
||||
language = language[0]
|
||||
previous_lang = get_obj_languages(obj_type, obj_subtype, obj_id)
|
||||
if previous_lang:
|
||||
previous_lang = previous_lang[0]
|
||||
if language != previous_lang:
|
||||
remove_obj_language(language, obj_type, obj_subtype, obj_id)
|
||||
add_obj_language(language, obj_type, obj_subtype, obj_id)
|
||||
else:
|
||||
add_obj_language(language, obj_type, obj_subtype, obj_id)
|
||||
return language
|
||||
|
||||
## Translation
|
||||
def _get_obj_translation(obj_global_id, language, field=''):
|
||||
|
@ -364,6 +378,7 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
|
|||
Returns translated content
|
||||
"""
|
||||
translation = r_cache.get(f'translation:{language}:{obj_global_id}:{field}')
|
||||
r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 0)
|
||||
if translation:
|
||||
# DEBUG
|
||||
# print('cache')
|
||||
|
@ -372,7 +387,10 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
|
|||
# TODO HANDLE FIELDS TRANSLATION
|
||||
translation = _get_obj_translation(obj_global_id, language, field=field)
|
||||
if not translation:
|
||||
translation = LanguageTranslator().translate(content, source=source, target=language)
|
||||
source, translation = LanguageTranslator().translate(content, source=source, target=language)
|
||||
if source and translation:
|
||||
obj_type, subtype, obj_id = obj_global_id.split(':', 2)
|
||||
add_obj_language(source, obj_type, subtype, obj_id)
|
||||
if translation:
|
||||
r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation)
|
||||
r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300)
|
||||
|
@ -380,10 +398,14 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel
|
|||
|
||||
|
||||
# TODO Force to edit ????
|
||||
|
||||
def set_obj_translation(obj_global_id, language, translation, field=''):
|
||||
r_cache.delete(f'translation:{language}:{obj_global_id}:')
|
||||
return r_lang.hset(f'tr:{obj_global_id}:{field}', language, translation)
|
||||
|
||||
def delete_obj_translation(obj_global_id, language, field=''):
|
||||
r_cache.delete(f'translation:{language}:{obj_global_id}:')
|
||||
r_lang.hdel(f'tr:{obj_global_id}:{field}', language)
|
||||
|
||||
## --LANGUAGE ENGINE-- ##
|
||||
|
||||
|
@ -410,11 +432,22 @@ class LanguagesDetector:
|
|||
if self.min_len > 0:
|
||||
if len(content) < self.min_len:
|
||||
return languages
|
||||
# p = self.detector.FindTopNMostFreqLangs(content, num_langs=3)
|
||||
# for lang in p:
|
||||
# print(lang.language, lang.probability, lang.proportion, lang.is_reliable)
|
||||
# print('------------------------------------------------')
|
||||
for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs):
|
||||
if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable:
|
||||
languages.append(lang.language)
|
||||
return languages
|
||||
|
||||
def detect_lexilang(self, content): # TODO clean text ??? - TODO REMOVE SEPARATOR
|
||||
language, prob = lexilang_detect(content)
|
||||
if prob > 0:
|
||||
return [language]
|
||||
else:
|
||||
return []
|
||||
|
||||
def detect_libretranslate(self, content):
|
||||
languages = []
|
||||
try:
|
||||
|
@ -431,19 +464,26 @@ class LanguagesDetector:
|
|||
languages.append(language)
|
||||
return languages
|
||||
|
||||
def detect(self, content, force_gcld3=False):
|
||||
def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
|
||||
content = _clean_text_to_translate(content, html=True)
|
||||
# print('cleaned content', content)
|
||||
# gcld3
|
||||
if len(content) >= 200 or not self.lt or force_gcld3:
|
||||
language = self.detect_gcld3(content)
|
||||
# libretranslate
|
||||
if len(content) < 100:
|
||||
languages = self.detect_lexilang(content)
|
||||
else:
|
||||
language = self.detect_libretranslate(content)
|
||||
return language
|
||||
# if len(content) >= 200 or not self.lt or force_gcld3:
|
||||
# print('gcld3')
|
||||
languages = self.detect_gcld3(content)
|
||||
# libretranslate
|
||||
# else:
|
||||
# languages = self.detect_libretranslate(content)
|
||||
return languages
|
||||
|
||||
class LanguageTranslator:
|
||||
|
||||
def __init__(self):
|
||||
self.lt = LibreTranslateAPI(get_translator_instance())
|
||||
self.ld = LanguagesDetector(nb_langs=1)
|
||||
|
||||
def languages(self):
|
||||
languages = []
|
||||
|
@ -473,13 +513,13 @@ class LanguageTranslator:
|
|||
return language[0].get('language')
|
||||
|
||||
def detect(self, content):
|
||||
# gcld3
|
||||
if len(content) >= 200:
|
||||
language = self.detect_gcld3(content)
|
||||
# libretranslate
|
||||
else:
|
||||
language = self.detect_libretranslate(content)
|
||||
return language
|
||||
# print('++++++++++++++++++++++++++++++++++++++++++++++++++++++')
|
||||
# print(content)
|
||||
language = self.ld.detect(content)
|
||||
if language:
|
||||
# print(language[0])
|
||||
# print('##############################################################')
|
||||
return language[0]
|
||||
|
||||
def translate(self, content, source=None, target="en"): # TODO source target
|
||||
if target not in get_translation_languages():
|
||||
|
@ -498,9 +538,9 @@ class LanguageTranslator:
|
|||
translation = None
|
||||
# TODO LOG and display error
|
||||
if translation == content:
|
||||
print('EQUAL')
|
||||
# print('EQUAL')
|
||||
translation = None
|
||||
return translation
|
||||
return source, translation
|
||||
|
||||
|
||||
LIST_LANGUAGES = {}
|
||||
|
|
|
@ -404,18 +404,33 @@ def api_get_message(message_id, translation_target=None):
|
|||
message = Messages.Message(message_id)
|
||||
if not message.exists():
|
||||
return {"status": "error", "reason": "Unknown uuid"}, 404
|
||||
meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
|
||||
meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target)
|
||||
return meta, 200
|
||||
|
||||
def api_manually_translate_message(message_id, translation_target, translation):
|
||||
def api_message_detect_language(message_id):
|
||||
message = Messages.Message(message_id)
|
||||
if not message.exists():
|
||||
return {"status": "error", "reason": "Unknown uuid"}, 404
|
||||
lang = message.detect_language()
|
||||
return {"language": lang}, 200
|
||||
|
||||
def api_manually_translate_message(message_id, source, translation_target, translation):
|
||||
message = Messages.Message(message_id)
|
||||
if not message.exists():
|
||||
return {"status": "error", "reason": "Unknown uuid"}, 404
|
||||
if len(translation) > 200000: # TODO REVIEW LIMIT
|
||||
return {"status": "error", "reason": "Max Size reached"}, 400
|
||||
if translation_target not in Language.get_translation_languages():
|
||||
return {"status": "error", "reason": "Unknown Language"}, 400
|
||||
if translation:
|
||||
if len(translation) > 200000: # TODO REVIEW LIMIT
|
||||
return {"status": "error", "reason": "Max Size reached"}, 400
|
||||
all_languages = Language.get_translation_languages()
|
||||
if source not in all_languages:
|
||||
print(source)
|
||||
return {"status": "error", "reason": "Unknown source Language"}, 400
|
||||
message_language = message.get_language()
|
||||
if message_language != source:
|
||||
message.edit_language(message_language, source)
|
||||
if translation:
|
||||
if translation_target not in all_languages:
|
||||
return {"status": "error", "reason": "Unknown target Language"}, 400
|
||||
message.set_translation(translation_target, translation)
|
||||
# TODO SANITYZE translation
|
||||
return None, 200
|
||||
|
|
|
@ -175,6 +175,13 @@ class Message(AbstractObject):
|
|||
# message media
|
||||
# flag is deleted -> event or missing from feeder pass ???
|
||||
|
||||
def get_language(self):
|
||||
languages = self.get_languages()
|
||||
if languages:
|
||||
return languages.pop()
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_translation(self, content=None, source=None, target='fr'):
|
||||
"""
|
||||
Returns translated content
|
||||
|
@ -289,8 +296,14 @@ class Message(AbstractObject):
|
|||
meta['files-names'] = self.get_files_names()
|
||||
if 'reactions' in options:
|
||||
meta['reactions'] = self.get_reactions()
|
||||
if 'language' in options:
|
||||
meta['language'] = self.get_language()
|
||||
if 'translation' in options and translation_target:
|
||||
meta['translation'] = self.translate(content=meta.get('content'), target=translation_target)
|
||||
if meta.get('language'):
|
||||
source = meta['language']
|
||||
else:
|
||||
source = None
|
||||
meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)
|
||||
|
||||
# meta['encoding'] = None
|
||||
return meta
|
||||
|
|
|
@ -226,7 +226,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC):
|
|||
def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message
|
||||
message = Messages.Message(message[9:])
|
||||
if not options:
|
||||
options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}
|
||||
meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target)
|
||||
return meta
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ from lib import Duplicate
|
|||
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
|
||||
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
|
||||
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
|
||||
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, get_obj_translation, set_obj_translation
|
||||
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_translation, set_obj_translation, delete_obj_translation
|
||||
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
|
||||
|
||||
logging.config.dictConfig(ail_logger.get_config(name='ail'))
|
||||
|
@ -313,12 +313,22 @@ class AbstractObject(ABC):
|
|||
def remove_language(self, language):
|
||||
return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id)
|
||||
|
||||
def edit_language(self, old_language, new_language):
|
||||
self.remove_language(old_language)
|
||||
self.add_language(new_language)
|
||||
|
||||
def detect_language(self, field=''):
|
||||
return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content())
|
||||
|
||||
def get_translation(self, language, field=''):
|
||||
return get_obj_translation(self.get_global_id(), language, field=field)
|
||||
|
||||
def set_translation(self, language, translation, field=''):
|
||||
return set_obj_translation(self.get_global_id(), language, translation, field=field)
|
||||
|
||||
def delete_translation(self, language, field=''):
|
||||
return delete_obj_translation(self.get_global_id(), language, field=field)
|
||||
|
||||
def translate(self, content=None, field='', source=None, target='en'):
|
||||
global_id = self.get_global_id()
|
||||
if not content:
|
||||
|
|
|
@ -44,6 +44,7 @@ scrapy-splash>=0.7.2
|
|||
# Languages
|
||||
gcld3
|
||||
libretranslatepy
|
||||
lexilang
|
||||
|
||||
#Graph
|
||||
numpy>1.18.1
|
||||
|
|
|
@ -24,6 +24,7 @@ echo ""
|
|||
echo -e $GREEN"Updating python packages ..."$DEFAULT
|
||||
echo ""
|
||||
pip install -U pylacus
|
||||
pip install -U lexilang
|
||||
|
||||
|
||||
bash ${AIL_BIN}/LAUNCH.sh -lrv
|
||||
|
|
|
@ -240,11 +240,24 @@ def objects_message():
|
|||
@login_read_only
|
||||
def objects_message_translate():
|
||||
message_id = request.form.get('id')
|
||||
source = request.form.get('language_target')
|
||||
target = request.form.get('target')
|
||||
translation = request.form.get('translation')
|
||||
if target == "Don't Translate":
|
||||
target = None
|
||||
resp = chats_viewer.api_manually_translate_message(message_id, target, translation)
|
||||
resp = chats_viewer.api_manually_translate_message(message_id, source, target, translation)
|
||||
if resp[1] != 200:
|
||||
return create_json_response(resp[0], resp[1])
|
||||
else:
|
||||
return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target))
|
||||
|
||||
@chats_explorer.route("/objects/message/detect/language", methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
def objects_message_detect_language():
|
||||
message_id = request.args.get('id')
|
||||
target = request.args.get('target')
|
||||
resp = chats_viewer.api_message_detect_language(message_id)
|
||||
if resp[1] != 200:
|
||||
return create_json_response(resp[0], resp[1])
|
||||
else:
|
||||
|
|
|
@ -81,24 +81,6 @@
|
|||
<hr class="m-1">
|
||||
<pre class="my-0 text-secondary">{{ message['translation'] }}</pre>
|
||||
|
||||
{% set mess_id_escape= message['id'] | replace("/", "_") %}
|
||||
<button class="btn btn-light p-0" type="button" data-toggle="collapse" data-target="#collapseTrans{{ mess_id_escape }}" aria-expanded="false" aria-controls="collapseTrans{{ mess_id_escape }}">
|
||||
<i class="fas fa-language"></i>
|
||||
</button>
|
||||
<div class="collapse" id="collapseTrans{{ mess_id_escape }}">
|
||||
<div class="card card-body">
|
||||
<form method="post" action="{{ url_for('chats_explorer.objects_message_translate') }}" target="_blank">
|
||||
<input type="text" id="id" name="id" value="{{message['id']}}" hidden>
|
||||
<input type="text" id="target" name="target" value="{{translation_target}}" hidden>
|
||||
<span>{{translation_target}}:</span>
|
||||
<textarea class="form-control" id="translation" name="translation">{{ message['translation'] }}</textarea>
|
||||
<button class="btn btn-primary" type="submit">
|
||||
<i class="fas fa-pen-alt"> Manual Translation</i>
|
||||
</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endif %}
|
||||
{% for reaction in message['reactions'] %}
|
||||
<span class="border rounded px-1">{{ reaction }} {{ message['reactions'][reaction] }}</span>
|
||||
|
@ -113,10 +95,47 @@
|
|||
<span class="badge badge-{{ bootstrap_label[loop.index0 % 5] }}">{{ tag }}</span>
|
||||
{% endfor %}
|
||||
<div class="">
|
||||
|
||||
{% set mess_id_escape= message['id'] | replace("/", "_") %}
|
||||
<span class="btn btn-outline-dark p-0 px-1" type="button" data-toggle="collapse" data-target="#collapseTrans{{ mess_id_escape }}" aria-expanded="false" aria-controls="collapseTrans{{ mess_id_escape }}">
|
||||
<i class="fas fa-language"></i> {% if message['language'] %}{{ message['language'] }}{% endif %}
|
||||
</span>
|
||||
<div class="collapse" id="collapseTrans{{ mess_id_escape }}">
|
||||
<div class="card card-body">
|
||||
<form method="post" action="{{ url_for('chats_explorer.objects_message_translate') }}" target="_blank">
|
||||
<input type="text" id="id" name="id" value="{{message['id']}}" hidden>
|
||||
<span class="badge badge-primary">Source:</span>
|
||||
<span class="">
|
||||
<select id="language_target" name="language_target" class="form-select" aria-label="Message Language" onchange="$('#translation').val('');">
|
||||
<option selected value="{{ message['language'] }}">{{ message['language'] }}</option>
|
||||
{% for language in translation_languages %}
|
||||
<option value="{{ language }}">{{ translation_languages[language] }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</span>
|
||||
{% if translation_target %}
|
||||
<input type="text" id="target" name="target" value="{{translation_target}}" hidden>
|
||||
<span class="badge badge-primary">Target:</span><span>{{translation_target}}</span>
|
||||
<textarea class="form-control" id="translation" name="translation">{{ message['translation'] }}</textarea>
|
||||
<button class="btn btn-dark" type="submit">
|
||||
<i class="fas fa-pen-alt"> Update Language or Translation</i>
|
||||
</button>
|
||||
{% else %}
|
||||
<button class="btn btn-dark" type="submit">
|
||||
<i class="fas fa-pen-alt"> Update Language</i>
|
||||
</button>
|
||||
{% endif %}
|
||||
</form>
|
||||
<div>
|
||||
<a class="btn btn-primary" href="{{ url_for('chats_explorer.objects_message_detect_language')}}?id={{ message['id'] }}">
|
||||
<i class="fas fa-redo"></i> Detect Language
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<a class="btn btn-light btn-sm text-secondary px-1" href="{{ url_for('correlation.show_correlation')}}?type={{ message['type'] }}&subtype={{ message['subtype'] }}&id={{ message['id'] }}"><i class="fas fa-project-diagram"></i></a>
|
||||
<a class="btn btn-light btn-sm text-secondary px-1" href="{{ message['link'] }}"><i class="fas fa-eye"></i></a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
<option selected>Don't Translate</option>
|
||||
{% else %}
|
||||
<option selected value="{{ translation_target }}">{{ translation_target }}</option>
|
||||
<option>Don't Translate</option>
|
||||
{% endif %}
|
||||
{% for language in translation_languages %}
|
||||
<option value="{{ language }}">{{ translation_languages[language] }}</option>
|
||||
|
|
Loading…
Reference in a new issue