fix: [ocr] filter ocr supported languages + fix type of object accepted by the tracker

This commit is contained in:
terrtia 2024-04-26 10:31:31 +02:00
parent 26f9e84d97
commit 35502d955f
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 29 additions and 10 deletions

View file

@ -81,10 +81,10 @@ def get_default_correlation_objects():
return AIL_OBJECTS_CORRELATIONS_DEFAULT return AIL_OBJECTS_CORRELATIONS_DEFAULT
def get_obj_queued(): def get_obj_queued():
return ['item', 'image'] return ['item', 'image', 'message', 'ocr']
def get_objects_tracked(): def get_objects_tracked():
return ['decoded', 'item', 'pgp', 'message', 'title'] return ['decoded', 'item', 'pgp', 'message', 'ocr', 'title']
def get_objects_retro_hunted(): def get_objects_retro_hunted():
return ['decoded', 'item', 'message'] return ['decoded', 'item', 'message']

View file

@ -296,14 +296,24 @@ def extract_text(image_path, languages, threshold=0.2):
extracted.append((bbox, text)) extracted.append((bbox, text))
return extracted return extracted
# TODO OCRS Class
def get_ids(): def get_ocr_languages():
return r_object.smembers(f'ocr:all') return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'}
def get_all_ocrs_objects(filters={}):
for obj_id in get_ids(): def sanityze_ocr_languages(languages, ocr_languages=None):
yield Ocr(obj_id) langs = set()
if not ocr_languages:
ocr_languages = get_ocr_languages()
for lang in languages:
if lang in ocr_languages:
if lang == 'zh':
langs.add('ch_sim')
elif lang == 'sr':
langs.add('rs_latin')
else:
langs.add(lang)
return langs
class Ocrs(AbstractDaterangeObjects): class Ocrs(AbstractDaterangeObjects):
""" """

View file

@ -22,8 +22,9 @@ from lib import chats_viewer
from lib.objects import Messages from lib.objects import Messages
from lib.objects import Ocrs from lib.objects import Ocrs
# Default to eng # Default to eng
def get_model_languages(obj, add_en=True): def get_model_languages(obj, ocr_languages, add_en=True):
if add_en: if add_en:
model_languages = {'en'} model_languages = {'en'}
else: else:
@ -53,6 +54,8 @@ def get_model_languages(obj, add_en=True):
model_languages.add(lang) model_languages.add(lang)
return model_languages return model_languages
model_languages = Ocrs.sanityze_ocr_languages(model_languages, ocr_languages=ocr_languages)
return model_languages return model_languages
# TODO thread # TODO thread
@ -72,6 +75,8 @@ class OcrExtractor(AbstractModule):
config_loader = ConfigLoader() config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache") self.r_cache = config_loader.get_redis_conn("Redis_Cache")
self.ocr_languages = Ocrs.get_ocr_languages()
# Send module state to logs # Send module state to logs
self.logger.info(f'Module {self.module_name} initialized') self.logger.info(f'Module {self.module_name} initialized')
@ -95,7 +100,7 @@ class OcrExtractor(AbstractModule):
if not ocr.exists(): if not ocr.exists():
path = image.get_filepath() path = image.get_filepath()
languages = get_model_languages(image) languages = get_model_languages(image, self.ocr_languages)
print(image.id, languages) print(image.id, languages)
texts = Ocrs.extract_text(path, languages) texts = Ocrs.extract_text(path, languages)
if texts: if texts:

View file

@ -140,6 +140,10 @@
<input class="custom-control-input" type="checkbox" name="message_obj" id="message_obj" checked=""> <input class="custom-control-input" type="checkbox" name="message_obj" id="message_obj" checked="">
<label class="custom-control-label" for="message_obj"><i class="fas fa-comment-dots"></i>&nbsp;Message <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Messages from Chats"></i></label> <label class="custom-control-label" for="message_obj"><i class="fas fa-comment-dots"></i>&nbsp;Message <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Messages from Chats"></i></label>
</div> </div>
<div class="custom-control custom-switch mt-1">
<input class="custom-control-input" type="checkbox" name="ocr_obj" id="ocr_obj" checked="">
<label class="custom-control-label" for="ocr_obj"><i class="fas fa-comment-dots"></i>&nbsp;OCR <i class="fas fa-expand text-info" data-toggle="tooltip" data-placement="right" title="Text extracted from Images"></i></label>
</div>
{# <div class="custom-control custom-switch mt-1">#} {# <div class="custom-control custom-switch mt-1">#}
{# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#} {# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}