chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images

This commit is contained in:
terrtia 2024-04-10 16:43:54 +02:00
parent 61701e2fcc
commit ed13e8bca4
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
10 changed files with 142 additions and 33 deletions

View file

@ -330,6 +330,11 @@ def get_obj_languages(obj_type, obj_subtype, obj_id):
def get_obj_language_stats(obj_type, obj_subtype, obj_id): def get_obj_language_stats(obj_type, obj_subtype, obj_id):
return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True) return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True)
def get_obj_main_language(obj_type, obj_subtype, obj_id):
language = r_lang.zrevrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, 0)
if language:
return language[0]
# TODO ADD language to CHAT GLOBAL SET # TODO ADD language to CHAT GLOBAL SET
def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s) def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s)
if not obj_subtype: if not obj_subtype:

View file

@ -288,6 +288,10 @@ def get_obj_chat(chat_type, chat_subtype, chat_id):
elif chat_type == 'chat-thread': elif chat_type == 'chat-thread':
return ChatThreads.ChatThread(chat_id, chat_subtype) return ChatThreads.ChatThread(chat_id, chat_subtype)
def get_obj_chat_from_global_id(chat_gid):
chat_type, chat_subtype, chat_id = chat_gid.split(':', 2)
return get_obj_chat(chat_type, chat_subtype, chat_id)
def get_obj_chat_meta(obj_chat, new_options=set()): def get_obj_chat_meta(obj_chat, new_options=set()):
options = {} options = {}
if obj_chat.type == 'chat': if obj_chat.type == 'chat':

View file

@ -50,7 +50,7 @@ class Image(AbstractDaterangeObject):
if flask_context: if flask_context:
url = url_for('correlation.show_correlation', type=self.type, id=self.id) url = url_for('correlation.show_correlation', type=self.type, id=self.id)
else: else:
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' url = f'/correlation/show?type={self.type}&id={self.id}'
return url return url
def get_svg_icon(self): def get_svg_icon(self):
@ -109,6 +109,20 @@ class Image(AbstractDaterangeObject):
def get_screenshot_dir(): def get_screenshot_dir():
return IMAGE_FOLDER return IMAGE_FOLDER
def get_all_images():
images = []
for root, dirs, files in os.walk(get_screenshot_dir()):
for file in files:
path = f'{root}{file}'
image_id = path.replace(IMAGE_FOLDER, '').replace('/', '')
images.append(image_id)
return images
def get_all_images_objects(filters={}):
for image_id in get_all_images():
yield Image(image_id)
def create(content, size_limit=5000000, b64=False, force=False): def create(content, size_limit=5000000, b64=False, force=False):
size = (len(content)*3) / 4 size = (len(content)*3) / 4
@ -134,5 +148,6 @@ class Images(AbstractDaterangeObjects):
# if __name__ == '__main__': # if __name__ == '__main__':
# print(json.dumps(get_all_images()))
# name_to_search = '29ba' # name_to_search = '29ba'
# print(search_screenshots_by_name(name_to_search)) # print(search_screenshots_by_name(name_to_search))

View file

@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
from lib.objects.abstract_object import AbstractObject from lib.objects.abstract_object import AbstractObject
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from packages import Date
# from lib import Language # from lib import Language
# from lib.data_retention_engine import update_obj_date, get_obj_date_first # from lib.data_retention_engine import update_obj_date, get_obj_date_first
@ -49,10 +50,24 @@ class Ocr(AbstractObject):
global_id = self.get_global_id() global_id = self.get_global_id()
content = r_cache.get(f'content:{global_id}') content = r_cache.get(f'content:{global_id}')
if not content: if not content:
content = '' dict_content = {}
for extracted in r_object.smembers(f'ocr:{self.id}'): for extracted in r_object.smembers(f'ocr:{self.id}'):
text = extracted.split(':', 4)[-1] extracted = extracted.split(':', 4)
content = f'{content}\n{text}' x, y = extracted[0].split(',', 1)
# get text line, y +- 20
rounded_y = round(int(y) / 20) * 20
if rounded_y not in dict_content:
dict_content[rounded_y] = []
dict_content[rounded_y].append((int(x), int(y), extracted[-1]))
content = ''
l_key = sorted(dict_content.keys())
for key in l_key:
dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])
for text in dict_content[key]:
content = f'{content} {text[2]}'
content = f'{content}\n'
# Set Cache # Set Cache
if content: if content:
global_id = self.get_global_id() global_id = self.get_global_id()
@ -66,8 +81,18 @@ class Ocr(AbstractObject):
return content.encode() return content.encode()
def get_date(self): # TODO def get_date(self): # TODO
timestamp = self.get_timestamp() return Date.get_today_date_str()
return datetime.utcfromtimestamp(float(timestamp)).strftime('%Y%m%d')
def get_source(self): # TODO
"""
Returns source/feeder name
"""
return 'ocr'
# l_source = self.id.split('/')[:-2]
# return os.path.join(*l_source)
def get_basename(self): # TODO
return 'ocr'
def get_link(self, flask_context=False): def get_link(self, flask_context=False):
if flask_context: if flask_context:
@ -77,7 +102,7 @@ class Ocr(AbstractObject):
return url return url
def get_svg_icon(self): def get_svg_icon(self):
return {'style': 'fas', 'icon': '\uf20a', 'color': 'yellow', 'radius': 5} return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5}
def get_image_path(self): def get_image_path(self):
rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:]) rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])
@ -138,18 +163,17 @@ class Ocr(AbstractObject):
# meta['language'] = self.get_language() # meta['language'] = self.get_language()
return meta return meta
def get_objs_container(self): # TODO def get_objs_container(self):
pass objs_containers = set()
# objs_containers = set() # chat
# # chat objs_containers.add(self.get_first_correlation('chat'))
# objs_containers.add(self.get_chat()) subchannel = self.get_first_correlation('chat-subchannel')
# subchannel = self.get_subchannel() if subchannel:
# if subchannel: objs_containers.add(subchannel)
# objs_containers.add(subchannel) thread = self.get_first_correlation('chat-thread')
# thread = self.get_current_thread() if thread:
# if thread: objs_containers.add(thread)
# objs_containers.add(thread) return objs_containers
# return objs_containers
def create_coord_str(self, bbox): def create_coord_str(self, bbox):
c1, c2, c3, c4 = bbox c1, c2, c3, c4 = bbox
@ -195,8 +219,10 @@ class Ocr(AbstractObject):
return r_object.srem(f'ocr:{self.id}', val) return r_object.srem(f'ocr:{self.id}', val)
def create(self, extracted_texts, tags=[]): def create(self, extracted_texts, tags=[]):
r_object.sadd(f'{self.type}:all', self.id)
for extracted in extracted_texts: for extracted in extracted_texts:
bbox, text = extracted bbox, text = extracted
if len(text) > 1:
str_coords = self.create_coord_str(bbox) str_coords = self.create_coord_str(bbox)
self.add(str_coords, text) self.add(str_coords, text)
self.add_correlation('image', '', self.id) self.add_correlation('image', '', self.id)
@ -206,7 +232,7 @@ class Ocr(AbstractObject):
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
def delete(self): def delete(self):
pass r_object.delete(f'ocr:{self.id}')
def draw_bounding_boxs(self): def draw_bounding_boxs(self):
img = Image.open(self.get_image_path()).convert("RGBA") img = Image.open(self.get_image_path()).convert("RGBA")
@ -233,8 +259,9 @@ def create(obj_id, detections, tags=[]):
# TODO preload languages # TODO preload languages
def extract_text(image_path, languages, threshold=0.2): def extract_text(image_path, languages, threshold=0.2):
import easyocr import easyocr
reader = easyocr.Reader(languages) reader = easyocr.Reader(languages, verbose=False)
texts = reader.readtext(image_path) texts = reader.readtext(image_path)
# print(texts)
extracted = [] extracted = []
for bbox, text, score in texts: for bbox, text, score in texts:
if score > threshold: if score > threshold:
@ -242,3 +269,11 @@ def extract_text(image_path, languages, threshold=0.2):
return extracted return extracted
# TODO OCRS Class # TODO OCRS Class
def get_ids():
return r_object.smembers(f'ocr:all')
def get_all_ocrs_objects(filters={}):
for obj_id in get_ids():
yield Ocr(obj_id)

View file

@ -25,7 +25,7 @@ from lib import Duplicate
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation, get_obj_main_language
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
logging.config.dictConfig(ail_logger.get_config(name='ail')) logging.config.dictConfig(ail_logger.get_config(name='ail'))
@ -237,6 +237,11 @@ class AbstractObject(ABC):
""" """
return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type]) return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type])
def get_first_correlation(self, obj_type):
correlation = self.get_correlation(obj_type)
if correlation.get(obj_type):
return f'{obj_type}:{correlation[obj_type].pop()}'
def get_correlations(self, filter_types=[], unpack=False): def get_correlations(self, filter_types=[], unpack=False):
""" """
Get object correlations Get object correlations
@ -330,6 +335,9 @@ class AbstractObject(ABC):
def get_obj_language_stats(self): def get_obj_language_stats(self):
return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id) return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id)
def get_main_language(self):
return get_obj_main_language(self.type, self.get_subtype(r_str=True), self.id)
def get_translation(self, language, field=''): def get_translation(self, language, field=''):
return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container()) return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container())

View file

@ -296,6 +296,8 @@ def is_filtered(obj, filters):
def obj_iterator(obj_type, filters): def obj_iterator(obj_type, filters):
if obj_type == 'decoded': if obj_type == 'decoded':
return get_all_decodeds_objects(filters=filters) return get_all_decodeds_objects(filters=filters)
elif obj_type == 'image':
return Images.get_all_images_objects(filters=filters)
elif obj_type == 'item': elif obj_type == 'item':
return get_all_items_objects(filters=filters) return get_all_items_objects(filters=filters)
elif obj_type == 'pgp': elif obj_type == 'pgp':

View file

@ -128,7 +128,7 @@ class Global(AbstractModule):
else: else:
self.logger.info(f"Empty Item: {message} not processed") self.logger.info(f"Empty Item: {message} not processed")
elif self.obj.type == 'message': elif self.obj.type == 'message' or self.obj.type == 'ocr':
# TODO send to specific object queue => image, ... # TODO send to specific object queue => image, ...
self.add_message_to_queue(obj=self.obj, queue='Item') self.add_message_to_queue(obj=self.obj, queue='Item')
elif self.obj.type == 'image': elif self.obj.type == 'image':

View file

@ -17,8 +17,45 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages # Import Project packages
################################## ##################################
from modules.abstract_module import AbstractModule from modules.abstract_module import AbstractModule
from lib import chats_viewer
from lib.objects import Messages
from lib.objects import Ocrs from lib.objects import Ocrs
# Default to eng
def get_model_languages(obj, add_en=True):
if add_en:
model_languages = {'en'}
else:
model_languages = set()
ob = obj.get_first_correlation('message')
if ob:
message = Messages.Message(ob.split(':', 2)[-1])
lang = message.get_language()
if lang:
model_languages.add(lang)
return model_languages
ob = obj.get_first_correlation('chat-subchannel')
if ob:
ob = chats_viewer.get_obj_chat_from_global_id(ob)
lang = ob.get_main_language()
if lang:
model_languages.add(lang)
return model_languages
ob = obj.get_first_correlation('chat')
if ob:
ob = chats_viewer.get_obj_chat_from_global_id(ob)
lang = ob.get_main_language()
if lang:
model_languages.add(lang)
return model_languages
return model_languages
# TODO thread
class OcrExtractor(AbstractModule): class OcrExtractor(AbstractModule):
""" """
@ -36,16 +73,16 @@ class OcrExtractor(AbstractModule):
def compute(self, message): def compute(self, message):
image = self.get_obj() image = self.get_obj()
print(image)
path = image.get_filepath() path = image.get_filepath()
languages = ['en', 'ru'] print(image)
languages = get_model_languages(image)
print(languages)
ocr = Ocrs.Ocr(image.id) ocr = Ocrs.Ocr(image.id)
ocr.delete()
if not ocr.exists(): if not ocr.exists():
# TODO Get Language to extract -> add en by default
texts = Ocrs.extract_text(path, languages) texts = Ocrs.extract_text(path, languages)
print(texts)
if texts: if texts:
ocr = Ocrs.create(image.id, texts) ocr = Ocrs.create(image.id, texts)
self.add_message_to_queue(ocr) self.add_message_to_queue(ocr)
@ -55,3 +92,6 @@ if __name__ == '__main__':
module = OcrExtractor() module = OcrExtractor()
module.run() module.run()
# from lib.objects import Images
# module.obj = Images.Image('')
# module.compute('')

View file

@ -62,7 +62,7 @@ if __name__ == "__main__":
obj_type = args.type obj_type = args.type
if not is_object_type(obj_type): if not is_object_type(obj_type):
raise Exception(f'Invalid Object Type: {obj_type}') raise Exception(f'Invalid Object Type: {obj_type}')
if obj_type not in ['item', 'message']: # TODO image if obj_type not in ['image', 'item', 'message']:
raise Exception(f'Currently not supported Object Type: {obj_type}') raise Exception(f'Currently not supported Object Type: {obj_type}')
modulename = args.module modulename = args.module

View file

@ -92,7 +92,7 @@
</li> </li>
</ul> </ul>
{% with obj_type='image', obj_id=meta['id'], obj_subtype='' %} {% with obj_type='ocr', obj_id=meta['id'], obj_subtype='' %}
{% include 'modals/investigations_register_obj.html' %} {% include 'modals/investigations_register_obj.html' %}
{% endwith %} {% endwith %}
<button type="button" class="btn btn-primary" data-toggle="modal" data-target="#investigations_register_obj_modal"> <button type="button" class="btn btn-primary" data-toggle="modal" data-target="#investigations_register_obj_modal">