mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 22:27:17 +00:00
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images
This commit is contained in:
parent
61701e2fcc
commit
ed13e8bca4
10 changed files with 142 additions and 33 deletions
|
@ -330,6 +330,11 @@ def get_obj_languages(obj_type, obj_subtype, obj_id):
|
||||||
def get_obj_language_stats(obj_type, obj_subtype, obj_id):
|
def get_obj_language_stats(obj_type, obj_subtype, obj_id):
|
||||||
return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True)
|
return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def get_obj_main_language(obj_type, obj_subtype, obj_id):
|
||||||
|
language = r_lang.zrevrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, 0)
|
||||||
|
if language:
|
||||||
|
return language[0]
|
||||||
|
|
||||||
# TODO ADD language to CHAT GLOBAL SET
|
# TODO ADD language to CHAT GLOBAL SET
|
||||||
def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s)
|
def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s)
|
||||||
if not obj_subtype:
|
if not obj_subtype:
|
||||||
|
|
|
@ -288,6 +288,10 @@ def get_obj_chat(chat_type, chat_subtype, chat_id):
|
||||||
elif chat_type == 'chat-thread':
|
elif chat_type == 'chat-thread':
|
||||||
return ChatThreads.ChatThread(chat_id, chat_subtype)
|
return ChatThreads.ChatThread(chat_id, chat_subtype)
|
||||||
|
|
||||||
|
def get_obj_chat_from_global_id(chat_gid):
|
||||||
|
chat_type, chat_subtype, chat_id = chat_gid.split(':', 2)
|
||||||
|
return get_obj_chat(chat_type, chat_subtype, chat_id)
|
||||||
|
|
||||||
def get_obj_chat_meta(obj_chat, new_options=set()):
|
def get_obj_chat_meta(obj_chat, new_options=set()):
|
||||||
options = {}
|
options = {}
|
||||||
if obj_chat.type == 'chat':
|
if obj_chat.type == 'chat':
|
||||||
|
|
|
@ -50,7 +50,7 @@ class Image(AbstractDaterangeObject):
|
||||||
if flask_context:
|
if flask_context:
|
||||||
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
|
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
|
||||||
else:
|
else:
|
||||||
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
|
url = f'/correlation/show?type={self.type}&id={self.id}'
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_svg_icon(self):
|
def get_svg_icon(self):
|
||||||
|
@ -109,6 +109,20 @@ class Image(AbstractDaterangeObject):
|
||||||
def get_screenshot_dir():
|
def get_screenshot_dir():
|
||||||
return IMAGE_FOLDER
|
return IMAGE_FOLDER
|
||||||
|
|
||||||
|
def get_all_images():
|
||||||
|
images = []
|
||||||
|
for root, dirs, files in os.walk(get_screenshot_dir()):
|
||||||
|
for file in files:
|
||||||
|
path = f'{root}{file}'
|
||||||
|
image_id = path.replace(IMAGE_FOLDER, '').replace('/', '')
|
||||||
|
images.append(image_id)
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_images_objects(filters={}):
|
||||||
|
for image_id in get_all_images():
|
||||||
|
yield Image(image_id)
|
||||||
|
|
||||||
|
|
||||||
def create(content, size_limit=5000000, b64=False, force=False):
|
def create(content, size_limit=5000000, b64=False, force=False):
|
||||||
size = (len(content)*3) / 4
|
size = (len(content)*3) / 4
|
||||||
|
@ -134,5 +148,6 @@ class Images(AbstractDaterangeObjects):
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
|
# print(json.dumps(get_all_images()))
|
||||||
# name_to_search = '29ba'
|
# name_to_search = '29ba'
|
||||||
# print(search_screenshots_by_name(name_to_search))
|
# print(search_screenshots_by_name(name_to_search))
|
||||||
|
|
|
@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
##################################
|
##################################
|
||||||
from lib.objects.abstract_object import AbstractObject
|
from lib.objects.abstract_object import AbstractObject
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
from packages import Date
|
||||||
# from lib import Language
|
# from lib import Language
|
||||||
# from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
# from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
||||||
|
|
||||||
|
@ -49,10 +50,24 @@ class Ocr(AbstractObject):
|
||||||
global_id = self.get_global_id()
|
global_id = self.get_global_id()
|
||||||
content = r_cache.get(f'content:{global_id}')
|
content = r_cache.get(f'content:{global_id}')
|
||||||
if not content:
|
if not content:
|
||||||
content = ''
|
dict_content = {}
|
||||||
for extracted in r_object.smembers(f'ocr:{self.id}'):
|
for extracted in r_object.smembers(f'ocr:{self.id}'):
|
||||||
text = extracted.split(':', 4)[-1]
|
extracted = extracted.split(':', 4)
|
||||||
content = f'{content}\n{text}'
|
x, y = extracted[0].split(',', 1)
|
||||||
|
# get text line, y +- 20
|
||||||
|
rounded_y = round(int(y) / 20) * 20
|
||||||
|
if rounded_y not in dict_content:
|
||||||
|
dict_content[rounded_y] = []
|
||||||
|
dict_content[rounded_y].append((int(x), int(y), extracted[-1]))
|
||||||
|
|
||||||
|
content = ''
|
||||||
|
l_key = sorted(dict_content.keys())
|
||||||
|
for key in l_key:
|
||||||
|
dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])
|
||||||
|
for text in dict_content[key]:
|
||||||
|
content = f'{content} {text[2]}'
|
||||||
|
content = f'{content}\n'
|
||||||
|
|
||||||
# Set Cache
|
# Set Cache
|
||||||
if content:
|
if content:
|
||||||
global_id = self.get_global_id()
|
global_id = self.get_global_id()
|
||||||
|
@ -66,8 +81,18 @@ class Ocr(AbstractObject):
|
||||||
return content.encode()
|
return content.encode()
|
||||||
|
|
||||||
def get_date(self): # TODO
|
def get_date(self): # TODO
|
||||||
timestamp = self.get_timestamp()
|
return Date.get_today_date_str()
|
||||||
return datetime.utcfromtimestamp(float(timestamp)).strftime('%Y%m%d')
|
|
||||||
|
def get_source(self): # TODO
|
||||||
|
"""
|
||||||
|
Returns source/feeder name
|
||||||
|
"""
|
||||||
|
return 'ocr'
|
||||||
|
# l_source = self.id.split('/')[:-2]
|
||||||
|
# return os.path.join(*l_source)
|
||||||
|
|
||||||
|
def get_basename(self): # TODO
|
||||||
|
return 'ocr'
|
||||||
|
|
||||||
def get_link(self, flask_context=False):
|
def get_link(self, flask_context=False):
|
||||||
if flask_context:
|
if flask_context:
|
||||||
|
@ -77,7 +102,7 @@ class Ocr(AbstractObject):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_svg_icon(self):
|
def get_svg_icon(self):
|
||||||
return {'style': 'fas', 'icon': '\uf20a', 'color': 'yellow', 'radius': 5}
|
return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5}
|
||||||
|
|
||||||
def get_image_path(self):
|
def get_image_path(self):
|
||||||
rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])
|
rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])
|
||||||
|
@ -138,18 +163,17 @@ class Ocr(AbstractObject):
|
||||||
# meta['language'] = self.get_language()
|
# meta['language'] = self.get_language()
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
def get_objs_container(self): # TODO
|
def get_objs_container(self):
|
||||||
pass
|
objs_containers = set()
|
||||||
# objs_containers = set()
|
# chat
|
||||||
# # chat
|
objs_containers.add(self.get_first_correlation('chat'))
|
||||||
# objs_containers.add(self.get_chat())
|
subchannel = self.get_first_correlation('chat-subchannel')
|
||||||
# subchannel = self.get_subchannel()
|
if subchannel:
|
||||||
# if subchannel:
|
objs_containers.add(subchannel)
|
||||||
# objs_containers.add(subchannel)
|
thread = self.get_first_correlation('chat-thread')
|
||||||
# thread = self.get_current_thread()
|
if thread:
|
||||||
# if thread:
|
objs_containers.add(thread)
|
||||||
# objs_containers.add(thread)
|
return objs_containers
|
||||||
# return objs_containers
|
|
||||||
|
|
||||||
def create_coord_str(self, bbox):
|
def create_coord_str(self, bbox):
|
||||||
c1, c2, c3, c4 = bbox
|
c1, c2, c3, c4 = bbox
|
||||||
|
@ -195,8 +219,10 @@ class Ocr(AbstractObject):
|
||||||
return r_object.srem(f'ocr:{self.id}', val)
|
return r_object.srem(f'ocr:{self.id}', val)
|
||||||
|
|
||||||
def create(self, extracted_texts, tags=[]):
|
def create(self, extracted_texts, tags=[]):
|
||||||
|
r_object.sadd(f'{self.type}:all', self.id)
|
||||||
for extracted in extracted_texts:
|
for extracted in extracted_texts:
|
||||||
bbox, text = extracted
|
bbox, text = extracted
|
||||||
|
if len(text) > 1:
|
||||||
str_coords = self.create_coord_str(bbox)
|
str_coords = self.create_coord_str(bbox)
|
||||||
self.add(str_coords, text)
|
self.add(str_coords, text)
|
||||||
self.add_correlation('image', '', self.id)
|
self.add_correlation('image', '', self.id)
|
||||||
|
@ -206,7 +232,7 @@ class Ocr(AbstractObject):
|
||||||
|
|
||||||
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
|
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
|
||||||
def delete(self):
|
def delete(self):
|
||||||
pass
|
r_object.delete(f'ocr:{self.id}')
|
||||||
|
|
||||||
def draw_bounding_boxs(self):
|
def draw_bounding_boxs(self):
|
||||||
img = Image.open(self.get_image_path()).convert("RGBA")
|
img = Image.open(self.get_image_path()).convert("RGBA")
|
||||||
|
@ -233,8 +259,9 @@ def create(obj_id, detections, tags=[]):
|
||||||
# TODO preload languages
|
# TODO preload languages
|
||||||
def extract_text(image_path, languages, threshold=0.2):
|
def extract_text(image_path, languages, threshold=0.2):
|
||||||
import easyocr
|
import easyocr
|
||||||
reader = easyocr.Reader(languages)
|
reader = easyocr.Reader(languages, verbose=False)
|
||||||
texts = reader.readtext(image_path)
|
texts = reader.readtext(image_path)
|
||||||
|
# print(texts)
|
||||||
extracted = []
|
extracted = []
|
||||||
for bbox, text, score in texts:
|
for bbox, text, score in texts:
|
||||||
if score > threshold:
|
if score > threshold:
|
||||||
|
@ -242,3 +269,11 @@ def extract_text(image_path, languages, threshold=0.2):
|
||||||
return extracted
|
return extracted
|
||||||
|
|
||||||
# TODO OCRS Class
|
# TODO OCRS Class
|
||||||
|
|
||||||
|
def get_ids():
|
||||||
|
return r_object.smembers(f'ocr:all')
|
||||||
|
|
||||||
|
def get_all_ocrs_objects(filters={}):
|
||||||
|
for obj_id in get_ids():
|
||||||
|
yield Ocr(obj_id)
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ from lib import Duplicate
|
||||||
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
|
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
|
||||||
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
|
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
|
||||||
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
|
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
|
||||||
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation
|
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation, get_obj_main_language
|
||||||
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
|
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
|
||||||
|
|
||||||
logging.config.dictConfig(ail_logger.get_config(name='ail'))
|
logging.config.dictConfig(ail_logger.get_config(name='ail'))
|
||||||
|
@ -237,6 +237,11 @@ class AbstractObject(ABC):
|
||||||
"""
|
"""
|
||||||
return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type])
|
return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type])
|
||||||
|
|
||||||
|
def get_first_correlation(self, obj_type):
|
||||||
|
correlation = self.get_correlation(obj_type)
|
||||||
|
if correlation.get(obj_type):
|
||||||
|
return f'{obj_type}:{correlation[obj_type].pop()}'
|
||||||
|
|
||||||
def get_correlations(self, filter_types=[], unpack=False):
|
def get_correlations(self, filter_types=[], unpack=False):
|
||||||
"""
|
"""
|
||||||
Get object correlations
|
Get object correlations
|
||||||
|
@ -330,6 +335,9 @@ class AbstractObject(ABC):
|
||||||
def get_obj_language_stats(self):
|
def get_obj_language_stats(self):
|
||||||
return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id)
|
return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id)
|
||||||
|
|
||||||
|
def get_main_language(self):
|
||||||
|
return get_obj_main_language(self.type, self.get_subtype(r_str=True), self.id)
|
||||||
|
|
||||||
def get_translation(self, language, field=''):
|
def get_translation(self, language, field=''):
|
||||||
return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container())
|
return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container())
|
||||||
|
|
||||||
|
|
|
@ -296,6 +296,8 @@ def is_filtered(obj, filters):
|
||||||
def obj_iterator(obj_type, filters):
|
def obj_iterator(obj_type, filters):
|
||||||
if obj_type == 'decoded':
|
if obj_type == 'decoded':
|
||||||
return get_all_decodeds_objects(filters=filters)
|
return get_all_decodeds_objects(filters=filters)
|
||||||
|
elif obj_type == 'image':
|
||||||
|
return Images.get_all_images_objects(filters=filters)
|
||||||
elif obj_type == 'item':
|
elif obj_type == 'item':
|
||||||
return get_all_items_objects(filters=filters)
|
return get_all_items_objects(filters=filters)
|
||||||
elif obj_type == 'pgp':
|
elif obj_type == 'pgp':
|
||||||
|
|
|
@ -128,7 +128,7 @@ class Global(AbstractModule):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.info(f"Empty Item: {message} not processed")
|
self.logger.info(f"Empty Item: {message} not processed")
|
||||||
elif self.obj.type == 'message':
|
elif self.obj.type == 'message' or self.obj.type == 'ocr':
|
||||||
# TODO send to specific object queue => image, ...
|
# TODO send to specific object queue => image, ...
|
||||||
self.add_message_to_queue(obj=self.obj, queue='Item')
|
self.add_message_to_queue(obj=self.obj, queue='Item')
|
||||||
elif self.obj.type == 'image':
|
elif self.obj.type == 'image':
|
||||||
|
|
|
@ -17,8 +17,45 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
# Import Project packages
|
# Import Project packages
|
||||||
##################################
|
##################################
|
||||||
from modules.abstract_module import AbstractModule
|
from modules.abstract_module import AbstractModule
|
||||||
|
from lib import chats_viewer
|
||||||
|
from lib.objects import Messages
|
||||||
from lib.objects import Ocrs
|
from lib.objects import Ocrs
|
||||||
|
|
||||||
|
# Default to eng
|
||||||
|
def get_model_languages(obj, add_en=True):
|
||||||
|
if add_en:
|
||||||
|
model_languages = {'en'}
|
||||||
|
else:
|
||||||
|
model_languages = set()
|
||||||
|
|
||||||
|
ob = obj.get_first_correlation('message')
|
||||||
|
if ob:
|
||||||
|
message = Messages.Message(ob.split(':', 2)[-1])
|
||||||
|
lang = message.get_language()
|
||||||
|
if lang:
|
||||||
|
model_languages.add(lang)
|
||||||
|
return model_languages
|
||||||
|
|
||||||
|
ob = obj.get_first_correlation('chat-subchannel')
|
||||||
|
if ob:
|
||||||
|
ob = chats_viewer.get_obj_chat_from_global_id(ob)
|
||||||
|
lang = ob.get_main_language()
|
||||||
|
if lang:
|
||||||
|
model_languages.add(lang)
|
||||||
|
return model_languages
|
||||||
|
|
||||||
|
ob = obj.get_first_correlation('chat')
|
||||||
|
if ob:
|
||||||
|
ob = chats_viewer.get_obj_chat_from_global_id(ob)
|
||||||
|
lang = ob.get_main_language()
|
||||||
|
if lang:
|
||||||
|
model_languages.add(lang)
|
||||||
|
return model_languages
|
||||||
|
|
||||||
|
return model_languages
|
||||||
|
|
||||||
|
# TODO thread
|
||||||
|
|
||||||
|
|
||||||
class OcrExtractor(AbstractModule):
|
class OcrExtractor(AbstractModule):
|
||||||
"""
|
"""
|
||||||
|
@ -36,16 +73,16 @@ class OcrExtractor(AbstractModule):
|
||||||
|
|
||||||
def compute(self, message):
|
def compute(self, message):
|
||||||
image = self.get_obj()
|
image = self.get_obj()
|
||||||
print(image)
|
|
||||||
path = image.get_filepath()
|
path = image.get_filepath()
|
||||||
languages = ['en', 'ru']
|
print(image)
|
||||||
|
|
||||||
|
languages = get_model_languages(image)
|
||||||
|
print(languages)
|
||||||
|
|
||||||
ocr = Ocrs.Ocr(image.id)
|
ocr = Ocrs.Ocr(image.id)
|
||||||
|
ocr.delete()
|
||||||
if not ocr.exists():
|
if not ocr.exists():
|
||||||
# TODO Get Language to extract -> add en by default
|
|
||||||
|
|
||||||
texts = Ocrs.extract_text(path, languages)
|
texts = Ocrs.extract_text(path, languages)
|
||||||
print(texts)
|
|
||||||
if texts:
|
if texts:
|
||||||
ocr = Ocrs.create(image.id, texts)
|
ocr = Ocrs.create(image.id, texts)
|
||||||
self.add_message_to_queue(ocr)
|
self.add_message_to_queue(ocr)
|
||||||
|
@ -55,3 +92,6 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
module = OcrExtractor()
|
module = OcrExtractor()
|
||||||
module.run()
|
module.run()
|
||||||
|
# from lib.objects import Images
|
||||||
|
# module.obj = Images.Image('')
|
||||||
|
# module.compute('')
|
||||||
|
|
|
@ -62,7 +62,7 @@ if __name__ == "__main__":
|
||||||
obj_type = args.type
|
obj_type = args.type
|
||||||
if not is_object_type(obj_type):
|
if not is_object_type(obj_type):
|
||||||
raise Exception(f'Invalid Object Type: {obj_type}')
|
raise Exception(f'Invalid Object Type: {obj_type}')
|
||||||
if obj_type not in ['item', 'message']: # TODO image
|
if obj_type not in ['image', 'item', 'message']:
|
||||||
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
||||||
|
|
||||||
modulename = args.module
|
modulename = args.module
|
||||||
|
|
|
@ -92,7 +92,7 @@
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
{% with obj_type='image', obj_id=meta['id'], obj_subtype='' %}
|
{% with obj_type='ocr', obj_id=meta['id'], obj_subtype='' %}
|
||||||
{% include 'modals/investigations_register_obj.html' %}
|
{% include 'modals/investigations_register_obj.html' %}
|
||||||
{% endwith %}
|
{% endwith %}
|
||||||
<button type="button" class="btn btn-primary" data-toggle="modal" data-target="#investigations_register_obj_modal">
|
<button type="button" class="btn btn-primary" data-toggle="modal" data-target="#investigations_register_obj_modal">
|
||||||
|
|
Loading…
Reference in a new issue