ail-framework/bin/lib/objects/Ocrs.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys

from datetime import datetime
from io import BytesIO
from PIL import Image
from PIL import ImageDraw

from pymisp import MISPObject

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects
from lib.ConfigLoader import ConfigLoader
from packages import Date
# from lib import Language
# from lib.data_retention_engine import update_obj_date, get_obj_date_first

from flask import url_for

config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
r_object = config_loader.get_db_conn("Kvrocks_Objects")
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
IMAGE_FOLDER = config_loader.get_files_directory('images')
config_loader = None

# SET x1,y1:x2,y2:x3,y3:x4,y4:extracted_text

class Ocr(AbstractDaterangeObject):
    """
    AIL Message Object. (strings)
    """

    def __init__(self, id):
        super(Ocr, self).__init__('ocr', id)

    def exists(self):
        return r_object.exists(f'ocr:{self.id}')

    def get_content(self, r_type='str'):
        """
        Returns content
        """
        global_id = self.get_global_id()
        content = r_cache.get(f'content:{global_id}')
        if not content:
            dict_content = {}
            for extracted in r_object.smembers(f'ocr:{self.id}'):
                extracted = extracted.split(':', 4)
                x, y = extracted[0].split(',', 1)
                # get text line, y +- 20
                rounded_y = round(int(y) / 20) * 20
                if rounded_y not in dict_content:
                    dict_content[rounded_y] = []
                dict_content[rounded_y].append((int(x), int(y), extracted[-1]))

            content = ''
            new_line = True
            l_key = sorted(dict_content.keys())
            for key in l_key:
                dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])
                for text in dict_content[key]:
                    if new_line:
                        content = f'{content}{text[2]}'
                        new_line = False
                    else:
                        content = f'{content}      {text[2]}'
                content = f'{content}\n'
                new_line = True

            # Set Cache
            if content:
                global_id = self.get_global_id()
                r_cache.set(f'content:{global_id}', content)
                r_cache.expire(f'content:{global_id}', 300)

        if r_type == 'str':
            return content
        elif r_type == 'bytes':
            if content:
                return content.encode()

    def get_date(self): # TODO
        return Date.get_today_date_str()

    def get_source(self): # TODO
        """
        Returns source/feeder name
        """
        return 'ocr'
        # l_source = self.id.split('/')[:-2]
        # return os.path.join(*l_source)

    def get_basename(self):  # TODO
        return 'ocr'

    def get_language(self):
        languages = self.get_languages()
        if languages:
            return languages.pop()
        else:
            return None

    def get_link(self, flask_context=False):
        if flask_context:
            url = url_for('correlation.show_correlation', type=self.type, id=self.id)
        else:
            url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
        return url

    def get_svg_icon(self):
        return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5}

    def get_image_path(self):
        rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])
        filename = os.path.join(IMAGE_FOLDER, rel_path)
        return os.path.realpath(filename)

    def get_misp_object(self):  # TODO
        obj = MISPObject('instant-message', standalone=True)
        obj_date = self.get_date()
        if obj_date:
            obj.first_seen = obj_date
        else:
            self.logger.warning(
                f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}')

        # obj_attrs = [obj.add_attribute('first-seen', value=obj_date),
        #              obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()),
        #              obj.add_attribute('sensor', value=get_ail_uuid())]
        obj_attrs = []
        for obj_attr in obj_attrs:
            for tag in self.get_tags():
                obj_attr.add_tag(tag)
        return obj

    # options: set of optional meta fields
    def get_meta(self, options=None, translation_target=''):
        """
        :type options: set
        """
        if options is None:
            options = set()
        meta = self._get_meta(options=options)
        meta['tags'] = self.get_tags()
        meta['content'] = self.get_content()

        # optional meta fields
        if 'investigations' in options:
            meta['investigations'] = self.get_investigations()
        if 'link' in options:
            meta['link'] = self.get_link(flask_context=True)
        if 'icon' in options:
            meta['svg_icon'] = self.get_svg_icon()
        if 'img' in options:
            meta['img'] = self.draw_bounding_boxs()
        if 'map' in options:
            meta['map'] = self.get_img_map_coords()
        if 'language' in options:
            meta['language'] = self.get_language()
        if 'translation' in options and translation_target:
            if meta.get('language'):
                source = meta['language']
            else:
                source = None
            meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)
            if 'language' in options:
                meta['language'] = self.get_language()
        return meta

    def get_objs_container(self):
        objs_containers = set()
        # chat
        objs_containers.add(self.get_first_correlation('chat'))
        subchannel = self.get_first_correlation('chat-subchannel')
        if subchannel:
            objs_containers.add(subchannel)
        thread = self.get_first_correlation('chat-thread')
        if thread:
            objs_containers.add(thread)
        return objs_containers

    def create_coord_str(self, bbox):
        c1, c2, c3, c4 = bbox
        x1, y1 = c1
        x2, y2 = c2
        x3, y3 = c3
        x4, y4 = c4
        return f'{int(x1)},{int(y1)}:{int(x2)},{int(y2)}:{int(x3)},{int(y3)}:{int(x4)},{int(y4)}'

    def _unpack_coord(self, coord):
        return coord.split(',', 1)

    def get_coords(self):
        coords = []
        for extracted in r_object.smembers(f'ocr:{self.id}'):
            coord = []
            bbox = extracted.split(':', 4)[:-1]
            for c in bbox:
                x, y = self._unpack_coord(c)
                coord.append((int(x), int(y)))
            coords.append(coord)
        return coords

    def get_img_map_coords(self):
        coords = []
        for extracted in r_object.smembers(f'ocr:{self.id}'):
            extract = extracted.split(':', 4)
            x1, y1 = self._unpack_coord(extract[0])
            x2, y2 = self._unpack_coord(extract[1])
            x3, y3 = self._unpack_coord(extract[2])
            x4, y4 = self._unpack_coord(extract[3])
            coords.append((f'{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4}', extract[4]))
        return coords

    def edit_text(self, coordinates, text, new_text, new_coordinates=None):
        pass

    def add_text(self, coordinates, text):
        val = f'{coordinates}:{text}'
        return r_object.sadd(f'ocr:{self.id}', val)

    def remove_text(self, val):
        return r_object.srem(f'ocr:{self.id}', val)

    def update_correlation(self, date=None):
        if date:
            self.add(date, None)
        image_correl = self.get_obj_correlations('image', '', self.id)
        for obj_type in image_correl:
            if obj_type != 'ocr':
                for obj_raw in image_correl[obj_type]:
                    obj_subtype, obj_id = obj_raw.split(':', 1)
                    self.add_correlation(obj_type, obj_subtype, obj_id)

    def create(self, extracted_texts, tags=[]):
        # r_object.sadd(f'{self.type}:all', self.id)
        created = False
        for extracted in extracted_texts:
            bbox, text = extracted
            if len(text) > 1:
                str_coords = self.create_coord_str(bbox)
                self.add_text(str_coords, text)
                created = True

        if created:
            # Correlations
            self._copy_from('image', self.id)
            self.update_correlation()
            self.add_correlation('image', '', self.id)

            for tag in tags:
                self.add_tag(tag)
            return self.id

    # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
    def delete(self):
        r_object.delete(f'ocr:{self.id}')

    def draw_bounding_boxs(self):
        img = Image.open(self.get_image_path()).convert("RGBA")
        draw = ImageDraw.Draw(img)
        for bbox in self.get_coords():
            c1, c2, c3, c4 = bbox
            draw.line((tuple(c1), tuple(c2)), fill="yellow", width=2)
            draw.line((tuple(c2), tuple(c3)), fill="yellow", width=2)
            draw.line((tuple(c3), tuple(c4)), fill="yellow", width=2)
            draw.line((tuple(c4), tuple(c1)), fill="yellow", width=2)
        # img.show()
        buff = BytesIO()
        img.save(buff, "PNG")
        return buff.getvalue()


def create(obj_id, detections, tags=[]):
    obj = Ocr(obj_id)
    if not obj.exists():
        obj_id = obj.create(detections, tags=tags)
        if obj_id:
            return obj

# TODO preload languages
def extract_text(image_path, languages, threshold=0.2):
    import easyocr
    reader = easyocr.Reader(languages, verbose=False)
    texts = reader.readtext(image_path)
    # print(texts)
    extracted = []
    for bbox, text, score in texts:
        if score > threshold:
            extracted.append((bbox, text))
    return extracted


def get_ocr_languages():
    return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'}


def sanityze_ocr_languages(languages, ocr_languages=None):
    langs = set()
    if not ocr_languages:
        ocr_languages = get_ocr_languages()
    for lang in languages:
        if lang in ocr_languages:
            if lang == 'zh':
                langs.add('ch_sim')
            elif lang == 'sr':
                langs.add('rs_latin')
            else:
                langs.add(lang)
    return langs

class Ocrs(AbstractDaterangeObjects):
    """
        OCR Objects
    """
    def __init__(self):
        super().__init__('ocr', Ocr)

    def sanitize_id_to_search(self, name_to_search):
        return name_to_search  # TODO


#### API ####
def api_get_ocr(obj_id, translation_target=None):
    ocr = Ocr(obj_id)
    if not ocr.exists():
        return {"status": "error", "reason": "Unknown ocr"}, 404
    meta = ocr.get_meta({'content', 'icon', 'img', 'language', 'link', 'map', 'translation'}, translation_target=translation_target)
    return meta, 200
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`#!/usr/bin/env python3`
			`# --coding:UTF-8 -`

			`import os`
			`import sys`

			`from datetime import datetime`
			`from io import BytesIO`
			`from PIL import Image`
			`from PIL import ImageDraw`

			`from pymisp import MISPObject`

			`sys.path.append(os.environ['AIL_BIN'])`
			`##################################`
			`# Import Project packages`
			`##################################`
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`from lib.ConfigLoader import ConfigLoader`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`from packages import Date`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`# from lib import Language`
			`# from lib.data_retention_engine import update_obj_date, get_obj_date_first`

			`from flask import url_for`

			`config_loader = ConfigLoader()`
			`r_cache = config_loader.get_redis_conn("Redis_Cache")`
			`r_object = config_loader.get_db_conn("Kvrocks_Objects")`
			`baseurl = config_loader.get_config_str("Notifications", "ail_domain")`
			`IMAGE_FOLDER = config_loader.get_files_directory('images')`
			`config_loader = None`

			`# SET x1,y1:x2,y2:x3,y3:x4,y4:extracted_text`

chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`class Ocr(AbstractDaterangeObject):`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`"""`
			`AIL Message Object. (strings)`
			`"""`

			`def __init__(self, id):`
			`super(Ocr, self).__init__('ocr', id)`

			`def exists(self):`
			`return r_object.exists(f'ocr:{self.id}')`

			`def get_content(self, r_type='str'):`
			`"""`
			`Returns content`
			`"""`
			`global_id = self.get_global_id()`
			`content = r_cache.get(f'content:{global_id}')`
			`if not content:`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`dict_content = {}`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`for extracted in r_object.smembers(f'ocr:{self.id}'):`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`extracted = extracted.split(':', 4)`
			`x, y = extracted[0].split(',', 1)`
			`# get text line, y +- 20`
			`rounded_y = round(int(y) / 20) * 20`
			`if rounded_y not in dict_content:`
			`dict_content[rounded_y] = []`
			`dict_content[rounded_y].append((int(x), int(y), extracted[-1]))`

			`content = ''`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`new_line = True`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`l_key = sorted(dict_content.keys())`
			`for key in l_key:`
			`dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])`
			`for text in dict_content[key]:`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`if new_line:`
			`content = f'{content}{text[2]}'`
			`new_line = False`
			`else:`
			`content = f'{content} {text[2]}'`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`content = f'{content}\n'`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`new_line = True`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`# Set Cache`
			`if content:`
			`global_id = self.get_global_id()`
			`r_cache.set(f'content:{global_id}', content)`
			`r_cache.expire(f'content:{global_id}', 300)`

			`if r_type == 'str':`
			`return content`
			`elif r_type == 'bytes':`
			`if content:`
			`return content.encode()`

			`def get_date(self): # TODO`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`return Date.get_today_date_str()`

			`def get_source(self): # TODO`
			`"""`
			`Returns source/feeder name`
			`"""`
			`return 'ocr'`
			`# l_source = self.id.split('/')[:-2]`
			`# return os.path.join(*l_source)`

			`def get_basename(self): # TODO`
			`return 'ocr'`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`def get_language(self):`
			`languages = self.get_languages()`
			`if languages:`
			`return languages.pop()`
			`else:`
			`return None`

chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`def get_link(self, flask_context=False):`
			`if flask_context:`
			`url = url_for('correlation.show_correlation', type=self.type, id=self.id)`
			`else:`
			`url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'`
			`return url`

			`def get_svg_icon(self):`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5}`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
			`def get_image_path(self):`
			`rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])`
			`filename = os.path.join(IMAGE_FOLDER, rel_path)`
			`return os.path.realpath(filename)`

			`def get_misp_object(self): # TODO`
			`obj = MISPObject('instant-message', standalone=True)`
			`obj_date = self.get_date()`
			`if obj_date:`
			`obj.first_seen = obj_date`
			`else:`
			`self.logger.warning(`
			`f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}')`

			`# obj_attrs = [obj.add_attribute('first-seen', value=obj_date),`
			`# obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()),`
			`# obj.add_attribute('sensor', value=get_ail_uuid())]`
			`obj_attrs = []`
			`for obj_attr in obj_attrs:`
			`for tag in self.get_tags():`
			`obj_attr.add_tag(tag)`
			`return obj`

			`# options: set of optional meta fields`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`def get_meta(self, options=None, translation_target=''):`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`"""`
			`:type options: set`
			`"""`
			`if options is None:`
			`options = set()`
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`meta = self._get_meta(options=options)`
chg: [correlation graph] add date first/last seen separator + fix display of ocr object tags 2024-04-26 09:52:17 +00:00			`meta['tags'] = self.get_tags()`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`meta['content'] = self.get_content()`

			`# optional meta fields`
			`if 'investigations' in options:`
			`meta['investigations'] = self.get_investigations()`
			`if 'link' in options:`
			`meta['link'] = self.get_link(flask_context=True)`
			`if 'icon' in options:`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`meta['svg_icon'] = self.get_svg_icon()`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`if 'img' in options:`
			`meta['img'] = self.draw_bounding_boxs()`
			`if 'map' in options:`
			`meta['map'] = self.get_img_map_coords()`
chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00			`if 'language' in options:`
			`meta['language'] = self.get_language()`
			`if 'translation' in options and translation_target:`
			`if meta.get('language'):`
			`source = meta['language']`
			`else:`
			`source = None`
			`meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)`
			`if 'language' in options:`
			`meta['language'] = self.get_language()`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`return meta`

chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`def get_objs_container(self):`
			`objs_containers = set()`
			`# chat`
			`objs_containers.add(self.get_first_correlation('chat'))`
			`subchannel = self.get_first_correlation('chat-subchannel')`
			`if subchannel:`
			`objs_containers.add(subchannel)`
			`thread = self.get_first_correlation('chat-thread')`
			`if thread:`
			`objs_containers.add(thread)`
			`return objs_containers`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
			`def create_coord_str(self, bbox):`
			`c1, c2, c3, c4 = bbox`
			`x1, y1 = c1`
			`x2, y2 = c2`
			`x3, y3 = c3`
			`x4, y4 = c4`
			`return f'{int(x1)},{int(y1)}:{int(x2)},{int(y2)}:{int(x3)},{int(y3)}:{int(x4)},{int(y4)}'`

			`def _unpack_coord(self, coord):`
			`return coord.split(',', 1)`

			`def get_coords(self):`
			`coords = []`
			`for extracted in r_object.smembers(f'ocr:{self.id}'):`
			`coord = []`
			`bbox = extracted.split(':', 4)[:-1]`
			`for c in bbox:`
			`x, y = self._unpack_coord(c)`
			`coord.append((int(x), int(y)))`
			`coords.append(coord)`
			`return coords`

			`def get_img_map_coords(self):`
			`coords = []`
			`for extracted in r_object.smembers(f'ocr:{self.id}'):`
			`extract = extracted.split(':', 4)`
			`x1, y1 = self._unpack_coord(extract[0])`
			`x2, y2 = self._unpack_coord(extract[1])`
			`x3, y3 = self._unpack_coord(extract[2])`
			`x4, y4 = self._unpack_coord(extract[3])`
			`coords.append((f'{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4}', extract[4]))`
			`return coords`

chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`def edit_text(self, coordinates, text, new_text, new_coordinates=None):`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`pass`

chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`def add_text(self, coordinates, text):`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`val = f'{coordinates}:{text}'`
			`return r_object.sadd(f'ocr:{self.id}', val)`

chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`def remove_text(self, val):`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`return r_object.srem(f'ocr:{self.id}', val)`

chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`def update_correlation(self, date=None):`
			`if date:`
			`self.add(date, None)`
chg: [ocr] add cache + correlation ocr-chats-messages + launch ocr extractor by default 2024-04-24 12:43:11 +00:00			`image_correl = self.get_obj_correlations('image', '', self.id)`
			`for obj_type in image_correl:`
			`if obj_type != 'ocr':`
			`for obj_raw in image_correl[obj_type]:`
			`obj_subtype, obj_id = obj_raw.split(':', 1)`
			`self.add_correlation(obj_type, obj_subtype, obj_id)`

chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`def create(self, extracted_texts, tags=[]):`
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`# r_object.sadd(f'{self.type}:all', self.id)`
			`created = False`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`for extracted in extracted_texts:`
			`bbox, text = extracted`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`if len(text) > 1:`
			`str_coords = self.create_coord_str(bbox)`
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`self.add_text(str_coords, text)`
			`created = True`
chg: [ocr] add cache + correlation ocr-chats-messages + launch ocr extractor by default 2024-04-24 12:43:11 +00:00
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`if created:`
			`# Correlations`
			`self._copy_from('image', self.id)`
			`self.update_correlation()`
			`self.add_correlation('image', '', self.id)`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`for tag in tags:`
			`self.add_tag(tag)`
			`return self.id`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
			`# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\`
			`def delete(self):`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`r_object.delete(f'ocr:{self.id}')`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
			`def draw_bounding_boxs(self):`
			`img = Image.open(self.get_image_path()).convert("RGBA")`
			`draw = ImageDraw.Draw(img)`
			`for bbox in self.get_coords():`
			`c1, c2, c3, c4 = bbox`
chg: [perf] reduce memory usage 2024-04-09 12:22:11 +00:00			`draw.line((tuple(c1), tuple(c2)), fill="yellow", width=2)`
			`draw.line((tuple(c2), tuple(c3)), fill="yellow", width=2)`
			`draw.line((tuple(c3), tuple(c4)), fill="yellow", width=2)`
			`draw.line((tuple(c4), tuple(c1)), fill="yellow", width=2)`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`# img.show()`
			`buff = BytesIO()`
			`img.save(buff, "PNG")`
			`return buff.getvalue()`


			`def create(obj_id, detections, tags=[]):`
			`obj = Ocr(obj_id)`
			`if not obj.exists():`
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`obj_id = obj.create(detections, tags=tags)`
			`if obj_id:`
			`return obj`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00
			`# TODO preload languages`
			`def extract_text(image_path, languages, threshold=0.2):`
			`import easyocr`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`reader = easyocr.Reader(languages, verbose=False)`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`texts = reader.readtext(image_path)`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00			`# print(texts)`
chg: [ocr] extract text from image + add ocr object 2024-04-08 15:16:07 +00:00			`extracted = []`
			`for bbox, text, score in texts:`
			`if score > threshold:`
			`extracted.append((bbox, text))`
			`return extracted`

chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00
fix: [ocr] filter ocr supported languages + fix type of object accepted by the tracker 2024-04-26 08:31:31 +00:00			`def get_ocr_languages():`
			`return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'}`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00
fix: [ocr] filter ocr supported languages + fix type of object accepted by the tracker 2024-04-26 08:31:31 +00:00
			`def sanityze_ocr_languages(languages, ocr_languages=None):`
			`langs = set()`
			`if not ocr_languages:`
			`ocr_languages = get_ocr_languages()`
			`for lang in languages:`
			`if lang in ocr_languages:`
			`if lang == 'zh':`
			`langs.add('ch_sim')`
			`elif lang == 'sr':`
			`langs.add('rs_latin')`
			`else:`
			`langs.add(lang)`
			`return langs`
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images 2024-04-10 14:43:54 +00:00
chg: [ocr] ocr daterange object, get ocrs by daterange + fixs 2024-04-25 12:18:22 +00:00			`class Ocrs(AbstractDaterangeObjects):`
			`"""`
			`OCR Objects`
			`"""`
			`def __init__(self):`
			`super().__init__('ocr', Ocr)`

			`def sanitize_id_to_search(self, name_to_search):`
			`return name_to_search # TODO`

chg: [ocr] detect and translate language + show ocr view + add languages blueprint 2024-04-11 10:15:47 +00:00
			`#### API ####`
			`def api_get_ocr(obj_id, translation_target=None):`
			`ocr = Ocr(obj_id)`
			`if not ocr.exists():`
			`return {"status": "error", "reason": "Unknown ocr"}, 404`
			`meta = ocr.get_meta({'content', 'icon', 'img', 'language', 'link', 'map', 'translation'}, translation_target=translation_target)`
			`return meta, 200`