From 20c98de0fa64641d306076b3bd0d96bd9e451db9 Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 25 Apr 2024 14:18:22 +0200 Subject: [PATCH] chg: [ocr] ocr daterange object, get ocrs by daterange + fixs --- bin/importer/FeederImporter.py | 3 + bin/importer/feeders/Default.py | 3 + bin/importer/feeders/abstract_chats_feeder.py | 8 + bin/lib/objects/Ocrs.py | 54 +- bin/lib/objects/abstract_daterange_object.py | 15 +- bin/modules/Global.py | 2 +- bin/modules/Mixer.py | 2 +- bin/modules/OcrExtractor.py | 15 +- var/www/blueprints/objects_ocr.py | 44 ++ .../templates/objects/ocr/OcrDaterange.html | 602 ++++++++++++++++++ .../templates/sidebars/sidebar_objects.html | 6 + 11 files changed, 726 insertions(+), 28 deletions(-) create mode 100644 var/www/templates/objects/ocr/OcrDaterange.html diff --git a/bin/importer/FeederImporter.py b/bin/importer/FeederImporter.py index 8532f488..f3382be3 100755 --- a/bin/importer/FeederImporter.py +++ b/bin/importer/FeederImporter.py @@ -109,6 +109,9 @@ class FeederImporter(AbstractImporter): gzip64_content = feeder.get_gzip64_content() relay_message = f'{feeder_name} {gzip64_content}' objs_messages.append({'obj': obj, 'message': relay_message}) + elif obj.type == 'image': + date = feeder.get_date() + objs_messages.append({'obj': obj, 'message': f'{feeder_name} {date}'}) else: # Messages save on DB if obj.exists() and obj.type != 'chat': objs_messages.append({'obj': obj, 'message': feeder_name}) diff --git a/bin/importer/feeders/Default.py b/bin/importer/feeders/Default.py index ca0861a1..70a96fa5 100755 --- a/bin/importer/feeders/Default.py +++ b/bin/importer/feeders/Default.py @@ -41,6 +41,9 @@ class DefaultFeeder: def get_source(self): return self.json_data.get('source') + def get_date(self): + return datetime.date.today().strftime("%Y%m%d") + def get_json_data(self): """ Return the JSON data, diff --git a/bin/importer/feeders/abstract_chats_feeder.py b/bin/importer/feeders/abstract_chats_feeder.py index 8b337e9f..b48fe4ad 100755 --- a/bin/importer/feeders/abstract_chats_feeder.py +++ b/bin/importer/feeders/abstract_chats_feeder.py @@ -92,6 +92,14 @@ class AbstractChatFeeder(DefaultFeeder, ABC): def get_reactions(self): return self.json_data['meta'].get('reactions', []) + def get_date(self): + if self.json_data['meta'].get('date'): + date = datetime.datetime.fromtimestamp( self.json_data['meta']['date']['timestamp']) + date = date.strftime('%Y%m%d') + else: + date = datetime.date.today().strftime("%Y%m%d") + return date + def get_message_timestamp(self): if not self.json_data['meta'].get('date'): return None diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py index e290e623..d8da0aa9 100755 --- a/bin/lib/objects/Ocrs.py +++ b/bin/lib/objects/Ocrs.py @@ -15,7 +15,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## -from lib.objects.abstract_object import AbstractObject +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects from lib.ConfigLoader import ConfigLoader from packages import Date # from lib import Language @@ -32,7 +32,7 @@ config_loader = None # SET x1,y1:x2,y2:x3,y3:x4,y4:extracted_text -class Ocr(AbstractObject): +class Ocr(AbstractDaterangeObject): """ AIL Message Object. (strings) """ @@ -147,7 +147,7 @@ class Ocr(AbstractObject): """ if options is None: options = set() - meta = self.get_default_meta(tags=True) + meta = self._get_meta(options=options) meta['content'] = self.get_content() # optional meta fields @@ -218,17 +218,19 @@ class Ocr(AbstractObject): coords.append((f'{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4}', extract[4])) return coords - def edit(self, coordinates, text, new_text, new_coordinates=None): + def edit_text(self, coordinates, text, new_text, new_coordinates=None): pass - def add(self, coordinates, text): + def add_text(self, coordinates, text): val = f'{coordinates}:{text}' return r_object.sadd(f'ocr:{self.id}', val) - def remove(self, val): + def remove_text(self, val): return r_object.srem(f'ocr:{self.id}', val) - def update_correlation(self): + def update_correlation(self, date=None): + if date: + self.add(date, None) image_correl = self.get_obj_correlations('image', '', self.id) for obj_type in image_correl: if obj_type != 'ocr': @@ -237,19 +239,24 @@ class Ocr(AbstractObject): self.add_correlation(obj_type, obj_subtype, obj_id) def create(self, extracted_texts, tags=[]): - r_object.sadd(f'{self.type}:all', self.id) + # r_object.sadd(f'{self.type}:all', self.id) + created = False for extracted in extracted_texts: bbox, text = extracted if len(text) > 1: str_coords = self.create_coord_str(bbox) - self.add(str_coords, text) + self.add_text(str_coords, text) + created = True - # Correlations - self.update_correlation() - self.add_correlation('image', '', self.id) + if created: + # Correlations + self._copy_from('image', self.id) + self.update_correlation() + self.add_correlation('image', '', self.id) - for tag in tags: - self.add_tag(tag) + for tag in tags: + self.add_tag(tag) + return self.id # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ def delete(self): @@ -273,9 +280,9 @@ class Ocr(AbstractObject): def create(obj_id, detections, tags=[]): obj = Ocr(obj_id) if not obj.exists(): - obj.create(detections, tags=tags) - # TODO Edit - return obj + obj_id = obj.create(detections, tags=tags) + if obj_id: + return obj # TODO preload languages def extract_text(image_path, languages, threshold=0.2): @@ -298,6 +305,16 @@ def get_all_ocrs_objects(filters={}): for obj_id in get_ids(): yield Ocr(obj_id) +class Ocrs(AbstractDaterangeObjects): + """ + OCR Objects + """ + def __init__(self): + super().__init__('ocr', Ocr) + + def sanitize_id_to_search(self, name_to_search): + return name_to_search # TODO + #### API #### def api_get_ocr(obj_id, translation_target=None): @@ -306,6 +323,3 @@ def api_get_ocr(obj_id, translation_target=None): return {"status": "error", "reason": "Unknown ocr"}, 404 meta = ocr.get_meta({'content', 'icon', 'img', 'language', 'link', 'map', 'translation'}, translation_target=translation_target) return meta, 200 - - - diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py index 8c2b6b1f..b749e40c 100755 --- a/bin/lib/objects/abstract_daterange_object.py +++ b/bin/lib/objects/abstract_daterange_object.py @@ -71,7 +71,7 @@ class AbstractDaterangeObject(AbstractObject, ABC): else: return last_seen - def get_nb_seen(self): # TODO REPLACE ME -> correlation image + def get_nb_seen(self): # TODO REPLACE ME -> correlation image chats return self.get_nb_correlation('item') + self.get_nb_correlation('message') def get_nb_seen_by_date(self, date): @@ -127,6 +127,19 @@ class AbstractDaterangeObject(AbstractObject, ABC): def _add_create(self): r_object.sadd(f'{self.type}:all', self.id) + def _copy_from(self, obj_type, obj_id): + first_seen = r_object.hget(f'meta:{obj_type}:{obj_id}', 'first_seen') + last_seen = r_object.hget(f'meta:{obj_type}:{obj_id}', 'last_seen') + if first_seen and last_seen: + for date in Date.get_daterange(first_seen, last_seen): + nb = r_object.zscore(f'{obj_type}:date:{date}', self.id) + r_object.zincrby(f'{self.type}:date:{date}', nb, self.id) + update_obj_date(first_seen, self.type) + update_obj_date(last_seen, self.type) + self._add_create() + self.set_first_seen(first_seen) + self.set_last_seen(last_seen) + def _add(self, date, obj): # TODO OBJ=None if not self.exists(): self._add_create() diff --git a/bin/modules/Global.py b/bin/modules/Global.py index f442a226..0dfafde1 100755 --- a/bin/modules/Global.py +++ b/bin/modules/Global.py @@ -132,7 +132,7 @@ class Global(AbstractModule): # TODO send to specific object queue => image, ... self.add_message_to_queue(obj=self.obj, queue='Item') elif self.obj.type == 'image': - self.add_message_to_queue(obj=self.obj, queue='Image') + self.add_message_to_queue(obj=self.obj, queue='Image', message=message) else: self.logger.critical(f"Empty obj: {self.obj} {message} not processed") diff --git a/bin/modules/Mixer.py b/bin/modules/Mixer.py index 8d9d513c..659874fe 100755 --- a/bin/modules/Mixer.py +++ b/bin/modules/Mixer.py @@ -218,7 +218,7 @@ class Mixer(AbstractModule): if self.obj.type == 'item': self.add_message_to_queue(obj=self.obj, message=gzip64encoded) else: - self.add_message_to_queue(obj=self.obj) + self.add_message_to_queue(obj=self.obj, message=gzip64encoded) if __name__ == "__main__": diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index 266d65a3..e3285c8e 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -83,7 +83,7 @@ class OcrExtractor(AbstractModule): def compute(self, message): image = self.get_obj() - print(image.id) + date = message ocr = Ocrs.Ocr(image.id) if self.is_cached(): @@ -96,19 +96,24 @@ class OcrExtractor(AbstractModule): if not ocr.exists(): path = image.get_filepath() languages = get_model_languages(image) - print(languages) + print(image.id, languages) texts = Ocrs.extract_text(path, languages) if texts: print('create') ocr = Ocrs.create(image.id, texts) - self.add_message_to_queue(ocr) + if ocr: + self.add_message_to_queue(ocr) + else: + print('no text') + self.add_to_cache() # Save in cache else: print('no text detected') self.add_to_cache() else: - print('update correlation') - ocr.update_correlation() + # print(image.id) + # print('update correlation', date) + ocr.update_correlation(date=date) if __name__ == '__main__': diff --git a/var/www/blueprints/objects_ocr.py b/var/www/blueprints/objects_ocr.py index eca7f1fe..87a827b3 100644 --- a/var/www/blueprints/objects_ocr.py +++ b/var/www/blueprints/objects_ocr.py @@ -25,6 +25,8 @@ from lib import Language from lib import Tag from lib.objects import Ocrs +from packages import Date + # ============ BLUEPRINT ============ objects_ocr = Blueprint('objects_ocr', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/ocr')) @@ -49,6 +51,48 @@ def ocr_image(filename): return send_file(BytesIO(ocr.draw_bounding_boxs()), mimetype='image/png') +@objects_ocr.route("/objects/ocrs", methods=['GET']) +@login_required +@login_read_only +def objects_ocrs(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = Ocrs.Ocrs().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + return render_template("OcrDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + + +@objects_ocr.route("/objects/ocrs/post", methods=['POST']) +@login_required +@login_read_only +def objects_ocrs_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_ocr.objects_ocrs', date_from=date_from, date_to=date_to, show_objects=show_objects)) + + +@objects_ocr.route("/objects/ocrs/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_ocrs_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(Ocrs.Ocrs().api_get_chart_nb_by_daterange(date_from, date_to)) + + @objects_ocr.route("/objects/ocr", methods=['GET']) @login_required @login_read_only diff --git a/var/www/templates/objects/ocr/OcrDaterange.html b/var/www/templates/objects/ocr/OcrDaterange.html new file mode 100644 index 00000000..abc3b26e --- /dev/null +++ b/var/www/templates/objects/ocr/OcrDaterange.html @@ -0,0 +1,602 @@ + + + + + Ocrs - AIL + + + + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'sidebars/sidebar_objects.html' %} + +
+ +
+
+
+ +{# {% include 'image/block_images_search.html' %}#} + +
+ + +
+ +
+
+
Select a date range :
+
+
+
+ +
+
+
+ +
+
+ + +
+ +
+
+
+ +
+
+
+
+
+
+ + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }} Ocrs Name:

+ {% else %} +

{{ date_from }} to {{ date_to }} Ocrs Name:

+ {% endif %} + + + + + + + + + + + + {% for obj_id in dict_objects %} + + + + + + + + {% endfor %} + +
First SeenLast SeenTotalLast days
{{ dict_objects[obj_id]['id'] }}{{ dict_objects[obj_id]['first_seen'] }}{{ dict_objects[obj_id]['last_seen'] }}{{ dict_objects[obj_id]['nb_seen'] }}
+ + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }}, No OCR

+ {% else %} +

{{ date_from }} to {{ date_to }}, No OCR

+ {% endif %} + {% endif %} + {% endif %} +
+ +
+
+ + + + + + + + + + + + + + + + + diff --git a/var/www/templates/sidebars/sidebar_objects.html b/var/www/templates/sidebars/sidebar_objects.html index ddba8474..93c86675 100644 --- a/var/www/templates/sidebars/sidebar_objects.html +++ b/var/www/templates/sidebars/sidebar_objects.html @@ -82,6 +82,12 @@ Image +