From 7fd8ae4a81ec54c9c1bc2d616e2b688a74951710 Mon Sep 17 00:00:00 2001 From: terrtia Date: Wed, 24 Apr 2024 15:16:18 +0200 Subject: [PATCH] chg: [reprocess tool] add OcrExtractor module + filter image gif --- bin/lib/objects/Images.py | 9 +++++++++ bin/modules/OcrExtractor.py | 4 ++++ tools/reprocess_objects.py | 8 +++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/bin/lib/objects/Images.py b/bin/lib/objects/Images.py index 391a2431..0e8e1d7a 100755 --- a/bin/lib/objects/Images.py +++ b/bin/lib/objects/Images.py @@ -2,6 +2,7 @@ # -*-coding:UTF-8 -* import base64 +import magic import os import sys @@ -64,6 +65,14 @@ class Image(AbstractDaterangeObject): filename = os.path.join(IMAGE_FOLDER, self.get_rel_path()) return os.path.realpath(filename) + def is_gif(self, filepath=None): + if not filepath: + filepath = self.get_filepath() + mime = magic.from_file(filepath, mime=True) + if mime == 'image/gif': + return True + return False + def get_file_content(self): filepath = self.get_filepath() with open(filepath, 'rb') as f: diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index deb732da..266d65a3 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -89,6 +89,10 @@ class OcrExtractor(AbstractModule): if self.is_cached(): return None + if self.obj.is_gif(): + self.logger.warning(f'Ignoring GIF: {self.obj.id}') + return None + if not ocr.exists(): path = image.get_filepath() languages = get_model_languages(image) diff --git a/tools/reprocess_objects.py b/tools/reprocess_objects.py index a832487a..b41f59e3 100755 --- a/tools/reprocess_objects.py +++ b/tools/reprocess_objects.py @@ -30,15 +30,21 @@ from lib.objects import ail_objects # from modules.Telegram import Telegram from modules.Languages import Languages +from modules.OcrExtractor import OcrExtractor MODULES = { - 'Languages': Languages + 'Languages': Languages, + 'OcrExtractor': OcrExtractor + } def reprocess_message_objects(object_type, module_name=None): if module_name: module = MODULES[module_name]() for obj in ail_objects.obj_iterator(object_type, filters={}): + if not obj.exists(): + print(f'ERROR: object does not exist, {obj.id}') + continue module.obj = obj module.compute(None) else: