From 0cb7431e10388439877aa5c5c269f27b7eae8157 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Mon, 21 Aug 2023 15:49:32 +0200
Subject: [PATCH 01/15] chg: [modules] crawl pasties domains

---
 bin/lib/ConfigLoader.py        |   1 +
 bin/lib/regex_helper.py        |  28 +++++++
 bin/modules/Pasties.py         | 144 +++++++++++++++++++++++++++++++++
 bin/modules/Zerobins.py        |  71 ----------------
 bin/modules/abstract_module.py |   3 +
 configs/modules.cfg            |   2 +-
 6 files changed, 177 insertions(+), 72 deletions(-)
 create mode 100755 bin/modules/Pasties.py
 delete mode 100755 bin/modules/Zerobins.py

diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py
index 5be8f492..6ecd4b02 100755
--- a/bin/lib/ConfigLoader.py
+++ b/bin/lib/ConfigLoader.py
@@ -83,6 +83,7 @@ class ConfigLoader(object):
         else:
             return []
 
+
 # # # # Directory Config # # # #
 
 config_loader = ConfigLoader()
diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py
index 41ba4e98..6f877823 100755
--- a/bin/lib/regex_helper.py
+++ b/bin/lib/regex_helper.py
@@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
         proc.terminate()
         sys.exit(0)
 
+def _regex_match(r_key, regex, content):
+    if re.match(regex, content):
+        r_serv_cache.set(r_key, 1)
+        r_serv_cache.expire(r_key, 360)
+
+def regex_match(r_key, regex, item_id, content, max_time=30):
+    proc = Proc(target=_regex_match, args=(r_key, regex, content))
+    try:
+        proc.start()
+        proc.join(max_time)
+        if proc.is_alive():
+            proc.terminate()
+            # Statistics.incr_module_timeout_statistic(r_key)
+            err_mess = f"{r_key}: processing timeout: {item_id}"
+            logger.info(err_mess)
+            return False
+        else:
+            if r_serv_cache.exists(r_key):
+                r_serv_cache.delete(r_key)
+                return True
+            else:
+                r_serv_cache.delete(r_key)
+                return False
+    except KeyboardInterrupt:
+        print("Caught KeyboardInterrupt, terminating regex worker")
+        proc.terminate()
+        sys.exit(0)
+
 def _regex_search(r_key, regex, content):
     if re.search(regex, content):
         r_serv_cache.set(r_key, 1)
diff --git a/bin/modules/Pasties.py b/bin/modules/Pasties.py
new file mode 100755
index 00000000..ce2eff10
--- /dev/null
+++ b/bin/modules/Pasties.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+The Pasties Module
+======================
+This module spots domain-pasties services for further processing
+"""
+
+##################################
+# Import External packages
+##################################
+import os
+import sys
+import time
+
+from pyfaup.faup import Faup
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from modules.abstract_module import AbstractModule
+from lib.ConfigLoader import ConfigLoader
+from lib import crawlers
+
+# TODO add url validator
+
+pasties_blocklist_urls = set()
+pasties_domains = {}
+
+class Pasties(AbstractModule):
+    """
+    Pasties module for AIL framework
+    """
+
+    def __init__(self):
+        super(Pasties, self).__init__()
+        self.faup = Faup()
+
+        config_loader = ConfigLoader()
+        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
+
+        self.pasties = {}
+        self.urls_blocklist = set()
+        self.load_pasties_domains()
+
+        # Send module state to logs
+        self.logger.info(f'Module {self.module_name} initialized')
+
+    def load_pasties_domains(self):
+        self.pasties = {}
+        self.urls_blocklist = set()
+
+        domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
+        if os.path.exists(domains_pasties):
+            with open(domains_pasties) as f:
+                for line in f:
+                    url = line.strip()
+                    if url: # TODO validate line
+                        self.faup.decode(url)
+                        url_decoded = self.faup.get()
+                        host = url_decoded['host']
+                        # if url_decoded.get('port', ''):
+                        #     host = f'{host}:{url_decoded["port"]}'
+                        path = url_decoded.get('resource_path', '')
+                        # print(url_decoded)
+                        if path and path != '/':
+                            if path[-1] != '/':
+                                path = f'{path}/'
+                        else:
+                            path = None
+
+                        if host in self.pasties:
+                            if path:
+                                self.pasties[host].add(path)
+                        else:
+                            if path:
+                                self.pasties[host] = {path}
+                            else:
+                                self.pasties[host] = set()
+
+        url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
+        if os.path.exists(url_blocklist):
+            with open(url_blocklist) as f:
+                for line in f:
+                    url = line.strip()
+                    self.faup.decode(url)
+                    url_decoded = self.faup.get()
+                    host = url_decoded['host']
+                    # if url_decoded.get('port', ''):
+                    #     host = f'{host}:{url_decoded["port"]}'
+                    path = url_decoded.get('resource_path', '')
+                    url = f'{host}{path}'
+                    if url_decoded['query_string']:
+                        url = url + url_decoded['query_string']
+                    self.urls_blocklist.add(url)
+
+    def send_to_crawler(self, url, obj_id):
+        if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
+            self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
+            self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
+            crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
+
+    def compute(self, message):
+        url, item_id = message.split()
+
+        self.faup.decode(url)
+        url_decoded = self.faup.get()
+        # print(url_decoded)
+        url_host = url_decoded['host']
+        # if url_decoded.get('port', ''):
+        #     url_host = f'{url_host}:{url_decoded["port"]}'
+        path = url_decoded.get('resource_path', '')
+        if url_host in self.pasties:
+            if url.startswith('http://'):
+                if url[7:] in self.urls_blocklist:
+                    return None
+            elif url.startswith('https://'):
+                if url[8:] in self.urls_blocklist:
+                    return None
+            else:
+                if url in self.urls_blocklist:
+                    return None
+
+            if not self.pasties[url_host]:
+                if path and path != '/':
+                    print('send to crawler', url_host, url)
+                    self.send_to_crawler(url, item_id)
+            else:
+                if path.endswith('/'):
+                    path_end = path[:-1]
+                else:
+                    path_end = f'{path}/'
+                for url_path in self.pasties[url_host]:
+                    if path.startswith(url_path):
+                        if url_path != path and url_path != path_end:
+                            print('send to crawler', url_path, url)
+                            self.send_to_crawler(url, item_id)
+                            break
+
+
+if __name__ == '__main__':
+    module = Pasties()
+    module.run()
diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py
deleted file mode 100755
index f3fcea5a..00000000
--- a/bin/modules/Zerobins.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-The Zerobins Module
-======================
-This module spots zerobins-like services for further processing
-"""
-
-##################################
-# Import External packages
-##################################
-import os
-import re
-import sys
-
-sys.path.append(os.environ['AIL_BIN'])
-##################################
-# Import Project packages
-##################################
-from modules.abstract_module import AbstractModule
-from lib import crawlers
-
-
-class Zerobins(AbstractModule):
-    """
-    Zerobins module for AIL framework
-    """
-
-    def __init__(self):
-        super(Zerobins, self).__init__()
-
-        binz = [
-            r'^https:\/\/(zerobin||privatebin)\..*$',  # historical ones
-            ]
-
-        self.regex = re.compile('|'.join(binz))
-
-        # Pending time between two computation (computeNone) in seconds
-        self.pending_seconds = 10
-
-        # Send module state to logs
-        self.logger.info(f'Module {self.module_name} initialized')
-
-    def computeNone(self):
-        """
-        Compute when no message in queue
-        """
-        self.logger.debug("No message in queue")
-
-    def compute(self, message):
-        """
-        Compute a message in queue
-        """
-        url, item_id = message.split()
-
-        # Extract zerobins addresses
-        matching_binz = self.regex_findall(self.regex, item_id, url)
-
-        if len(matching_binz) > 0:
-            for bin_url in matching_binz:
-                print(f'send {bin_url} to crawler')
-                # TODO Change priority ???
-                crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
-                                     parent='manual', priority=60)
-
-        self.logger.debug("Compute message in queue")
-
-
-if __name__ == '__main__':
-    module = Zerobins()
-    module.run()
diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py
index 0a1a12cd..164e77b3 100644
--- a/bin/modules/abstract_module.py
+++ b/bin/modules/abstract_module.py
@@ -92,6 +92,9 @@ class AbstractModule(ABC):
     def get_available_queues(self):
         return self.queue.get_out_queues()
 
+    def regex_match(self, regex, obj_id, content):
+        return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
+
     def regex_search(self, regex, obj_id, content):
         return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
 
diff --git a/configs/modules.cfg b/configs/modules.cfg
index b0b1f6df..3ce4f0ae 100644
--- a/configs/modules.cfg
+++ b/configs/modules.cfg
@@ -162,7 +162,7 @@ publish = Importers,Tags
 subscribe = Item
 publish = Tags
 
-[Zerobins]
+[Pasties]
 subscribe = Url
 
 # [My_Module_Name]

From 045aab6f3425ef9f3b2ca20cf69acbde6e0ae52e Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Mon, 21 Aug 2023 15:52:33 +0200
Subject: [PATCH 02/15] fix: [module pasties] fix module name

---
 bin/LAUNCH.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh
index 00c224e4..39640a71 100755
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@@ -267,7 +267,7 @@ function launching_scripts {
     sleep 0.1
     screen -S "Script_AIL" -X screen -t "LibInjection" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./LibInjection.py; read x"
     sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Zerobins" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Zerobins.py; read x"
+    screen -S "Script_AIL" -X screen -t "Pasties" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Pasties.py; read x"
     sleep 0.1
 
     screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x"

From f44c5509da842be5ec0756d042fad0d5d7d0a005 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Wed, 23 Aug 2023 11:16:22 +0200
Subject: [PATCH 03/15] chg: [titles] add yara tracker on title + tags domains
 if unsafe title tags

---
 bin/crawlers/Crawler.py                   | 9 +++++++++
 bin/lib/Tracker.py                        | 4 ++--
 bin/lib/ail_core.py                       | 2 +-
 bin/lib/objects/Titles.py                 | 3 ++-
 var/www/templates/hunter/tracker_add.html | 4 ++++
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
index 7f2c3df9..c22f6ccf 100755
--- a/bin/crawlers/Crawler.py
+++ b/bin/crawlers/Crawler.py
@@ -22,6 +22,7 @@ from lib.objects.Domains import Domain
 from lib.objects.Items import Item
 from lib.objects import Screenshots
 from lib.objects import Titles
+from trackers.Tracker_Yara import Tracker_Yara
 
 logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
 
@@ -35,6 +36,8 @@ class Crawler(AbstractModule):
         # Waiting time in seconds between to message processed
         self.pending_seconds = 1
 
+        self.tracker_yara = Tracker_Yara(queue=False)
+
         config_loader = ConfigLoader()
 
         self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@@ -283,6 +286,12 @@ class Crawler(AbstractModule):
             if title_content:
                 title = Titles.create_title(title_content)
                 title.add(item.get_date(), item_id)
+                # Tracker
+                self.tracker_yara.compute(title.get_id(), obj_type=title.get_type())
+                if not title.is_tags_safe():
+                    unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
+                    self.domain.add_tag(unsafe_tag)
+                    item.add_tag(unsafe_tag)
 
             # SCREENSHOT
             if self.screenshot:
diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py
index f1ea8905..c06e303d 100755
--- a/bin/lib/Tracker.py
+++ b/bin/lib/Tracker.py
@@ -923,7 +923,7 @@ def api_add_tracker(dict_input, user_id):
     # Filters # TODO MOVE ME
     filters = dict_input.get('filters', {})
     if filters:
-        if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
+        if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
             filters = {}
         for obj_type in filters:
             if obj_type not in get_objects_tracked():
@@ -998,7 +998,7 @@ def api_edit_tracker(dict_input, user_id):
     # Filters # TODO MOVE ME
     filters = dict_input.get('filters', {})
     if filters:
-        if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
+        if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
             if not filters['decoded'] and not filters['item']:
                 filters = {}
         for obj_type in filters:
diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py
index 75520a2b..9a7d9557 100755
--- a/bin/lib/ail_core.py
+++ b/bin/lib/ail_core.py
@@ -50,7 +50,7 @@ def get_object_all_subtypes(obj_type):
     return []
 
 def get_objects_tracked():
-    return ['decoded', 'item', 'pgp']
+    return ['decoded', 'item', 'pgp', 'title']
 
 def get_objects_retro_hunted():
     return ['decoded', 'item']
diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py
index 9f88426c..1a29d58e 100755
--- a/bin/lib/objects/Titles.py
+++ b/bin/lib/objects/Titles.py
@@ -45,6 +45,8 @@ class Title(AbstractDaterangeObject):
     def get_content(self, r_type='str'):
         if r_type == 'str':
             return self._get_field('content')
+        elif r_type == 'bytes':
+            return self._get_field('content').encode()
 
     def get_link(self, flask_context=False):
         if flask_context:
@@ -122,4 +124,3 @@ class Titles(AbstractDaterangeObjects):
 #     #     print(r)
 #     r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False)
 #     print(r)
-
diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html
index 7cc690ba..05266fa4 100644
--- a/var/www/templates/hunter/tracker_add.html
+++ b/var/www/templates/hunter/tracker_add.html
@@ -132,6 +132,10 @@
                                                             </div>
                                                         </div>
                                                     </div>
+                                                    <div class="custom-control custom-switch mt-1">
+                                                        <input class="custom-control-input" type="checkbox" name="title_obj" id="title_obj" checked="">
+                                                        <label class="custom-control-label" for="title_obj"><i class="fas fa-lock-open"></i>&nbsp;Decoded <i class="fas fa-heading text-info" data-toggle="tooltip" data-placement="right" title="Title that has been extracted from a HTML page"></i></label>
+                                                    </div>
 
 {#                                                    <div class="custom-control custom-switch mt-1">#}
 {#                                                        <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}

From 46c721590d83301b46999fed645ce16c1cfaff40 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Wed, 23 Aug 2023 11:21:22 +0200
Subject: [PATCH 04/15] fix: [tracker objs filter] fix title icon

---
 var/www/templates/hunter/tracker_add.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html
index 05266fa4..4f8c6f3e 100644
--- a/var/www/templates/hunter/tracker_add.html
+++ b/var/www/templates/hunter/tracker_add.html
@@ -134,7 +134,7 @@
                                                     </div>
                                                     <div class="custom-control custom-switch mt-1">
                                                         <input class="custom-control-input" type="checkbox" name="title_obj" id="title_obj" checked="">
-                                                        <label class="custom-control-label" for="title_obj"><i class="fas fa-lock-open"></i>&nbsp;Decoded <i class="fas fa-heading text-info" data-toggle="tooltip" data-placement="right" title="Title that has been extracted from a HTML page"></i></label>
+                                                        <label class="custom-control-label" for="title_obj"><i class="fas fa-heading"></i>&nbsp;Decoded <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Title that has been extracted from a HTML page"></i></label>
                                                     </div>
 
 {#                                                    <div class="custom-control custom-switch mt-1">#}

From 2145eb7b8a89fafd4c7631a23f3de01bd1a87570 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Wed, 23 Aug 2023 11:46:37 +0200
Subject: [PATCH 05/15] fix: [title] fix None title

---
 bin/lib/crawlers.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 3e61ed88..6e9132d2 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -234,7 +234,9 @@ def extract_title_from_html(html):
     soup = BeautifulSoup(html, 'html.parser')
     title = soup.title
     if title:
-        return str(title.string)
+        title = title.string
+        if title:
+            return str(title)
     return ''
 
 def extract_description_from_html(html):
@@ -2022,4 +2024,4 @@ if __name__ == '__main__':
     # _reprocess_all_hars_cookie_name()
     # _reprocess_all_hars_etag()
     # _gzip_all_hars()
-    _reprocess_all_hars_hhhashs()
+    # _reprocess_all_hars_hhhashs()

From 4e3784922c3dc420828f95cfe6afa63e772194c0 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Wed, 23 Aug 2023 11:47:39 +0200
Subject: [PATCH 06/15] fix: typo

---
 bin/lib/crawlers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 6e9132d2..18b1eeac 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -2012,7 +2012,7 @@ def test_ail_crawlers():
 # TODO MOVE ME IN CRAWLER OR FLASK
 load_blacklist()
 
-if __name__ == '__main__':
+# if __name__ == '__main__':
     # delete_captures()
 
     # item_id = 'crawled/2023/02/20/data.gz'

From c01b806ae30c6304bee5203b6fd46b389cbf1c2b Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Thu, 24 Aug 2023 11:11:57 +0200
Subject: [PATCH 07/15] chg: [mail exporter] add obj content extract for each
 yara rule match

---
 bin/exporter/MailExporter.py | 20 ++++++++++----
 bin/lib/Tracker.py           |  3 +-
 bin/trackers/Tracker_Yara.py | 53 ++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py
index c4d3f5b5..40ee1708 100755
--- a/bin/exporter/MailExporter.py
+++ b/bin/exporter/MailExporter.py
@@ -124,16 +124,26 @@ class MailExporterTracker(MailExporter):
     def __init__(self, host=None, port=None, password=None, user='', sender=''):
         super().__init__(host=host, port=port, password=password, user=user, sender=sender)
 
-    def export(self, tracker, obj):  # TODO match
+    def export(self, tracker, obj, matches=[]):
         tracker_type = tracker.get_type()
         tracker_name = tracker.get_tracked()
-        subject = f'AIL Framework Tracker: {tracker_name}'  # TODO custom subject
+        description = tracker.get_description()
+        if not description:
+            description = tracker_name
+
+        subject = f'AIL Framework Tracker: {description}'
         body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n"
         body += f'Item: {obj.id}\nurl:{obj.get_link()}'
 
-        # TODO match option
-        # if match:
-        #     body += f'Tracker Match:\n\n{escape(match)}'
+        if matches:
+            body += '\n'
+            nb = 1
+            for match in matches:
+                body += f'\nMatch {nb}: {match[0]}\nExtract:\n{match[1]}\n\n'
+                nb += 1
+        else:
+            body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n"
+            body += f'Item: {obj.id}\nurl:{obj.get_link()}'
 
         for mail in tracker.get_mails():
             self._export(mail, subject, body)
diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py
index c06e303d..4baa3e5f 100755
--- a/bin/lib/Tracker.py
+++ b/bin/lib/Tracker.py
@@ -248,7 +248,8 @@ class Tracker:
         return self._get_field('user_id')
 
     def webhook_export(self):
-        return r_tracker.hexists(f'tracker:{self.uuid}', 'webhook')
+        webhook = self.get_webhook()
+        return webhook is not None and webhook
 
     def get_webhook(self):
         return r_tracker.hget(f'tracker:{self.uuid}', 'webhook')
diff --git a/bin/trackers/Tracker_Yara.py b/bin/trackers/Tracker_Yara.py
index fab397d1..1cebeaa6 100755
--- a/bin/trackers/Tracker_Yara.py
+++ b/bin/trackers/Tracker_Yara.py
@@ -73,8 +73,56 @@ class Tracker_Yara(AbstractModule):
             print(f'{self.obj.get_id()}: yara scanning timed out')
             self.redis_logger.info(f'{self.obj.get_id()}: yara scanning timed out')
 
+    def convert_byte_offset_to_string(self, b_content, offset):
+        byte_chunk = b_content[:offset + 1]
+        try:
+            string_chunk = byte_chunk.decode()
+            offset = len(string_chunk) - 1
+            return offset
+        except UnicodeDecodeError:
+            return self.convert_byte_offset_to_string(b_content, offset - 1)
+
+    def extract_matches(self, data, limit=500, lines=5):
+        matches = []
+        content = self.obj.get_content()
+        l_content = len(content)
+        b_content = content.encode()
+        for string_match in data.get('strings'):
+            for string_match_instance in string_match.instances:
+                start = string_match_instance.offset
+                value = string_match_instance.matched_data.decode()
+                end = start + string_match_instance.matched_length
+                # str
+                start = self.convert_byte_offset_to_string(b_content, start)
+                end = self.convert_byte_offset_to_string(b_content, end)
+
+                # Start
+                if start > limit:
+                    i_start = start - limit
+                else:
+                    i_start = 0
+                str_start = content[i_start:start].splitlines()
+                if len(str_start) > lines:
+                    str_start = '\n'.join(str_start[-lines + 1:])
+                else:
+                    str_start = content[i_start:start]
+
+                # End
+                if end + limit > l_content:
+                    i_end = l_content
+                else:
+                    i_end = end + limit
+                str_end = content[end:i_end].splitlines()
+                if len(str_end) > lines:
+                    str_end = '\n'.join(str_end[:lines + 1])
+                else:
+                    str_end = content[end:i_end]
+                matches.append((value, f'{str_start}{value}{str_end}'))
+        return matches
+
     def yara_rules_match(self, data):
         tracker_name = data['namespace']
+        matches = None
         obj_id = self.obj.get_id()
         for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type('yara', self.obj.get_type(), tracker_name):
             tracker = Tracker.Tracker(tracker_uuid)
@@ -96,8 +144,9 @@ class Tracker_Yara(AbstractModule):
 
             # Mails
             if tracker.mail_export():
-                # TODO add matches + custom subjects
-                self.exporters['mail'].export(tracker, self.obj)
+                if not matches:
+                    matches = self.extract_matches(data)
+                self.exporters['mail'].export(tracker, self.obj, matches)
 
             # Webhook
             if tracker.webhook_export():

From 546d6538fd25cbf701b220b4440699f776367cb7 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Thu, 24 Aug 2023 14:37:50 +0200
Subject: [PATCH 08/15] chg: [mail exporter] add obj content extract for each
 regex match

---
 bin/exporter/MailExporter.py  |  1 +
 bin/trackers/Tracker_Regex.py | 51 ++++++++++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py
index 40ee1708..41074d7b 100755
--- a/bin/exporter/MailExporter.py
+++ b/bin/exporter/MailExporter.py
@@ -145,5 +145,6 @@ class MailExporterTracker(MailExporter):
             body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n"
             body += f'Item: {obj.id}\nurl:{obj.get_link()}'
 
+        # print(body)
         for mail in tracker.get_mails():
             self._export(mail, subject, body)
diff --git a/bin/trackers/Tracker_Regex.py b/bin/trackers/Tracker_Regex.py
index 5cc06410..db35f239 100755
--- a/bin/trackers/Tracker_Regex.py
+++ b/bin/trackers/Tracker_Regex.py
@@ -41,6 +41,8 @@ class Tracker_Regex(AbstractModule):
         self.tracked_regexs = Tracker.get_tracked_regexs()
         self.last_refresh = time.time()
 
+        self.obj = None
+
         # Exporter
         self.exporters = {'mail': MailExporterTracker(),
                           'webhook': WebHookExporterTracker()}
@@ -56,6 +58,7 @@ class Tracker_Regex(AbstractModule):
             print('Tracked regex refreshed')
 
         obj = ail_objects.get_object(obj_type, subtype, obj_id)
+        self.obj = obj
         obj_id = obj.get_id()
         obj_type = obj.get_type()
 
@@ -66,12 +69,46 @@ class Tracker_Regex(AbstractModule):
         content = obj.get_content()
 
         for dict_regex in self.tracked_regexs[obj_type]:
-            matched = self.regex_findall(dict_regex['regex'], obj_id, content)
-            if matched:
-                self.new_tracker_found(dict_regex['tracked'], 'regex', obj)
+            matches = self.regex_finditer(dict_regex['regex'], obj_id, content)
+            if matches:
+                self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches)
 
-    def new_tracker_found(self, tracker_name, tracker_type, obj):
+    def extract_matches(self, re_matches, limit=500, lines=5):
+        matches = []
+        content = self.obj.get_content()
+        l_content = len(content)
+        for match in re_matches:
+            start = match[0]
+            value = match[2]
+            end = match[1]
+
+            # Start
+            if start > limit:
+                i_start = start - limit
+            else:
+                i_start = 0
+            str_start = content[i_start:start].splitlines()
+            if len(str_start) > lines:
+                str_start = '\n'.join(str_start[-lines + 1:])
+            else:
+                str_start = content[i_start:start]
+
+            # End
+            if end + limit > l_content:
+                i_end = l_content
+            else:
+                i_end = end + limit
+            str_end = content[end:i_end].splitlines()
+            if len(str_end) > lines:
+                str_end = '\n'.join(str_end[:lines + 1])
+            else:
+                str_end = content[end:i_end]
+            matches.append((value, f'{str_start}{value}{str_end}'))
+        return matches
+
+    def new_tracker_found(self, tracker_name, tracker_type, obj, re_matches):
         obj_id = obj.get_id()
+        matches = None
         for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type(tracker_type, obj.get_type(), tracker_name):
             tracker = Tracker.Tracker(tracker_uuid)
 
@@ -93,8 +130,9 @@ class Tracker_Regex(AbstractModule):
                     obj.add_tag(tag)
 
             if tracker.mail_export():
-                # TODO add matches + custom subjects
-                self.exporters['mail'].export(tracker, obj)
+                if not matches:
+                    matches = self.extract_matches(re_matches)
+                self.exporters['mail'].export(tracker, obj, matches)
 
             if tracker.webhook_export():
                 self.exporters['webhook'].export(tracker, obj)
@@ -103,4 +141,3 @@ class Tracker_Regex(AbstractModule):
 if __name__ == "__main__":
     module = Tracker_Regex()
     module.run()
-    # module.compute('submitted/2023/05/02/submitted_b1e518f1-703b-40f6-8238-d1c22888197e.gz')

From 24969610cc4d5c04845e65dfaf9a5592487a0954 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Tue, 29 Aug 2023 11:59:39 +0200
Subject: [PATCH 09/15] fix: [items source] fix empty sources list

---
 bin/lib/item_basic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py
index 71fa5378..b35d126e 100755
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@@ -209,7 +209,10 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt
         l_dir = os.listdir(directory)
     # empty directory
     if not l_dir:
-        return l_sources_name.add(source_name)
+        if source_name:
+            return l_sources_name.add(source_name)
+        else:
+            return l_sources_name
     else:
         for src_name in l_dir:
             if len(src_name) == 4:

From 099253f8546237b6164f90e78b16d5444fbf3fbb Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Tue, 29 Aug 2023 13:50:16 +0200
Subject: [PATCH 10/15] fix: [json importer] fix empty source name

---
 bin/importer/feeders/Default.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bin/importer/feeders/Default.py b/bin/importer/feeders/Default.py
index 482d06b4..100ed1e6 100755
--- a/bin/importer/feeders/Default.py
+++ b/bin/importer/feeders/Default.py
@@ -24,8 +24,12 @@ class DefaultFeeder:
         Return feeder name. first part of the item_id and display in the UI
         """
         if not self.name:
-            return self.get_source()
-        return self.name
+            name = self.get_source()
+        else:
+            name = self.name
+        if not name:
+            name = 'default'
+        return name
 
     def get_source(self):
         return self.json_data.get('source')

From 7c73f0944a1a4b8ba052563f6bc0b03374c6ffdf Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Tue, 29 Aug 2023 14:03:26 +0200
Subject: [PATCH 11/15] fix: [items source] filter invalid item sources

---
 bin/lib/item_basic.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py
index b35d126e..25420106 100755
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@@ -204,7 +204,11 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt
     if not l_sources_name:
         l_sources_name = set()
     if source_name:
-        l_dir = os.listdir(os.path.join(directory, source_name))
+        path = os.path.join(directory, source_name)
+        if os.path.isdir(path):
+            l_dir = os.listdir(os.path.join(directory, source_name))
+        else:
+            l_dir = []
     else:
         l_dir = os.listdir(directory)
     # empty directory
@@ -215,7 +219,7 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt
             return l_sources_name
     else:
         for src_name in l_dir:
-            if len(src_name) == 4:
+            if len(src_name) == 4 and source_name:
                 # try:
                 int(src_name)
                 to_add = os.path.join(source_name)

From ed0423118e9facb55fff0d3ef381e688aeb0ade0 Mon Sep 17 00:00:00 2001
From: Jean-Louis Huynen <huynenjl@gmail.com>
Date: Thu, 31 Aug 2023 15:42:44 +0200
Subject: [PATCH 12/15] chg: [crawlers] submit a single cookie to the crawler
 task API

---
 bin/lib/crawlers.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 18b1eeac..3a0a9f19 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -1692,6 +1692,18 @@ def api_add_crawler_task(data, user_id=None):
                 return {'error': 'The access to this cookiejar is restricted'}, 403
         cookiejar_uuid = cookiejar.uuid
 
+    cookie = data.get('cookie', None)
+    if not cookiejar_uuid and cookie:
+        # Create new cookiejar
+        cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None)
+        cookiejar = Cookiejar(cookiejar_uuid)
+        try:
+            name = cookie.get('name')
+            value = cookie.get('value')
+            cookiejar.add_cookie(name, value, None, None, None, None, None)
+        except KeyError:
+            return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400
+
     frequency = data.get('frequency', None)
     if frequency:
         if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:

From 68c17c3fbcc20b9e63a3b97d0faac092a970dd10 Mon Sep 17 00:00:00 2001
From: Jean-Louis Huynen <huynenjl@gmail.com>
Date: Thu, 31 Aug 2023 16:13:20 +0200
Subject: [PATCH 13/15] chg: [crawlers] submit cookies to the crawler task API

---
 bin/lib/crawlers.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 3a0a9f19..67f868f0 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -1692,17 +1692,18 @@ def api_add_crawler_task(data, user_id=None):
                 return {'error': 'The access to this cookiejar is restricted'}, 403
         cookiejar_uuid = cookiejar.uuid
 
-    cookie = data.get('cookie', None)
-    if not cookiejar_uuid and cookie:
+    cookies = data.get('cookies', None)
+    if not cookiejar_uuid and cookies:
         # Create new cookiejar
         cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None)
         cookiejar = Cookiejar(cookiejar_uuid)
-        try:
-            name = cookie.get('name')
-            value = cookie.get('value')
-            cookiejar.add_cookie(name, value, None, None, None, None, None)
-        except KeyError:
-            return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400
+        for cookie in cookies:
+            try:
+                name = cookie.get('name')
+                value = cookie.get('value')
+                cookiejar.add_cookie(name, value, None, None, None, None, None)
+            except KeyError:
+                return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400
 
     frequency = data.get('frequency', None)
     if frequency:

From fee3332edbe223106eb5a233746198fe7f174679 Mon Sep 17 00:00:00 2001
From: terrtia <or1994@hotmail.fr>
Date: Fri, 29 Sep 2023 15:43:37 +0200
Subject: [PATCH 14/15] fix: [tracker] delete yara rule, fix filter by object
 type

---
 bin/lib/Tracker.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py
index 4baa3e5f..9c4702ae 100755
--- a/bin/lib/Tracker.py
+++ b/bin/lib/Tracker.py
@@ -2,6 +2,8 @@
 # -*-coding:UTF-8 -*
 import json
 import os
+import logging
+import logging.config
 import re
 import sys
 import time
@@ -24,11 +26,16 @@ sys.path.append(os.environ['AIL_BIN'])
 ##################################
 from packages import Date
 from lib.ail_core import get_objects_tracked, get_object_all_subtypes, get_objects_retro_hunted
+from lib import ail_logger
 from lib import ConfigLoader
 from lib import item_basic
 from lib import Tag
 from lib.Users import User
 
+# LOGS
+logging.config.dictConfig(ail_logger.get_config(name='modules'))
+logger = logging.getLogger()
+
 config_loader = ConfigLoader.ConfigLoader()
 r_cache = config_loader.get_redis_conn("Redis_Cache")
 
@@ -561,9 +568,7 @@ class Tracker:
                     os.remove(filepath)
 
         # Filters
-        filters = self.get_filters()
-        if not filters:
-            filters = get_objects_tracked()
+        filters = get_objects_tracked()
         for obj_type in filters:
             r_tracker.srem(f'trackers:objs:{tracker_type}:{obj_type}', tracked)
             r_tracker.srem(f'trackers:uuid:{tracker_type}:{tracked}', f'{self.uuid}:{obj_type}')
@@ -1152,7 +1157,11 @@ def get_tracked_yara_rules():
     for obj_type in get_objects_tracked():
         rules = {}
         for tracked in _get_tracked_by_obj_type('yara', obj_type):
-            rules[tracked] = os.path.join(get_yara_rules_dir(), tracked)
+            rule = os.path.join(get_yara_rules_dir(), tracked)
+            if not os.path.exists(rule):
+                logger.critical(f"Yara rule don't exists {tracked} : {obj_type}")
+            else:
+                rules[tracked] = rule
         to_track[obj_type] = yara.compile(filepaths=rules)
     print(to_track)
     return to_track

From fb4a74b45a49dc968bf823866e06a75fdc8f92d5 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Tue, 3 Oct 2023 11:56:01 +0200
Subject: [PATCH 15/15] fix: [dep] Pinning flask to < 3.0 due to Werkzeug 3.0
 issues: https://stackoverflow.com/a/77215455

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8bb16553..8e9bb803 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -67,7 +67,7 @@ pylibinjection>=0.2.4
 phonenumbers>8.12.1
 
 # Web
-flask>=1.1.4
+flask==2.3.3
 flask-login
 bcrypt>3.1.6