From 0cb7431e10388439877aa5c5c269f27b7eae8157 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 21 Aug 2023 15:49:32 +0200 Subject: [PATCH 01/15] chg: [modules] crawl pasties domains --- bin/lib/ConfigLoader.py | 1 + bin/lib/regex_helper.py | 28 +++++++ bin/modules/Pasties.py | 144 +++++++++++++++++++++++++++++++++ bin/modules/Zerobins.py | 71 ---------------- bin/modules/abstract_module.py | 3 + configs/modules.cfg | 2 +- 6 files changed, 177 insertions(+), 72 deletions(-) create mode 100755 bin/modules/Pasties.py delete mode 100755 bin/modules/Zerobins.py diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index 5be8f492..6ecd4b02 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -83,6 +83,7 @@ class ConfigLoader(object): else: return [] + # # # # Directory Config # # # # config_loader = ConfigLoader() diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py index 41ba4e98..6f877823 100755 --- a/bin/lib/regex_helper.py +++ b/bin/lib/regex_helper.py @@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30): proc.terminate() sys.exit(0) +def _regex_match(r_key, regex, content): + if re.match(regex, content): + r_serv_cache.set(r_key, 1) + r_serv_cache.expire(r_key, 360) + +def regex_match(r_key, regex, item_id, content, max_time=30): + proc = Proc(target=_regex_match, args=(r_key, regex, content)) + try: + proc.start() + proc.join(max_time) + if proc.is_alive(): + proc.terminate() + # Statistics.incr_module_timeout_statistic(r_key) + err_mess = f"{r_key}: processing timeout: {item_id}" + logger.info(err_mess) + return False + else: + if r_serv_cache.exists(r_key): + r_serv_cache.delete(r_key) + return True + else: + r_serv_cache.delete(r_key) + return False + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating regex worker") + proc.terminate() + sys.exit(0) + def _regex_search(r_key, regex, content): if re.search(regex, content): r_serv_cache.set(r_key, 1) diff --git a/bin/modules/Pasties.py b/bin/modules/Pasties.py new file mode 100755 index 00000000..ce2eff10 --- /dev/null +++ b/bin/modules/Pasties.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The Pasties Module +====================== +This module spots domain-pasties services for further processing +""" + +################################## +# Import External packages +################################## +import os +import sys +import time + +from pyfaup.faup import Faup + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader +from lib import crawlers + +# TODO add url validator + +pasties_blocklist_urls = set() +pasties_domains = {} + +class Pasties(AbstractModule): + """ + Pasties module for AIL framework + """ + + def __init__(self): + super(Pasties, self).__init__() + self.faup = Faup() + + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + + self.pasties = {} + self.urls_blocklist = set() + self.load_pasties_domains() + + # Send module state to logs + self.logger.info(f'Module {self.module_name} initialized') + + def load_pasties_domains(self): + self.pasties = {} + self.urls_blocklist = set() + + domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties') + if os.path.exists(domains_pasties): + with open(domains_pasties) as f: + for line in f: + url = line.strip() + if url: # TODO validate line + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + # print(url_decoded) + if path and path != '/': + if path[-1] != '/': + path = f'{path}/' + else: + path = None + + if host in self.pasties: + if path: + self.pasties[host].add(path) + else: + if path: + self.pasties[host] = {path} + else: + self.pasties[host] = set() + + url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist') + if os.path.exists(url_blocklist): + with open(url_blocklist) as f: + for line in f: + url = line.strip() + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + url = f'{host}{path}' + if url_decoded['query_string']: + url = url + url_decoded['query_string'] + self.urls_blocklist.add(url) + + def send_to_crawler(self, url, obj_id): + if not self.r_cache.exists(f'{self.module_name}:url:{url}'): + self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time())) + self.r_cache.expire(f'{self.module_name}:url:{url}', 86400) + crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id) + + def compute(self, message): + url, item_id = message.split() + + self.faup.decode(url) + url_decoded = self.faup.get() + # print(url_decoded) + url_host = url_decoded['host'] + # if url_decoded.get('port', ''): + # url_host = f'{url_host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + if url_host in self.pasties: + if url.startswith('http://'): + if url[7:] in self.urls_blocklist: + return None + elif url.startswith('https://'): + if url[8:] in self.urls_blocklist: + return None + else: + if url in self.urls_blocklist: + return None + + if not self.pasties[url_host]: + if path and path != '/': + print('send to crawler', url_host, url) + self.send_to_crawler(url, item_id) + else: + if path.endswith('/'): + path_end = path[:-1] + else: + path_end = f'{path}/' + for url_path in self.pasties[url_host]: + if path.startswith(url_path): + if url_path != path and url_path != path_end: + print('send to crawler', url_path, url) + self.send_to_crawler(url, item_id) + break + + +if __name__ == '__main__': + module = Pasties() + module.run() diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py deleted file mode 100755 index f3fcea5a..00000000 --- a/bin/modules/Zerobins.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -The Zerobins Module -====================== -This module spots zerobins-like services for further processing -""" - -################################## -# Import External packages -################################## -import os -import re -import sys - -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages -################################## -from modules.abstract_module import AbstractModule -from lib import crawlers - - -class Zerobins(AbstractModule): - """ - Zerobins module for AIL framework - """ - - def __init__(self): - super(Zerobins, self).__init__() - - binz = [ - r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones - ] - - self.regex = re.compile('|'.join(binz)) - - # Pending time between two computation (computeNone) in seconds - self.pending_seconds = 10 - - # Send module state to logs - self.logger.info(f'Module {self.module_name} initialized') - - def computeNone(self): - """ - Compute when no message in queue - """ - self.logger.debug("No message in queue") - - def compute(self, message): - """ - Compute a message in queue - """ - url, item_id = message.split() - - # Extract zerobins addresses - matching_binz = self.regex_findall(self.regex, item_id, url) - - if len(matching_binz) > 0: - for bin_url in matching_binz: - print(f'send {bin_url} to crawler') - # TODO Change priority ??? - crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor', - parent='manual', priority=60) - - self.logger.debug("Compute message in queue") - - -if __name__ == '__main__': - module = Zerobins() - module.run() diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index 0a1a12cd..164e77b3 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -92,6 +92,9 @@ class AbstractModule(ABC): def get_available_queues(self): return self.queue.get_out_queues() + def regex_match(self, regex, obj_id, content): + return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) + def regex_search(self, regex, obj_id, content): return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) diff --git a/configs/modules.cfg b/configs/modules.cfg index b0b1f6df..3ce4f0ae 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -162,7 +162,7 @@ publish = Importers,Tags subscribe = Item publish = Tags -[Zerobins] +[Pasties] subscribe = Url # [My_Module_Name] From 045aab6f3425ef9f3b2ca20cf69acbde6e0ae52e Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 21 Aug 2023 15:52:33 +0200 Subject: [PATCH 02/15] fix: [module pasties] fix module name --- bin/LAUNCH.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 00c224e4..39640a71 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -267,7 +267,7 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "LibInjection" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./LibInjection.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Zerobins" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Zerobins.py; read x" + screen -S "Script_AIL" -X screen -t "Pasties" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Pasties.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x" From f44c5509da842be5ec0756d042fad0d5d7d0a005 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 23 Aug 2023 11:16:22 +0200 Subject: [PATCH 03/15] chg: [titles] add yara tracker on title + tags domains if unsafe title tags --- bin/crawlers/Crawler.py | 9 +++++++++ bin/lib/Tracker.py | 4 ++-- bin/lib/ail_core.py | 2 +- bin/lib/objects/Titles.py | 3 ++- var/www/templates/hunter/tracker_add.html | 4 ++++ 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 7f2c3df9..c22f6ccf 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -22,6 +22,7 @@ from lib.objects.Domains import Domain from lib.objects.Items import Item from lib.objects import Screenshots from lib.objects import Titles +from trackers.Tracker_Yara import Tracker_Yara logging.config.dictConfig(ail_logger.get_config(name='crawlers')) @@ -35,6 +36,8 @@ class Crawler(AbstractModule): # Waiting time in seconds between to message processed self.pending_seconds = 1 + self.tracker_yara = Tracker_Yara(queue=False) + config_loader = ConfigLoader() self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') @@ -283,6 +286,12 @@ class Crawler(AbstractModule): if title_content: title = Titles.create_title(title_content) title.add(item.get_date(), item_id) + # Tracker + self.tracker_yara.compute(title.get_id(), obj_type=title.get_type()) + if not title.is_tags_safe(): + unsafe_tag = 'dark-web:topic="pornography-child-exploitation"' + self.domain.add_tag(unsafe_tag) + item.add_tag(unsafe_tag) # SCREENSHOT if self.screenshot: diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py index f1ea8905..c06e303d 100755 --- a/bin/lib/Tracker.py +++ b/bin/lib/Tracker.py @@ -923,7 +923,7 @@ def api_add_tracker(dict_input, user_id): # Filters # TODO MOVE ME filters = dict_input.get('filters', {}) if filters: - if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: + if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: filters = {} for obj_type in filters: if obj_type not in get_objects_tracked(): @@ -998,7 +998,7 @@ def api_edit_tracker(dict_input, user_id): # Filters # TODO MOVE ME filters = dict_input.get('filters', {}) if filters: - if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: + if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: if not filters['decoded'] and not filters['item']: filters = {} for obj_type in filters: diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 75520a2b..9a7d9557 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -50,7 +50,7 @@ def get_object_all_subtypes(obj_type): return [] def get_objects_tracked(): - return ['decoded', 'item', 'pgp'] + return ['decoded', 'item', 'pgp', 'title'] def get_objects_retro_hunted(): return ['decoded', 'item'] diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py index 9f88426c..1a29d58e 100755 --- a/bin/lib/objects/Titles.py +++ b/bin/lib/objects/Titles.py @@ -45,6 +45,8 @@ class Title(AbstractDaterangeObject): def get_content(self, r_type='str'): if r_type == 'str': return self._get_field('content') + elif r_type == 'bytes': + return self._get_field('content').encode() def get_link(self, flask_context=False): if flask_context: @@ -122,4 +124,3 @@ class Titles(AbstractDaterangeObjects): # # print(r) # r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False) # print(r) - diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html index 7cc690ba..05266fa4 100644 --- a/var/www/templates/hunter/tracker_add.html +++ b/var/www/templates/hunter/tracker_add.html @@ -132,6 +132,10 @@ +
+ + +
{#
#} {# #} From 46c721590d83301b46999fed645ce16c1cfaff40 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 23 Aug 2023 11:21:22 +0200 Subject: [PATCH 04/15] fix: [tracker objs filter] fix title icon --- var/www/templates/hunter/tracker_add.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html index 05266fa4..4f8c6f3e 100644 --- a/var/www/templates/hunter/tracker_add.html +++ b/var/www/templates/hunter/tracker_add.html @@ -134,7 +134,7 @@
- +
{#
#} From 2145eb7b8a89fafd4c7631a23f3de01bd1a87570 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 23 Aug 2023 11:46:37 +0200 Subject: [PATCH 05/15] fix: [title] fix None title --- bin/lib/crawlers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 3e61ed88..6e9132d2 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -234,7 +234,9 @@ def extract_title_from_html(html): soup = BeautifulSoup(html, 'html.parser') title = soup.title if title: - return str(title.string) + title = title.string + if title: + return str(title) return '' def extract_description_from_html(html): @@ -2022,4 +2024,4 @@ if __name__ == '__main__': # _reprocess_all_hars_cookie_name() # _reprocess_all_hars_etag() # _gzip_all_hars() - _reprocess_all_hars_hhhashs() + # _reprocess_all_hars_hhhashs() From 4e3784922c3dc420828f95cfe6afa63e772194c0 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 23 Aug 2023 11:47:39 +0200 Subject: [PATCH 06/15] fix: typo --- bin/lib/crawlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 6e9132d2..18b1eeac 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -2012,7 +2012,7 @@ def test_ail_crawlers(): # TODO MOVE ME IN CRAWLER OR FLASK load_blacklist() -if __name__ == '__main__': +# if __name__ == '__main__': # delete_captures() # item_id = 'crawled/2023/02/20/data.gz' From c01b806ae30c6304bee5203b6fd46b389cbf1c2b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 24 Aug 2023 11:11:57 +0200 Subject: [PATCH 07/15] chg: [mail exporter] add obj content extract for each yara rule match --- bin/exporter/MailExporter.py | 20 ++++++++++---- bin/lib/Tracker.py | 3 +- bin/trackers/Tracker_Yara.py | 53 ++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py index c4d3f5b5..40ee1708 100755 --- a/bin/exporter/MailExporter.py +++ b/bin/exporter/MailExporter.py @@ -124,16 +124,26 @@ class MailExporterTracker(MailExporter): def __init__(self, host=None, port=None, password=None, user='', sender=''): super().__init__(host=host, port=port, password=password, user=user, sender=sender) - def export(self, tracker, obj): # TODO match + def export(self, tracker, obj, matches=[]): tracker_type = tracker.get_type() tracker_name = tracker.get_tracked() - subject = f'AIL Framework Tracker: {tracker_name}' # TODO custom subject + description = tracker.get_description() + if not description: + description = tracker_name + + subject = f'AIL Framework Tracker: {description}' body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" body += f'Item: {obj.id}\nurl:{obj.get_link()}' - # TODO match option - # if match: - # body += f'Tracker Match:\n\n{escape(match)}' + if matches: + body += '\n' + nb = 1 + for match in matches: + body += f'\nMatch {nb}: {match[0]}\nExtract:\n{match[1]}\n\n' + nb += 1 + else: + body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" + body += f'Item: {obj.id}\nurl:{obj.get_link()}' for mail in tracker.get_mails(): self._export(mail, subject, body) diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py index c06e303d..4baa3e5f 100755 --- a/bin/lib/Tracker.py +++ b/bin/lib/Tracker.py @@ -248,7 +248,8 @@ class Tracker: return self._get_field('user_id') def webhook_export(self): - return r_tracker.hexists(f'tracker:{self.uuid}', 'webhook') + webhook = self.get_webhook() + return webhook is not None and webhook def get_webhook(self): return r_tracker.hget(f'tracker:{self.uuid}', 'webhook') diff --git a/bin/trackers/Tracker_Yara.py b/bin/trackers/Tracker_Yara.py index fab397d1..1cebeaa6 100755 --- a/bin/trackers/Tracker_Yara.py +++ b/bin/trackers/Tracker_Yara.py @@ -73,8 +73,56 @@ class Tracker_Yara(AbstractModule): print(f'{self.obj.get_id()}: yara scanning timed out') self.redis_logger.info(f'{self.obj.get_id()}: yara scanning timed out') + def convert_byte_offset_to_string(self, b_content, offset): + byte_chunk = b_content[:offset + 1] + try: + string_chunk = byte_chunk.decode() + offset = len(string_chunk) - 1 + return offset + except UnicodeDecodeError: + return self.convert_byte_offset_to_string(b_content, offset - 1) + + def extract_matches(self, data, limit=500, lines=5): + matches = [] + content = self.obj.get_content() + l_content = len(content) + b_content = content.encode() + for string_match in data.get('strings'): + for string_match_instance in string_match.instances: + start = string_match_instance.offset + value = string_match_instance.matched_data.decode() + end = start + string_match_instance.matched_length + # str + start = self.convert_byte_offset_to_string(b_content, start) + end = self.convert_byte_offset_to_string(b_content, end) + + # Start + if start > limit: + i_start = start - limit + else: + i_start = 0 + str_start = content[i_start:start].splitlines() + if len(str_start) > lines: + str_start = '\n'.join(str_start[-lines + 1:]) + else: + str_start = content[i_start:start] + + # End + if end + limit > l_content: + i_end = l_content + else: + i_end = end + limit + str_end = content[end:i_end].splitlines() + if len(str_end) > lines: + str_end = '\n'.join(str_end[:lines + 1]) + else: + str_end = content[end:i_end] + matches.append((value, f'{str_start}{value}{str_end}')) + return matches + def yara_rules_match(self, data): tracker_name = data['namespace'] + matches = None obj_id = self.obj.get_id() for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type('yara', self.obj.get_type(), tracker_name): tracker = Tracker.Tracker(tracker_uuid) @@ -96,8 +144,9 @@ class Tracker_Yara(AbstractModule): # Mails if tracker.mail_export(): - # TODO add matches + custom subjects - self.exporters['mail'].export(tracker, self.obj) + if not matches: + matches = self.extract_matches(data) + self.exporters['mail'].export(tracker, self.obj, matches) # Webhook if tracker.webhook_export(): From 546d6538fd25cbf701b220b4440699f776367cb7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 24 Aug 2023 14:37:50 +0200 Subject: [PATCH 08/15] chg: [mail exporter] add obj content extract for each regex match --- bin/exporter/MailExporter.py | 1 + bin/trackers/Tracker_Regex.py | 51 ++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py index 40ee1708..41074d7b 100755 --- a/bin/exporter/MailExporter.py +++ b/bin/exporter/MailExporter.py @@ -145,5 +145,6 @@ class MailExporterTracker(MailExporter): body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" body += f'Item: {obj.id}\nurl:{obj.get_link()}' + # print(body) for mail in tracker.get_mails(): self._export(mail, subject, body) diff --git a/bin/trackers/Tracker_Regex.py b/bin/trackers/Tracker_Regex.py index 5cc06410..db35f239 100755 --- a/bin/trackers/Tracker_Regex.py +++ b/bin/trackers/Tracker_Regex.py @@ -41,6 +41,8 @@ class Tracker_Regex(AbstractModule): self.tracked_regexs = Tracker.get_tracked_regexs() self.last_refresh = time.time() + self.obj = None + # Exporter self.exporters = {'mail': MailExporterTracker(), 'webhook': WebHookExporterTracker()} @@ -56,6 +58,7 @@ class Tracker_Regex(AbstractModule): print('Tracked regex refreshed') obj = ail_objects.get_object(obj_type, subtype, obj_id) + self.obj = obj obj_id = obj.get_id() obj_type = obj.get_type() @@ -66,12 +69,46 @@ class Tracker_Regex(AbstractModule): content = obj.get_content() for dict_regex in self.tracked_regexs[obj_type]: - matched = self.regex_findall(dict_regex['regex'], obj_id, content) - if matched: - self.new_tracker_found(dict_regex['tracked'], 'regex', obj) + matches = self.regex_finditer(dict_regex['regex'], obj_id, content) + if matches: + self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches) - def new_tracker_found(self, tracker_name, tracker_type, obj): + def extract_matches(self, re_matches, limit=500, lines=5): + matches = [] + content = self.obj.get_content() + l_content = len(content) + for match in re_matches: + start = match[0] + value = match[2] + end = match[1] + + # Start + if start > limit: + i_start = start - limit + else: + i_start = 0 + str_start = content[i_start:start].splitlines() + if len(str_start) > lines: + str_start = '\n'.join(str_start[-lines + 1:]) + else: + str_start = content[i_start:start] + + # End + if end + limit > l_content: + i_end = l_content + else: + i_end = end + limit + str_end = content[end:i_end].splitlines() + if len(str_end) > lines: + str_end = '\n'.join(str_end[:lines + 1]) + else: + str_end = content[end:i_end] + matches.append((value, f'{str_start}{value}{str_end}')) + return matches + + def new_tracker_found(self, tracker_name, tracker_type, obj, re_matches): obj_id = obj.get_id() + matches = None for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type(tracker_type, obj.get_type(), tracker_name): tracker = Tracker.Tracker(tracker_uuid) @@ -93,8 +130,9 @@ class Tracker_Regex(AbstractModule): obj.add_tag(tag) if tracker.mail_export(): - # TODO add matches + custom subjects - self.exporters['mail'].export(tracker, obj) + if not matches: + matches = self.extract_matches(re_matches) + self.exporters['mail'].export(tracker, obj, matches) if tracker.webhook_export(): self.exporters['webhook'].export(tracker, obj) @@ -103,4 +141,3 @@ class Tracker_Regex(AbstractModule): if __name__ == "__main__": module = Tracker_Regex() module.run() - # module.compute('submitted/2023/05/02/submitted_b1e518f1-703b-40f6-8238-d1c22888197e.gz') From 24969610cc4d5c04845e65dfaf9a5592487a0954 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Aug 2023 11:59:39 +0200 Subject: [PATCH 09/15] fix: [items source] fix empty sources list --- bin/lib/item_basic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index 71fa5378..b35d126e 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -209,7 +209,10 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt l_dir = os.listdir(directory) # empty directory if not l_dir: - return l_sources_name.add(source_name) + if source_name: + return l_sources_name.add(source_name) + else: + return l_sources_name else: for src_name in l_dir: if len(src_name) == 4: From 099253f8546237b6164f90e78b16d5444fbf3fbb Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Aug 2023 13:50:16 +0200 Subject: [PATCH 10/15] fix: [json importer] fix empty source name --- bin/importer/feeders/Default.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/importer/feeders/Default.py b/bin/importer/feeders/Default.py index 482d06b4..100ed1e6 100755 --- a/bin/importer/feeders/Default.py +++ b/bin/importer/feeders/Default.py @@ -24,8 +24,12 @@ class DefaultFeeder: Return feeder name. first part of the item_id and display in the UI """ if not self.name: - return self.get_source() - return self.name + name = self.get_source() + else: + name = self.name + if not name: + name = 'default' + return name def get_source(self): return self.json_data.get('source') From 7c73f0944a1a4b8ba052563f6bc0b03374c6ffdf Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Aug 2023 14:03:26 +0200 Subject: [PATCH 11/15] fix: [items source] filter invalid item sources --- bin/lib/item_basic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index b35d126e..25420106 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -204,7 +204,11 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt if not l_sources_name: l_sources_name = set() if source_name: - l_dir = os.listdir(os.path.join(directory, source_name)) + path = os.path.join(directory, source_name) + if os.path.isdir(path): + l_dir = os.listdir(os.path.join(directory, source_name)) + else: + l_dir = [] else: l_dir = os.listdir(directory) # empty directory @@ -215,7 +219,7 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt return l_sources_name else: for src_name in l_dir: - if len(src_name) == 4: + if len(src_name) == 4 and source_name: # try: int(src_name) to_add = os.path.join(source_name) From ed0423118e9facb55fff0d3ef381e688aeb0ade0 Mon Sep 17 00:00:00 2001 From: Jean-Louis Huynen Date: Thu, 31 Aug 2023 15:42:44 +0200 Subject: [PATCH 12/15] chg: [crawlers] submit a single cookie to the crawler task API --- bin/lib/crawlers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 18b1eeac..3a0a9f19 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1692,6 +1692,18 @@ def api_add_crawler_task(data, user_id=None): return {'error': 'The access to this cookiejar is restricted'}, 403 cookiejar_uuid = cookiejar.uuid + cookie = data.get('cookie', None) + if not cookiejar_uuid and cookie: + # Create new cookiejar + cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None) + cookiejar = Cookiejar(cookiejar_uuid) + try: + name = cookie.get('name') + value = cookie.get('value') + cookiejar.add_cookie(name, value, None, None, None, None, None) + except KeyError: + return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400 + frequency = data.get('frequency', None) if frequency: if frequency not in ['monthly', 'weekly', 'daily', 'hourly']: From 68c17c3fbcc20b9e63a3b97d0faac092a970dd10 Mon Sep 17 00:00:00 2001 From: Jean-Louis Huynen Date: Thu, 31 Aug 2023 16:13:20 +0200 Subject: [PATCH 13/15] chg: [crawlers] submit cookies to the crawler task API --- bin/lib/crawlers.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 3a0a9f19..67f868f0 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1692,17 +1692,18 @@ def api_add_crawler_task(data, user_id=None): return {'error': 'The access to this cookiejar is restricted'}, 403 cookiejar_uuid = cookiejar.uuid - cookie = data.get('cookie', None) - if not cookiejar_uuid and cookie: + cookies = data.get('cookies', None) + if not cookiejar_uuid and cookies: # Create new cookiejar cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None) cookiejar = Cookiejar(cookiejar_uuid) - try: - name = cookie.get('name') - value = cookie.get('value') - cookiejar.add_cookie(name, value, None, None, None, None, None) - except KeyError: - return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400 + for cookie in cookies: + try: + name = cookie.get('name') + value = cookie.get('value') + cookiejar.add_cookie(name, value, None, None, None, None, None) + except KeyError: + return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400 frequency = data.get('frequency', None) if frequency: From fee3332edbe223106eb5a233746198fe7f174679 Mon Sep 17 00:00:00 2001 From: terrtia Date: Fri, 29 Sep 2023 15:43:37 +0200 Subject: [PATCH 14/15] fix: [tracker] delete yara rule, fix filter by object type --- bin/lib/Tracker.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py index 4baa3e5f..9c4702ae 100755 --- a/bin/lib/Tracker.py +++ b/bin/lib/Tracker.py @@ -2,6 +2,8 @@ # -*-coding:UTF-8 -* import json import os +import logging +import logging.config import re import sys import time @@ -24,11 +26,16 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from packages import Date from lib.ail_core import get_objects_tracked, get_object_all_subtypes, get_objects_retro_hunted +from lib import ail_logger from lib import ConfigLoader from lib import item_basic from lib import Tag from lib.Users import User +# LOGS +logging.config.dictConfig(ail_logger.get_config(name='modules')) +logger = logging.getLogger() + config_loader = ConfigLoader.ConfigLoader() r_cache = config_loader.get_redis_conn("Redis_Cache") @@ -561,9 +568,7 @@ class Tracker: os.remove(filepath) # Filters - filters = self.get_filters() - if not filters: - filters = get_objects_tracked() + filters = get_objects_tracked() for obj_type in filters: r_tracker.srem(f'trackers:objs:{tracker_type}:{obj_type}', tracked) r_tracker.srem(f'trackers:uuid:{tracker_type}:{tracked}', f'{self.uuid}:{obj_type}') @@ -1152,7 +1157,11 @@ def get_tracked_yara_rules(): for obj_type in get_objects_tracked(): rules = {} for tracked in _get_tracked_by_obj_type('yara', obj_type): - rules[tracked] = os.path.join(get_yara_rules_dir(), tracked) + rule = os.path.join(get_yara_rules_dir(), tracked) + if not os.path.exists(rule): + logger.critical(f"Yara rule don't exists {tracked} : {obj_type}") + else: + rules[tracked] = rule to_track[obj_type] = yara.compile(filepaths=rules) print(to_track) return to_track From fb4a74b45a49dc968bf823866e06a75fdc8f92d5 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Tue, 3 Oct 2023 11:56:01 +0200 Subject: [PATCH 15/15] fix: [dep] Pinning flask to < 3.0 due to Werkzeug 3.0 issues: https://stackoverflow.com/a/77215455 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8bb16553..8e9bb803 100644 --- a/requirements.txt +++ b/requirements.txt @@ -67,7 +67,7 @@ pylibinjection>=0.2.4 phonenumbers>8.12.1 # Web -flask>=1.1.4 +flask==2.3.3 flask-login bcrypt>3.1.6