diff --git a/bin/lib/module_extractor.py b/bin/lib/module_extractor.py index 887bf09a..c801168c 100755 --- a/bin/lib/module_extractor.py +++ b/bin/lib/module_extractor.py @@ -40,6 +40,11 @@ r_key = regex_helper.generate_redis_cache_key('extractor') # TODO UI Link +CORRELATION_TO_EXTRACT = { + 'item': ['cve', 'cryptocurrency', 'title', 'username'], + 'message': ['cve', 'cryptocurrency', 'username'] +} + MODULES = { 'infoleak:automatic-detection="credit-card"': CreditCards(queue=False), 'infoleak:automatic-detection="iban"': Iban(queue=False), @@ -57,9 +62,9 @@ tools = Tools(queue=False) for tool_name in tools.get_tools(): MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools -def get_correl_match(extract_type, obj_id, content): +def get_correl_match(extract_type, obj, content): extracted = [] - correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type) + correl = correlations_engine.get_correlation_by_correl_type(obj.type, obj.get_subtype(r_str=True), obj.id, extract_type) to_extract = [] map_subtype = {} map_value_id = {} @@ -75,18 +80,18 @@ def get_correl_match(extract_type, obj_id, content): sha256_val = sha256(value.encode()).hexdigest() map_value_id[sha256_val] = value if to_extract: - objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content) - for obj in objs: - if map_subtype.get(obj[2]): - subtype = map_subtype[obj[2]] + objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj.get_global_id(), content) + for ob in objs: + if map_subtype.get(ob[2]): + subtype = map_subtype[ob[2]] else: subtype = '' - sha256_val = sha256(obj[2].encode()).hexdigest() + sha256_val = sha256(ob[2].encode()).hexdigest() value_id = map_value_id.get(sha256_val) if not value_id: logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}') value_id = 'ERROR' - extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{value_id}']) + extracted.append([ob[0], ob[1], ob[2], f'{extract_type}:{subtype}:{value_id}']) return extracted def _get_yara_match(data): @@ -100,7 +105,7 @@ def _get_yara_match(data): return yara.CALLBACK_CONTINUE def _get_word_regex(word): - return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + return '(?i)(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' def convert_byte_offset_to_string(b_content, offset): byte_chunk = b_content[:offset + 1] @@ -115,17 +120,18 @@ def convert_byte_offset_to_string(b_content, offset): # TODO RETRO HUNTS # TODO TRACKER TYPE IN UI -def get_tracker_match(obj_id, content): +def get_tracker_match(obj, content): extracted = [] extracted_yara = [] - trackers = Tracker.get_obj_trackers('item', '', obj_id) + obj_gid = obj.get_global_id() + trackers = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id) for tracker_uuid in trackers: tracker = Tracker.Tracker(tracker_uuid) tracker_type = tracker.get_type() # print(tracker_type) tracked = tracker.get_tracked() if tracker_type == 'regex': # TODO Improve word detection -> word delimiter - regex_match = regex_helper.regex_finditer(r_key, tracked, obj_id, content) + regex_match = regex_helper.regex_finditer(r_key, tracked, obj_gid, content) for match in regex_match: extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}']) elif tracker_type == 'yara': @@ -147,13 +153,13 @@ def get_tracker_match(obj_id, content): words = [tracked] for word in words: regex = _get_word_regex(word) - regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content) + regex_match = regex_helper.regex_finditer(r_key, regex, obj_gid, content) # print(regex_match) for match in regex_match: extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}']) # Retro Hunt - retro_hunts = Tracker.get_obj_retro_hunts('item', '', obj_id) + retro_hunts = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id) for retro_uuid in retro_hunts: retro_hunt = Tracker.RetroHunt(retro_uuid) rule = retro_hunt.get_rule(r_compile=True) @@ -182,35 +188,36 @@ def get_tracker_match(obj_id, content): # Type:subtype:id # tag:iban # tracker:uuid - -def extract(obj_id, content=None): - item = Item(obj_id) - if not item.exists(): +# def extract(obj_id, content=None): +def extract(obj_type, subtype, obj_id, content=None): + obj = ail_objects.get_object(obj_type, subtype, obj_id) + if not obj.exists(): return [] + obj_gid = obj.get_global_id() # CHECK CACHE - cached = r_cache.get(f'extractor:cache:{obj_id}') + cached = r_cache.get(f'extractor:cache:{obj_gid}') # cached = None if cached: - r_cache.expire(f'extractor:cache:{obj_id}', 300) + r_cache.expire(f'extractor:cache:{obj_gid}', 300) return json.loads(cached) if not content: - content = item.get_content() + content = obj.get_content() - extracted = get_tracker_match(obj_id, content) + extracted = get_tracker_match(obj, content) # print(item.get_tags()) - for tag in item.get_tags(): + for tag in obj.get_tags(): if MODULES.get(tag): # print(tag) module = MODULES.get(tag) - matches = module.extract(obj_id, content, tag) + matches = module.extract(obj, content, tag) if matches: extracted = extracted + matches - for obj_t in ['cve', 'cryptocurrency', 'title', 'username']: # Decoded, PGP->extract bloc - matches = get_correl_match(obj_t, obj_id, content) + for obj_t in CORRELATION_TO_EXTRACT[obj.type]: + matches = get_correl_match(obj_t, obj, content) if matches: extracted = extracted + matches @@ -221,8 +228,8 @@ def extract(obj_id, content=None): # Save In Cache if extracted: extracted_dump = json.dumps(extracted) - r_cache.set(f'extractor:cache:{obj_id}', extracted_dump) - r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ??????????????? + r_cache.set(f'extractor:cache:{obj_gid}', extracted_dump) + r_cache.expire(f'extractor:cache:{obj_gid}', 300) # TODO Reduce CACHE ??????????????? return extracted @@ -271,15 +278,7 @@ def get_extracted_by_match(extracted): # if __name__ == '__main__': # t0 = time.time() -# obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b' -# obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd' -# obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz' -# # obj_id = 'tests/2021/01/01/credit_cards.gz' -# # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5' -# obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz' -# obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz' # obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8' -# # extract(obj_id) # # # get_obj_correl('cve', obj_id, content) diff --git a/bin/modules/CreditCards.py b/bin/modules/CreditCards.py index c237bf05..bd0efe3b 100755 --- a/bin/modules/CreditCards.py +++ b/bin/modules/CreditCards.py @@ -58,9 +58,9 @@ class CreditCards(AbstractModule): if lib_refine.is_luhn_valid(clean_card): return clean_card - def extract(self, obj_id, content, tag): + def extract(self, obj, content, tag): extracted = [] - cards = self.regex_finditer(self.regex, obj_id, content) + cards = self.regex_finditer(self.regex, obj.get_global_id(), content) for card in cards: start, end, value = card if self.get_valid_card(value): diff --git a/bin/modules/Iban.py b/bin/modules/Iban.py index 7b0c66d0..8c0fafb0 100755 --- a/bin/modules/Iban.py +++ b/bin/modules/Iban.py @@ -62,9 +62,9 @@ class Iban(AbstractModule): return True return False - def extract(self, obj_id, content, tag): + def extract(self, obj, content, tag): extracted = [] - ibans = self.regex_finditer(self.iban_regex, obj_id, content) + ibans = self.regex_finditer(self.iban_regex, obj.get_global_id(), content) for iban in ibans: start, end, value = iban value = ''.join(e for e in value if e.isalnum()) diff --git a/bin/modules/Mail.py b/bin/modules/Mail.py index 8a3a66a4..a87aec46 100755 --- a/bin/modules/Mail.py +++ b/bin/modules/Mail.py @@ -118,10 +118,10 @@ class Mail(AbstractModule): print(e) return valid_mxdomain - def extract(self, obj_id, content, tag): + def extract(self, obj, content, tag): extracted = [] mxdomains = {} - mails = self.regex_finditer(self.email_regex, obj_id, content) + mails = self.regex_finditer(self.email_regex, obj.get_global_id(), content) for mail in mails: start, end, value = mail mxdomain = value.rsplit('@', 1)[1].lower() diff --git a/bin/modules/Onion.py b/bin/modules/Onion.py index 34e50247..2e11431e 100755 --- a/bin/modules/Onion.py +++ b/bin/modules/Onion.py @@ -55,9 +55,9 @@ class Onion(AbstractModule): # TEMP var: SAVE I2P Domain (future I2P crawler) # self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p") - def extract(self, obj_id, content, tag): + def extract(self, obj, content, tag): extracted = [] - onions = self.regex_finditer(self.onion_regex, obj_id, content) + onions = self.regex_finditer(self.onion_regex, obj.get_global_id(), content) for onion in onions: start, end, value = onion url_unpack = crawlers.unpack_url(value) diff --git a/bin/modules/Phone.py b/bin/modules/Phone.py index 8eae16cc..22686649 100755 --- a/bin/modules/Phone.py +++ b/bin/modules/Phone.py @@ -41,9 +41,9 @@ class Phone(AbstractModule): # Waiting time in seconds between to message processed self.pending_seconds = 1 - def extract(self, obj_id, content, tag): + def extract(self, obj, content, tag): extracted = [] - phones = self.regex_phone_iter('ZZ', obj_id, content) + phones = self.regex_phone_iter('ZZ', obj.get_global_id(), content) for phone in phones: extracted.append([phone[0], phone[1], phone[2], f'tag:{tag}']) return extracted diff --git a/var/www/blueprints/chats_explorer.py b/var/www/blueprints/chats_explorer.py index ad3e3d4c..1a53ef8f 100644 --- a/var/www/blueprints/chats_explorer.py +++ b/var/www/blueprints/chats_explorer.py @@ -23,6 +23,7 @@ from lib import ail_core from lib import chats_viewer from lib import Language from lib import Tag +from lib import module_extractor # ============ BLUEPRINT ============ chats_explorer = Blueprint('chats_explorer', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/chats_explorer')) @@ -235,6 +236,10 @@ def objects_message(): else: message = message[0] languages = Language.get_translation_languages() + extracted = module_extractor.extract('message', '', message['id'], content=message['content']) + extracted_matches = module_extractor.get_extracted_by_match(extracted) + message['extracted'] = extracted + message['extracted_matches'] = extracted_matches return render_template('ChatMessage.html', meta=message, bootstrap_label=bootstrap_label, translation_languages=languages, translation_target=target, modal_add_tags=Tag.get_modal_add_tags(message['id'], object_type='message')) diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py index 29e6a2de..186bd969 100644 --- a/var/www/blueprints/objects_item.py +++ b/var/www/blueprints/objects_item.py @@ -85,7 +85,7 @@ def showItem(): # # TODO: support post else: meta['investigations'] = [] - extracted = module_extractor.extract(item.id, content=meta['content']) + extracted = module_extractor.extract('item', '', item.id, content=meta['content']) extracted_matches = module_extractor.get_extracted_by_match(extracted) return render_template("show_item.html", bootstrap_label=bootstrap_label, diff --git a/var/www/templates/chats_explorer/ChatMessage.html b/var/www/templates/chats_explorer/ChatMessage.html index 1e314541..c853698f 100644 --- a/var/www/templates/chats_explorer/ChatMessage.html +++ b/var/www/templates/chats_explorer/ChatMessage.html @@ -14,8 +14,8 @@ - - + + @@ -134,6 +134,65 @@ + {% if meta['extracted_matches'] %} +
Type | +ID | +Extracted | +
---|---|---|
+ + {{ meta['extracted_matches'][match]['subtype'] }} + | +{{ meta['extracted_matches'][match]['id'] }} | +
+ {% for row in meta['extracted_matches'][match]['matches'] %}
+ {{ row[2] }} + {% endfor %} + |
+
{{ message['content'] }}+ {% if not message['extracted'] %} +
{{ message['content'] }}+ {% else %} +
{{ message['content'][:message['extracted'][0][0]] }}{% for row in message['extracted'] %}{{ message['content'][row[0]:row[1]] }}{% if loop.index + 1 > message['extracted']|length %}{{ message['content'][message['extracted'][-1][1]:] }}{% else %}{{ message['content'][row[1]:message['extracted'][loop.index][0]] }}{% endif %}{% endfor %}
+ {% endif %}
{% if message['translation'] %}
{{ message['translation'] }}