From 0fa27c6a518d761380b240ed4d92760c1a549c92 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 23 Feb 2023 16:25:15 +0100 Subject: [PATCH] chg: [extractor] add cache + UI extractor + word/set extractor --- bin/lib/module_extractor.py | 163 +++++++++++++----- bin/modules/CreditCards.py | 2 +- bin/modules/Iban.py | 3 +- bin/modules/Mail.py | 2 +- bin/modules/Onion.py | 2 +- bin/modules/Tools.py | 6 +- var/www/blueprints/objects_item.py | 10 +- var/www/templates/objects/item/show_item.html | 89 ++++++++-- 8 files changed, 208 insertions(+), 69 deletions(-) diff --git a/bin/lib/module_extractor.py b/bin/lib/module_extractor.py index be596621..fe81f9cb 100755 --- a/bin/lib/module_extractor.py +++ b/bin/lib/module_extractor.py @@ -1,17 +1,19 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* - +import json import os import sys import time import yara +from operator import itemgetter + sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## -import lib.objects.ail_objects +from lib.objects import ail_objects from lib.objects.Items import Item from lib import correlations_engine from lib import regex_helper @@ -25,23 +27,19 @@ from modules.Mail import Mail from modules.Onion import Onion from modules.Tools import Tools -creditCards = CreditCards() -ibans = Iban() -mails = Mail() -onions = Onion() -tools = Tools() - config_loader = ConfigLoader() r_cache = config_loader.get_redis_conn("Redis_Cache") config_loader = None r_key = regex_helper.generate_redis_cache_key('extractor') +# TODO UI Link + MODULES = { - 'infoleak:automatic-detection="credit-card"': creditCards, - 'infoleak:automatic-detection="iban"': ibans, - 'infoleak:automatic-detection="mail"': mails, - 'infoleak:automatic-detection="onion"': onions, + 'infoleak:automatic-detection="credit-card"': CreditCards(), + 'infoleak:automatic-detection="iban"': Iban(), + 'infoleak:automatic-detection="mail"': Mail(), + 'infoleak:automatic-detection="onion"': Onion(), # APIkey ??? # Credentials # Zerobins @@ -49,20 +47,28 @@ MODULES = { # SQL Injetction / Libinjection ??? } +tools = Tools() for tool_name in tools.get_tools(): MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools -def get_correl_match(extract_type, obj_id, content, filter_subtypes=['']): +def get_correl_match(extract_type, obj_id, content): + extracted = [] correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type) to_extract = [] + map_subtype = {} for c in correl: subtype, value = c.split(':', 1) - # if subtype in filter_subtypes: + map_subtype[value] = subtype to_extract.append(value) if to_extract: - return regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content) - else: - return [] + objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content) + for obj in objs: + if map_subtype[obj[2]]: + subtype = map_subtype[obj[2]] + else: + subtype = '' + extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}']) + return extracted def _get_yara_match(data): for row in data.get('strings'): @@ -73,14 +79,27 @@ def _get_yara_match(data): r_cache.expire(f'extractor:yara:match:{r_key}', 300) return yara.CALLBACK_CONTINUE +def _get_word_regex(word): + return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + # TODO RETRO HUNTS +# TODO TRACKER TYPE IN UI def get_tracker_match(obj_id, content): + cached = r_cache.get(f'extractor:cache:{obj_id}') + if cached: + r_cache.expire(f'extractor:cache:{obj_id}', 300) + return json.loads(cached) + + extracted = [] trackers = Tracker.get_obj_all_trackers('item', '', obj_id) for tracker_uuid in trackers: tracker_type = Tracker.get_tracker_type(tracker_uuid) + print(tracker_type) tracker = Tracker.get_tracker_by_uuid(tracker_uuid) - if tracker_type == 'regex': - return regex_helper.regex_finditer(r_key, tracker, obj_id, content) + if tracker_type == 'regex': # TODO Improve word detection -> word delimiter + regex_match = regex_helper.regex_finditer(r_key, tracker, obj_id, content) + for match in regex_match: + extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}']) elif tracker_type == 'yara': rule = Tracker.get_yara_rule_by_uuid(tracker_uuid) rule.match(data=content, callback=_get_yara_match, @@ -90,22 +109,39 @@ def get_tracker_match(obj_id, content): extracted = [] for match in yara_match: start, end, value = match.split(':', 2) - extracted.append((int(start), int(end), value)) - return extracted + extracted.append([int(start), int(end), value, f'tracker:{tracker_uuid}']) - # elif tracker_type == 'term': # TODO - # - # elif tracker_type == '': - return [] + elif tracker_type == 'word' or tracker_type == 'set': + if tracker_type == 'set': + tracker = tracker.rsplit(';', 1)[0] + words = tracker.split(',') + else: + words = [tracker] + for word in words: + regex = _get_word_regex(word) + regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content) + print(regex_match) + for match in regex_match: + extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}']) + # Save In Cache + if extracted: + extracted_dump = json.dumps(extracted) + r_cache.set(f'extractor:cache:{obj_id}', extracted_dump) + r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ??????????????? + + return extracted + +# Type:subtype:id +# tag:iban +# tracker:uuid def extract(obj_id, content=None): item = Item(obj_id) if not content: content = item.get_content() - extracted = [] - extracted = extracted + get_tracker_match(obj_id, content) + extracted = get_tracker_match(obj_id, content) # print(item.get_tags()) for tag in item.get_tags(): @@ -116,33 +152,70 @@ def extract(obj_id, content=None): if matches: extracted = extracted + matches - for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc + for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc matches = get_correl_match(obj_t, obj_id, content) if matches: extracted = extracted + matches - from operator import itemgetter - + # SORT By Start Pos extracted = sorted(extracted, key=itemgetter(0)) - print(extracted) + # print(extracted) return extracted +# TODO ADD LINK UI +def get_extracted_by_match(extracted): + matches = {} + for start, end, value, str_obj in extracted: -if __name__ == '__main__': - t0 = time.time() - obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b' - obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd' - obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz' - # obj_id = 'tests/2021/01/01/credit_cards.gz' - # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5' - obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz' - obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz' + if str_obj not in matches: + matches[str_obj] = {} + ob_type, row_id = str_obj.split(':', 1) + if ob_type == 'tag': # TODO put me in object class + matches[str_obj]['subtype'] = 'tag' + matches[str_obj]['id'] = row_id + matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5} + matches[str_obj]['link'] = '' + elif ob_type == 'tracker': # TODO put me in object class + matches[str_obj]['subtype'] = 'tracker' + matches[str_obj]['id'] = row_id + matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5} + matches[str_obj]['link'] = '' + else: + row_id = row_id.split(':', 1) + if len(row_id) == 2: + subtype = row_id[0] + obj_id = row_id[1] + else: + subtype = '' + obj_id = row_id[0] + matches[str_obj]['subtype'] = subtype + matches[str_obj]['id'] = obj_id + matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id) + matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id) - extract(obj_id) + matches[str_obj]['matches'] = [] - # get_obj_correl('cve', obj_id, content) - # r = get_tracker_match(obj_id, content) - # print(r) + match = [start, end, value] + matches[str_obj]['matches'].append(match) + return matches - print(time.time() - t0) + +# if __name__ == '__main__': +# t0 = time.time() +# obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b' +# obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd' +# obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz' +# # obj_id = 'tests/2021/01/01/credit_cards.gz' +# # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5' +# obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz' +# obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz' +# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8' +# +# extract(obj_id) +# +# # get_obj_correl('cve', obj_id, content) +# # r = get_tracker_match(obj_id, content) +# # print(r) +# +# print(time.time() - t0) diff --git a/bin/modules/CreditCards.py b/bin/modules/CreditCards.py index 6ea60f1e..33bdebe5 100755 --- a/bin/modules/CreditCards.py +++ b/bin/modules/CreditCards.py @@ -64,7 +64,7 @@ class CreditCards(AbstractModule): for card in cards: start, end, value = card if self.get_valid_card(value): - extracted.append(card) + extracted.append([start, end, value, f'tag:{tag}']) return extracted def compute(self, message, r_result=False): diff --git a/bin/modules/Iban.py b/bin/modules/Iban.py index 83ad6b12..0d7597f2 100755 --- a/bin/modules/Iban.py +++ b/bin/modules/Iban.py @@ -69,8 +69,7 @@ class Iban(AbstractModule): start, end, value = iban value = ''.join(e for e in value if e.isalnum()) if self.is_valid_iban(value): - print(value) - extracted.append(iban) + extracted.append([start, end, value, f'tag:{tag}']) return extracted def compute(self, message): diff --git a/bin/modules/Mail.py b/bin/modules/Mail.py index 9982f903..af704599 100755 --- a/bin/modules/Mail.py +++ b/bin/modules/Mail.py @@ -130,7 +130,7 @@ class Mail(AbstractModule): mxdomains[mxdomain].append(mail) for mx in self.check_mx_record(mxdomains.keys()): for row in mxdomains[mx]: - extracted.append(row) + extracted.append([row[0], row[1], row[2], f'tag:{tag}']) return extracted # # TODO: sanitize mails diff --git a/bin/modules/Onion.py b/bin/modules/Onion.py index 15120098..e8bb5c7a 100755 --- a/bin/modules/Onion.py +++ b/bin/modules/Onion.py @@ -62,7 +62,7 @@ class Onion(AbstractModule): url_unpack = crawlers.unpack_url(value) domain = url_unpack['domain'] if crawlers.is_valid_onion_domain(domain): - extracted.append(onion) + extracted.append([start, end, value, f'tag:{tag}']) return extracted def compute(self, message): diff --git a/bin/modules/Tools.py b/bin/modules/Tools.py index 7af97301..0803ec2b 100755 --- a/bin/modules/Tools.py +++ b/bin/modules/Tools.py @@ -409,8 +409,12 @@ class Tools(AbstractModule): return TOOLS.keys() def extract(self, obj_id, content, tag): + extracted = [] tool_name = tag.rsplit('"', 2)[1][:-5] - return self.regex_finditer(TOOLS[tool_name]['regex'], obj_id, content) + tools = self.regex_finditer(TOOLS[tool_name]['regex'], obj_id, content) + for tool in tools: + extracted.append([tool[0], tool[1], tool[2], f'tag:{tag}']) + return extracted def compute(self, message): item = Item(message) diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py index 442f8caa..67b05648 100644 --- a/var/www/blueprints/objects_item.py +++ b/var/www/blueprints/objects_item.py @@ -67,7 +67,7 @@ def showItem(): # # TODO: support post abort(404) item = Item(item_id) - meta = item.get_meta(options=['content', 'crawler', 'duplicates', 'lines', 'size']) + meta = item.get_meta(options={'content', 'crawler', 'duplicates', 'lines', 'size'}) meta['name'] = meta['id'].replace('/', ' / ') meta['father'] = item_basic.get_item_parent(item_id) @@ -76,11 +76,13 @@ def showItem(): # # TODO: support post meta['hive_case'] = Export.get_item_hive_cases(item_id) extracted = module_extractor.extract(item.id, content=meta['content']) + extracted_matches = module_extractor.get_extracted_by_match(extracted) return render_template("show_item.html", bootstrap_label=bootstrap_label, - modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'), - is_hive_connected=Export.get_item_hive_cases(item_id), - meta=meta, extracted=extracted) + modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'), + is_hive_connected=Export.get_item_hive_cases(item_id), + meta=meta, + extracted=extracted, extracted_matches=extracted_matches) # kvrocks data diff --git a/var/www/templates/objects/item/show_item.html b/var/www/templates/objects/item/show_item.html index 99e5f23c..18499a78 100644 --- a/var/www/templates/objects/item/show_item.html +++ b/var/www/templates/objects/item/show_item.html @@ -20,6 +20,9 @@