ail-framework/bin/lib/module_extractor.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import os
import sys
import time

import yara

from operator import itemgetter

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader

from lib import Tracker

from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Tools import Tools

config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None

r_key = regex_helper.generate_redis_cache_key('extractor')

# TODO UI Link

MODULES = {
    'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
    'infoleak:automatic-detection="iban"': Iban(queue=False),
    'infoleak:automatic-detection="mail"': Mail(queue=False),
    'infoleak:automatic-detection="onion"': Onion(queue=False),
    # APIkey ???
    # Credentials
    # Zerobins
    # CERTIFICATE + KEYS ???
    # SQL Injetction / Libinjection ???

}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
    MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools

def get_correl_match(extract_type, obj_id, content):
    extracted = []
    correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
    to_extract = []
    map_subtype = {}
    for c in correl:
        subtype, value = c.split(':', 1)
        map_subtype[value] = subtype
        to_extract.append(value)
    if to_extract:
        objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
        for obj in objs:
            if map_subtype[obj[2]]:
                subtype = map_subtype[obj[2]]
            else:
                subtype = ''
            extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])
    return extracted

def _get_yara_match(data):
    for row in data.get('strings'):
        start, i, value = row
        value = value.decode()
        end = start + len(value)
        r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
        r_cache.expire(f'extractor:yara:match:{r_key}', 300)
    return yara.CALLBACK_CONTINUE

def _get_word_regex(word):
    return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'

# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(obj_id, content):
    cached = r_cache.get(f'extractor:cache:{obj_id}')
    if cached:
        r_cache.expire(f'extractor:cache:{obj_id}', 300)
        return json.loads(cached)

    extracted = []
    trackers = Tracker.get_obj_all_trackers('item', '', obj_id)
    for tracker_uuid in trackers:
        tracker_type = Tracker.get_tracker_type(tracker_uuid)
        print(tracker_type)
        tracker = Tracker.get_tracker_by_uuid(tracker_uuid)
        if tracker_type == 'regex':  # TODO Improve word detection -> word delimiter
            regex_match = regex_helper.regex_finditer(r_key, tracker, obj_id, content)
            for match in regex_match:
                extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])
        elif tracker_type == 'yara':
            rule = Tracker.get_yara_rule_by_uuid(tracker_uuid)
            rule.match(data=content, callback=_get_yara_match,
                       which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
            yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
            r_cache.delete(f'extractor:yara:match:{r_key}')
            extracted = []
            for match in yara_match:
                start, end, value = match.split(':', 2)
                extracted.append([int(start), int(end), value, f'tracker:{tracker_uuid}'])

        elif tracker_type == 'word' or tracker_type == 'set':
            if tracker_type == 'set':
                tracker = tracker.rsplit(';', 1)[0]
                words = tracker.split(',')
            else:
                words = [tracker]
            for word in words:
                regex = _get_word_regex(word)
                regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)
                print(regex_match)
                for match in regex_match:
                    extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])

    # Save In Cache
    if extracted:
        extracted_dump = json.dumps(extracted)
        r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)
        r_cache.expire(f'extractor:cache:{obj_id}', 300)  # TODO Reduce CACHE ???????????????

    return extracted

# Type:subtype:id
# tag:iban
# tracker:uuid

def extract(obj_id, content=None):
    item = Item(obj_id)
    if not content:
        content = item.get_content()

    extracted = get_tracker_match(obj_id, content)

    # print(item.get_tags())
    for tag in item.get_tags():
        if MODULES.get(tag):
            # print(tag)
            module = MODULES.get(tag)
            matches = module.extract(obj_id, content, tag)
            if matches:
                extracted = extracted + matches

    for obj_t in ['cve', 'cryptocurrency', 'username']:  # Decoded, PGP->extract bloc
        matches = get_correl_match(obj_t, obj_id, content)
        if matches:
            extracted = extracted + matches

    # SORT By Start Pos
    extracted = sorted(extracted, key=itemgetter(0))
    # print(extracted)
    return extracted

# TODO ADD LINK UI
def get_extracted_by_match(extracted):
    matches = {}
    for start, end, value, str_obj in extracted:

        if str_obj not in matches:
            matches[str_obj] = {}
            ob_type, row_id = str_obj.split(':', 1)
            if ob_type == 'tag':  # TODO put me in object class
                matches[str_obj]['subtype'] = 'tag'
                matches[str_obj]['id'] = row_id
                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
                matches[str_obj]['link'] = ''
            elif ob_type == 'tracker':  # TODO put me in object class
                matches[str_obj]['subtype'] = 'tracker'
                matches[str_obj]['id'] = row_id
                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
                matches[str_obj]['link'] = ''
            else:
                row_id = row_id.split(':', 1)
                if len(row_id) == 2:
                    subtype = row_id[0]
                    obj_id = row_id[1]
                else:
                    subtype = ''
                    obj_id = row_id[0]
                matches[str_obj]['subtype'] = subtype
                matches[str_obj]['id'] = obj_id
                matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
                matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)

            matches[str_obj]['matches'] = []

        match = [start, end, value]
        matches[str_obj]['matches'].append(match)
    return matches


# if __name__ == '__main__':
#     t0 = time.time()
#     obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
#     obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
#     obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
#     # obj_id = 'tests/2021/01/01/credit_cards.gz'
#     # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
#     obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
#     obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
#     obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
#
#     extract(obj_id)
#
#     # get_obj_correl('cve', obj_id, content)
#     # r = get_tracker_match(obj_id, content)
#     # print(r)
#
#     print(time.time() - t0)
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`#!/usr/bin/env python3`
			`# --coding:UTF-8 -`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`import json`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`import os`
			`import sys`
			`import time`

			`import yara`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`from operator import itemgetter`

chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`sys.path.append(os.environ['AIL_BIN'])`
			`##################################`
			`# Import Project packages`
			`##################################`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`from lib.objects import ail_objects`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`from lib.objects.Items import Item`
			`from lib import correlations_engine`
			`from lib import regex_helper`
			`from lib.ConfigLoader import ConfigLoader`

			`from lib import Tracker`

			`from modules.CreditCards import CreditCards`
			`from modules.Iban import Iban`
			`from modules.Mail import Mail`
			`from modules.Onion import Onion`
			`from modules.Tools import Tools`

			`config_loader = ConfigLoader()`
			`r_cache = config_loader.get_redis_conn("Redis_Cache")`
			`config_loader = None`

			`r_key = regex_helper.generate_redis_cache_key('extractor')`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# TODO UI Link`

chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`MODULES = {`
chg: [AIL queues] rewrite module queues. remove PUBSUB 2023-04-13 12:25:02 +00:00			`'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),`
			`'infoleak:automatic-detection="iban"': Iban(queue=False),`
			`'infoleak:automatic-detection="mail"': Mail(queue=False),`
			`'infoleak:automatic-detection="onion"': Onion(queue=False),`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`# APIkey ???`
			`# Credentials`
			`# Zerobins`
			`# CERTIFICATE + KEYS ???`
			`# SQL Injetction / Libinjection ???`

			`}`
chg: [AIL queues] rewrite module queues. remove PUBSUB 2023-04-13 12:25:02 +00:00			`tools = Tools(queue=False)`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`for tool_name in tools.get_tools():`
			`MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`def get_correl_match(extract_type, obj_id, content):`
			`extracted = []`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)`
			`to_extract = []`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`map_subtype = {}`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`for c in correl:`
			`subtype, value = c.split(':', 1)`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`map_subtype[value] = subtype`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`to_extract.append(value)`
			`if to_extract:`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`objs = regex_helper.regex_finditer(r_key, '\|'.join(to_extract), obj_id, content)`
			`for obj in objs:`
			`if map_subtype[obj[2]]:`
			`subtype = map_subtype[obj[2]]`
			`else:`
			`subtype = ''`
			`extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])`
			`return extracted`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00
			`def _get_yara_match(data):`
			`for row in data.get('strings'):`
			`start, i, value = row`
			`value = value.decode()`
			`end = start + len(value)`
			`r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')`
			`r_cache.expire(f'extractor:yara:match:{r_key}', 300)`
			`return yara.CALLBACK_CONTINUE`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`def _get_word_regex(word):`
			`return '(?:^\|(?<=[\&\~\:\;\,\.\(\)\{\}\\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\\n\r\t\s]))' + word + '(?:$\|(?=[\&\~\:\;\,\.\(\)\{\}\\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\\n\r\t\s]))'`

chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`# TODO RETRO HUNTS`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# TODO TRACKER TYPE IN UI`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`def get_tracker_match(obj_id, content):`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`cached = r_cache.get(f'extractor:cache:{obj_id}')`
			`if cached:`
			`r_cache.expire(f'extractor:cache:{obj_id}', 300)`
			`return json.loads(cached)`

			`extracted = []`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`trackers = Tracker.get_obj_all_trackers('item', '', obj_id)`
			`for tracker_uuid in trackers:`
			`tracker_type = Tracker.get_tracker_type(tracker_uuid)`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`print(tracker_type)`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`tracker = Tracker.get_tracker_by_uuid(tracker_uuid)`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`if tracker_type == 'regex': # TODO Improve word detection -> word delimiter`
			`regex_match = regex_helper.regex_finditer(r_key, tracker, obj_id, content)`
			`for match in regex_match:`
			`extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`elif tracker_type == 'yara':`
			`rule = Tracker.get_yara_rule_by_uuid(tracker_uuid)`
			`rule.match(data=content, callback=_get_yara_match,`
			`which_callbacks=yara.CALLBACK_MATCHES, timeout=30)`
			`yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')`
			`r_cache.delete(f'extractor:yara:match:{r_key}')`
			`extracted = []`
			`for match in yara_match:`
			`start, end, value = match.split(':', 2)`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`extracted.append([int(start), int(end), value, f'tracker:{tracker_uuid}'])`

			`elif tracker_type == 'word' or tracker_type == 'set':`
			`if tracker_type == 'set':`
			`tracker = tracker.rsplit(';', 1)[0]`
			`words = tracker.split(',')`
			`else:`
			`words = [tracker]`
			`for word in words:`
			`regex = _get_word_regex(word)`
			`regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)`
			`print(regex_match)`
			`for match in regex_match:`
			`extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])`

			`# Save In Cache`
			`if extracted:`
			`extracted_dump = json.dumps(extracted)`
			`r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)`
			`r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ???????????????`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`return extracted`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# Type:subtype:id`
			`# tag:iban`
			`# tracker:uuid`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00
			`def extract(obj_id, content=None):`
			`item = Item(obj_id)`
			`if not content:`
			`content = item.get_content()`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`extracted = get_tracker_match(obj_id, content)`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00
			`# print(item.get_tags())`
			`for tag in item.get_tags():`
			`if MODULES.get(tag):`
			`# print(tag)`
			`module = MODULES.get(tag)`
			`matches = module.extract(obj_id, content, tag)`
			`if matches:`
			`extracted = extracted + matches`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`matches = get_correl_match(obj_t, obj_id, content)`
			`if matches:`
			`extracted = extracted + matches`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# SORT By Start Pos`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`extracted = sorted(extracted, key=itemgetter(0))`
chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# print(extracted)`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00			`return extracted`

chg: [extractor] add cache + UI extractor + word/set extractor 2023-02-23 15:25:15 +00:00			`# TODO ADD LINK UI`
			`def get_extracted_by_match(extracted):`
			`matches = {}`
			`for start, end, value, str_obj in extracted:`

			`if str_obj not in matches:`
			`matches[str_obj] = {}`
			`ob_type, row_id = str_obj.split(':', 1)`
			`if ob_type == 'tag': # TODO put me in object class`
			`matches[str_obj]['subtype'] = 'tag'`
			`matches[str_obj]['id'] = row_id`
			`matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}`
			`matches[str_obj]['link'] = ''`
			`elif ob_type == 'tracker': # TODO put me in object class`
			`matches[str_obj]['subtype'] = 'tracker'`
			`matches[str_obj]['id'] = row_id`
			`matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}`
			`matches[str_obj]['link'] = ''`
			`else:`
			`row_id = row_id.split(':', 1)`
			`if len(row_id) == 2:`
			`subtype = row_id[0]`
			`obj_id = row_id[1]`
			`else:`
			`subtype = ''`
			`obj_id = row_id[0]`
			`matches[str_obj]['subtype'] = subtype`
			`matches[str_obj]['id'] = obj_id`
			`matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)`
			`matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)`

			`matches[str_obj]['matches'] = []`

			`match = [start, end, value]`
			`matches[str_obj]['matches'].append(match)`
			`return matches`


			`# if __name__ == '__main__':`
			`# t0 = time.time()`
			`# obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'`
			`# obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'`
			`# obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'`
			`# # obj_id = 'tests/2021/01/01/credit_cards.gz'`
			`# # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'`
			`# obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'`
			`# obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'`
			`# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'`
			`#`
			`# extract(obj_id)`
			`#`
			`# # get_obj_correl('cve', obj_id, content)`
			`# # r = get_tracker_match(obj_id, content)`
			`# # print(r)`
			`#`
			`# print(time.time() - t0)`
chg: [DB Migration] UI: Extract + highlight leaks and trackers match, Data Retention save object first/last date, Refactor Tools 2022-12-19 15:38:20 +00:00