ail-framework/bin/lib/module_extractor.py

322 lines
12 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import logging
import os
import sys
import yara
from hashlib import sha256
from operator import itemgetter
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ail_users import get_user_org
from lib.objects import ail_objects
from lib.objects.Titles import Title
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader
from lib import Tracker
from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Phone import Phone
from modules.Tools import Tools
logger = logging.getLogger()
config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None
r_key = regex_helper.generate_redis_cache_key('extractor')
# TODO UI Link
CORRELATION_TO_EXTRACT = {
'item': ['cve', 'cryptocurrency', 'title', 'username'],
'message': ['cve', 'cryptocurrency', 'username']
}
MODULES = {
'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
'infoleak:automatic-detection="iban"': Iban(queue=False),
'infoleak:automatic-detection="mail"': Mail(queue=False),
'infoleak:automatic-detection="onion"': Onion(queue=False),
'infoleak:automatic-detection="phone-number"': Phone(queue=False),
# APIkey ???
# Credentials
# Zerobins
# CERTIFICATE + KEYS ???
# SQL Injetction / Libinjection ???
}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools
def merge_overlap(extracted):
merged = []
curr_start, curr_end, curr_string_match, curr_obj_ref = extracted[0]
curr_obj_ref = [(curr_obj_ref, curr_string_match)]
for start, end, mstring, ref in extracted[1:]:
# overlap
if start <= curr_end:
curr_string_match += mstring[curr_end - start:]
curr_end = max(curr_end, end)
curr_obj_ref.append((ref, mstring))
else:
merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
curr_start, curr_end, curr_string_match, curr_obj_ref = start, end, mstring, [(ref, mstring)]
merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
return merged
def get_correl_match(extract_type, obj, content):
extracted = []
correl = correlations_engine.get_correlation_by_correl_type(obj.type, obj.get_subtype(r_str=True), obj.id, extract_type)
to_extract = []
map_subtype = {}
map_value_id = {}
for c in correl:
subtype, value = c.split(':', 1)
if extract_type == 'title':
title = Title(value).get_content()
to_extract.append(title)
sha256_val = sha256(title.encode()).hexdigest()
else:
map_subtype[value] = subtype
to_extract.append(value)
sha256_val = sha256(value.encode()).hexdigest()
map_value_id[sha256_val] = value
if to_extract:
objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj.get_global_id(), content)
if extract_type == 'title' and objs:
objs = [objs[0]]
for ob in objs:
if map_subtype.get(ob[2]):
subtype = map_subtype[ob[2]]
else:
subtype = ''
sha256_val = sha256(ob[2].encode()).hexdigest()
value_id = map_value_id.get(sha256_val)
if not value_id:
logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}')
value_id = 'ERROR'
extracted.append([ob[0], ob[1], ob[2], f'{extract_type}:{subtype}:{value_id}'])
return extracted
def _get_yara_match(data):
for string_match in data.get('strings'):
for string_match_instance in string_match.instances:
start = string_match_instance.offset
value = string_match_instance.matched_data.decode()
end = start + string_match_instance.matched_length
r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
r_cache.expire(f'extractor:yara:match:{r_key}', 300)
return yara.CALLBACK_CONTINUE
def _get_word_regex(word):
return '(?i)(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'
def convert_byte_offset_to_string(b_content, offset):
byte_chunk = b_content[:offset + 1]
2023-08-08 08:36:58 +00:00
try:
string_chunk = byte_chunk.decode()
offset = len(string_chunk) - 1
return offset
2023-08-08 08:36:58 +00:00
except UnicodeDecodeError as e:
logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}')
return convert_byte_offset_to_string(b_content, offset - 1)
# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(user_org, user_id, obj, content):
extracted = []
extracted_yara = []
obj_gid = obj.get_global_id()
trackers = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id)
for tracker_uuid in trackers:
tracker = Tracker.Tracker(tracker_uuid)
if not tracker.check_level(user_org, user_id):
continue
tracker_type = tracker.get_type()
# print(tracker_type)
tracked = tracker.get_tracked()
if tracker_type == 'regex': # TODO Improve word detection -> word delimiter
regex_match = regex_helper.regex_finditer(r_key, tracked, obj_gid, content)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
elif tracker_type == 'yara':
rule = tracker.get_rule()
rule.match(data=content.encode(), callback=_get_yara_match,
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
r_cache.delete(f'extractor:yara:match:{r_key}')
extracted = []
for match in yara_match:
start, end, value = match.split(':', 2)
extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])
elif tracker_type == 'word' or tracker_type == 'set':
if tracker_type == 'set':
tracked = tracked.rsplit(';', 1)[0]
words = tracked.split(',')
else:
words = [tracked]
for word in words:
regex = _get_word_regex(word)
regex_match = regex_helper.regex_finditer(r_key, regex, obj_gid, content)
# print(regex_match)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
# Retro Hunt
retro_hunts = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id)
for retro_uuid in retro_hunts:
retro_hunt = Tracker.RetroHunt(retro_uuid)
if not retro_hunt.check_level(user_org):
continue
rule = retro_hunt.get_rule(r_compile=True)
rule.match(data=content.encode(), callback=_get_yara_match,
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
r_cache.delete(f'extractor:yara:match:{r_key}')
extracted = []
for match in yara_match:
start, end, value = match.split(':', 2)
extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}'])
# Convert byte offset to string offset
if extracted_yara:
b_content = content.encode()
if len(b_content) == len(content):
extracted[0:0] = extracted_yara
else:
for yara_m in extracted_yara:
start = convert_byte_offset_to_string(b_content, yara_m[0])
end = convert_byte_offset_to_string(b_content, yara_m[1])
extracted.append([int(start), int(end), yara_m[2], yara_m[3]])
return extracted
# Type:subtype:id
# tag:iban
# tracker:uuid
# def extract(obj_id, content=None):
def extract(user_id, obj_type, subtype, obj_id, content=None):
obj = ail_objects.get_object(obj_type, subtype, obj_id)
if not obj.exists():
return []
obj_gid = obj.get_global_id()
user_org = get_user_org(user_id)
# CHECK CACHE
cached = r_cache.get(f'extractor:cache:{obj_gid}:{user_org}:{user_id}')
# cached = None
if cached:
r_cache.expire(f'extractor:cache:{obj_gid}:{user_org}:{user_id}', 300)
return json.loads(cached)
if not content:
content = obj.get_content()
extracted = get_tracker_match(user_org, user_id, obj, content)
# print(item.get_tags())
for tag in obj.get_tags():
if MODULES.get(tag):
# print(tag)
module = MODULES.get(tag)
matches = module.extract(obj, content, tag)
if matches:
extracted = extracted + matches
for obj_t in CORRELATION_TO_EXTRACT[obj.type]:
matches = get_correl_match(obj_t, obj, content)
if matches:
extracted = extracted + matches
# SORT By Start Pos
if extracted:
extracted = sorted(extracted, key=itemgetter(0))
extracted = merge_overlap(extracted)
# Save In Cache
if extracted:
extracted_dump = json.dumps(extracted)
r_cache.set(f'extractor:cache:{obj_gid}:{user_org}:{user_id}', extracted_dump)
r_cache.expire(f'extractor:cache:{obj_gid}:{user_org}:{user_id}', 300) # TODO Reduce CACHE ???????????????
return extracted
# TODO ADD LINK UI
def get_extracted_by_match(extracted):
matches = {}
for start, end, value, raw_objs in extracted:
for raw in raw_objs:
str_obj, str_match = raw
if str_obj not in matches:
matches[str_obj] = {}
ob_type, row_id = str_obj.split(':', 1)
if ob_type == 'tag': # TODO put me in object class
matches[str_obj]['subtype'] = 'tag'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
matches[str_obj]['link'] = ''
elif ob_type == 'tracker': # TODO put me in object class
matches[str_obj]['subtype'] = 'tracker'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
matches[str_obj]['link'] = ''
elif ob_type == 'retro_hunt': # TODO put me in object class
matches[str_obj]['subtype'] = 'retro_hunt'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#008107', 'radius': 5}
matches[str_obj]['link'] = ''
else:
row_id = row_id.split(':', 1)
if len(row_id) == 2:
subtype = row_id[0]
obj_id = row_id[1]
else:
subtype = ''
obj_id = row_id[0]
matches[str_obj]['subtype'] = subtype
matches[str_obj]['id'] = obj_id
matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)
matches[str_obj]['matches'] = []
match = [start, end, str_match]
matches[str_obj]['matches'].append(match)
return matches
# if __name__ == '__main__':
# t0 = time.time()
# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
# extract(obj_id)
#
# # get_obj_correl('cve', obj_id, content)
# # r = get_tracker_match(obj_id, content)
# # print(r)
#
# print(time.time() - t0)