chg: [core] merge master + fix object subtype correlation stats

This commit is contained in:
terrtia 2023-10-12 13:53:00 +02:00
commit c5cef5fd00
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
30 changed files with 415 additions and 191 deletions

View file

@ -267,8 +267,8 @@ function launching_scripts {
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "LibInjection" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./LibInjection.py; read x" screen -S "Script_AIL" -X screen -t "LibInjection" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./LibInjection.py; read x"
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "Zerobins" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Zerobins.py; read x" # screen -S "Script_AIL" -X screen -t "Pasties" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Pasties.py; read x"
sleep 0.1 # sleep 0.1
screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x" screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x"
sleep 0.1 sleep 0.1

View file

@ -22,6 +22,7 @@ from lib.objects.Domains import Domain
from lib.objects.Items import Item from lib.objects.Items import Item
from lib.objects import Screenshots from lib.objects import Screenshots
from lib.objects import Titles from lib.objects import Titles
from trackers.Tracker_Yara import Tracker_Yara
logging.config.dictConfig(ail_logger.get_config(name='crawlers')) logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
@ -35,6 +36,8 @@ class Crawler(AbstractModule):
# Waiting time in seconds between to message processed # Waiting time in seconds between to message processed
self.pending_seconds = 1 self.pending_seconds = 1
self.tracker_yara = Tracker_Yara(queue=False)
config_loader = ConfigLoader() config_loader = ConfigLoader()
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@ -284,6 +287,12 @@ class Crawler(AbstractModule):
if title_content: if title_content:
title = Titles.create_title(title_content) title = Titles.create_title(title_content)
title.add(item.get_date(), item) title.add(item.get_date(), item)
# Tracker
self.tracker_yara.compute_manual(title)
if not title.is_tags_safe():
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item.add_tag(unsafe_tag)
# SCREENSHOT # SCREENSHOT
if self.screenshot: if self.screenshot:

View file

@ -124,16 +124,27 @@ class MailExporterTracker(MailExporter):
def __init__(self, host=None, port=None, password=None, user='', sender=''): def __init__(self, host=None, port=None, password=None, user='', sender=''):
super().__init__(host=host, port=port, password=password, user=user, sender=sender) super().__init__(host=host, port=port, password=password, user=user, sender=sender)
def export(self, tracker, obj): # TODO match def export(self, tracker, obj, matches=[]):
tracker_type = tracker.get_type() tracker_type = tracker.get_type()
tracker_name = tracker.get_tracked() tracker_name = tracker.get_tracked()
subject = f'AIL Framework Tracker: {tracker_name}' # TODO custom subject description = tracker.get_description()
if not description:
description = tracker_name
subject = f'AIL Framework Tracker: {description}'
body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n"
body += f'Item: {obj.id}\nurl:{obj.get_link()}' body += f'Item: {obj.id}\nurl:{obj.get_link()}'
# TODO match option if matches:
# if match: body += '\n'
# body += f'Tracker Match:\n\n{escape(match)}' nb = 1
for match in matches:
body += f'\nMatch {nb}: {match[0]}\nExtract:\n{match[1]}\n\n'
nb += 1
else:
body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n"
body += f'Item: {obj.id}\nurl:{obj.get_link()}'
# print(body)
for mail in tracker.get_mails(): for mail in tracker.get_mails():
self._export(mail, subject, body) self._export(mail, subject, body)

View file

@ -31,8 +31,12 @@ class DefaultFeeder:
Return feeder name. first part of the item_id and display in the UI Return feeder name. first part of the item_id and display in the UI
""" """
if not self.name: if not self.name:
return self.get_source() name = self.get_source()
return self.name else:
name = self.name
if not name:
name = 'default'
return name
def get_source(self): def get_source(self):
return self.json_data.get('source') return self.json_data.get('source')

View file

@ -83,6 +83,7 @@ class ConfigLoader(object):
else: else:
return [] return []
# # # # Directory Config # # # # # # # # Directory Config # # # #
config_loader = ConfigLoader() config_loader = ConfigLoader()

View file

@ -2,6 +2,8 @@
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
import json import json
import os import os
import logging
import logging.config
import re import re
import sys import sys
import time import time
@ -24,11 +26,16 @@ sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
from packages import Date from packages import Date
from lib.ail_core import get_objects_tracked, get_object_all_subtypes, get_objects_retro_hunted from lib.ail_core import get_objects_tracked, get_object_all_subtypes, get_objects_retro_hunted
from lib import ail_logger
from lib import ConfigLoader from lib import ConfigLoader
from lib import item_basic from lib import item_basic
from lib import Tag from lib import Tag
from lib.Users import User from lib.Users import User
# LOGS
logging.config.dictConfig(ail_logger.get_config(name='modules'))
logger = logging.getLogger()
config_loader = ConfigLoader.ConfigLoader() config_loader = ConfigLoader.ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache") r_cache = config_loader.get_redis_conn("Redis_Cache")
@ -248,7 +255,8 @@ class Tracker:
return self._get_field('user_id') return self._get_field('user_id')
def webhook_export(self): def webhook_export(self):
return r_tracker.hexists(f'tracker:{self.uuid}', 'webhook') webhook = self.get_webhook()
return webhook is not None and webhook
def get_webhook(self): def get_webhook(self):
return r_tracker.hget(f'tracker:{self.uuid}', 'webhook') return r_tracker.hget(f'tracker:{self.uuid}', 'webhook')
@ -560,9 +568,7 @@ class Tracker:
os.remove(filepath) os.remove(filepath)
# Filters # Filters
filters = self.get_filters() filters = get_objects_tracked()
if not filters:
filters = get_objects_tracked()
for obj_type in filters: for obj_type in filters:
r_tracker.srem(f'trackers:objs:{tracker_type}:{obj_type}', tracked) r_tracker.srem(f'trackers:objs:{tracker_type}:{obj_type}', tracked)
r_tracker.srem(f'trackers:uuid:{tracker_type}:{tracked}', f'{self.uuid}:{obj_type}') r_tracker.srem(f'trackers:uuid:{tracker_type}:{tracked}', f'{self.uuid}:{obj_type}')
@ -923,7 +929,7 @@ def api_add_tracker(dict_input, user_id):
# Filters # TODO MOVE ME # Filters # TODO MOVE ME
filters = dict_input.get('filters', {}) filters = dict_input.get('filters', {})
if filters: if filters:
if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
filters = {} filters = {}
for obj_type in filters: for obj_type in filters:
if obj_type not in get_objects_tracked(): if obj_type not in get_objects_tracked():
@ -998,7 +1004,7 @@ def api_edit_tracker(dict_input, user_id):
# Filters # TODO MOVE ME # Filters # TODO MOVE ME
filters = dict_input.get('filters', {}) filters = dict_input.get('filters', {})
if filters: if filters:
if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}:
if not filters['decoded'] and not filters['item']: if not filters['decoded'] and not filters['item']:
filters = {} filters = {}
for obj_type in filters: for obj_type in filters:
@ -1151,7 +1157,11 @@ def get_tracked_yara_rules():
for obj_type in get_objects_tracked(): for obj_type in get_objects_tracked():
rules = {} rules = {}
for tracked in _get_tracked_by_obj_type('yara', obj_type): for tracked in _get_tracked_by_obj_type('yara', obj_type):
rules[tracked] = os.path.join(get_yara_rules_dir(), tracked) rule = os.path.join(get_yara_rules_dir(), tracked)
if not os.path.exists(rule):
logger.critical(f"Yara rule don't exists {tracked} : {obj_type}")
else:
rules[tracked] = rule
to_track[obj_type] = yara.compile(filepaths=rules) to_track[obj_type] = yara.compile(filepaths=rules)
print(to_track) print(to_track)
return to_track return to_track

View file

@ -52,7 +52,7 @@ def get_object_all_subtypes(obj_type):
return [] return []
def get_objects_tracked(): def get_objects_tracked():
return ['decoded', 'item', 'pgp'] return ['decoded', 'item', 'pgp', 'title']
def get_objects_retro_hunted(): def get_objects_retro_hunted():
return ['decoded', 'item'] return ['decoded', 'item']

View file

@ -234,7 +234,9 @@ def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
title = soup.title title = soup.title
if title: if title:
return str(title.string) title = title.string
if title:
return str(title)
return '' return ''
def extract_description_from_html(html): def extract_description_from_html(html):
@ -1690,6 +1692,19 @@ def api_add_crawler_task(data, user_id=None):
return {'error': 'The access to this cookiejar is restricted'}, 403 return {'error': 'The access to this cookiejar is restricted'}, 403
cookiejar_uuid = cookiejar.uuid cookiejar_uuid = cookiejar.uuid
cookies = data.get('cookies', None)
if not cookiejar_uuid and cookies:
# Create new cookiejar
cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None)
cookiejar = Cookiejar(cookiejar_uuid)
for cookie in cookies:
try:
name = cookie.get('name')
value = cookie.get('value')
cookiejar.add_cookie(name, value, None, None, None, None, None)
except KeyError:
return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400
frequency = data.get('frequency', None) frequency = data.get('frequency', None)
if frequency: if frequency:
if frequency not in ['monthly', 'weekly', 'daily', 'hourly']: if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
@ -2010,7 +2025,7 @@ def test_ail_crawlers():
# TODO MOVE ME IN CRAWLER OR FLASK # TODO MOVE ME IN CRAWLER OR FLASK
load_blacklist() load_blacklist()
if __name__ == '__main__': # if __name__ == '__main__':
# delete_captures() # delete_captures()
# item_id = 'crawled/2023/02/20/data.gz' # item_id = 'crawled/2023/02/20/data.gz'
@ -2022,4 +2037,4 @@ if __name__ == '__main__':
# _reprocess_all_hars_cookie_name() # _reprocess_all_hars_cookie_name()
# _reprocess_all_hars_etag() # _reprocess_all_hars_etag()
# _gzip_all_hars() # _gzip_all_hars()
_reprocess_all_hars_hhhashs() # _reprocess_all_hars_hhhashs()

View file

@ -204,15 +204,22 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt
if not l_sources_name: if not l_sources_name:
l_sources_name = set() l_sources_name = set()
if source_name: if source_name:
l_dir = os.listdir(os.path.join(directory, source_name)) path = os.path.join(directory, source_name)
if os.path.isdir(path):
l_dir = os.listdir(os.path.join(directory, source_name))
else:
l_dir = []
else: else:
l_dir = os.listdir(directory) l_dir = os.listdir(directory)
# empty directory # empty directory
if not l_dir: if not l_dir:
return l_sources_name.add(source_name) if source_name:
return l_sources_name.add(source_name)
else:
return l_sources_name
else: else:
for src_name in l_dir: for src_name in l_dir:
if len(src_name) == 4: if len(src_name) == 4 and source_name:
# try: # try:
int(src_name) int(src_name)
to_add = os.path.join(source_name) to_add = os.path.join(source_name)

View file

@ -85,9 +85,6 @@ class CookieName(AbstractDaterangeObject):
meta['content'] = self.get_content() meta['content'] = self.get_content()
return meta return meta
def add(self, date, obj_id): # date = HAR Date
self._add(date, 'domain', '', obj_id)
def create(self, content, _first_seen=None, _last_seen=None): def create(self, content, _first_seen=None, _last_seen=None):
if not isinstance(content, str): if not isinstance(content, str):
content = content.decode() content = content.decode()

View file

@ -79,9 +79,6 @@ class Cve(AbstractDaterangeObject):
meta['tags'] = self.get_tags(r_list=True) meta['tags'] = self.get_tags(r_list=True)
return meta return meta
def add(self, date, item_id):
self._add(date, 'item', '', item_id)
def get_cve_search(self): def get_cve_search(self):
try: try:
response = requests.get(f'https://cvepremium.circl.lu/api/cve/{self.id}', timeout=10) response = requests.get(f'https://cvepremium.circl.lu/api/cve/{self.id}', timeout=10)

View file

@ -239,8 +239,8 @@ class Decoded(AbstractDaterangeObject):
return True return True
def add(self, algo_name, date, obj_id, mimetype=None): def add(self, date, obj, algo_name, mimetype=None):
self._add(date, 'item', '', obj_id) self._add(date, obj)
if not mimetype: if not mimetype:
mimetype = self.get_mimetype() mimetype = self.get_mimetype()
@ -460,7 +460,7 @@ def get_all_decodeds_objects(filters={}):
############################################################################ ############################################################################
def sanityze_decoder_names(decoder_name): def sanityze_decoder_names(decoder_name):
if decoder_name not in Decodeds.get_algos(): if decoder_name not in get_algos():
return None return None
else: else:
return decoder_name return decoder_name

View file

@ -85,9 +85,6 @@ class Etag(AbstractDaterangeObject):
meta['content'] = self.get_content() meta['content'] = self.get_content()
return meta return meta
def add(self, date, obj_id): # date = HAR Date
self._add(date, 'domain', '', obj_id)
def create(self, content, _first_seen=None, _last_seen=None): def create(self, content, _first_seen=None, _last_seen=None):
if not isinstance(content, str): if not isinstance(content, str):
content = content.decode() content = content.decode()

View file

@ -86,9 +86,6 @@ class Favicon(AbstractDaterangeObject):
# def get_links(self): # def get_links(self):
# # TODO GET ALL URLS FROM CORRELATED ITEMS # # TODO GET ALL URLS FROM CORRELATED ITEMS
def add(self, date, obj_id): # TODO correlation base 64 -> calc md5
self._add(date, 'domain', '', obj_id)
def create(self, content, _first_seen=None, _last_seen=None): def create(self, content, _first_seen=None, _last_seen=None):
if not isinstance(content, str): if not isinstance(content, str):
content = content.decode() content = content.decode()

View file

@ -86,9 +86,6 @@ class HHHash(AbstractDaterangeObject):
meta['content'] = self.get_content() meta['content'] = self.get_content()
return meta return meta
def add(self, date, obj_id): # date = HAR Date
self._add(date, 'domain', '', obj_id)
def create(self, hhhash_header, _first_seen=None, _last_seen=None): # TODO CREATE ADD FUNCTION -> urls set def create(self, hhhash_header, _first_seen=None, _last_seen=None): # TODO CREATE ADD FUNCTION -> urls set
self._set_field('content', hhhash_header) self._set_field('content', hhhash_header)
self._create() self._create()

View file

@ -175,7 +175,7 @@ class Message(AbstractObject):
if options is None: if options is None:
options = set() options = set()
meta = self.get_default_meta(tags=True) meta = self.get_default_meta(tags=True)
meta['date'] = self.get_date() # TODO replace me by timestamp ?????? meta['date'] = self.get_date()
meta['source'] = self.get_source() meta['source'] = self.get_source()
# optional meta fields # optional meta fields
if 'content' in options: if 'content' in options:

View file

@ -45,6 +45,8 @@ class Title(AbstractDaterangeObject):
def get_content(self, r_type='str'): def get_content(self, r_type='str'):
if r_type == 'str': if r_type == 'str':
return self._get_field('content') return self._get_field('content')
elif r_type == 'bytes':
return self._get_field('content').encode()
def get_link(self, flask_context=False): def get_link(self, flask_context=False):
if flask_context: if flask_context:
@ -82,9 +84,6 @@ class Title(AbstractDaterangeObject):
meta['content'] = self.get_content() meta['content'] = self.get_content()
return meta return meta
def add(self, date, item_id):
self._add(date, 'item', '', item_id)
def create(self, content, _first_seen=None, _last_seen=None): def create(self, content, _first_seen=None, _last_seen=None):
self._set_field('content', content) self._set_field('content', content)
self._create() self._create()
@ -122,4 +121,3 @@ class Titles(AbstractDaterangeObjects):
# # print(r) # # print(r)
# r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False) # r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False)
# print(r) # print(r)

View file

@ -125,9 +125,7 @@ class AbstractDaterangeObject(AbstractObject, ABC):
def _add_create(self): def _add_create(self):
r_object.sadd(f'{self.type}:all', self.id) r_object.sadd(f'{self.type}:all', self.id)
# TODO don't increase nb if same hash in item with different encoding def _add(self, date, obj):
# if hash already in item
def _add(self, date, obj_type, subtype, obj_id):
if not self.exists(): if not self.exists():
self._add_create() self._add_create()
self.set_first_seen(date) self.set_first_seen(date)
@ -136,26 +134,22 @@ class AbstractDaterangeObject(AbstractObject, ABC):
self.update_daterange(date) self.update_daterange(date)
update_obj_date(date, self.type) update_obj_date(date, self.type)
if obj_type == 'item': if obj:
# NB Object seen by day TODO
if not self.is_correlated(obj_type, subtype, obj_id): # nb seen by day
r_object.zincrby(f'{self.type}:date:{date}', 1, self.id)
# Correlations # Correlations
self.add_correlation(obj_type, subtype, obj_id) self.add_correlation(obj.type, obj.get_subtype(r_str=True), obj.get_id())
if is_crawled(obj_id): # Domain # Stats NB by day: # TODO Don't increase on reprocess
domain = get_item_domain(obj_id)
self.add_correlation('domain', '', domain)
else:
# Correlations
self.add_correlation(obj_type, subtype, obj_id)
# TODO Don't increase on reprocess
r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) r_object.zincrby(f'{self.type}:date:{date}', 1, self.id)
# r_object.zincrby(f'{self.type}:obj:{obj_type}', 1, self.id)
# 1 Domain by day / 1 HAR by day if obj.type == 'item':
# Domain check / file created -> issue with scheduler item_id = obj.get_id()
# domain
if is_crawled(item_id):
domain = get_item_domain(item_id)
self.add_correlation('domain', '', domain)
def add(self, date, obj):
self._add(date, obj)
# TODO:ADD objects + Stats # TODO:ADD objects + Stats
def _create(self, first_seen=None, last_seen=None): def _create(self, first_seen=None, last_seen=None):

View file

@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
proc.terminate() proc.terminate()
sys.exit(0) sys.exit(0)
def _regex_match(r_key, regex, content):
if re.match(regex, content):
r_serv_cache.set(r_key, 1)
r_serv_cache.expire(r_key, 360)
def regex_match(r_key, regex, item_id, content, max_time=30):
proc = Proc(target=_regex_match, args=(r_key, regex, content))
try:
proc.start()
proc.join(max_time)
if proc.is_alive():
proc.terminate()
# Statistics.incr_module_timeout_statistic(r_key)
err_mess = f"{r_key}: processing timeout: {item_id}"
logger.info(err_mess)
return False
else:
if r_serv_cache.exists(r_key):
r_serv_cache.delete(r_key)
return True
else:
r_serv_cache.delete(r_key)
return False
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating regex worker")
proc.terminate()
sys.exit(0)
def _regex_search(r_key, regex, content): def _regex_search(r_key, regex, content):
if re.search(regex, content): if re.search(regex, content):
r_serv_cache.set(r_key, 1) r_serv_cache.set(r_key, 1)

View file

@ -54,7 +54,7 @@ class CveModule(AbstractModule):
date = item.get_date() date = item.get_date()
for cve_id in cves: for cve_id in cves:
cve = Cves.Cve(cve_id) cve = Cves.Cve(cve_id)
cve.add(date, item_id) cve.add(date, item)
warning = f'{item_id} contains CVEs {cves}' warning = f'{item_id} contains CVEs {cves}'
print(warning) print(warning)

View file

@ -21,7 +21,6 @@ sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
from modules.abstract_module import AbstractModule from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.objects.Items import Item
from lib.objects.Decodeds import Decoded from lib.objects.Decodeds import Decoded
from trackers.Tracker_Term import Tracker_Term from trackers.Tracker_Term import Tracker_Term
from trackers.Tracker_Regex import Tracker_Regex from trackers.Tracker_Regex import Tracker_Regex
@ -87,17 +86,16 @@ class Decoder(AbstractModule):
self.logger.info(f'Module {self.module_name} initialized') self.logger.info(f'Module {self.module_name} initialized')
def compute(self, message): def compute(self, message):
item = self.get_obj() content = self.obj.get_content()
content = item.get_content() date = self.obj.get_date()
date = item.get_date()
new_decodeds = [] new_decodeds = []
for decoder in self.decoder_order: for decoder in self.decoder_order:
find = False find = False
dname = decoder['name'] dname = decoder['name']
encodeds = self.regex_findall(decoder['regex'], item.id, content) encodeds = self.regex_findall(decoder['regex'], self.obj.id, content)
# PERF remove encoded from item content # PERF remove encoded from obj content
for encoded in encodeds: for encoded in encodeds:
content = content.replace(encoded, '', 1) content = content.replace(encoded, '', 1)
encodeds = set(encodeds) encodeds = set(encodeds)
@ -113,19 +111,19 @@ class Decoder(AbstractModule):
if not decoded.exists(): if not decoded.exists():
mimetype = decoded.guess_mimetype(decoded_file) mimetype = decoded.guess_mimetype(decoded_file)
if not mimetype: if not mimetype:
print(sha1_string, item.id) print(sha1_string, self.obj.id)
raise Exception(f'Invalid mimetype: {decoded.id} {item.id}') raise Exception(f'Invalid mimetype: {decoded.id} {self.obj.id}')
decoded.save_file(decoded_file, mimetype) decoded.save_file(decoded_file, mimetype)
new_decodeds.append(decoded.id) new_decodeds.append(decoded.id)
else: else:
mimetype = decoded.get_mimetype() mimetype = decoded.get_mimetype()
decoded.add(dname, date, item.id, mimetype=mimetype) decoded.add(date, self.obj, dname, mimetype=mimetype)
# new_decodeds.append(decoded.id) # new_decodeds.append(decoded.id)
self.logger.info(f'{item.id} : {dname} - {decoded.id} - {mimetype}') self.logger.info(f'{self.obj.id} : {dname} - {decoded.id} - {mimetype}')
if find: if find:
self.logger.info(f'{item.id} - {dname}') self.logger.info(f'{self.obj.id} - {dname}')
# Send to Tags # Send to Tags
tag = f'infoleak:automatic-detection="{dname}"' tag = f'infoleak:automatic-detection="{dname}"'
@ -134,12 +132,13 @@ class Decoder(AbstractModule):
#################### ####################
# TRACKERS DECODED # TRACKERS DECODED
for decoded_id in new_decodeds: for decoded_id in new_decodeds:
decoded = Decoded(decoded_id)
try: try:
self.tracker_term.compute(decoded_id, obj_type='decoded') self.tracker_term.compute_manual(decoded)
self.tracker_regex.compute(decoded_id, obj_type='decoded') self.tracker_regex.compute_manual(decoded)
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
self.tracker_yara.compute(decoded_id, obj_type='decoded') self.tracker_yara.compute_manual(decoded)
if __name__ == '__main__': if __name__ == '__main__':

144
bin/modules/Pasties.py Executable file
View file

@ -0,0 +1,144 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Pasties Module
======================
This module spots domain-pasties services for further processing
"""
##################################
# Import External packages
##################################
import os
import sys
import time
from pyfaup.faup import Faup
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import crawlers
# TODO add url validator
pasties_blocklist_urls = set()
pasties_domains = {}
class Pasties(AbstractModule):
"""
Pasties module for AIL framework
"""
def __init__(self):
super(Pasties, self).__init__()
self.faup = Faup()
config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
self.pasties = {}
self.urls_blocklist = set()
self.load_pasties_domains()
# Send module state to logs
self.logger.info(f'Module {self.module_name} initialized')
def load_pasties_domains(self):
self.pasties = {}
self.urls_blocklist = set()
domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
if os.path.exists(domains_pasties):
with open(domains_pasties) as f:
for line in f:
url = line.strip()
if url: # TODO validate line
self.faup.decode(url)
url_decoded = self.faup.get()
host = url_decoded['host']
# if url_decoded.get('port', ''):
# host = f'{host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
# print(url_decoded)
if path and path != '/':
if path[-1] != '/':
path = f'{path}/'
else:
path = None
if host in self.pasties:
if path:
self.pasties[host].add(path)
else:
if path:
self.pasties[host] = {path}
else:
self.pasties[host] = set()
url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
if os.path.exists(url_blocklist):
with open(url_blocklist) as f:
for line in f:
url = line.strip()
self.faup.decode(url)
url_decoded = self.faup.get()
host = url_decoded['host']
# if url_decoded.get('port', ''):
# host = f'{host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
url = f'{host}{path}'
if url_decoded['query_string']:
url = url + url_decoded['query_string']
self.urls_blocklist.add(url)
def send_to_crawler(self, url, obj_id):
if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
def compute(self, message):
url = message.split()
self.faup.decode(url)
url_decoded = self.faup.get()
# print(url_decoded)
url_host = url_decoded['host']
# if url_decoded.get('port', ''):
# url_host = f'{url_host}:{url_decoded["port"]}'
path = url_decoded.get('resource_path', '')
if url_host in self.pasties:
if url.startswith('http://'):
if url[7:] in self.urls_blocklist:
return None
elif url.startswith('https://'):
if url[8:] in self.urls_blocklist:
return None
else:
if url in self.urls_blocklist:
return None
if not self.pasties[url_host]:
if path and path != '/':
print('send to crawler', url_host, url)
self.send_to_crawler(url, self.obj.id)
else:
if path.endswith('/'):
path_end = path[:-1]
else:
path_end = f'{path}/'
for url_path in self.pasties[url_host]:
if path.startswith(url_path):
if url_path != path and url_path != path_end:
print('send to crawler', url_path, url)
self.send_to_crawler(url, self.obj.id))
break
if __name__ == '__main__':
module = Pasties()
module.run()

View file

@ -24,7 +24,6 @@ sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
from modules.abstract_module import AbstractModule from modules.abstract_module import AbstractModule
from lib.objects import Pgps from lib.objects import Pgps
from lib.objects.Items import Item
from trackers.Tracker_Term import Tracker_Term from trackers.Tracker_Term import Tracker_Term
from trackers.Tracker_Regex import Tracker_Regex from trackers.Tracker_Regex import Tracker_Regex
from trackers.Tracker_Yara import Tracker_Yara from trackers.Tracker_Yara import Tracker_Yara
@ -61,7 +60,6 @@ class PgpDump(AbstractModule):
self.tracker_yara = Tracker_Yara(queue=False) self.tracker_yara = Tracker_Yara(queue=False)
# init # init
self.item_id = None
self.keys = set() self.keys = set()
self.private_keys = set() self.private_keys = set()
self.names = set() self.names = set()
@ -93,11 +91,11 @@ class PgpDump(AbstractModule):
print() print()
pgp_block = self.remove_html(pgp_block) pgp_block = self.remove_html(pgp_block)
# Remove Version # Remove Version
versions = self.regex_findall(self.reg_tool_version, self.item_id, pgp_block) versions = self.regex_findall(self.reg_tool_version, self.obj.id, pgp_block)
for version in versions: for version in versions:
pgp_block = pgp_block.replace(version, '') pgp_block = pgp_block.replace(version, '')
# Remove Comment # Remove Comment
comments = self.regex_findall(self.reg_block_comment, self.item_id, pgp_block) comments = self.regex_findall(self.reg_block_comment, self.obj.id, pgp_block)
for comment in comments: for comment in comments:
pgp_block = pgp_block.replace(comment, '') pgp_block = pgp_block.replace(comment, '')
# Remove Empty Lines # Remove Empty Lines
@ -130,7 +128,7 @@ class PgpDump(AbstractModule):
try: try:
output = output.decode() output = output.decode()
except UnicodeDecodeError: except UnicodeDecodeError:
self.logger.error(f'Error PgpDump UnicodeDecodeError: {self.item_id}') self.logger.error(f'Error PgpDump UnicodeDecodeError: {self.obj.id}')
output = '' output = ''
return output return output
@ -145,7 +143,7 @@ class PgpDump(AbstractModule):
private = True private = True
else: else:
private = False private = False
users = self.regex_findall(self.reg_user_id, self.item_id, pgpdump_output) users = self.regex_findall(self.reg_user_id, self.obj.id, pgpdump_output)
for user in users: for user in users:
# avoid key injection in user_id: # avoid key injection in user_id:
pgpdump_output.replace(user, '', 1) pgpdump_output.replace(user, '', 1)
@ -159,7 +157,7 @@ class PgpDump(AbstractModule):
name = user name = user
self.names.add(name) self.names.add(name)
keys = self.regex_findall(self.reg_key_id, self.item_id, pgpdump_output) keys = self.regex_findall(self.reg_key_id, self.obj.id, pgpdump_output)
for key_id in keys: for key_id in keys:
key_id = key_id.replace('Key ID - ', '', 1) key_id = key_id.replace('Key ID - ', '', 1)
if key_id != '0x0000000000000000': if key_id != '0x0000000000000000':
@ -171,28 +169,26 @@ class PgpDump(AbstractModule):
print('symmetrically encrypted') print('symmetrically encrypted')
def compute(self, message): def compute(self, message):
item = self.get_obj() content = self.obj.get_content()
self.item_id = item.get_id()
content = item.get_content()
pgp_blocks = [] pgp_blocks = []
# Public Block # Public Block
for pgp_block in self.regex_findall(self.reg_pgp_public_blocs, self.item_id, content): for pgp_block in self.regex_findall(self.reg_pgp_public_blocs, self.obj.id, content):
# content = content.replace(pgp_block, '') # content = content.replace(pgp_block, '')
pgp_block = self.sanitize_pgp_block(pgp_block) pgp_block = self.sanitize_pgp_block(pgp_block)
pgp_blocks.append(pgp_block) pgp_blocks.append(pgp_block)
# Private Block # Private Block
for pgp_block in self.regex_findall(self.reg_pgp_private_blocs, self.item_id, content): for pgp_block in self.regex_findall(self.reg_pgp_private_blocs, self.obj.id, content):
# content = content.replace(pgp_block, '') # content = content.replace(pgp_block, '')
pgp_block = self.sanitize_pgp_block(pgp_block) pgp_block = self.sanitize_pgp_block(pgp_block)
pgp_blocks.append(pgp_block) pgp_blocks.append(pgp_block)
# Signature # Signature
for pgp_block in self.regex_findall(self.reg_pgp_signature, self.item_id, content): for pgp_block in self.regex_findall(self.reg_pgp_signature, self.obj.id, content):
# content = content.replace(pgp_block, '') # content = content.replace(pgp_block, '')
pgp_block = self.sanitize_pgp_block(pgp_block) pgp_block = self.sanitize_pgp_block(pgp_block)
pgp_blocks.append(pgp_block) pgp_blocks.append(pgp_block)
# Message # Message
for pgp_block in self.regex_findall(self.reg_pgp_message, self.item_id, content): for pgp_block in self.regex_findall(self.reg_pgp_message, self.obj.id, content):
pgp_block = self.sanitize_pgp_block(pgp_block) pgp_block = self.sanitize_pgp_block(pgp_block)
pgp_blocks.append(pgp_block) pgp_blocks.append(pgp_block)
@ -206,26 +202,26 @@ class PgpDump(AbstractModule):
self.extract_id_from_pgpdump_output(pgpdump_output) self.extract_id_from_pgpdump_output(pgpdump_output)
if self.keys or self.names or self.mails: if self.keys or self.names or self.mails:
print(self.item_id) print(self.obj.id)
date = item.get_date() date = self.obj.get_date()
for key in self.keys: for key in self.keys:
pgp = Pgps.Pgp(key, 'key') pgp = Pgps.Pgp(key, 'key')
pgp.add(date, item) pgp.add(date, self.obj)
print(f' key: {key}') print(f' key: {key}')
for name in self.names: for name in self.names:
pgp = Pgps.Pgp(name, 'name') pgp = Pgps.Pgp(name, 'name')
pgp.add(date, item) pgp.add(date, self.obj)
print(f' name: {name}') print(f' name: {name}')
self.tracker_term.compute(name, obj_type='pgp', subtype='name') self.tracker_term.compute_manual(pgp)
self.tracker_regex.compute(name, obj_type='pgp', subtype='name') self.tracker_regex.compute_manual(pgp)
self.tracker_yara.compute(name, obj_type='pgp', subtype='name') self.tracker_yara.compute_manual(pgp)
for mail in self.mails: for mail in self.mails:
pgp = Pgps.Pgp(mail, 'mail') pgp = Pgps.Pgp(mail, 'mail')
pgp.add(date, item) pgp.add(date, self.obj)
print(f' mail: {mail}') print(f' mail: {mail}')
self.tracker_term.compute(mail, obj_type='pgp', subtype='mail') self.tracker_term.compute_manual(pgp)
self.tracker_regex.compute(mail, obj_type='pgp', subtype='mail') self.tracker_regex.compute_manual(pgp)
self.tracker_yara.compute(mail, obj_type='pgp', subtype='mail') self.tracker_yara.compute_manual(pgp)
# Keys extracted from PGP PRIVATE KEY BLOCK # Keys extracted from PGP PRIVATE KEY BLOCK
for key in self.private_keys: for key in self.private_keys:
@ -241,4 +237,3 @@ class PgpDump(AbstractModule):
if __name__ == '__main__': if __name__ == '__main__':
module = PgpDump() module = PgpDump()
module.run() module.run()

View file

@ -1,72 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Zerobins Module
======================
This module spots zerobins-like services for further processing
"""
##################################
# Import External packages
##################################
import os
import re
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib import crawlers
class Zerobins(AbstractModule):
"""
Zerobins module for AIL framework
"""
def __init__(self):
super(Zerobins, self).__init__()
binz = [
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
]
self.regex = re.compile('|'.join(binz))
# Pending time between two computation (computeNone) in seconds
self.pending_seconds = 10
# Send module state to logs
self.logger.info(f'Module {self.module_name} initialized')
def computeNone(self):
"""
Compute when no message in queue
"""
self.logger.debug("No message in queue")
def compute(self, message):
"""
Compute a message in queue
"""
url = message
item = self.get_obj()
# Extract zerobins addresses
matching_binz = self.regex_findall(self.regex, item.get_id(), url)
if len(matching_binz) > 0:
for bin_url in matching_binz:
print(f'send {bin_url} to crawler')
# TODO Change priority ???
crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
parent='manual', priority=60)
self.logger.debug("Compute message in queue")
if __name__ == '__main__':
module = Zerobins()
module.run()

View file

@ -117,6 +117,9 @@ class AbstractModule(ABC):
def get_available_queues(self): def get_available_queues(self):
return self.queue.get_out_queues() return self.queue.get_out_queues()
def regex_match(self, regex, obj_id, content):
return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
def regex_search(self, regex, obj_id, content): def regex_search(self, regex, obj_id, content):
return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
@ -201,6 +204,10 @@ class AbstractModule(ABC):
""" """
pass pass
def compute_manual(self, obj, message=None):
self.obj = obj
return self.compute(message)
def computeNone(self): def computeNone(self):
""" """
Method of the Module when there is no message Method of the Module when there is no message

View file

@ -41,6 +41,8 @@ class Tracker_Regex(AbstractModule):
self.tracked_regexs = Tracker.get_tracked_regexs() self.tracked_regexs = Tracker.get_tracked_regexs()
self.last_refresh = time.time() self.last_refresh = time.time()
self.obj = None
# Exporter # Exporter
self.exporters = {'mail': MailExporterTracker(), self.exporters = {'mail': MailExporterTracker(),
'webhook': WebHookExporterTracker()} 'webhook': WebHookExporterTracker()}
@ -66,12 +68,46 @@ class Tracker_Regex(AbstractModule):
content = obj.get_content() content = obj.get_content()
for dict_regex in self.tracked_regexs[obj_type]: for dict_regex in self.tracked_regexs[obj_type]:
matched = self.regex_findall(dict_regex['regex'], obj_id, content) matches = self.regex_finditer(dict_regex['regex'], obj_id, content)
if matched: if matches:
self.new_tracker_found(dict_regex['tracked'], 'regex', obj) self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches)
def new_tracker_found(self, tracker_name, tracker_type, obj): def extract_matches(self, re_matches, limit=500, lines=5):
matches = []
content = self.obj.get_content()
l_content = len(content)
for match in re_matches:
start = match[0]
value = match[2]
end = match[1]
# Start
if start > limit:
i_start = start - limit
else:
i_start = 0
str_start = content[i_start:start].splitlines()
if len(str_start) > lines:
str_start = '\n'.join(str_start[-lines + 1:])
else:
str_start = content[i_start:start]
# End
if end + limit > l_content:
i_end = l_content
else:
i_end = end + limit
str_end = content[end:i_end].splitlines()
if len(str_end) > lines:
str_end = '\n'.join(str_end[:lines + 1])
else:
str_end = content[end:i_end]
matches.append((value, f'{str_start}{value}{str_end}'))
return matches
def new_tracker_found(self, tracker_name, tracker_type, obj, re_matches):
obj_id = obj.get_id() obj_id = obj.get_id()
matches = None
for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type(tracker_type, obj.get_type(), tracker_name): for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type(tracker_type, obj.get_type(), tracker_name):
tracker = Tracker.Tracker(tracker_uuid) tracker = Tracker.Tracker(tracker_uuid)
@ -92,8 +128,9 @@ class Tracker_Regex(AbstractModule):
obj.add_tag(tag) obj.add_tag(tag)
if tracker.mail_export(): if tracker.mail_export():
# TODO add matches + custom subjects if not matches:
self.exporters['mail'].export(tracker, obj) matches = self.extract_matches(re_matches)
self.exporters['mail'].export(tracker, obj, matches)
if tracker.webhook_export(): if tracker.webhook_export():
self.exporters['webhook'].export(tracker, obj) self.exporters['webhook'].export(tracker, obj)
@ -102,4 +139,3 @@ class Tracker_Regex(AbstractModule):
if __name__ == "__main__": if __name__ == "__main__":
module = Tracker_Regex() module = Tracker_Regex()
module.run() module.run()
# module.compute('submitted/2023/05/02/submitted_b1e518f1-703b-40f6-8238-d1c22888197e.gz')

View file

@ -73,8 +73,56 @@ class Tracker_Yara(AbstractModule):
print(f'{self.obj.get_id()}: yara scanning timed out') print(f'{self.obj.get_id()}: yara scanning timed out')
self.redis_logger.info(f'{self.obj.get_id()}: yara scanning timed out') self.redis_logger.info(f'{self.obj.get_id()}: yara scanning timed out')
def convert_byte_offset_to_string(self, b_content, offset):
byte_chunk = b_content[:offset + 1]
try:
string_chunk = byte_chunk.decode()
offset = len(string_chunk) - 1
return offset
except UnicodeDecodeError:
return self.convert_byte_offset_to_string(b_content, offset - 1)
def extract_matches(self, data, limit=500, lines=5):
matches = []
content = self.obj.get_content()
l_content = len(content)
b_content = content.encode()
for string_match in data.get('strings'):
for string_match_instance in string_match.instances:
start = string_match_instance.offset
value = string_match_instance.matched_data.decode()
end = start + string_match_instance.matched_length
# str
start = self.convert_byte_offset_to_string(b_content, start)
end = self.convert_byte_offset_to_string(b_content, end)
# Start
if start > limit:
i_start = start - limit
else:
i_start = 0
str_start = content[i_start:start].splitlines()
if len(str_start) > lines:
str_start = '\n'.join(str_start[-lines + 1:])
else:
str_start = content[i_start:start]
# End
if end + limit > l_content:
i_end = l_content
else:
i_end = end + limit
str_end = content[end:i_end].splitlines()
if len(str_end) > lines:
str_end = '\n'.join(str_end[:lines + 1])
else:
str_end = content[end:i_end]
matches.append((value, f'{str_start}{value}{str_end}'))
return matches
def yara_rules_match(self, data): def yara_rules_match(self, data):
tracker_name = data['namespace'] tracker_name = data['namespace']
matches = None
obj_id = self.obj.get_id() obj_id = self.obj.get_id()
for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type('yara', self.obj.get_type(), tracker_name): for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type('yara', self.obj.get_type(), tracker_name):
tracker = Tracker.Tracker(tracker_uuid) tracker = Tracker.Tracker(tracker_uuid)
@ -95,8 +143,9 @@ class Tracker_Yara(AbstractModule):
# Mails # Mails
if tracker.mail_export(): if tracker.mail_export():
# TODO add matches + custom subjects if not matches:
self.exporters['mail'].export(tracker, self.obj) matches = self.extract_matches(data)
self.exporters['mail'].export(tracker, self.obj, matches)
# Webhook # Webhook
if tracker.webhook_export(): if tracker.webhook_export():

View file

@ -158,8 +158,8 @@ publish = Importers,Tags
subscribe = Item subscribe = Item
publish = Tags publish = Tags
[Zerobins] #[Pasties]
subscribe = Url #subscribe = Url
#[Sync_module] #[Sync_module]
#publish = Sync #publish = Sync

View file

@ -68,7 +68,7 @@ pylibinjection>=0.2.4
phonenumbers>8.12.1 phonenumbers>8.12.1
# Web # Web
flask>=1.1.4 flask==2.3.3
flask-login flask-login
bcrypt>3.1.6 bcrypt>3.1.6

View file

@ -132,6 +132,10 @@
</div> </div>
</div> </div>
</div> </div>
<div class="custom-control custom-switch mt-1">
<input class="custom-control-input" type="checkbox" name="title_obj" id="title_obj" checked="">
<label class="custom-control-label" for="title_obj"><i class="fas fa-heading"></i>&nbsp;Decoded <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Title that has been extracted from a HTML page"></i></label>
</div>
{# <div class="custom-control custom-switch mt-1">#} {# <div class="custom-control custom-switch mt-1">#}
{# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#} {# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}