diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 00c224e4..c15311e5 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -267,8 +267,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "LibInjection" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./LibInjection.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Zerobins" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Zerobins.py; read x" - sleep 0.1 +# screen -S "Script_AIL" -X screen -t "Pasties" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Pasties.py; read x" +# sleep 0.1 screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x" sleep 0.1 diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 3332299d..eb492207 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -22,6 +22,7 @@ from lib.objects.Domains import Domain from lib.objects.Items import Item from lib.objects import Screenshots from lib.objects import Titles +from trackers.Tracker_Yara import Tracker_Yara logging.config.dictConfig(ail_logger.get_config(name='crawlers')) @@ -35,6 +36,8 @@ class Crawler(AbstractModule): # Waiting time in seconds between to message processed self.pending_seconds = 1 + self.tracker_yara = Tracker_Yara(queue=False) + config_loader = ConfigLoader() self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') @@ -284,6 +287,12 @@ class Crawler(AbstractModule): if title_content: title = Titles.create_title(title_content) title.add(item.get_date(), item) + # Tracker + self.tracker_yara.compute_manual(title) + if not title.is_tags_safe(): + unsafe_tag = 'dark-web:topic="pornography-child-exploitation"' + self.domain.add_tag(unsafe_tag) + item.add_tag(unsafe_tag) # SCREENSHOT if self.screenshot: diff --git a/bin/exporter/MailExporter.py b/bin/exporter/MailExporter.py index c4d3f5b5..41074d7b 100755 --- a/bin/exporter/MailExporter.py +++ b/bin/exporter/MailExporter.py @@ -124,16 +124,27 @@ class MailExporterTracker(MailExporter): def __init__(self, host=None, port=None, password=None, user='', sender=''): super().__init__(host=host, port=port, password=password, user=user, sender=sender) - def export(self, tracker, obj): # TODO match + def export(self, tracker, obj, matches=[]): tracker_type = tracker.get_type() tracker_name = tracker.get_tracked() - subject = f'AIL Framework Tracker: {tracker_name}' # TODO custom subject + description = tracker.get_description() + if not description: + description = tracker_name + + subject = f'AIL Framework Tracker: {description}' body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" body += f'Item: {obj.id}\nurl:{obj.get_link()}' - # TODO match option - # if match: - # body += f'Tracker Match:\n\n{escape(match)}' + if matches: + body += '\n' + nb = 1 + for match in matches: + body += f'\nMatch {nb}: {match[0]}\nExtract:\n{match[1]}\n\n' + nb += 1 + else: + body = f"AIL Framework, New occurrence for {tracker_type} tracker: {tracker_name}\n" + body += f'Item: {obj.id}\nurl:{obj.get_link()}' + # print(body) for mail in tracker.get_mails(): self._export(mail, subject, body) diff --git a/bin/importer/feeders/Default.py b/bin/importer/feeders/Default.py index f4313707..4200efa8 100755 --- a/bin/importer/feeders/Default.py +++ b/bin/importer/feeders/Default.py @@ -31,8 +31,12 @@ class DefaultFeeder: Return feeder name. first part of the item_id and display in the UI """ if not self.name: - return self.get_source() - return self.name + name = self.get_source() + else: + name = self.name + if not name: + name = 'default' + return name def get_source(self): return self.json_data.get('source') diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index 5be8f492..6ecd4b02 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -83,6 +83,7 @@ class ConfigLoader(object): else: return [] + # # # # Directory Config # # # # config_loader = ConfigLoader() diff --git a/bin/lib/Tracker.py b/bin/lib/Tracker.py index f1ea8905..9c4702ae 100755 --- a/bin/lib/Tracker.py +++ b/bin/lib/Tracker.py @@ -2,6 +2,8 @@ # -*-coding:UTF-8 -* import json import os +import logging +import logging.config import re import sys import time @@ -24,11 +26,16 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from packages import Date from lib.ail_core import get_objects_tracked, get_object_all_subtypes, get_objects_retro_hunted +from lib import ail_logger from lib import ConfigLoader from lib import item_basic from lib import Tag from lib.Users import User +# LOGS +logging.config.dictConfig(ail_logger.get_config(name='modules')) +logger = logging.getLogger() + config_loader = ConfigLoader.ConfigLoader() r_cache = config_loader.get_redis_conn("Redis_Cache") @@ -248,7 +255,8 @@ class Tracker: return self._get_field('user_id') def webhook_export(self): - return r_tracker.hexists(f'tracker:{self.uuid}', 'webhook') + webhook = self.get_webhook() + return webhook is not None and webhook def get_webhook(self): return r_tracker.hget(f'tracker:{self.uuid}', 'webhook') @@ -560,9 +568,7 @@ class Tracker: os.remove(filepath) # Filters - filters = self.get_filters() - if not filters: - filters = get_objects_tracked() + filters = get_objects_tracked() for obj_type in filters: r_tracker.srem(f'trackers:objs:{tracker_type}:{obj_type}', tracked) r_tracker.srem(f'trackers:uuid:{tracker_type}:{tracked}', f'{self.uuid}:{obj_type}') @@ -923,7 +929,7 @@ def api_add_tracker(dict_input, user_id): # Filters # TODO MOVE ME filters = dict_input.get('filters', {}) if filters: - if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: + if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: filters = {} for obj_type in filters: if obj_type not in get_objects_tracked(): @@ -998,7 +1004,7 @@ def api_edit_tracker(dict_input, user_id): # Filters # TODO MOVE ME filters = dict_input.get('filters', {}) if filters: - if filters.keys() == {'decoded', 'item', 'pgp'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: + if filters.keys() == {'decoded', 'item', 'pgp', 'title'} and set(filters['pgp'].get('subtypes', [])) == {'mail', 'name'}: if not filters['decoded'] and not filters['item']: filters = {} for obj_type in filters: @@ -1151,7 +1157,11 @@ def get_tracked_yara_rules(): for obj_type in get_objects_tracked(): rules = {} for tracked in _get_tracked_by_obj_type('yara', obj_type): - rules[tracked] = os.path.join(get_yara_rules_dir(), tracked) + rule = os.path.join(get_yara_rules_dir(), tracked) + if not os.path.exists(rule): + logger.critical(f"Yara rule don't exists {tracked} : {obj_type}") + else: + rules[tracked] = rule to_track[obj_type] = yara.compile(filepaths=rules) print(to_track) return to_track diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 9eaaca97..5e15eabf 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -52,7 +52,7 @@ def get_object_all_subtypes(obj_type): return [] def get_objects_tracked(): - return ['decoded', 'item', 'pgp'] + return ['decoded', 'item', 'pgp', 'title'] def get_objects_retro_hunted(): return ['decoded', 'item'] diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 6387c76f..3484afa0 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -234,7 +234,9 @@ def extract_title_from_html(html): soup = BeautifulSoup(html, 'html.parser') title = soup.title if title: - return str(title.string) + title = title.string + if title: + return str(title) return '' def extract_description_from_html(html): @@ -1690,6 +1692,19 @@ def api_add_crawler_task(data, user_id=None): return {'error': 'The access to this cookiejar is restricted'}, 403 cookiejar_uuid = cookiejar.uuid + cookies = data.get('cookies', None) + if not cookiejar_uuid and cookies: + # Create new cookiejar + cookiejar_uuid = create_cookiejar(user_id, "single-shot cookiejar", 1, None) + cookiejar = Cookiejar(cookiejar_uuid) + for cookie in cookies: + try: + name = cookie.get('name') + value = cookie.get('value') + cookiejar.add_cookie(name, value, None, None, None, None, None) + except KeyError: + return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400 + frequency = data.get('frequency', None) if frequency: if frequency not in ['monthly', 'weekly', 'daily', 'hourly']: @@ -2010,7 +2025,7 @@ def test_ail_crawlers(): # TODO MOVE ME IN CRAWLER OR FLASK load_blacklist() -if __name__ == '__main__': +# if __name__ == '__main__': # delete_captures() # item_id = 'crawled/2023/02/20/data.gz' @@ -2022,4 +2037,4 @@ if __name__ == '__main__': # _reprocess_all_hars_cookie_name() # _reprocess_all_hars_etag() # _gzip_all_hars() - _reprocess_all_hars_hhhashs() + # _reprocess_all_hars_hhhashs() diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index 71fa5378..25420106 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -204,15 +204,22 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt if not l_sources_name: l_sources_name = set() if source_name: - l_dir = os.listdir(os.path.join(directory, source_name)) + path = os.path.join(directory, source_name) + if os.path.isdir(path): + l_dir = os.listdir(os.path.join(directory, source_name)) + else: + l_dir = [] else: l_dir = os.listdir(directory) # empty directory if not l_dir: - return l_sources_name.add(source_name) + if source_name: + return l_sources_name.add(source_name) + else: + return l_sources_name else: for src_name in l_dir: - if len(src_name) == 4: + if len(src_name) == 4 and source_name: # try: int(src_name) to_add = os.path.join(source_name) diff --git a/bin/lib/objects/CookiesNames.py b/bin/lib/objects/CookiesNames.py index b064892b..df9e44ad 100755 --- a/bin/lib/objects/CookiesNames.py +++ b/bin/lib/objects/CookiesNames.py @@ -85,9 +85,6 @@ class CookieName(AbstractDaterangeObject): meta['content'] = self.get_content() return meta - def add(self, date, obj_id): # date = HAR Date - self._add(date, 'domain', '', obj_id) - def create(self, content, _first_seen=None, _last_seen=None): if not isinstance(content, str): content = content.decode() diff --git a/bin/lib/objects/Cves.py b/bin/lib/objects/Cves.py index 02361636..f8cb997b 100755 --- a/bin/lib/objects/Cves.py +++ b/bin/lib/objects/Cves.py @@ -79,9 +79,6 @@ class Cve(AbstractDaterangeObject): meta['tags'] = self.get_tags(r_list=True) return meta - def add(self, date, item_id): - self._add(date, 'item', '', item_id) - def get_cve_search(self): try: response = requests.get(f'https://cvepremium.circl.lu/api/cve/{self.id}', timeout=10) diff --git a/bin/lib/objects/Decodeds.py b/bin/lib/objects/Decodeds.py index fb194be1..085568c3 100755 --- a/bin/lib/objects/Decodeds.py +++ b/bin/lib/objects/Decodeds.py @@ -239,8 +239,8 @@ class Decoded(AbstractDaterangeObject): return True - def add(self, algo_name, date, obj_id, mimetype=None): - self._add(date, 'item', '', obj_id) + def add(self, date, obj, algo_name, mimetype=None): + self._add(date, obj) if not mimetype: mimetype = self.get_mimetype() @@ -460,7 +460,7 @@ def get_all_decodeds_objects(filters={}): ############################################################################ def sanityze_decoder_names(decoder_name): - if decoder_name not in Decodeds.get_algos(): + if decoder_name not in get_algos(): return None else: return decoder_name diff --git a/bin/lib/objects/Etags.py b/bin/lib/objects/Etags.py index eb41f68c..16b90573 100755 --- a/bin/lib/objects/Etags.py +++ b/bin/lib/objects/Etags.py @@ -85,9 +85,6 @@ class Etag(AbstractDaterangeObject): meta['content'] = self.get_content() return meta - def add(self, date, obj_id): # date = HAR Date - self._add(date, 'domain', '', obj_id) - def create(self, content, _first_seen=None, _last_seen=None): if not isinstance(content, str): content = content.decode() diff --git a/bin/lib/objects/Favicons.py b/bin/lib/objects/Favicons.py index 68452b65..4acdec31 100755 --- a/bin/lib/objects/Favicons.py +++ b/bin/lib/objects/Favicons.py @@ -86,9 +86,6 @@ class Favicon(AbstractDaterangeObject): # def get_links(self): # # TODO GET ALL URLS FROM CORRELATED ITEMS - def add(self, date, obj_id): # TODO correlation base 64 -> calc md5 - self._add(date, 'domain', '', obj_id) - def create(self, content, _first_seen=None, _last_seen=None): if not isinstance(content, str): content = content.decode() diff --git a/bin/lib/objects/HHHashs.py b/bin/lib/objects/HHHashs.py index 021ac451..836b3e1e 100755 --- a/bin/lib/objects/HHHashs.py +++ b/bin/lib/objects/HHHashs.py @@ -86,9 +86,6 @@ class HHHash(AbstractDaterangeObject): meta['content'] = self.get_content() return meta - def add(self, date, obj_id): # date = HAR Date - self._add(date, 'domain', '', obj_id) - def create(self, hhhash_header, _first_seen=None, _last_seen=None): # TODO CREATE ADD FUNCTION -> urls set self._set_field('content', hhhash_header) self._create() diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index b724f854..2f1ef5de 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -175,7 +175,7 @@ class Message(AbstractObject): if options is None: options = set() meta = self.get_default_meta(tags=True) - meta['date'] = self.get_date() # TODO replace me by timestamp ?????? + meta['date'] = self.get_date() meta['source'] = self.get_source() # optional meta fields if 'content' in options: diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py index 9f88426c..f9e0064b 100755 --- a/bin/lib/objects/Titles.py +++ b/bin/lib/objects/Titles.py @@ -45,6 +45,8 @@ class Title(AbstractDaterangeObject): def get_content(self, r_type='str'): if r_type == 'str': return self._get_field('content') + elif r_type == 'bytes': + return self._get_field('content').encode() def get_link(self, flask_context=False): if flask_context: @@ -82,9 +84,6 @@ class Title(AbstractDaterangeObject): meta['content'] = self.get_content() return meta - def add(self, date, item_id): - self._add(date, 'item', '', item_id) - def create(self, content, _first_seen=None, _last_seen=None): self._set_field('content', content) self._create() @@ -122,4 +121,3 @@ class Titles(AbstractDaterangeObjects): # # print(r) # r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False) # print(r) - diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py index 98aa49c2..f2e86c57 100755 --- a/bin/lib/objects/abstract_daterange_object.py +++ b/bin/lib/objects/abstract_daterange_object.py @@ -125,9 +125,7 @@ class AbstractDaterangeObject(AbstractObject, ABC): def _add_create(self): r_object.sadd(f'{self.type}:all', self.id) - # TODO don't increase nb if same hash in item with different encoding - # if hash already in item - def _add(self, date, obj_type, subtype, obj_id): + def _add(self, date, obj): if not self.exists(): self._add_create() self.set_first_seen(date) @@ -136,26 +134,22 @@ class AbstractDaterangeObject(AbstractObject, ABC): self.update_daterange(date) update_obj_date(date, self.type) - if obj_type == 'item': - # NB Object seen by day TODO - if not self.is_correlated(obj_type, subtype, obj_id): # nb seen by day - r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) - + if obj: # Correlations - self.add_correlation(obj_type, subtype, obj_id) + self.add_correlation(obj.type, obj.get_subtype(r_str=True), obj.get_id()) - if is_crawled(obj_id): # Domain - domain = get_item_domain(obj_id) - self.add_correlation('domain', '', domain) - else: - # Correlations - self.add_correlation(obj_type, subtype, obj_id) - - # TODO Don't increase on reprocess + # Stats NB by day: # TODO Don't increase on reprocess r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) - # r_object.zincrby(f'{self.type}:obj:{obj_type}', 1, self.id) - # 1 Domain by day / 1 HAR by day - # Domain check / file created -> issue with scheduler + + if obj.type == 'item': + item_id = obj.get_id() + # domain + if is_crawled(item_id): + domain = get_item_domain(item_id) + self.add_correlation('domain', '', domain) + + def add(self, date, obj): + self._add(date, obj) # TODO:ADD objects + Stats def _create(self, first_seen=None, last_seen=None): diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py index 41ba4e98..6f877823 100755 --- a/bin/lib/regex_helper.py +++ b/bin/lib/regex_helper.py @@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30): proc.terminate() sys.exit(0) +def _regex_match(r_key, regex, content): + if re.match(regex, content): + r_serv_cache.set(r_key, 1) + r_serv_cache.expire(r_key, 360) + +def regex_match(r_key, regex, item_id, content, max_time=30): + proc = Proc(target=_regex_match, args=(r_key, regex, content)) + try: + proc.start() + proc.join(max_time) + if proc.is_alive(): + proc.terminate() + # Statistics.incr_module_timeout_statistic(r_key) + err_mess = f"{r_key}: processing timeout: {item_id}" + logger.info(err_mess) + return False + else: + if r_serv_cache.exists(r_key): + r_serv_cache.delete(r_key) + return True + else: + r_serv_cache.delete(r_key) + return False + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating regex worker") + proc.terminate() + sys.exit(0) + def _regex_search(r_key, regex, content): if re.search(regex, content): r_serv_cache.set(r_key, 1) diff --git a/bin/modules/CveModule.py b/bin/modules/CveModule.py index 55fa0c91..6904ee28 100755 --- a/bin/modules/CveModule.py +++ b/bin/modules/CveModule.py @@ -54,7 +54,7 @@ class CveModule(AbstractModule): date = item.get_date() for cve_id in cves: cve = Cves.Cve(cve_id) - cve.add(date, item_id) + cve.add(date, item) warning = f'{item_id} contains CVEs {cves}' print(warning) diff --git a/bin/modules/Decoder.py b/bin/modules/Decoder.py index a8ba07af..36875ca6 100755 --- a/bin/modules/Decoder.py +++ b/bin/modules/Decoder.py @@ -21,7 +21,6 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader -from lib.objects.Items import Item from lib.objects.Decodeds import Decoded from trackers.Tracker_Term import Tracker_Term from trackers.Tracker_Regex import Tracker_Regex @@ -87,17 +86,16 @@ class Decoder(AbstractModule): self.logger.info(f'Module {self.module_name} initialized') def compute(self, message): - item = self.get_obj() - content = item.get_content() - date = item.get_date() + content = self.obj.get_content() + date = self.obj.get_date() new_decodeds = [] for decoder in self.decoder_order: find = False dname = decoder['name'] - encodeds = self.regex_findall(decoder['regex'], item.id, content) - # PERF remove encoded from item content + encodeds = self.regex_findall(decoder['regex'], self.obj.id, content) + # PERF remove encoded from obj content for encoded in encodeds: content = content.replace(encoded, '', 1) encodeds = set(encodeds) @@ -113,19 +111,19 @@ class Decoder(AbstractModule): if not decoded.exists(): mimetype = decoded.guess_mimetype(decoded_file) if not mimetype: - print(sha1_string, item.id) - raise Exception(f'Invalid mimetype: {decoded.id} {item.id}') + print(sha1_string, self.obj.id) + raise Exception(f'Invalid mimetype: {decoded.id} {self.obj.id}') decoded.save_file(decoded_file, mimetype) new_decodeds.append(decoded.id) else: mimetype = decoded.get_mimetype() - decoded.add(dname, date, item.id, mimetype=mimetype) + decoded.add(date, self.obj, dname, mimetype=mimetype) # new_decodeds.append(decoded.id) - self.logger.info(f'{item.id} : {dname} - {decoded.id} - {mimetype}') + self.logger.info(f'{self.obj.id} : {dname} - {decoded.id} - {mimetype}') if find: - self.logger.info(f'{item.id} - {dname}') + self.logger.info(f'{self.obj.id} - {dname}') # Send to Tags tag = f'infoleak:automatic-detection="{dname}"' @@ -134,12 +132,13 @@ class Decoder(AbstractModule): #################### # TRACKERS DECODED for decoded_id in new_decodeds: + decoded = Decoded(decoded_id) try: - self.tracker_term.compute(decoded_id, obj_type='decoded') - self.tracker_regex.compute(decoded_id, obj_type='decoded') + self.tracker_term.compute_manual(decoded) + self.tracker_regex.compute_manual(decoded) except UnicodeDecodeError: pass - self.tracker_yara.compute(decoded_id, obj_type='decoded') + self.tracker_yara.compute_manual(decoded) if __name__ == '__main__': diff --git a/bin/modules/Pasties.py b/bin/modules/Pasties.py new file mode 100755 index 00000000..3d420c84 --- /dev/null +++ b/bin/modules/Pasties.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The Pasties Module +====================== +This module spots domain-pasties services for further processing +""" + +################################## +# Import External packages +################################## +import os +import sys +import time + +from pyfaup.faup import Faup + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader +from lib import crawlers + +# TODO add url validator + +pasties_blocklist_urls = set() +pasties_domains = {} + +class Pasties(AbstractModule): + """ + Pasties module for AIL framework + """ + + def __init__(self): + super(Pasties, self).__init__() + self.faup = Faup() + + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + + self.pasties = {} + self.urls_blocklist = set() + self.load_pasties_domains() + + # Send module state to logs + self.logger.info(f'Module {self.module_name} initialized') + + def load_pasties_domains(self): + self.pasties = {} + self.urls_blocklist = set() + + domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties') + if os.path.exists(domains_pasties): + with open(domains_pasties) as f: + for line in f: + url = line.strip() + if url: # TODO validate line + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + # print(url_decoded) + if path and path != '/': + if path[-1] != '/': + path = f'{path}/' + else: + path = None + + if host in self.pasties: + if path: + self.pasties[host].add(path) + else: + if path: + self.pasties[host] = {path} + else: + self.pasties[host] = set() + + url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist') + if os.path.exists(url_blocklist): + with open(url_blocklist) as f: + for line in f: + url = line.strip() + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + url = f'{host}{path}' + if url_decoded['query_string']: + url = url + url_decoded['query_string'] + self.urls_blocklist.add(url) + + def send_to_crawler(self, url, obj_id): + if not self.r_cache.exists(f'{self.module_name}:url:{url}'): + self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time())) + self.r_cache.expire(f'{self.module_name}:url:{url}', 86400) + crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id) + + def compute(self, message): + url = message.split() + + self.faup.decode(url) + url_decoded = self.faup.get() + # print(url_decoded) + url_host = url_decoded['host'] + # if url_decoded.get('port', ''): + # url_host = f'{url_host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + if url_host in self.pasties: + if url.startswith('http://'): + if url[7:] in self.urls_blocklist: + return None + elif url.startswith('https://'): + if url[8:] in self.urls_blocklist: + return None + else: + if url in self.urls_blocklist: + return None + + if not self.pasties[url_host]: + if path and path != '/': + print('send to crawler', url_host, url) + self.send_to_crawler(url, self.obj.id) + else: + if path.endswith('/'): + path_end = path[:-1] + else: + path_end = f'{path}/' + for url_path in self.pasties[url_host]: + if path.startswith(url_path): + if url_path != path and url_path != path_end: + print('send to crawler', url_path, url) + self.send_to_crawler(url, self.obj.id)) + break + + +if __name__ == '__main__': + module = Pasties() + module.run() diff --git a/bin/modules/PgpDump.py b/bin/modules/PgpDump.py index 82ec9f32..c309e729 100755 --- a/bin/modules/PgpDump.py +++ b/bin/modules/PgpDump.py @@ -24,7 +24,6 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.objects import Pgps -from lib.objects.Items import Item from trackers.Tracker_Term import Tracker_Term from trackers.Tracker_Regex import Tracker_Regex from trackers.Tracker_Yara import Tracker_Yara @@ -61,7 +60,6 @@ class PgpDump(AbstractModule): self.tracker_yara = Tracker_Yara(queue=False) # init - self.item_id = None self.keys = set() self.private_keys = set() self.names = set() @@ -93,11 +91,11 @@ class PgpDump(AbstractModule): print() pgp_block = self.remove_html(pgp_block) # Remove Version - versions = self.regex_findall(self.reg_tool_version, self.item_id, pgp_block) + versions = self.regex_findall(self.reg_tool_version, self.obj.id, pgp_block) for version in versions: pgp_block = pgp_block.replace(version, '') # Remove Comment - comments = self.regex_findall(self.reg_block_comment, self.item_id, pgp_block) + comments = self.regex_findall(self.reg_block_comment, self.obj.id, pgp_block) for comment in comments: pgp_block = pgp_block.replace(comment, '') # Remove Empty Lines @@ -130,7 +128,7 @@ class PgpDump(AbstractModule): try: output = output.decode() except UnicodeDecodeError: - self.logger.error(f'Error PgpDump UnicodeDecodeError: {self.item_id}') + self.logger.error(f'Error PgpDump UnicodeDecodeError: {self.obj.id}') output = '' return output @@ -145,7 +143,7 @@ class PgpDump(AbstractModule): private = True else: private = False - users = self.regex_findall(self.reg_user_id, self.item_id, pgpdump_output) + users = self.regex_findall(self.reg_user_id, self.obj.id, pgpdump_output) for user in users: # avoid key injection in user_id: pgpdump_output.replace(user, '', 1) @@ -159,7 +157,7 @@ class PgpDump(AbstractModule): name = user self.names.add(name) - keys = self.regex_findall(self.reg_key_id, self.item_id, pgpdump_output) + keys = self.regex_findall(self.reg_key_id, self.obj.id, pgpdump_output) for key_id in keys: key_id = key_id.replace('Key ID - ', '', 1) if key_id != '0x0000000000000000': @@ -171,28 +169,26 @@ class PgpDump(AbstractModule): print('symmetrically encrypted') def compute(self, message): - item = self.get_obj() - self.item_id = item.get_id() - content = item.get_content() + content = self.obj.get_content() pgp_blocks = [] # Public Block - for pgp_block in self.regex_findall(self.reg_pgp_public_blocs, self.item_id, content): + for pgp_block in self.regex_findall(self.reg_pgp_public_blocs, self.obj.id, content): # content = content.replace(pgp_block, '') pgp_block = self.sanitize_pgp_block(pgp_block) pgp_blocks.append(pgp_block) # Private Block - for pgp_block in self.regex_findall(self.reg_pgp_private_blocs, self.item_id, content): + for pgp_block in self.regex_findall(self.reg_pgp_private_blocs, self.obj.id, content): # content = content.replace(pgp_block, '') pgp_block = self.sanitize_pgp_block(pgp_block) pgp_blocks.append(pgp_block) # Signature - for pgp_block in self.regex_findall(self.reg_pgp_signature, self.item_id, content): + for pgp_block in self.regex_findall(self.reg_pgp_signature, self.obj.id, content): # content = content.replace(pgp_block, '') pgp_block = self.sanitize_pgp_block(pgp_block) pgp_blocks.append(pgp_block) # Message - for pgp_block in self.regex_findall(self.reg_pgp_message, self.item_id, content): + for pgp_block in self.regex_findall(self.reg_pgp_message, self.obj.id, content): pgp_block = self.sanitize_pgp_block(pgp_block) pgp_blocks.append(pgp_block) @@ -206,26 +202,26 @@ class PgpDump(AbstractModule): self.extract_id_from_pgpdump_output(pgpdump_output) if self.keys or self.names or self.mails: - print(self.item_id) - date = item.get_date() + print(self.obj.id) + date = self.obj.get_date() for key in self.keys: pgp = Pgps.Pgp(key, 'key') - pgp.add(date, item) + pgp.add(date, self.obj) print(f' key: {key}') for name in self.names: pgp = Pgps.Pgp(name, 'name') - pgp.add(date, item) + pgp.add(date, self.obj) print(f' name: {name}') - self.tracker_term.compute(name, obj_type='pgp', subtype='name') - self.tracker_regex.compute(name, obj_type='pgp', subtype='name') - self.tracker_yara.compute(name, obj_type='pgp', subtype='name') + self.tracker_term.compute_manual(pgp) + self.tracker_regex.compute_manual(pgp) + self.tracker_yara.compute_manual(pgp) for mail in self.mails: pgp = Pgps.Pgp(mail, 'mail') - pgp.add(date, item) + pgp.add(date, self.obj) print(f' mail: {mail}') - self.tracker_term.compute(mail, obj_type='pgp', subtype='mail') - self.tracker_regex.compute(mail, obj_type='pgp', subtype='mail') - self.tracker_yara.compute(mail, obj_type='pgp', subtype='mail') + self.tracker_term.compute_manual(pgp) + self.tracker_regex.compute_manual(pgp) + self.tracker_yara.compute_manual(pgp) # Keys extracted from PGP PRIVATE KEY BLOCK for key in self.private_keys: @@ -241,4 +237,3 @@ class PgpDump(AbstractModule): if __name__ == '__main__': module = PgpDump() module.run() - diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py deleted file mode 100755 index f81bf9f8..00000000 --- a/bin/modules/Zerobins.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -The Zerobins Module -====================== -This module spots zerobins-like services for further processing -""" - -################################## -# Import External packages -################################## -import os -import re -import sys - -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages -################################## -from modules.abstract_module import AbstractModule -from lib import crawlers - - -class Zerobins(AbstractModule): - """ - Zerobins module for AIL framework - """ - - def __init__(self): - super(Zerobins, self).__init__() - - binz = [ - r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones - ] - - self.regex = re.compile('|'.join(binz)) - - # Pending time between two computation (computeNone) in seconds - self.pending_seconds = 10 - - # Send module state to logs - self.logger.info(f'Module {self.module_name} initialized') - - def computeNone(self): - """ - Compute when no message in queue - """ - self.logger.debug("No message in queue") - - def compute(self, message): - """ - Compute a message in queue - """ - url = message - item = self.get_obj() - - # Extract zerobins addresses - matching_binz = self.regex_findall(self.regex, item.get_id(), url) - - if len(matching_binz) > 0: - for bin_url in matching_binz: - print(f'send {bin_url} to crawler') - # TODO Change priority ??? - crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor', - parent='manual', priority=60) - - self.logger.debug("Compute message in queue") - - -if __name__ == '__main__': - module = Zerobins() - module.run() diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index ed2fe7d3..c22fa568 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -117,6 +117,9 @@ class AbstractModule(ABC): def get_available_queues(self): return self.queue.get_out_queues() + def regex_match(self, regex, obj_id, content): + return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) + def regex_search(self, regex, obj_id, content): return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) @@ -201,6 +204,10 @@ class AbstractModule(ABC): """ pass + def compute_manual(self, obj, message=None): + self.obj = obj + return self.compute(message) + def computeNone(self): """ Method of the Module when there is no message diff --git a/bin/trackers/Tracker_Regex.py b/bin/trackers/Tracker_Regex.py index bf756f49..43fa5764 100755 --- a/bin/trackers/Tracker_Regex.py +++ b/bin/trackers/Tracker_Regex.py @@ -41,6 +41,8 @@ class Tracker_Regex(AbstractModule): self.tracked_regexs = Tracker.get_tracked_regexs() self.last_refresh = time.time() + self.obj = None + # Exporter self.exporters = {'mail': MailExporterTracker(), 'webhook': WebHookExporterTracker()} @@ -66,12 +68,46 @@ class Tracker_Regex(AbstractModule): content = obj.get_content() for dict_regex in self.tracked_regexs[obj_type]: - matched = self.regex_findall(dict_regex['regex'], obj_id, content) - if matched: - self.new_tracker_found(dict_regex['tracked'], 'regex', obj) + matches = self.regex_finditer(dict_regex['regex'], obj_id, content) + if matches: + self.new_tracker_found(dict_regex['tracked'], 'regex', obj, matches) - def new_tracker_found(self, tracker_name, tracker_type, obj): + def extract_matches(self, re_matches, limit=500, lines=5): + matches = [] + content = self.obj.get_content() + l_content = len(content) + for match in re_matches: + start = match[0] + value = match[2] + end = match[1] + + # Start + if start > limit: + i_start = start - limit + else: + i_start = 0 + str_start = content[i_start:start].splitlines() + if len(str_start) > lines: + str_start = '\n'.join(str_start[-lines + 1:]) + else: + str_start = content[i_start:start] + + # End + if end + limit > l_content: + i_end = l_content + else: + i_end = end + limit + str_end = content[end:i_end].splitlines() + if len(str_end) > lines: + str_end = '\n'.join(str_end[:lines + 1]) + else: + str_end = content[end:i_end] + matches.append((value, f'{str_start}{value}{str_end}')) + return matches + + def new_tracker_found(self, tracker_name, tracker_type, obj, re_matches): obj_id = obj.get_id() + matches = None for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type(tracker_type, obj.get_type(), tracker_name): tracker = Tracker.Tracker(tracker_uuid) @@ -92,8 +128,9 @@ class Tracker_Regex(AbstractModule): obj.add_tag(tag) if tracker.mail_export(): - # TODO add matches + custom subjects - self.exporters['mail'].export(tracker, obj) + if not matches: + matches = self.extract_matches(re_matches) + self.exporters['mail'].export(tracker, obj, matches) if tracker.webhook_export(): self.exporters['webhook'].export(tracker, obj) @@ -102,4 +139,3 @@ class Tracker_Regex(AbstractModule): if __name__ == "__main__": module = Tracker_Regex() module.run() - # module.compute('submitted/2023/05/02/submitted_b1e518f1-703b-40f6-8238-d1c22888197e.gz') diff --git a/bin/trackers/Tracker_Yara.py b/bin/trackers/Tracker_Yara.py index 7bf13dfd..344c77a7 100755 --- a/bin/trackers/Tracker_Yara.py +++ b/bin/trackers/Tracker_Yara.py @@ -73,8 +73,56 @@ class Tracker_Yara(AbstractModule): print(f'{self.obj.get_id()}: yara scanning timed out') self.redis_logger.info(f'{self.obj.get_id()}: yara scanning timed out') + def convert_byte_offset_to_string(self, b_content, offset): + byte_chunk = b_content[:offset + 1] + try: + string_chunk = byte_chunk.decode() + offset = len(string_chunk) - 1 + return offset + except UnicodeDecodeError: + return self.convert_byte_offset_to_string(b_content, offset - 1) + + def extract_matches(self, data, limit=500, lines=5): + matches = [] + content = self.obj.get_content() + l_content = len(content) + b_content = content.encode() + for string_match in data.get('strings'): + for string_match_instance in string_match.instances: + start = string_match_instance.offset + value = string_match_instance.matched_data.decode() + end = start + string_match_instance.matched_length + # str + start = self.convert_byte_offset_to_string(b_content, start) + end = self.convert_byte_offset_to_string(b_content, end) + + # Start + if start > limit: + i_start = start - limit + else: + i_start = 0 + str_start = content[i_start:start].splitlines() + if len(str_start) > lines: + str_start = '\n'.join(str_start[-lines + 1:]) + else: + str_start = content[i_start:start] + + # End + if end + limit > l_content: + i_end = l_content + else: + i_end = end + limit + str_end = content[end:i_end].splitlines() + if len(str_end) > lines: + str_end = '\n'.join(str_end[:lines + 1]) + else: + str_end = content[end:i_end] + matches.append((value, f'{str_start}{value}{str_end}')) + return matches + def yara_rules_match(self, data): tracker_name = data['namespace'] + matches = None obj_id = self.obj.get_id() for tracker_uuid in Tracker.get_trackers_by_tracked_obj_type('yara', self.obj.get_type(), tracker_name): tracker = Tracker.Tracker(tracker_uuid) @@ -95,8 +143,9 @@ class Tracker_Yara(AbstractModule): # Mails if tracker.mail_export(): - # TODO add matches + custom subjects - self.exporters['mail'].export(tracker, self.obj) + if not matches: + matches = self.extract_matches(data) + self.exporters['mail'].export(tracker, self.obj, matches) # Webhook if tracker.webhook_export(): diff --git a/configs/modules.cfg b/configs/modules.cfg index d64f6431..b14e5c8b 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -158,8 +158,8 @@ publish = Importers,Tags subscribe = Item publish = Tags -[Zerobins] -subscribe = Url +#[Pasties] +#subscribe = Url #[Sync_module] #publish = Sync diff --git a/requirements.txt b/requirements.txt index 30cd6c3f..0412c6bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,7 +68,7 @@ pylibinjection>=0.2.4 phonenumbers>8.12.1 # Web -flask>=1.1.4 +flask==2.3.3 flask-login bcrypt>3.1.6 diff --git a/var/www/templates/hunter/tracker_add.html b/var/www/templates/hunter/tracker_add.html index 7cc690ba..4f8c6f3e 100644 --- a/var/www/templates/hunter/tracker_add.html +++ b/var/www/templates/hunter/tracker_add.html @@ -132,6 +132,10 @@ +
+ + +
{#
#} {# #}