From b988f46c90ec4b97c1aa99745910dc11424d244c Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 17 Oct 2024 12:14:48 +0200 Subject: [PATCH] chg: [dom-hash] add dom-hash object compute dom-hash for domains and crawled items --- bin/crawlers/Crawler.py | 6 + bin/lib/ail_core.py | 8 +- bin/lib/ail_updates.py | 4 + bin/lib/correlations_engine.py | 5 +- bin/lib/objects/DomHashs.py | 134 ++++ bin/lib/objects/ail_objects.py | 3 + update/v5.9/Update.py | 24 + update/v5.9/Update.sh | 31 + update/v5.9/reprocess_dom_hash.py | 39 ++ var/www/Flask_server.py | 2 + var/www/blueprints/objects_dom_hash.py | 95 +++ .../correlation/metadata_card_dom_hash.html | 173 +++++ .../correlation/show_correlation.html | 6 + .../objects/dom-hash/DomHashDaterange.html | 611 ++++++++++++++++++ .../templates/sidebars/sidebar_objects.html | 6 + 15 files changed, 1141 insertions(+), 6 deletions(-) create mode 100755 bin/lib/objects/DomHashs.py create mode 100755 update/v5.9/Update.py create mode 100755 update/v5.9/Update.sh create mode 100755 update/v5.9/reprocess_dom_hash.py create mode 100644 var/www/blueprints/objects_dom_hash.py create mode 100644 var/www/templates/correlation/metadata_card_dom_hash.html create mode 100644 var/www/templates/objects/dom-hash/DomHashDaterange.html diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 61b94a71..b1eda5a1 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -21,6 +21,7 @@ from lib.Tag import get_domain_vanity_tags from lib.objects import CookiesNames from lib.objects import Etags from lib.objects.Domains import Domain +from lib.objects import DomHashs from lib.objects import Favicons from lib.objects.Items import Item from lib.objects import Screenshots @@ -348,6 +349,11 @@ class Crawler(AbstractModule): self.root_item = item_id parent_id = item_id + # DOM-HASH + dom_hash = DomHashs.create(entries['html']) + dom_hash.add(self.date.replace('/', ''), item) + dom_hash.add_correlation('domain', '', self.domain.id) + title_content = crawlers.extract_title_from_html(entries['html']) if title_content: title = Titles.create_title(title_content) diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 532a89f1..6e22f795 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -17,15 +17,15 @@ r_object = config_loader.get_db_conn("Kvrocks_Objects") config_loader = None AIL_OBJECTS = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', 'decoded', - 'domain', 'etag', 'favicon', 'file-name', 'hhhash','item', 'image', 'message', 'ocr', 'pgp', - 'qrcode', 'screenshot', 'title', 'user-account', 'username'}) + 'domain', 'dom-hash', 'etag', 'favicon', 'file-name', 'hhhash','item', 'image', 'message', 'ocr', + 'pgp', 'qrcode', 'screenshot', 'title', 'user-account', 'username'}) AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'} # TODO by object TYPE ???? AIL_OBJECTS_CORRELATIONS_DEFAULT = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', 'decoded', - 'domain', 'favicon', 'file-name', 'item', 'image', 'message', 'ocr', 'pgp', - 'qrcode', 'screenshot', 'title', 'user-account', 'username'}) + 'domain', 'dom-hash', 'favicon', 'file-name', 'item', 'image', 'message', + 'ocr', 'pgp', 'qrcode', 'screenshot', 'title', 'user-account', 'username'}) def get_ail_uuid(): ail_uuid = r_serv_db.get('ail:uuid') diff --git a/bin/lib/ail_updates.py b/bin/lib/ail_updates.py index 07fd791a..968c7004 100755 --- a/bin/lib/ail_updates.py +++ b/bin/lib/ail_updates.py @@ -46,6 +46,10 @@ BACKGROUND_UPDATES = { 'message': 'Compress HAR', 'scripts': ['compress_har.py'] }, + 'v5.9': { + 'message': 'Compute Domain/Items Dom-Hash', + 'scripts': ['reprocess_dom_hash.py'] + } } class AILBackgroundUpdate: diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 662ed3a0..9cf8b5d2 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -48,13 +48,14 @@ CORRELATION_TYPES_BY_OBJ = { "cryptocurrency": ["domain", "item", "message", "ocr", "qrcode"], "cve": ["domain", "item", "message", "ocr", "qrcode"], "decoded": ["domain", "item", "message", "ocr", "qrcode"], - "domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"], + "domain": ["cve", "cookie-name", "cryptocurrency", "dom-hash", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"], + "dom-hash": ["domain", "item"], "etag": ["domain"], "favicon": ["domain", "item"], # TODO Decoded "file-name": ["chat", "message"], "hhhash": ["domain"], "image": ["chat", "chat-subchannel", "chat-thread", "message", "ocr", "qrcode", "user-account"], # TODO subchannel + threads ???? - "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? + "item": ["cve", "cryptocurrency", "decoded", "domain", "dom-hash", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? "message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"], "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"], "pgp": ["domain", "item", "message", "ocr"], diff --git a/bin/lib/objects/DomHashs.py b/bin/lib/objects/DomHashs.py new file mode 100755 index 00000000..996a30dd --- /dev/null +++ b/bin/lib/objects/DomHashs.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from bs4 import BeautifulSoup +from hashlib import sha256 +from flask import url_for + +# import warnings +# warnings.filterwarnings("ignore", category=DeprecationWarning) +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects + +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + + +class DomHash(AbstractDaterangeObject): + """ + AIL Title Object. + """ + + def __init__(self, id): + super(DomHash, self).__init__('dom-hash', id) + + # def get_ail_2_ail_payload(self): + # payload = {'raw': self.get_gzip_content(b64=True), + # 'compress': 'gzip'} + # return payload + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + # # TODO: + pass + + # def get_content(self, r_type='str'): # TODO Get random item -> compute hash + # if r_type == 'str': + # return self._get_field('content') + # elif r_type == 'bytes': + # return self._get_field('content').encode() + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf714', 'color': 'grey', 'radius': 5} + + def get_misp_object(self): + obj_attrs = [] + obj = MISPObject('dom-hash') + first_seen = self.get_first_seen() + last_seen = self.get_last_seen() + if first_seen: + obj.first_seen = first_seen + if last_seen: + obj.last_seen = last_seen + if not first_seen or not last_seen: + self.logger.warning( + f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}') + + obj_attrs.append(obj.add_attribute('dom-hash', value=self.get_id())) + # TODO ############################# URLS + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + return None + + def get_nb_seen(self): + return self.get_nb_correlation('domain') + + def get_meta(self, options=set()): + meta = self._get_meta(options=options) + meta['id'] = self.id + meta['tags'] = self.get_tags(r_list=True) + return meta + + def create(self, _first_seen=None, _last_seen=None): + self._create() + + +def _compute_dom_hash(html_content): + soup = BeautifulSoup(html_content, "lxml") + to_hash = "|".join(t.name for t in soup.findAll()).encode() + return sha256(to_hash).hexdigest()[:32] + + +def create(content): + obj_id = _compute_dom_hash(content) + obj = DomHash(obj_id) + if not obj.exists(): + obj.create() + return obj + + +class DomHashs(AbstractDaterangeObjects): + """ + Titles Objects + """ + def __init__(self): + super().__init__('dom-hash', DomHash) + + def sanitize_id_to_search(self, name_to_search): + return name_to_search + + +# if __name__ == '__main__': +# # from lib import crawlers +# # from lib.objects import Items +# # for item in Items.get_all_items_objects(filters={'sources': ['crawled']}): +# # title_content = crawlers.extract_title_from_html(item.get_content()) +# # if title_content: +# # print(item.id, title_content) +# # title = create_title(title_content) +# # title.add(item.get_date(), item.id) +# titles = Titles() +# # for r in titles.get_ids_iterator(): +# # print(r) +# r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False) +# print(r) diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index d590b0c4..9823e928 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -30,6 +30,7 @@ from lib.objects.Domains import Domain from lib.objects import Etags from lib.objects import Favicons from lib.objects import FilesNames +from lib.objects import DomHashs from lib.objects import HHHashs from lib.objects.Items import Item, get_all_items_objects, get_nb_items_objects from lib.objects import Images @@ -91,6 +92,8 @@ def get_object(obj_type, subtype, obj_id): return Favicons.Favicon(obj_id) elif obj_type == 'file-name': return FilesNames.FileName(obj_id) + elif obj_type == 'dom-hash': + return DomHashs.DomHash(obj_id) elif obj_type == 'hhhash': return HHHashs.HHHash(obj_id) elif obj_type == 'image': diff --git a/update/v5.9/Update.py b/update/v5.9/Update.py new file mode 100755 index 00000000..a60db2a1 --- /dev/null +++ b/update/v5.9/Update.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +sys.path.append(os.environ['AIL_HOME']) +################################## +# Import Project packages +################################## +from update.bin.ail_updater import AIL_Updater +from lib import ail_updates + +class Updater(AIL_Updater): + """default Updater.""" + + def __init__(self, version): + super(Updater, self).__init__(version) + + +if __name__ == '__main__': + updater = Updater('v5.9') + updater.run_update() + ail_updates.add_background_update('v5.9') diff --git a/update/v5.9/Update.sh b/update/v5.9/Update.sh new file mode 100755 index 00000000..03353f35 --- /dev/null +++ b/update/v5.9/Update.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1; +[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1; + +export PATH=$AIL_HOME:$PATH +export PATH=$AIL_REDIS:$PATH +export PATH=$AIL_BIN:$PATH +export PATH=$AIL_FLASK:$PATH + +GREEN="\\033[1;32m" +DEFAULT="\\033[0;39m" + +echo -e $GREEN"Shutting down AIL ..."$DEFAULT +bash ${AIL_BIN}/LAUNCH.sh -ks +wait + +# SUBMODULES # +git submodule update + +echo "" +echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT +echo "" +python ${AIL_HOME}/update/v5.9/Update.py +wait +echo "" +echo "" + +exit 0 diff --git a/update/v5.9/reprocess_dom_hash.py b/update/v5.9/reprocess_dom_hash.py new file mode 100755 index 00000000..cf192ea8 --- /dev/null +++ b/update/v5.9/reprocess_dom_hash.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import gzip +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib import ail_updates +from lib.objects import ail_objects +from lib.objects import DomHashs +from lib.objects.Domains import Domain + +if __name__ == '__main__': + update = ail_updates.AILBackgroundUpdate('v5.9') + n = 0 + nb_items = ail_objects.card_obj_iterator('item', filters={'sources': ['crawled']}) + update.set_nb_to_update(nb_items) + + for item in ail_objects.obj_iterator('item', filters={'sources': ['crawled']}): + dom = item.get_domain() + domain = Domain(dom) + i_content = item.get_content() + if domain.exists() and i_content: + date = item.get_date() + # DOM-HASH + dom_hash = DomHashs.create(i_content) + dom_hash.add(date, item) + dom_hash.add_correlation('domain', '', domain.id) + + print(domain.id, item.id, dom_hash.id) + + update.inc_nb_updated() + n += 1 + if n % 100 == 0: + update.update_progress() diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index c2fe5bdf..6890874f 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -54,6 +54,7 @@ from blueprints.objects_title import objects_title from blueprints.objects_cookie_name import objects_cookie_name from blueprints.objects_etag import objects_etag from blueprints.objects_hhhash import objects_hhhash +from blueprints.objects_dom_hash import objects_dom_hash from blueprints.chats_explorer import chats_explorer from blueprints.objects_image import objects_image from blueprints.objects_ocr import objects_ocr @@ -138,6 +139,7 @@ app.register_blueprint(objects_title, url_prefix=baseUrl) app.register_blueprint(objects_cookie_name, url_prefix=baseUrl) app.register_blueprint(objects_etag, url_prefix=baseUrl) app.register_blueprint(objects_hhhash, url_prefix=baseUrl) +app.register_blueprint(objects_dom_hash, url_prefix=baseUrl) app.register_blueprint(chats_explorer, url_prefix=baseUrl) app.register_blueprint(objects_image, url_prefix=baseUrl) app.register_blueprint(objects_ocr, url_prefix=baseUrl) diff --git a/var/www/blueprints/objects_dom_hash.py b/var/www/blueprints/objects_dom_hash.py new file mode 100644 index 00000000..e493b3ab --- /dev/null +++ b/var/www/blueprints/objects_dom_hash.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +''' + +import os +import sys + +from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file +from flask_login import login_required + +# Import Role_Manager +from Role_Manager import login_admin, login_read_only + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import DomHashs +from packages import Date + +# ============ BLUEPRINT ============ +objects_dom_hash = Blueprint('objects_dom_hash', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/dom-hash')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + + +# ============ FUNCTIONS ============ +@objects_dom_hash.route("/objects/dom-hashs", methods=['GET']) +@login_required +@login_read_only +def objects_dom_hashs(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = DomHashs.DomHashs().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + return render_template("DomHashDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + +@objects_dom_hash.route("/objects/dom-hash/post", methods=['POST']) +@login_required +@login_read_only +def objects_dom_hashs_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_dom_hash.objects_dom_hashs', date_from=date_from, date_to=date_to, show_objects=show_objects)) + +@objects_dom_hash.route("/objects/dom-hash/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_dom_hash_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(DomHashs.DomHashs().api_get_chart_nb_by_daterange(date_from, date_to)) + +# @objects_dom_hash.route("/objects/dom-hash/search", methods=['POST']) +# @login_required +# @login_read_only +# def objects_dom_hash_search(): +# date_from = request.args.get('date_from') +# date_to = request.args.get('date_to') +# date = Date.sanitise_date_range(date_from, date_to) +# date_from = date['date_from'] +# date_to = date['date_to'] +# return jsonify(HHHashs.HHHashs().api_get_chart_nb_by_daterange(date_from, date_to)) +# +# search_by_id + +# @objects_dom_hash.route("/objects/dom-hash/graphline/json", methods=['GET']) +# @login_required +# @login_read_only +# def objects_dom_hash_graphline_json(): +# dom_hash_id = request.args.get('id') +# cve = Cves.Cve(cve_id) +# if not cve.exists(): +# abort(404) +# return jsonify(Cves.get_cve_graphline(cve_id)) + +# ============= ROUTES ============== + diff --git a/var/www/templates/correlation/metadata_card_dom_hash.html b/var/www/templates/correlation/metadata_card_dom_hash.html new file mode 100644 index 00000000..09d0e859 --- /dev/null +++ b/var/www/templates/correlation/metadata_card_dom_hash.html @@ -0,0 +1,173 @@ + + + +{% with modal_add_tags=dict_object['metadata_card']['add_tags_modal']%} + {% include 'modals/add_tags.html' %} +{% endwith %} + +{% include 'modals/edit_tag.html' %} + +
+
+

{{ dict_object["metadata"]["content"] }}

+
{{ dict_object["correlation_id"] }}
+
    +
  • +
    +
    + + + + + + + + + + + + + + + + + +
    Object typeFirst seenLast seenNb seen
    + + + + {{ dict_object["metadata_card"]["svg_icon"]["icon"] }} + + + {{ dict_object["object_type"] }} + {{ dict_object["metadata"]['first_seen'] }}{{ dict_object["metadata"]['last_seen'] }}{{ dict_object["metadata"]['nb_seen'] }}
    +
    +
    +
    +
    +
    +
  • + +
  • +
    +
    + Tags: + {% for tag in dict_object["metadata"]['tags'] %} + + {% endfor %} + +
    +
  • +
+ + {% with obj_type='dom-hash', obj_id=dict_object['correlation_id'], obj_subtype='' %} + {% include 'modals/investigations_register_obj.html' %} + {% endwith %} + + +
+
+ + + + + + diff --git a/var/www/templates/correlation/show_correlation.html b/var/www/templates/correlation/show_correlation.html index 8012f66c..30f66ab1 100644 --- a/var/www/templates/correlation/show_correlation.html +++ b/var/www/templates/correlation/show_correlation.html @@ -126,6 +126,8 @@ {% include 'correlation/metadata_card_cookie_name.html' %} {% elif dict_object["object_type"] == "etag" %} {% include 'correlation/metadata_card_etag.html' %} + {% elif dict_object["object_type"] == "dom-hash" %} + {% include 'correlation/metadata_card_dom_hash.html' %} {% elif dict_object["object_type"] == "hhhash" %} {% include 'correlation/metadata_card_hhhash.html' %} {% elif dict_object["object_type"] == "image" %} @@ -267,6 +269,10 @@ +
+ + +
diff --git a/var/www/templates/objects/dom-hash/DomHashDaterange.html b/var/www/templates/objects/dom-hash/DomHashDaterange.html new file mode 100644 index 00000000..3c72938b --- /dev/null +++ b/var/www/templates/objects/dom-hash/DomHashDaterange.html @@ -0,0 +1,611 @@ + + + + + Dom-Hashs - AIL + + + + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'sidebars/sidebar_objects.html' %} + +
+ +
+
+
+ +{#
#} +{#
#} +{#
Search Dom-Hash by name:
#} +{#
#} +{#
#} +{# #} +{# #} +{#
#} +{#
#} +{#
#} +{#
#} +
+ + +
+ +
+
+
Select a date range :
+
+
+
+ +
+
+
+ +
+
+ + +
+ +
+
+
+ +
+
+
+
+
+
+ + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }} Dom-Hash:

+ {% else %} +

{{ date_from }} to {{ date_to }} Dom-Hash:

+ {% endif %} + + + + + + + + + + + + {% for dom_hash_id in dict_objects %} + + + + + + + + {% endfor %} + +
Dom-Hash-IDFirst SeenLast SeenTotalLast days
{{ dom_hash_id }}{{ dict_objects[dom_hash_id]['first_seen'] }}{{ dict_objects[dom_hash_id]['last_seen'] }}{{ dict_objects[dom_hash_id]['nb_seen'] }}
+ + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }}, No Dom-Hash

+ {% else %} +

{{ date_from }} to {{ date_to }}, No Dom-Hash

+ {% endif %} + {% endif %} + {% endif %} +
+ +
+
+ + + + + + + + + + + + + + + + + diff --git a/var/www/templates/sidebars/sidebar_objects.html b/var/www/templates/sidebars/sidebar_objects.html index 2d9e2fb2..b28f1313 100644 --- a/var/www/templates/sidebars/sidebar_objects.html +++ b/var/www/templates/sidebars/sidebar_objects.html @@ -70,6 +70,12 @@ HHHash +