ail-framework/bin/lib/objects/DomHashs.py
terrtia b988f46c90
Some checks are pending
CI / ail_test (3.10) (push) Waiting to run
CI / ail_test (3.7) (push) Waiting to run
CI / ail_test (3.8) (push) Waiting to run
CI / ail_test (3.9) (push) Waiting to run
chg: [dom-hash] add dom-hash object compute dom-hash for domains and crawled items
2024-10-17 12:14:48 +02:00

134 lines
4 KiB
Python
Executable file

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
from bs4 import BeautifulSoup
from hashlib import sha256
from flask import url_for
# import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)
from pymisp import MISPObject
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects
config_loader = ConfigLoader()
r_objects = config_loader.get_db_conn("Kvrocks_Objects")
baseurl = config_loader.get_config_str("Notifications", "ail_domain")
config_loader = None
class DomHash(AbstractDaterangeObject):
"""
AIL Title Object.
"""
def __init__(self, id):
super(DomHash, self).__init__('dom-hash', id)
# def get_ail_2_ail_payload(self):
# payload = {'raw': self.get_gzip_content(b64=True),
# 'compress': 'gzip'}
# return payload
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
def delete(self):
# # TODO:
pass
# def get_content(self, r_type='str'): # TODO Get random item -> compute hash
# if r_type == 'str':
# return self._get_field('content')
# elif r_type == 'bytes':
# return self._get_field('content').encode()
def get_link(self, flask_context=False):
if flask_context:
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
else:
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
return url
def get_svg_icon(self):
return {'style': 'fas', 'icon': '\uf714', 'color': 'grey', 'radius': 5}
def get_misp_object(self):
obj_attrs = []
obj = MISPObject('dom-hash')
first_seen = self.get_first_seen()
last_seen = self.get_last_seen()
if first_seen:
obj.first_seen = first_seen
if last_seen:
obj.last_seen = last_seen
if not first_seen or not last_seen:
self.logger.warning(
f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}')
obj_attrs.append(obj.add_attribute('dom-hash', value=self.get_id()))
# TODO ############################# URLS
for obj_attr in obj_attrs:
for tag in self.get_tags():
obj_attr.add_tag(tag)
return obj
return None
def get_nb_seen(self):
return self.get_nb_correlation('domain')
def get_meta(self, options=set()):
meta = self._get_meta(options=options)
meta['id'] = self.id
meta['tags'] = self.get_tags(r_list=True)
return meta
def create(self, _first_seen=None, _last_seen=None):
self._create()
def _compute_dom_hash(html_content):
soup = BeautifulSoup(html_content, "lxml")
to_hash = "|".join(t.name for t in soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]
def create(content):
obj_id = _compute_dom_hash(content)
obj = DomHash(obj_id)
if not obj.exists():
obj.create()
return obj
class DomHashs(AbstractDaterangeObjects):
"""
Titles Objects
"""
def __init__(self):
super().__init__('dom-hash', DomHash)
def sanitize_id_to_search(self, name_to_search):
return name_to_search
# if __name__ == '__main__':
# # from lib import crawlers
# # from lib.objects import Items
# # for item in Items.get_all_items_objects(filters={'sources': ['crawled']}):
# # title_content = crawlers.extract_title_from_html(item.get_content())
# # if title_content:
# # print(item.id, title_content)
# # title = create_title(title_content)
# # title.add(item.get_date(), item.id)
# titles = Titles()
# # for r in titles.get_ids_iterator():
# # print(r)
# r = titles.search_by_id('f7d57B', r_pos=True, case_sensitive=False)
# print(r)