diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index a9128489..a4ad9f1d 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -15,7 +15,7 @@ config_loader = ConfigLoader() r_serv_db = config_loader.get_db_conn("Kvrocks_DB") config_loader = None -AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'title', 'username'}) +AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'favicon', 'item', 'pgp', 'screenshot', 'title', 'username'}) def get_ail_uuid(): ail_uuid = r_serv_db.get('ail:uuid') diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index f09d5c5f..226131ab 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -44,8 +44,9 @@ CORRELATION_TYPES_BY_OBJ = { "cryptocurrency": ["domain", "item"], "cve": ["domain", "item"], "decoded": ["domain", "item"], - "domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "title", "screenshot", "username"], - "item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "screenshot", "title", "username"], + "domain": ["cve", "cryptocurrency", "decoded", "favicon", "item", "pgp", "title", "screenshot", "username"], + "favicon": ["domain", "item"], # TODO Decoded + "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], "pgp": ["domain", "item"], "screenshot": ["domain", "item"], "title": ["domain", "item"], diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 7dbb18b2..32fc4d6c 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -141,9 +141,11 @@ def get_favicon_from_html(html, domain, url): return favicon_urls def extract_favicon_from_html(html, url): - favicon_urls = set() + favicons = set() + favicons_urls = set() + soup = BeautifulSoup(html, 'html.parser') - set_icons = set() + all_icons = set() # If there are multiple s, the browser uses their media, # type, and sizes attributes to select the most appropriate icon. # If several icons are equally appropriate, the last one is used. @@ -159,27 +161,65 @@ def extract_favicon_from_html(html, url): # - # - - # desktop browser 'shortcut icon' (older browser), 'icon' - for favicon_tag in ['icon', 'shortcut icon']: - if soup.head: - for icon in soup.head.find_all('link', attrs={'rel': lambda x : x and x.lower() == favicon_tag, 'href': True}): - set_icons.add(icon) - # # TODO: handle base64 favicon - for tag in set_icons: + # Root Favicon + f = get_faup() + f.decode(url) + url_decoded = f.get() + root_domain = f"{url_decoded['scheme']}://{url_decoded['domain']}" + default_icon = f'{root_domain}/favicon.ico' + favicons_urls.add(default_icon) + # print(default_icon) + + # shortcut + for shortcut in soup.find_all('link', rel='shortcut icon'): + all_icons.add(shortcut) + # icons + for icon in soup.find_all('link', rel='icon'): + all_icons.add(icon) + + for mask_icon in soup.find_all('link', rel='mask-icon'): + all_icons.add(mask_icon) + for apple_touche_icon in soup.find_all('link', rel='apple-touch-icon'): + all_icons.add(apple_touche_icon) + for msapplication in soup.find_all('meta', attrs={'name': 'msapplication-TileImage'}): # msapplication-TileColor + all_icons.add(msapplication) + + # msapplication-TileImage + + # print(all_icons) + for tag in all_icons: icon_url = tag.get('href') if icon_url: - if icon_url.startswith('//'): - icon_url = icon_url.replace('//', '/') if icon_url.startswith('data:'): - # # TODO: handle base64 favicon - pass + data = icon_url.split(',', 1) + if len(data) > 1: + data = ''.join(data[1].split()) + favicon = base64.b64decode(data) + if favicon: + favicons.add(favicon) else: - icon_url = urljoin(url, icon_url) - icon_url = urlparse(icon_url, scheme=urlparse(url).scheme).geturl() - favicon_urls.add(icon_url) - return favicon_urls + favicon_url = urljoin(url, icon_url) + favicons_urls.add(favicon_url) + elif tag.get('name') == 'msapplication-TileImage': + icon_url = tag.get('content') + if icon_url: + if icon_url.startswith('data:'): + data = icon_url.split(',', 1) + if len(data) > 1: + data = ''.join(data[1].split()) + favicon = base64.b64decode(data) + if favicon: + favicons.add(favicon) + else: + favicon_url = urljoin(url, icon_url) + favicons_urls.add(favicon_url) + print(favicon_url) + # print(favicons_urls) + return favicons_urls, favicons + +# mmh3.hash(favicon) # # # - - # # # @@ -1755,7 +1795,9 @@ def test_ail_crawlers(): load_blacklist() # if __name__ == '__main__': -# item = Item('crawled/2023/03/06/foo.bec50a87b5-0c21-4ed4-9cb2-2d717a7a6507') +# item_id = 'crawled/2023/02/20/data.gz' +# item = Item(item_id) # content = item.get_content() -# r = extract_author_from_html(content) +# temp_url = '' +# r = extract_favicon_from_html(content, temp_url) # print(r) diff --git a/bin/lib/objects/Favicons.py b/bin/lib/objects/Favicons.py new file mode 100755 index 00000000..8080effd --- /dev/null +++ b/bin/lib/objects/Favicons.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import mmh3 +import os +import sys + +from flask import url_for + +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects + +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + + +class Favicon(AbstractDaterangeObject): + """ + AIL Favicon Object. + """ + + def __init__(self, id): + super(Favicon, self).__init__('favicon', id) + + # def get_ail_2_ail_payload(self): + # payload = {'raw': self.get_gzip_content(b64=True), + # 'compress': 'gzip'} + # return payload + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + # # TODO: + pass + + def get_content(self, r_type='str'): + if r_type == 'str': + return self._get_field('content') + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + # TODO # CHANGE COLOR + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf20a', 'color': '#1E88E5', 'radius': 5} # f0c8 f45c + + def get_misp_object(self): + obj_attrs = [] + obj = MISPObject('favicon') + first_seen = self.get_first_seen() + last_seen = self.get_last_seen() + if first_seen: + obj.first_seen = first_seen + if last_seen: + obj.last_seen = last_seen + if not first_seen or not last_seen: + self.logger.warning( + f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}') + + obj_attrs.append(obj.add_attribute('favicon-mmh3', value=self.id)) + obj_attrs.append(obj.add_attribute('favicon', value=self.get_content(r_type='bytes'))) + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + + def get_meta(self, options=set()): + meta = self._get_meta(options=options) + meta['id'] = self.id + meta['tags'] = self.get_tags(r_list=True) + if 'content' in options: + meta['content'] = self.get_content() + return meta + + # def get_links(self): + # # TODO GET ALL URLS FROM CORRELATED ITEMS + + def add(self, date, item_id): # TODO correlation base 64 -> calc md5 + self._add(date, item_id) + + def create(self, content, _first_seen=None, _last_seen=None): + if not isinstance(content, str): + content = content.decode() + self._set_field('content', content) + self._create() + + +def create_favicon(content, url=None): # TODO URL ???? + if isinstance(content, str): + content = content.encode() + favicon_id = mmh3.hash_bytes(content) + favicon = Favicon(favicon_id) + if not favicon.exists(): + favicon.create(content) + + +# TODO ADD SEARCH FUNCTION + +class Favicons(AbstractDaterangeObjects): + """ + Favicons Objects + """ + def __init__(self): + super().__init__('favicon') + + def get_metas(self, obj_ids, options=set()): + return self._get_metas(Favicon, obj_ids, options=options) + + def sanitize_name_to_search(self, name_to_search): + return name_to_search # TODO + + +# if __name__ == '__main__': +# name_to_search = '98' +# print(search_cves_by_name(name_to_search)) diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 6886d6fc..a768cee8 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -18,6 +18,7 @@ from lib.objects import CryptoCurrencies from lib.objects.Cves import Cve from lib.objects.Decodeds import Decoded, get_all_decodeds_objects, get_nb_decodeds_objects from lib.objects.Domains import Domain +from lib.objects.Favicons import Favicon from lib.objects.Items import Item, get_all_items_objects, get_nb_items_objects from lib.objects import Pgps from lib.objects.Screenshots import Screenshot @@ -54,6 +55,8 @@ def get_object(obj_type, subtype, id): return Decoded(id) elif obj_type == 'cve': return Cve(id) + elif obj_type == 'favicon': + return Favicon(id) elif obj_type == 'screenshot': return Screenshot(id) elif obj_type == 'cryptocurrency': @@ -163,7 +166,7 @@ def get_object_card_meta(obj_type, subtype, id, related_btc=False): obj = get_object(obj_type, subtype, id) meta = obj.get_meta() meta['icon'] = obj.get_svg_icon() - if subtype or obj_type == 'cve' or obj_type == 'title': + if subtype or obj_type == 'cve' or obj_type == 'title' or obj_type == 'favicon': meta['sparkline'] = obj.get_sparkline() if obj_type == 'cve': meta['cve_search'] = obj.get_cve_search()