diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py
index a9128489..a4ad9f1d 100755
--- a/bin/lib/ail_core.py
+++ b/bin/lib/ail_core.py
@@ -15,7 +15,7 @@ config_loader = ConfigLoader()
r_serv_db = config_loader.get_db_conn("Kvrocks_DB")
config_loader = None
-AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'item', 'pgp', 'screenshot', 'title', 'username'})
+AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'favicon', 'item', 'pgp', 'screenshot', 'title', 'username'})
def get_ail_uuid():
ail_uuid = r_serv_db.get('ail:uuid')
diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py
index f09d5c5f..226131ab 100755
--- a/bin/lib/correlations_engine.py
+++ b/bin/lib/correlations_engine.py
@@ -44,8 +44,9 @@ CORRELATION_TYPES_BY_OBJ = {
"cryptocurrency": ["domain", "item"],
"cve": ["domain", "item"],
"decoded": ["domain", "item"],
- "domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "title", "screenshot", "username"],
- "item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "screenshot", "title", "username"],
+ "domain": ["cve", "cryptocurrency", "decoded", "favicon", "item", "pgp", "title", "screenshot", "username"],
+ "favicon": ["domain", "item"], # TODO Decoded
+ "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"],
"pgp": ["domain", "item"],
"screenshot": ["domain", "item"],
"title": ["domain", "item"],
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 7dbb18b2..32fc4d6c 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -141,9 +141,11 @@ def get_favicon_from_html(html, domain, url):
return favicon_urls
def extract_favicon_from_html(html, url):
- favicon_urls = set()
+ favicons = set()
+ favicons_urls = set()
+
soup = BeautifulSoup(html, 'html.parser')
- set_icons = set()
+ all_icons = set()
# If there are multiple s, the browser uses their media,
# type, and sizes attributes to select the most appropriate icon.
# If several icons are equally appropriate, the last one is used.
@@ -159,27 +161,65 @@ def extract_favicon_from_html(html, url):
# -
# -
- # desktop browser 'shortcut icon' (older browser), 'icon'
- for favicon_tag in ['icon', 'shortcut icon']:
- if soup.head:
- for icon in soup.head.find_all('link', attrs={'rel': lambda x : x and x.lower() == favicon_tag, 'href': True}):
- set_icons.add(icon)
- # # TODO: handle base64 favicon
- for tag in set_icons:
+ # Root Favicon
+ f = get_faup()
+ f.decode(url)
+ url_decoded = f.get()
+ root_domain = f"{url_decoded['scheme']}://{url_decoded['domain']}"
+ default_icon = f'{root_domain}/favicon.ico'
+ favicons_urls.add(default_icon)
+ # print(default_icon)
+
+ # shortcut
+ for shortcut in soup.find_all('link', rel='shortcut icon'):
+ all_icons.add(shortcut)
+ # icons
+ for icon in soup.find_all('link', rel='icon'):
+ all_icons.add(icon)
+
+ for mask_icon in soup.find_all('link', rel='mask-icon'):
+ all_icons.add(mask_icon)
+ for apple_touche_icon in soup.find_all('link', rel='apple-touch-icon'):
+ all_icons.add(apple_touche_icon)
+ for msapplication in soup.find_all('meta', attrs={'name': 'msapplication-TileImage'}): # msapplication-TileColor
+ all_icons.add(msapplication)
+
+ # msapplication-TileImage
+
+ # print(all_icons)
+ for tag in all_icons:
icon_url = tag.get('href')
if icon_url:
- if icon_url.startswith('//'):
- icon_url = icon_url.replace('//', '/')
if icon_url.startswith('data:'):
- # # TODO: handle base64 favicon
- pass
+ data = icon_url.split(',', 1)
+ if len(data) > 1:
+ data = ''.join(data[1].split())
+ favicon = base64.b64decode(data)
+ if favicon:
+ favicons.add(favicon)
else:
- icon_url = urljoin(url, icon_url)
- icon_url = urlparse(icon_url, scheme=urlparse(url).scheme).geturl()
- favicon_urls.add(icon_url)
- return favicon_urls
+ favicon_url = urljoin(url, icon_url)
+ favicons_urls.add(favicon_url)
+ elif tag.get('name') == 'msapplication-TileImage':
+ icon_url = tag.get('content')
+ if icon_url:
+ if icon_url.startswith('data:'):
+ data = icon_url.split(',', 1)
+ if len(data) > 1:
+ data = ''.join(data[1].split())
+ favicon = base64.b64decode(data)
+ if favicon:
+ favicons.add(favicon)
+ else:
+ favicon_url = urljoin(url, icon_url)
+ favicons_urls.add(favicon_url)
+ print(favicon_url)
+ # print(favicons_urls)
+ return favicons_urls, favicons
+
+# mmh3.hash(favicon)
# # # - - # # #
@@ -1755,7 +1795,9 @@ def test_ail_crawlers():
load_blacklist()
# if __name__ == '__main__':
-# item = Item('crawled/2023/03/06/foo.bec50a87b5-0c21-4ed4-9cb2-2d717a7a6507')
+# item_id = 'crawled/2023/02/20/data.gz'
+# item = Item(item_id)
# content = item.get_content()
-# r = extract_author_from_html(content)
+# temp_url = ''
+# r = extract_favicon_from_html(content, temp_url)
# print(r)
diff --git a/bin/lib/objects/Favicons.py b/bin/lib/objects/Favicons.py
new file mode 100755
index 00000000..8080effd
--- /dev/null
+++ b/bin/lib/objects/Favicons.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import mmh3
+import os
+import sys
+
+from flask import url_for
+
+from pymisp import MISPObject
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib.ConfigLoader import ConfigLoader
+from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects
+
+config_loader = ConfigLoader()
+r_objects = config_loader.get_db_conn("Kvrocks_Objects")
+baseurl = config_loader.get_config_str("Notifications", "ail_domain")
+config_loader = None
+
+
+class Favicon(AbstractDaterangeObject):
+ """
+ AIL Favicon Object.
+ """
+
+ def __init__(self, id):
+ super(Favicon, self).__init__('favicon', id)
+
+ # def get_ail_2_ail_payload(self):
+ # payload = {'raw': self.get_gzip_content(b64=True),
+ # 'compress': 'gzip'}
+ # return payload
+
+ # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
+ def delete(self):
+ # # TODO:
+ pass
+
+ def get_content(self, r_type='str'):
+ if r_type == 'str':
+ return self._get_field('content')
+
+ def get_link(self, flask_context=False):
+ if flask_context:
+ url = url_for('correlation.show_correlation', type=self.type, id=self.id)
+ else:
+ url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
+ return url
+
+ # TODO # CHANGE COLOR
+ def get_svg_icon(self):
+ return {'style': 'fas', 'icon': '\uf20a', 'color': '#1E88E5', 'radius': 5} # f0c8 f45c
+
+ def get_misp_object(self):
+ obj_attrs = []
+ obj = MISPObject('favicon')
+ first_seen = self.get_first_seen()
+ last_seen = self.get_last_seen()
+ if first_seen:
+ obj.first_seen = first_seen
+ if last_seen:
+ obj.last_seen = last_seen
+ if not first_seen or not last_seen:
+ self.logger.warning(
+ f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}')
+
+ obj_attrs.append(obj.add_attribute('favicon-mmh3', value=self.id))
+ obj_attrs.append(obj.add_attribute('favicon', value=self.get_content(r_type='bytes')))
+ for obj_attr in obj_attrs:
+ for tag in self.get_tags():
+ obj_attr.add_tag(tag)
+ return obj
+
+ def get_meta(self, options=set()):
+ meta = self._get_meta(options=options)
+ meta['id'] = self.id
+ meta['tags'] = self.get_tags(r_list=True)
+ if 'content' in options:
+ meta['content'] = self.get_content()
+ return meta
+
+ # def get_links(self):
+ # # TODO GET ALL URLS FROM CORRELATED ITEMS
+
+ def add(self, date, item_id): # TODO correlation base 64 -> calc md5
+ self._add(date, item_id)
+
+ def create(self, content, _first_seen=None, _last_seen=None):
+ if not isinstance(content, str):
+ content = content.decode()
+ self._set_field('content', content)
+ self._create()
+
+
+def create_favicon(content, url=None): # TODO URL ????
+ if isinstance(content, str):
+ content = content.encode()
+ favicon_id = mmh3.hash_bytes(content)
+ favicon = Favicon(favicon_id)
+ if not favicon.exists():
+ favicon.create(content)
+
+
+# TODO ADD SEARCH FUNCTION
+
+class Favicons(AbstractDaterangeObjects):
+ """
+ Favicons Objects
+ """
+ def __init__(self):
+ super().__init__('favicon')
+
+ def get_metas(self, obj_ids, options=set()):
+ return self._get_metas(Favicon, obj_ids, options=options)
+
+ def sanitize_name_to_search(self, name_to_search):
+ return name_to_search # TODO
+
+
+# if __name__ == '__main__':
+# name_to_search = '98'
+# print(search_cves_by_name(name_to_search))
diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py
index 6886d6fc..a768cee8 100755
--- a/bin/lib/objects/ail_objects.py
+++ b/bin/lib/objects/ail_objects.py
@@ -18,6 +18,7 @@ from lib.objects import CryptoCurrencies
from lib.objects.Cves import Cve
from lib.objects.Decodeds import Decoded, get_all_decodeds_objects, get_nb_decodeds_objects
from lib.objects.Domains import Domain
+from lib.objects.Favicons import Favicon
from lib.objects.Items import Item, get_all_items_objects, get_nb_items_objects
from lib.objects import Pgps
from lib.objects.Screenshots import Screenshot
@@ -54,6 +55,8 @@ def get_object(obj_type, subtype, id):
return Decoded(id)
elif obj_type == 'cve':
return Cve(id)
+ elif obj_type == 'favicon':
+ return Favicon(id)
elif obj_type == 'screenshot':
return Screenshot(id)
elif obj_type == 'cryptocurrency':
@@ -163,7 +166,7 @@ def get_object_card_meta(obj_type, subtype, id, related_btc=False):
obj = get_object(obj_type, subtype, id)
meta = obj.get_meta()
meta['icon'] = obj.get_svg_icon()
- if subtype or obj_type == 'cve' or obj_type == 'title':
+ if subtype or obj_type == 'cve' or obj_type == 'title' or obj_type == 'favicon':
meta['sparkline'] = obj.get_sparkline()
if obj_type == 'cve':
meta['cve_search'] = obj.get_cve_search()