diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 73c17472..a2966d9a 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -16,6 +16,7 @@ from modules.abstract_module import AbstractModule from lib import ail_logger from lib import crawlers from lib.ConfigLoader import ConfigLoader +from lib.objects import CookiesNames from lib.objects.Domains import Domain from lib.objects.Items import Item from lib.objects import Screenshots @@ -56,7 +57,7 @@ class Crawler(AbstractModule): self.har = None self.screenshot = None self.root_item = None - self.har_dir = None + self.date = None self.items_dir = None self.domain = None @@ -191,15 +192,14 @@ class Crawler(AbstractModule): # DEBUG # self.har = True # self.screenshot = True - str_date = crawlers.get_current_date(separator=True) - self.har_dir = crawlers.get_date_har_dir(str_date) - self.items_dir = crawlers.get_date_crawled_items_source(str_date) + self.date = crawlers.get_current_date(separator=True) + self.items_dir = crawlers.get_date_crawled_items_source(self.date) self.root_item = None # Save Capture self.save_capture_response(parent_id, entries) - self.domain.update_daterange(str_date.replace('/', '')) + self.domain.update_daterange(self.date.replace('/', '')) # Origin + History if self.root_item: self.domain.set_last_origin(parent_id) @@ -279,7 +279,13 @@ class Crawler(AbstractModule): # HAR if self.har: if 'har' in entries and entries['har']: - crawlers.save_har(self.har_dir, item_id, entries['har']) + har_id = crawlers.create_har_id(self.date, item_id) + crawlers.save_har(har_id, entries['har']) + for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']): + print(cookie_name) + cookie = CookiesNames.create(cookie_name) + cookie.add(self.date.replace('/', ''), self.domain.id) + # Next Children entries_children = entries.get('children') if entries_children: diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index a4ad9f1d..c52db274 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -15,7 +15,8 @@ config_loader = ConfigLoader() r_serv_db = config_loader.get_db_conn("Kvrocks_DB") config_loader = None -AIL_OBJECTS = sorted({'cve', 'cryptocurrency', 'decoded', 'domain', 'favicon', 'item', 'pgp', 'screenshot', 'title', 'username'}) +AIL_OBJECTS = sorted({'cookie-name', 'cve', 'cryptocurrency', 'decoded', 'domain', 'favicon', 'item', 'pgp', + 'screenshot', 'title', 'username'}) def get_ail_uuid(): ail_uuid = r_serv_db.get('ail:uuid') diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 226131ab..d1d7e3b6 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -41,10 +41,11 @@ config_loader = None ################################## CORRELATION_TYPES_BY_OBJ = { + "cookie-name": ["domain"], "cryptocurrency": ["domain", "item"], "cve": ["domain", "item"], "decoded": ["domain", "item"], - "domain": ["cve", "cryptocurrency", "decoded", "favicon", "item", "pgp", "title", "screenshot", "username"], + "domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "favicon", "item", "pgp", "title", "screenshot", "username"], "favicon": ["domain", "item"], # TODO Decoded "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], "pgp": ["domain", "item"], diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 32fc4d6c..1883490e 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -36,6 +36,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from packages import git_status +from packages import Date from lib.ConfigLoader import ConfigLoader from lib.objects.Domains import Domain from lib.objects.Items import Item @@ -74,8 +75,8 @@ def get_current_date(separator=False): def get_date_crawled_items_source(date): return os.path.join('crawled', date) -def get_date_har_dir(date): - return os.path.join(HAR_DIR, date) +def get_har_dir(): + return HAR_DIR def is_valid_onion_domain(domain): if not domain.endswith('.onion'): @@ -265,6 +266,88 @@ def extract_author_from_html(html): return '' # # # - - # # # + +# # # # # # # # +# # +# HAR # +# # +# # # # # # # # + +def create_har_id(date, item_id): + item_id = item_id.split('/')[-1] + return os.path.join(date, f'{item_id}.json') + +def save_har(har_id, har_content): + # create dir + har_dir = os.path.dirname(os.path.join(get_har_dir(), har_id)) + if not os.path.exists(har_dir): + os.makedirs(har_dir) + # save HAR + filename = os.path.join(get_har_dir(), har_id) + with open(filename, 'w') as f: + f.write(json.dumps(har_content)) + +def get_all_har_ids(): + har_ids = [] + today_root_dir = os.path.join(HAR_DIR, Date.get_today_date_str(separator=True)) + dirs_year = set() + for ydir in next(os.walk(HAR_DIR))[1]: + if len(ydir) == 4: + try: + int(ydir) + dirs_year.add(ydir) + except (TypeError, ValueError): + pass + + for file in [f for f in os.listdir(today_root_dir) if os.path.isfile(os.path.join(today_root_dir, f))]: + har_id = os.path.relpath(os.path.join(today_root_dir, file), HAR_DIR) + har_ids.append(har_id) + + for ydir in sorted(dirs_year, reverse=False): + search_dear = os.path.join(HAR_DIR, ydir) + for root, dirs, files in os.walk(search_dear): + for file in files: + if root != today_root_dir: + har_id = os.path.relpath(os.path.join(root, file), HAR_DIR) + har_ids.append(har_id) + return har_ids + +def extract_cookies_names_from_har_by_har_id(har_id): + har_path = os.path.join(HAR_DIR, har_id) + with open(har_path) as f: + try: + har_content = json.loads(f.read()) + except json.decoder.JSONDecodeError: + har_content = {} + return extract_cookies_names_from_har(har_content) + +def extract_cookies_names_from_har(har): + cookies = set() + for entrie in har.get('log', {}).get('entries', []): + for cookie in entrie.get('request', {}).get('cookies', []): + name = cookie.get('name') + if name: + cookies.add(name) + for cookie in entrie.get('response', {}).get('cookies', []): + name = cookie.get('name') + if name: + cookies.add(name) + return cookies + +def _reprocess_all_hars(): + from lib.objects import CookiesNames + for har_id in get_all_har_ids(): + domain = har_id.split('/')[-1] + domain = domain[:-41] + date = har_id.split('/') + date = f'{date[-4]}{date[-3]}{date[-2]}' + for cookie_name in extract_cookies_names_from_har_by_har_id(har_id): + print(domain, date, cookie_name) + cookie = CookiesNames.create(cookie_name) + cookie.add(date, domain) + +# # # - - # # # + ################################################################################ # # TODO: @@ -1555,14 +1638,6 @@ def create_item_id(item_dir, domain): UUID = domain+str(uuid.uuid4()) return os.path.join(item_dir, UUID) -def save_har(har_dir, item_id, har_content): - if not os.path.exists(har_dir): - os.makedirs(har_dir) - item_id = item_id.split('/')[-1] - filename = os.path.join(har_dir, item_id + '.json') - with open(filename, 'w') as f: - f.write(json.dumps(har_content)) - # # # # # # # # # # # # # # # CRAWLER MANAGER # TODO REFACTOR ME @@ -1801,3 +1876,5 @@ load_blacklist() # temp_url = '' # r = extract_favicon_from_html(content, temp_url) # print(r) +# _reprocess_all_hars() + diff --git a/bin/lib/objects/CookiesNames.py b/bin/lib/objects/CookiesNames.py new file mode 100755 index 00000000..b064892b --- /dev/null +++ b/bin/lib/objects/CookiesNames.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from hashlib import sha256 +from flask import url_for + +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject, AbstractDaterangeObjects + +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + +# TODO NEW ABSTRACT OBJECT -> daterange for all objects ???? + +class CookieName(AbstractDaterangeObject): + """ + AIL CookieName Object. + """ + + def __init__(self, obj_id): + super(CookieName, self).__init__('cookie-name', obj_id) + + # def get_ail_2_ail_payload(self): + # payload = {'raw': self.get_gzip_content(b64=True), + # 'compress': 'gzip'} + # return payload + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + # # TODO: + pass + + def get_content(self, r_type='str'): + if r_type == 'str': + return self._get_field('content') + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + # TODO # CHANGE COLOR + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf564', 'color': '#BFD677', 'radius': 5} # f563 + + def get_misp_object(self): + obj_attrs = [] + obj = MISPObject('cookie') + first_seen = self.get_first_seen() + last_seen = self.get_last_seen() + if first_seen: + obj.first_seen = first_seen + if last_seen: + obj.last_seen = last_seen + if not first_seen or not last_seen: + self.logger.warning( + f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={first_seen}, last={last_seen}') + + obj_attrs.append(obj.add_attribute('cookie-name', value=self.get_content())) + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + + def get_nb_seen(self): + return self.get_nb_correlation('domain') + + def get_meta(self, options=set()): + meta = self._get_meta(options=options) + meta['id'] = self.id + meta['tags'] = self.get_tags(r_list=True) + meta['content'] = self.get_content() + return meta + + def add(self, date, obj_id): # date = HAR Date + self._add(date, 'domain', '', obj_id) + + def create(self, content, _first_seen=None, _last_seen=None): + if not isinstance(content, str): + content = content.decode() + self._set_field('content', content) + self._create() + + +def create(content): + if isinstance(content, str): + content = content.encode() + obj_id = sha256(content).hexdigest() + cookie = CookieName(obj_id) + if not cookie.exists(): + cookie.create(content) + return cookie + + +class CookiesNames(AbstractDaterangeObjects): + """ + CookieName Objects + """ + def __init__(self): + super().__init__('cookie-name', CookieName) + + def sanitize_id_to_search(self, name_to_search): + return name_to_search # TODO + + +# if __name__ == '__main__': +# name_to_search = '98' +# print(search_cves_by_name(name_to_search)) diff --git a/bin/lib/objects/Cves.py b/bin/lib/objects/Cves.py index ed550822..02361636 100755 --- a/bin/lib/objects/Cves.py +++ b/bin/lib/objects/Cves.py @@ -80,7 +80,7 @@ class Cve(AbstractDaterangeObject): return meta def add(self, date, item_id): - self._add(date, item_id) + self._add(date, 'item', '', item_id) def get_cve_search(self): try: diff --git a/bin/lib/objects/Decodeds.py b/bin/lib/objects/Decodeds.py index abe45584..a936a1a9 100755 --- a/bin/lib/objects/Decodeds.py +++ b/bin/lib/objects/Decodeds.py @@ -228,7 +228,7 @@ class Decoded(AbstractDaterangeObject): return True def add(self, algo_name, date, obj_id, mimetype=None): - self._add(date, obj_id) + self._add(date, 'item', '', obj_id) if not mimetype: mimetype = self.get_mimetype() diff --git a/bin/lib/objects/Favicons.py b/bin/lib/objects/Favicons.py index 469e84ff..68452b65 100755 --- a/bin/lib/objects/Favicons.py +++ b/bin/lib/objects/Favicons.py @@ -86,8 +86,8 @@ class Favicon(AbstractDaterangeObject): # def get_links(self): # # TODO GET ALL URLS FROM CORRELATED ITEMS - def add(self, date, item_id): # TODO correlation base 64 -> calc md5 - self._add(date, item_id) + def add(self, date, obj_id): # TODO correlation base 64 -> calc md5 + self._add(date, 'domain', '', obj_id) def create(self, content, _first_seen=None, _last_seen=None): if not isinstance(content, str): diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py index f633a0cf..9f88426c 100755 --- a/bin/lib/objects/Titles.py +++ b/bin/lib/objects/Titles.py @@ -83,7 +83,7 @@ class Title(AbstractDaterangeObject): return meta def add(self, date, item_id): - self._add(date, item_id) + self._add(date, 'item', '', item_id) def create(self, content, _first_seen=None, _last_seen=None): self._set_field('content', content) diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py index 674c6219..1465c371 100755 --- a/bin/lib/objects/abstract_daterange_object.py +++ b/bin/lib/objects/abstract_daterange_object.py @@ -126,7 +126,7 @@ class AbstractDaterangeObject(AbstractObject, ABC): # TODO don't increase nb if same hash in item with different encoding # if hash already in item - def _add(self, date, item_id): + def _add(self, date, obj_type, subtype, obj_id): if not self.exists(): self._add_create() self.set_first_seen(date) @@ -135,15 +135,22 @@ class AbstractDaterangeObject(AbstractObject, ABC): self.update_daterange(date) update_obj_date(date, self.type) - # NB Object seen by day - if not self.is_correlated('item', '', item_id): # if decoded not already in object - r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) - # Correlations - self.add_correlation('item', '', item_id) - if is_crawled(item_id): # Domain - domain = get_item_domain(item_id) - self.add_correlation('domain', '', domain) + self.add_correlation(obj_type, subtype, obj_id) + + if obj_type == 'item': + # NB Object seen by day TODO + if not self.is_correlated(obj_type, subtype, obj_id): # nb seen by day + r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) + if is_crawled(obj_id): # Domain + domain = get_item_domain(obj_id) + self.add_correlation('domain', '', domain) + else: + # TODO Don't increase on reprocess + r_object.zincrby(f'{self.type}:date:{date}', 1, self.id) + # r_object.zincrby(f'{self.type}:obj:{obj_type}', 1, self.id) + # 1 Domain by day / 1 HAR by day + # Domain check / file created -> issue with scheduler # TODO:ADD objects + Stats def _create(self, first_seen=None, last_seen=None): diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index a768cee8..b17ce3e6 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -15,6 +15,7 @@ from lib import btc_ail from lib import Tag from lib.objects import CryptoCurrencies +from lib.objects import CookiesNames from lib.objects.Cves import Cve from lib.objects.Decodeds import Decoded, get_all_decodeds_objects, get_nb_decodeds_objects from lib.objects.Domains import Domain @@ -53,6 +54,8 @@ def get_object(obj_type, subtype, id): return Domain(id) elif obj_type == 'decoded': return Decoded(id) + elif obj_type == 'cookie-name': + return CookiesNames.CookieName(id) elif obj_type == 'cve': return Cve(id) elif obj_type == 'favicon': diff --git a/bin/packages/Date.py b/bin/packages/Date.py index 804f6973..49bf38eb 100644 --- a/bin/packages/Date.py +++ b/bin/packages/Date.py @@ -81,7 +81,7 @@ class Date(object): def get_today_date_str(separator=False): if separator: - datetime.date.today().strftime("%Y/%m/%d") + return datetime.date.today().strftime("%Y/%m/%d") else: return datetime.date.today().strftime("%Y%m%d") diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 63addc22..cc110c35 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -50,6 +50,7 @@ from blueprints.objects_cve import objects_cve from blueprints.objects_decoded import objects_decoded from blueprints.objects_subtypes import objects_subtypes from blueprints.objects_title import objects_title +from blueprints.objects_cookie_name import objects_cookie_name Flask_dir = os.environ['AIL_FLASK'] @@ -104,6 +105,8 @@ app.register_blueprint(objects_cve, url_prefix=baseUrl) app.register_blueprint(objects_decoded, url_prefix=baseUrl) app.register_blueprint(objects_subtypes, url_prefix=baseUrl) app.register_blueprint(objects_title, url_prefix=baseUrl) +app.register_blueprint(objects_cookie_name, url_prefix=baseUrl) + # ========= =========# # ========= Cookie name ======== diff --git a/var/www/blueprints/correlation.py b/var/www/blueprints/correlation.py index 38a2b73e..b43cfe1e 100644 --- a/var/www/blueprints/correlation.py +++ b/var/www/blueprints/correlation.py @@ -80,6 +80,9 @@ def show_correlation(): ## get all selected correlations filter_types = [] + correl_option = request.form.get('CookieNameCheck') + if correl_option: + filter_types.append('cookie-name') correl_option = request.form.get('CveCheck') if correl_option: filter_types.append('cve') diff --git a/var/www/blueprints/objects_cookie_name.py b/var/www/blueprints/objects_cookie_name.py new file mode 100644 index 00000000..ab111ff2 --- /dev/null +++ b/var/www/blueprints/objects_cookie_name.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +''' + +import os +import sys + +from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort +from flask_login import login_required, current_user + +# Import Role_Manager +from Role_Manager import login_admin, login_analyst, login_read_only + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import CookiesNames +from packages import Date + +# ============ BLUEPRINT ============ +objects_cookie_name = Blueprint('objects_cookie_name', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/cookie-name')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + + +# ============ FUNCTIONS ============ +@objects_cookie_name.route("/objects/cookie-name", methods=['GET']) +@login_required +@login_read_only +def objects_cookies_names(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + if show_objects: + dict_objects = CookiesNames.CookiesNames().api_get_meta_by_daterange(date_from, date_to) + else: + dict_objects = {} + + print(dict_objects) + + return render_template("CookieNameDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + +@objects_cookie_name.route("/objects/cookie-name/post", methods=['POST']) +@login_required +@login_read_only +def objects_cookies_names_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_cookie_name.objects_cookies_names', date_from=date_from, date_to=date_to, show_objects=show_objects)) + +@objects_cookie_name.route("/objects/cookie-name/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_cookie_name_range_json(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + return jsonify(CookiesNames.CookiesNames().api_get_chart_nb_by_daterange(date_from, date_to)) + +# @objects_cookie_name.route("/objects/cookie-nam/search", methods=['POST']) +# @login_required +# @login_read_only +# def objects_cookies_names_search(): +# to_search = request.form.get('object_id') +# +# # TODO SANITIZE ID +# # TODO Search all +# cve = Cves.Cve(to_search) +# if not cve.exists(): +# abort(404) +# else: +# return redirect(cve.get_link(flask_context=True)) + +# ============= ROUTES ============== + diff --git a/var/www/templates/correlation/legend_graph_correlation.html b/var/www/templates/correlation/legend_graph_correlation.html index ce5dc7ed..63e18eaa 100644 --- a/var/www/templates/correlation/legend_graph_correlation.html +++ b/var/www/templates/correlation/legend_graph_correlation.html @@ -111,6 +111,16 @@