diff --git a/OVERVIEW.md b/OVERVIEW.md index 68efa81f..14aac71b 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -261,6 +261,9 @@ Redis and ARDB overview | set_pgpdump_name:*name* | *item_path* | | | | | set_pgpdump_mail:*mail* | *item_path* | +| | | +| | | +| set_domain_pgpdump_**pgp_type**:**key** | **domain** | ##### Hset date: | Key | Field | Value | @@ -288,11 +291,20 @@ Redis and ARDB overview | item_pgpdump_name:*item_path* | *name* | | | | | item_pgpdump_mail:*item_path* | *mail* | +| | | +| | | +| domain_pgpdump_**pgp_type**:**domain** | **key** | #### Cryptocurrency Supported cryptocurrency: - bitcoin +- bitcoin-cash +- dash +- etherum +- litecoin +- monero +- zcash ##### Hset: | Key | Field | Value | @@ -303,7 +315,8 @@ Supported cryptocurrency: ##### set: | Key | Value | | ------ | ------ | -| set_cryptocurrency_**cryptocurrency name**:**cryptocurrency address** | **item_path** | +| set_cryptocurrency_**cryptocurrency name**:**cryptocurrency address** | **item_path** | PASTE +| domain_cryptocurrency_**cryptocurrency name**:**cryptocurrency address** | **domain** | DOMAIN ##### Hset date: | Key | Field | Value | @@ -318,8 +331,14 @@ Supported cryptocurrency: ##### set: | Key | Value | | ------ | ------ | -| item_cryptocurrency_**cryptocurrency name**:**item_path** | **cryptocurrency address** | +| item_cryptocurrency_**cryptocurrency name**:**item_path** | **cryptocurrency address** | PASTE +| domain_cryptocurrency_**cryptocurrency name**:**item_path** | **cryptocurrency address** | DOMAIN +#### HASH +| Key | Value | +| ------ | ------ | +| hash_domain:**domain** | **hash** | +| domain_hash:**hash** | **domain** | ## DB9 - Crawler: @@ -362,6 +381,20 @@ Supported cryptocurrency: } ``` +##### CRAWLER QUEUES: +| SET - Key | Value | +| ------ | ------ | +| onion_crawler_queue | **url**;**item_id** | RE-CRAWL +| regular_crawler_queue | - | +| | | +| onion_crawler_priority_queue | **url**;**item_id** | USER +| regular_crawler_priority_queue | - | +| | | +| onion_crawler_discovery_queue | **url**;**item_id** | DISCOVER +| regular_crawler_discovery_queue | - | + +##### TO CHANGE: + ARDB overview ----------------------------------------- SENTIMENT ------------------------------------ diff --git a/bin/Decoder.py b/bin/Decoder.py index 76228dfb..82133de7 100755 --- a/bin/Decoder.py +++ b/bin/Decoder.py @@ -18,6 +18,7 @@ from pubsublogger import publisher from Helper import Process from packages import Paste +from packages import Item import re import signal @@ -120,6 +121,12 @@ def save_hash(decoder_name, message, date, decoded): serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste + # Domain Object + if Item.is_crawled(message): + domain = Item.get_item_domain(message) + serv_metadata.sadd('hash_domain:{}'.format(domain), hash) # domain - hash map + serv_metadata.sadd('domain_hash:{}'.format(hash), domain) # hash - domain map + def save_hash_on_disk(decode, type, hash, json_data): diff --git a/bin/PgpDump.py b/bin/PgpDump.py index 4b7ec629..a269734f 100755 --- a/bin/PgpDump.py +++ b/bin/PgpDump.py @@ -21,6 +21,8 @@ from bs4 import BeautifulSoup from Helper import Process from packages import Paste +from packages import Pgp + class TimeoutException(Exception): pass @@ -117,31 +119,6 @@ def extract_id_from_output(pgp_dump_outpout): key_id = key_id.replace(key_id_str, '', 1) set_key.add(key_id) -def save_pgp_data(type_pgp, date, item_path, data): - # create basic medata - if not serv_metadata.exists('pgpdump_metadata_{}:{}'.format(type_pgp, data)): - serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'first_seen', date) - serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) - else: - last_seen = serv_metadata.hget('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen') - if not last_seen: - serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) - else: - if int(last_seen) < int(date): - serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) - - # global set - serv_metadata.sadd('set_pgpdump_{}:{}'.format(type_pgp, data), item_path) - - # daily - serv_metadata.hincrby('pgpdump:{}:{}'.format(type_pgp, date), data, 1) - - # all type - serv_metadata.zincrby('pgpdump_all:{}'.format(type_pgp), data, 1) - - # item_metadata - serv_metadata.sadd('item_pgpdump_{}:{}'.format(type_pgp, item_path), data) - if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) @@ -236,12 +213,12 @@ if __name__ == '__main__': for key_id in set_key: print(key_id) - save_pgp_data('key', date, message, key_id) + Pgp.save_pgp_data('key', date, message, key_id) for name_id in set_name: print(name_id) - save_pgp_data('name', date, message, name_id) + Pgp.save_pgp_data('name', date, message, name_id) for mail_id in set_mail: print(mail_id) - save_pgp_data('mail', date, message, mail_id) + Pgp.save_pgp_data('mail', date, message, mail_id) diff --git a/bin/Tags.py b/bin/Tags.py index 2bf30d87..46c63d46 100755 --- a/bin/Tags.py +++ b/bin/Tags.py @@ -16,6 +16,8 @@ import datetime from pubsublogger import publisher from Helper import Process from packages import Paste +from packages import Item + def get_item_date(item_filename): l_directory = item_filename.split('/') @@ -84,6 +86,12 @@ if __name__ == '__main__': set_tag_metadata(tag, item_date) server_metadata.sadd('tag:{}'.format(path), tag) + # Domain Object + if Item.is_crawled(path): + domain = Item.get_item_domain(path) + server_metadata.sadd('tag:{}'.format(domain), tag) + server.sadd('domain:{}:{}'.format(tag, item_date), domain) + curr_date = datetime.date.today().strftime("%Y%m%d") server.hincrby('daily_tags:{}'.format(item_date), tag, 1) p.populate_set_out(message, 'MISP_The_Hive_feeder') diff --git a/bin/packages/Correlation.py b/bin/packages/Correlation.py index b769600b..cf923049 100755 --- a/bin/packages/Correlation.py +++ b/bin/packages/Correlation.py @@ -2,8 +2,10 @@ # -*-coding:UTF-8 -* import os +import sys import redis +sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules/')) import Flask_config r_serv_metadata = Flask_config.r_serv_metadata @@ -14,9 +16,11 @@ class Correlation(object): def __init__(self, correlation_name): self.correlation_name = correlation_name - def _exist_corelation_field(self, correlation_type, field_name): - return r_serv_metadata.exists('set_{}_{}:{}'.format(self.correlation_name, correlation_type, field_name)) - + def _exist_corelation_field(self, correlation_type, field_name, item_type='paste'): + if type=='paste': + return r_serv_metadata.exists('set_{}_{}:{}'.format(self.correlation_name, correlation_type, field_name)) + else: + return r_serv_metadata.exists('set_domain_{}_{}:{}'.format(self.correlation_name, correlation_type, field_name)) def _get_items(self, correlation_type, field_name): res = r_serv_metadata.smembers('set_{}_{}:{}'.format(self.correlation_name, correlation_type, field_name)) @@ -25,6 +29,12 @@ class Correlation(object): else: return [] + def _get_domains(self, correlation_type, field_name): + res = r_serv_metadata.smembers('set_domain_{}_{}:{}'.format(self.correlation_name, correlation_type, field_name)) + if res: + return list(res) + else: + return [] def _get_metadata(self, correlation_type, field_name): meta_dict = {} @@ -35,14 +45,14 @@ class Correlation(object): def _get_correlation_by_date(self, correlation_type, date): return r_serv_metadata.hkeys('{}:{}:{}'.format(self.correlation_name, correlation_type, date)) - def verify_correlation_field_request(self, request_dict, correlation_type): + def verify_correlation_field_request(self, request_dict, correlation_type, item_type='paste'): if not request_dict: - return Response({'status': 'error', 'reason': 'Malformed JSON'}, 400) + return ({'status': 'error', 'reason': 'Malformed JSON'}, 400) field_name = request_dict.get(correlation_type, None) if not field_name: return ( {'status': 'error', 'reason': 'Mandatory parameter(s) not provided'}, 400 ) - if not self._exist_corelation_field(correlation_type, field_name): + if not self._exist_corelation_field(correlation_type, field_name, item_type=item_type): return ( {'status': 'error', 'reason': 'Item not found'}, 404 ) def get_correlation(self, request_dict, correlation_type, field_name): @@ -58,7 +68,37 @@ class Correlation(object): return (dict_resp, 200) + def get_correlation_domain(self, request_dict, correlation_type, field_name): + dict_resp = {} + dict_resp['domain'] = self._get_domains(correlation_type, field_name) + #if request_dict.get('metadata'): + # dict_resp['metadata'] = self._get_metadata(correlation_type, field_name) -#cryptocurrency_all:cryptocurrency name cryptocurrency address nb seen + dict_resp[correlation_type] = field_name + + return (dict_resp, 200) + +######## INTERNAL ######## + +def _get_domain_correlation_obj(correlation_name, correlation_type, domain): + print('domain_{}_{}:{}'.format(correlation_name, correlation_type, domain)) + res = r_serv_metadata.smembers('domain_{}_{}:{}'.format(correlation_name, correlation_type, domain)) + if res: + return list(res) + else: + return [] + +######## ######## + +######## API EXPOSED ######## + +def get_domain_correlation_obj(request_dict, correlation_name, correlation_type, domain): + dict_resp = {} + dict_resp[correlation_type] = _get_domain_correlation_obj(correlation_name, correlation_type, domain) + dict_resp['domain'] = domain + + return (dict_resp, 200) + +######## ######## diff --git a/bin/packages/Cryptocurrency.py b/bin/packages/Cryptocurrency.py index 995ada9e..eb5c00e6 100755 --- a/bin/packages/Cryptocurrency.py +++ b/bin/packages/Cryptocurrency.py @@ -10,11 +10,13 @@ from hashlib import sha256 sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) import Flask_config from Correlation import Correlation +import Item r_serv_metadata = Flask_config.r_serv_metadata +all_cryptocurrency = ['bitcoin', 'etherum'] + digits58 = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' -#address_validation = {'bitcoin': 'base58', 'dash': 'base58'} cryptocurrency = Correlation('cryptocurrency') @@ -52,6 +54,21 @@ def get_cryptocurrency(request_dict, cryptocurrency_type): return cryptocurrency.get_correlation(request_dict, cryptocurrency_type, field_name) +# # TODO: add get all cryptocurrency option +def get_cryptocurrency_domain(request_dict, cryptocurrency_type): + res = cryptocurrency.verify_correlation_field_request(request_dict, cryptocurrency_type, item_type='domain') + if res: + return res + field_name = request_dict.get(cryptocurrency_type) + if not verify_cryptocurrency_address(cryptocurrency_type, field_name): + return ( {'status': 'error', 'reason': 'Invalid Cryptocurrency address'}, 400 ) + + return cryptocurrency.get_correlation_domain(request_dict, cryptocurrency_type, field_name) + +def get_domain_cryptocurrency(request_dict, cryptocurrency_type): + return cryptocurrency.get_domain_correlation_obj(self, request_dict, cryptocurrency_type, domain) + + def save_cryptocurrency_data(cryptocurrency_name, date, item_path, cryptocurrency_address): # create basic medata if not r_serv_metadata.exists('cryptocurrency_metadata_{}:{}'.format(cryptocurrency_name, cryptocurrency_address)): @@ -65,7 +82,8 @@ def save_cryptocurrency_data(cryptocurrency_name, date, item_path, cryptocurrenc if int(last_seen) < int(date): r_serv_metadata.hset('cryptocurrency_metadata_{}:{}'.format(cryptocurrency_name, cryptocurrency_address), 'last_seen', date) - # global set + ## global set + # item r_serv_metadata.sadd('set_cryptocurrency_{}:{}'.format(cryptocurrency_name, cryptocurrency_address), item_path) # daily @@ -74,5 +92,12 @@ def save_cryptocurrency_data(cryptocurrency_name, date, item_path, cryptocurrenc # all type r_serv_metadata.zincrby('cryptocurrency_all:{}'.format(cryptocurrency_name), cryptocurrency_address, 1) - # item_metadata + ## object_metadata + # item r_serv_metadata.sadd('item_cryptocurrency_{}:{}'.format(cryptocurrency_name, item_path), cryptocurrency_address) + + # domain + if Item.is_crawled(item_path): + domain = Item.get_item_domain(item_path) + r_serv_metadata.sadd('domain_cryptocurrency_{}:{}'.format(cryptocurrency_name, domain), cryptocurrency_address) + r_serv_metadata.sadd('set_domain_cryptocurrency_{}:{}'.format(cryptocurrency_name, cryptocurrency_address), domain) diff --git a/bin/packages/Domain.py b/bin/packages/Domain.py new file mode 100755 index 00000000..76f97735 --- /dev/null +++ b/bin/packages/Domain.py @@ -0,0 +1,85 @@ +#!/usr/bin/python3 + +""" +The ``Domain`` +=================== + + +""" + +import os +import sys +import time +import redis + +import Item + +sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules/')) +import Flask_config + +r_serv_onion = Flask_config.r_serv_onion + +def get_domain_type(domain): + if str(domain).endswith('.onion'): + return 'onion' + else: + return 'regular' + +def get_all_domain_up_by_type(domain_type): + if domain_type in domains: + list_domain = list(r_serv_onion.smembers('full_{}_up'.format(domain_type))) + return ({'type': domain_type, 'domains': list_domain}, 200) + else: + return ({"status": "error", "reason": "Invalid domain type"}, 400) + +def get_domain_items(domain, root_item_id): + dom_item = get_domain_item_children(domain, root_item_id) + dom_item.append(root_item_id) + return dom_item + +def get_domain_item_children(domain, root_item_id): + all_items = [] + for item_id in Item.get_item_children(root_item_id): + if Item.is_item_in_domain(domain, item_id): + all_items.append(item_id) + all_items.extend(get_domain_item_children(domain, item_id)) + return all_items + +def get_link_tree(): + pass + + +### +### correlation +### + +def _get_domain_correlation(domain, correlation_name=None, correlation_type=None): + res = r_serv_metadata.smembers('item_{}_{}:{}'.format(correlation_name, correlation_type, item_id)) + if res: + return list(res) + else: + return [] + +def get_item_bitcoin(item_id): + return _get_item_correlation('cryptocurrency', 'bitcoin', item_id) + +def get_item_pgp_key(item_id): + return _get_item_correlation('pgpdump', 'key', item_id) + +def get_item_pgp_name(item_id): + return _get_item_correlation('pgpdump', 'name', item_id) + +def get_item_pgp_mail(item_id): + return _get_item_correlation('pgpdump', 'mail', item_id) + +def get_item_pgp_correlation(item_id): + pass + + +class Domain(object): + """docstring for Domain.""" + + def __init__(self, domain, port=80): + self.domain = str(domain) + ## TODO: handle none port + self.type = get_domain_type(domain) diff --git a/bin/packages/Item.py b/bin/packages/Item.py index 4f7aa851..680b8f97 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -125,7 +125,6 @@ def get_item(request_dict): ### def _get_item_correlation(correlation_name, correlation_type, item_id): - print('item_{}_{}:{}'.format(correlation_name, correlation_type, item_id)) res = r_serv_metadata.smembers('item_{}_{}:{}'.format(correlation_name, correlation_type, item_id)) if res: return list(res) @@ -144,6 +143,8 @@ def get_item_pgp_name(item_id): def get_item_pgp_mail(item_id): return _get_item_correlation('pgpdump', 'mail', item_id) +def get_item_pgp_correlation(item_id): + pass ### ### GET Internal Module DESC @@ -153,3 +154,29 @@ def get_item_list_desc(list_item_id): for item_id in list_item_id: desc_list.append( {'id': item_id, 'date': get_item_date(item_id), 'tags': Tag.get_item_tags(item_id)} ) return desc_list + +# # TODO: add an option to check the tag +def is_crawled(item_id): + return item_id.startswith('crawled') + +def is_onion(item_id): + is_onion = False + if len(is_onion) > 62: + if is_crawled(item_id) and item_id[-42:-36] == '.onion': + is_onion = True + return is_onion + +def is_item_in_domain(domain, item_id): + is_in_domain = False + domain_lenght = len(domain) + if len(item_id) > (domain_lenght+48): + if item_id[-36-domain_lenght:-36] == domain: + is_in_domain = True + return is_in_domain + +def get_item_domain(item_id): + return item_id[19:-36] + + +def get_item_children(item_id): + return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id))) diff --git a/bin/packages/Pgp.py b/bin/packages/Pgp.py index 9c7b0ec4..12ff34fa 100755 --- a/bin/packages/Pgp.py +++ b/bin/packages/Pgp.py @@ -2,14 +2,18 @@ # -*-coding:UTF-8 -* import os +import sys import redis from hashlib import sha256 +sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) import Flask_config -from Correlation import Correlation -r_serv_metadata = Flask_config.r_serv_metadata +from Correlation import Correlation +import Item + +serv_metadata = Flask_config.r_serv_metadata pgpdump = Correlation('pgpdump') @@ -23,3 +27,36 @@ def get_pgp(request_dict, pgp_type): field_name = request_dict.get(pgp_type) return pgpdump.get_correlation(request_dict, pgp_type, field_name) + +def save_pgp_data(type_pgp, date, item_path, data): + # create basic medata + if not serv_metadata.exists('pgpdump_metadata_{}:{}'.format(type_pgp, data)): + serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'first_seen', date) + serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) + else: + last_seen = serv_metadata.hget('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen') + if not last_seen: + serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) + else: + if int(last_seen) < int(date): + serv_metadata.hset('pgpdump_metadata_{}:{}'.format(type_pgp, data), 'last_seen', date) + + # global set + serv_metadata.sadd('set_pgpdump_{}:{}'.format(type_pgp, data), item_path) + + # daily + serv_metadata.hincrby('pgpdump:{}:{}'.format(type_pgp, date), data, 1) + + # all type + serv_metadata.zincrby('pgpdump_all:{}'.format(type_pgp), data, 1) + + ## object_metadata + # paste + serv_metadata.sadd('item_pgpdump_{}:{}'.format(type_pgp, item_path), data) + + + # domain object + if Item.is_crawled(item_path): + domain = Item.get_item_domain(item_path) + serv_metadata.sadd('domain_pgpdump_{}:{}'.format(type_pgp, domain), data) + serv_metadata.sadd('set_domain_pgpdump_{}:{}'.format(type_pgp, data), domain) diff --git a/bin/packages/Tag.py b/bin/packages/Tag.py index 70d7e72e..f1147715 100755 --- a/bin/packages/Tag.py +++ b/bin/packages/Tag.py @@ -121,6 +121,11 @@ def add_item_tag(tag, item_path): r_serv_metadata.sadd('tag:{}'.format(item_path), tag) r_serv_tags.sadd('{}:{}'.format(tag, item_date), item_path) + if Item.is_crawled(item_path): + domain = Item.get_item_domain(item_path) + r_serv_metadata.sadd('tag:{}'.format(domain), tag) + r_serv_tags.sadd('domain:{}:{}'.format(tag, item_date), domain) + r_serv_tags.hincrby('daily_tags:{}'.format(item_date), tag, 1) tag_first_seen = r_serv_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')