From aa6ba61050a70abaa9d691cbd087ab07185bba56 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 8 Sep 2022 10:31:57 +0200 Subject: [PATCH] chg: [statistics] ARDB migration --- bin/BankAccount.py | 21 ++-- bin/Crawler.py | 8 -- bin/DB_KVROCKS_MIGRATION.py | 109 ++++++++++++++++-- bin/LAUNCH.sh | 8 +- bin/Mail.py | 220 ------------------------------------ bin/ModuleStats.py | 156 ------------------------- bin/lib/Statistics.py | 128 ++++++++++++++++++++- bin/lib/crawlers.py | 6 + bin/modules/Credential.py | 10 +- bin/modules/CreditCards.py | 3 - bin/modules/Mail.py | 177 +++++++++++++++++++++++++++++ bin/modules/ModuleStats.py | 54 +++++++++ bin/packages/modules.cfg | 2 +- 13 files changed, 482 insertions(+), 420 deletions(-) delete mode 100755 bin/Mail.py delete mode 100755 bin/ModuleStats.py create mode 100755 bin/modules/Mail.py create mode 100755 bin/modules/ModuleStats.py diff --git a/bin/BankAccount.py b/bin/BankAccount.py index e02468f1..f6888084 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -16,6 +16,13 @@ import re import string from itertools import chain +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages # +################################## +from lib import Statistics + + from packages import Item from pubsublogger import publisher @@ -48,6 +55,7 @@ def is_valid_iban(iban): return True return False +# # TODO: SET def check_all_iban(l_iban, obj_id): nb_valid_iban = 0 for iban in l_iban: @@ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id): if is_valid_iban(iban): print('------') nb_valid_iban = nb_valid_iban + 1 - server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) + Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1) + if(nb_valid_iban > 0): to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id)) @@ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id): msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id) p.populate_set_out(msg, 'Tags') - #Send to duplicate - p.populate_set_out(obj_id, 'Duplicate') - if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" @@ -82,13 +88,6 @@ if __name__ == "__main__": p = Process(config_section) max_execution_time = p.config.getint("BankAccount", "max_execution_time") - # ARDB # - server_statistics = redis.StrictRedis( - host=p.config.get("ARDB_Statistics", "host"), - port=p.config.getint("ARDB_Statistics", "port"), - db=p.config.getint("ARDB_Statistics", "db"), - decode_responses=True) - publisher.info("BankAccount started") #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') diff --git a/bin/Crawler.py b/bin/Crawler.py index f02a9929..3ec4922f 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -314,14 +314,6 @@ if __name__ == '__main__': print('splash url: {}'.format(splash_url)) - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) - - r_serv_metadata = redis.StrictRedis( - host=p.config.get("ARDB_Metadata", "host"), - port=p.config.getint("ARDB_Metadata", "port"), - db=p.config.getint("ARDB_Metadata", "db"), - decode_responses=True) - r_cache = redis.StrictRedis( host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), diff --git a/bin/DB_KVROCKS_MIGRATION.py b/bin/DB_KVROCKS_MIGRATION.py index 529088fc..ee2b4d4c 100755 --- a/bin/DB_KVROCKS_MIGRATION.py +++ b/bin/DB_KVROCKS_MIGRATION.py @@ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from lib.ConfigLoader import ConfigLoader +from lib import Statistics from lib import Tag from lib import Users from lib.objects import Decodeds @@ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker") r_serv_tags = config_loader.get_redis_conn("ARDB_Tags") r_crawler = config_loader.get_redis_conn("ARDB_Onion") r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") +r_serv_trend = config_loader.get_redis_conn("ARDB_Trending") +r_statistics = config_loader.get_redis_conn("ARDB_Statistics") config_loader = None # # - - CONFIGS - - # # @@ -358,9 +361,16 @@ def items_migration(): def get_last_crawled_domains(domain_type): return r_crawler.lrange(f'last_{domain_type}', 0 ,-1) +def get_domains_blacklist(domain_type): + return r_crawler.smembers(f'blacklist_{domain_type}') + def crawler_migration(): print('CRAWLER MIGRATION...') + for domain_type in ['onion', 'regular']: + for domain in get_domains_blacklist(domain_type): + crawlers.add_domain_blacklist(domain_type, domain) + # for domain_type in ['onion', 'regular']: # for row in get_last_crawled_domains(domain_type): # dom_row, epoch = row.rsplit(';', 1) @@ -368,19 +378,18 @@ def crawler_migration(): # print(domain, port, epoch) # #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch) - for cookiejar_uuid in old_crawlers.get_all_cookiejar(): - meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True) - #print(meta) - #crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid) - #crawlers._set_cookiejar_date(meta['date']) - - for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True): - print(cookie_uuid) - #crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid) + # for cookiejar_uuid in old_crawlers.get_all_cookiejar(): + # meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True) + # #print(meta) + # crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid) + # crawlers._set_cookiejar_date(meta['date']) + # + # for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True): + # print(cookie_uuid) + # crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid) # TODO: auto crawler -> to Fix / change - # TODO: crawlers queues ############################### @@ -689,12 +698,89 @@ def subtypes_obj_migration(): # # Credential: # HSET 'credential_by_tld:'+date, tld, 1 + +def get_all_provider(): + return r_serv_trend.smembers('all_provider_set') + +def get_item_source_stats_by_date(date, source): + stats = {} + stats['num'] = r_serv_trend.hget(f'{source}_num', date) + stats['size'] = r_serv_trend.hget(f'{source}_size', date) + stats['avg'] = r_serv_trend.hget(f'{source}_avg', date) + return stats + +def get_item_stats_size_avg_by_date(date): + return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True) + +def get_item_stats_nb_by_date(date): + return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True) + +def get_top_stats_module(module_name, date): + return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True) + +def get_module_tld_stats_by_date(module, date): + return r_statistics.hgetall(f'{module}_by_tld:{date}') + def statistics_migration(): + # paste_by_modules_timeout + + # Date full history => lot of keys + + + # top_size_set_{date} + # top_avg_size_set_{date} + + # 'providers_set_{date} + + + + sources = get_all_provider() + for date in Date.get_date_range_today('20180101'): + + size_avg = get_item_stats_size_avg_by_date(date) + + nb_items = get_item_stats_nb_by_date(date) + + # top_size_set_{date} + # top_avg_size_set_{date} + + # 'providers_set_{date} + + # ITEM STATS + for source in sources: + source_stat = get_item_source_stats_by_date(date, source) + Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg']) + + + + # # MODULE STATS + # for module in ['credential', 'mail', 'SQLInjection']: + # stats = get_module_tld_stats_by_date(module, date) + # for tld in stats: + # if tld: + # print(module, date, tld, stats[tld]) + # Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld]) + # for module in ['credential']: + # # TOP STATS + # top_module = get_top_stats_module(module, date) + # for keyword, total_sum in top_module: + # print(date, module, keyword, total_sum) + # #Statistics._add_module_stats(module, total_sum, keyword, date) + + + + + + + + + pass + if __name__ == '__main__': - core_migration() + #core_migration() # user_migration() # tags_migration() #items_migration() @@ -706,6 +792,7 @@ if __name__ == '__main__': # ail_2_ail_migration() # trackers_migration() # investigations_migration() + statistics_migration() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index aa547a8f..502bf6bd 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -217,8 +217,12 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x" + sleep 0.1 # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x" # sleep 0.1 + screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x" sleep 0.1 @@ -267,8 +271,6 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x" - sleep 0.1 screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x" @@ -277,8 +279,6 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x" - sleep 0.1 screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x" diff --git a/bin/Mail.py b/bin/Mail.py deleted file mode 100755 index 3b39e71f..00000000 --- a/bin/Mail.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -""" -The Mails Module -====================== - -This module is consuming the Redis-list created by the Categ module. - -It apply mail regexes on item content and warn if above a threshold. - -""" - -import os -import re -import sys -import uuid -import redis -import time -import datetime - -import dns.resolver -import dns.exception - -from multiprocessing import Process as Proc - -from pubsublogger import publisher -from Helper import Process - -from pyfaup.faup import Faup - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) -import Item - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader - -## LOAD CONFIG ## -config_loader = ConfigLoader.ConfigLoader() -server_statistics = config_loader.get_redis_conn("ARDB_Statistics") -r_serv_cache = config_loader.get_redis_conn("Redis_Cache") - -dns_server = config_loader.get_config_str('Mail', 'dns') - -config_loader = None -## -- ## - -def is_mxdomain_in_cache(mxdomain): - return r_serv_cache.exists('mxdomain:{}'.format(mxdomain)) - -def save_mxdomain_in_cache(mxdomain): - r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1)) - -def check_mx_record(set_mxdomains, dns_server): - """Check if emails MX domains are responding. - - :param adress_set: -- (set) This is a set of emails domains - :return: (int) Number of adress with a responding and valid MX domains - - """ - resolver = dns.resolver.Resolver() - resolver.nameservers = [dns_server] - resolver.timeout = 5.0 - resolver.lifetime = 2.0 - - valid_mxdomain = [] - for mxdomain in set_mxdomains: - - # check if is in cache - # # TODO: - if is_mxdomain_in_cache(mxdomain): - valid_mxdomain.append(mxdomain) - else: - - # DNS resolution - try: - answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX) - if answers: - save_mxdomain_in_cache(mxdomain) - valid_mxdomain.append(mxdomain) - # DEBUG - # print('---') - # print(answers.response) - # print(answers.qname) - # print(answers.rdtype) - # print(answers.rdclass) - # print(answers.nameserver) - # print() - - except dns.resolver.NoNameservers: - publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.') - print('NoNameserver, No non-broken nameservers are available to answer the query.') - except dns.resolver.NoAnswer: - publisher.debug('NoAnswer, The response did not contain an answer to the question.') - print('NoAnswer, The response did not contain an answer to the question.') - except dns.name.EmptyLabel: - publisher.debug('SyntaxError: EmptyLabel') - print('SyntaxError: EmptyLabel') - except dns.resolver.NXDOMAIN: - #save_mxdomain_in_cache(mxdomain) - publisher.debug('The query name does not exist.') - print('The query name does not exist.') - except dns.name.LabelTooLong: - publisher.debug('The Label is too long') - print('The Label is too long') - except dns.exception.Timeout: - print('dns timeout') - #save_mxdomain_in_cache(mxdomain) - except Exception as e: - print(e) - return valid_mxdomain - -def extract_all_emails(redis_key, item_content): - all_emails = re.findall(email_regex, item_content) - if len(all_emails) > 1: - r_serv_cache.sadd(redis_key, *all_emails) - r_serv_cache.expire(redis_key, 360) - elif all_emails: - r_serv_cache.sadd(redis_key, all_emails[0]) - r_serv_cache.expire(redis_key, 360) - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'Mail' - - faup = Faup() - - p = Process(config_section) - - publisher.info("Mails module started") - - # Numbers of Mails needed to Tags - mail_threshold = 10 - - max_execution_time = 30 - - email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" - - redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4())) - - while True: - message = p.get_from_set() - - if message is not None: - item_id, score = message.split() - - print(item_id) - - item_content = Item.get_item_content(item_id) - - proc = Proc(target=extract_all_emails, args=(redis_key, item_content, )) - try: - proc.start() - proc.join(max_execution_time) - if proc.is_alive(): - proc.terminate() - p.incr_module_timeout_statistic() - err_mess = "Mails: processing timeout: {}".format(item_id) - print(err_mess) - publisher.info(err_mess) - continue - else: - all_emails = r_serv_cache.smembers(redis_key) - r_serv_cache.delete(redis_key) - proc.terminate() - except KeyboardInterrupt: - print("Caught KeyboardInterrupt, terminating workers") - proc.terminate() - sys.exit(0) - - # get MXdomains - set_mxdomains = set() - dict_mxdomains_email = {} - for email in all_emails: - mxdomain = email.split('@')[1].lower() - if not mxdomain in dict_mxdomains_email: - dict_mxdomains_email[mxdomain] = [] - set_mxdomains.add(mxdomain) - dict_mxdomains_email[mxdomain].append(email) - - ## TODO: add MAIL trackers - - valid_mx = check_mx_record(set_mxdomains, dns_server) - - item_date = Item.get_item_date(item_id) - - num_valid_email = 0 - for domain_mx in valid_mx: - num_valid_email += len(dict_mxdomains_email[domain_mx]) - - for email in dict_mxdomains_email[domain_mx]: - msg = 'mail;{};{};{}'.format(1, email, item_date) - p.populate_set_out(msg, 'ModuleStats') - - # Create country stats - faup.decode(email) - tld = faup.get()['tld'] - try: - tld = tld.decode() - except: - pass - server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1) - - msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id) - - if num_valid_email > mail_threshold: - print('{} Checked {} e-mail(s)'.format(item_id, num_valid_email)) - publisher.warning(msg) - #Send to duplicate - p.populate_set_out(item_id, 'Duplicate') - #tags - msg = 'infoleak:automatic-detection="mail";{}'.format(item_id) - p.populate_set_out(msg, 'Tags') - else: - publisher.info(msg) - - else: - time.sleep(10) diff --git a/bin/ModuleStats.py b/bin/ModuleStats.py deleted file mode 100755 index 240c315b..00000000 --- a/bin/ModuleStats.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" - This module makes statistics for some modules and providers - -""" - -################################## -# Import External packages # -################################## -import time -import datetime -import redis -import os -import sys - -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages # -################################## -from modules.abstract_module import AbstractModule -from packages.Date import Date -from packages import Paste -import ConfigLoader - - -class ModuleStats(AbstractModule): - """ - Module Statistics module for AIL framework - """ - - # Config Var - MAX_SET_CARDINALITY = 8 - - - def __init__(self): - - super(ModuleStats, self).__init__() - - # Waiting time in secondes between to message proccessed - self.pending_seconds = 20 - - # Sent to the logging a description of the module - self.redis_logger.info("Makes statistics about valid URL") - - # REDIS # - self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending") - - def compute(self, message): - - if len(message.split(';')) > 1: - self.compute_most_posted(message) - else: - self.compute_provider_info(message) - - - def get_date_range(self, num_day): - curr_date = datetime.date.today() - date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) - date_list = [] - - for i in range(0, num_day+1): - date_list.append(date.substract_day(i)) - return date_list - - - def compute_most_posted(self, message): - module, num, keyword, paste_date = message.split(';') - - redis_progression_name_set = 'top_'+ module +'_set_' + paste_date - - # Add/Update in Redis - self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num)) - - # Compute Most Posted - date = self.get_date_range(0)[0] - # check if this keyword is eligible for progression - keyword_total_sum = 0 - - curr_value = self.r_serv_trend.hget(date, module+'-'+keyword) - keyword_total_sum += int(curr_value) if curr_value is not None else 0 - - if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY: - self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) - - else: # not in set - member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1) - # Member set is a list of (value, score) pairs - if int(member_set[0][1]) < keyword_total_sum: - #remove min from set and add the new one - self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') - self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0]) - self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) - self.redis_logger.debug(redis_progression_name_set) - - - def compute_provider_info(self, message): - redis_all_provider = 'all_provider_set' - - paste = Paste.Paste(message) - - paste_baseName = paste.p_name.split('.')[0] - paste_size = paste._get_p_size() - paste_provider = paste.p_source - paste_date = str(paste._get_p_date()) - redis_sum_size_set = 'top_size_set_' + paste_date - redis_avg_size_name_set = 'top_avg_size_set_' + paste_date - redis_providers_name_set = 'providers_set_' + paste_date - - # Add/Update in Redis - self.r_serv_trend.sadd(redis_all_provider, paste_provider) - - num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1)) - sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size)) - new_avg = float(sum_size) / float(num_paste) - self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg) - - - # - # Compute Most Posted - # - - # Size - if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil": - self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider) - self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) - else: #set full capacity - member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1) - # Member set is a list of (value, score) pairs - if float(member_set[0][1]) < new_avg: - #remove min from set and add the new one - self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') - self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0]) - self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider) - self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0]) - self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) - - - # Num - # if set not full or provider already present - if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil": - self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) - else: #set at full capacity - member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1) - # Member set is a list of (value, score) pairs - if int(member_set[0][1]) < num_paste: - #remove min from set and add the new one - self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') - self.r_serv_trend.zrem(member_set[0][0]) - self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) - - -if __name__ == '__main__': - - module = ModuleStats() - module.run() diff --git a/bin/lib/Statistics.py b/bin/lib/Statistics.py index f0944754..fcad048a 100755 --- a/bin/lib/Statistics.py +++ b/bin/lib/Statistics.py @@ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader config_loader = ConfigLoader.ConfigLoader() -r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics") +r_statistics = config_loader.get_redis_conn("ARDB_Statistics") +#r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending") config_loader = None +PIE_CHART_MAX_CARDINALITY = 8 + def incr_module_timeout_statistic(module_name): curr_date = datetime.date.today() - r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1) + r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1) + +def create_item_statistics(item_id, source, size): + pass + +def get_item_sources(): + return r_statistics.smembers('all_provider_set') + +def get_nb_items_processed_by_day_and_source(date, source): + nb_items = r_statistics.hget(f'{source}_num', date) + if not nb_items: + nb_items = 0 + return int(nb_items) + +def get_items_total_size_by_day_and_source(date, source): + total_size = r_statistics.hget(f'{source}_size', date) + if not total_size: + total_size = 0 + return float(total_size) + +def get_items_av_size_by_day_and_source(date, source): + av_size = r_statistics.hget(f'{source}_avg', date) + if not av_size: + av_size = 0 + return float(av_size) + +def _create_item_stats_size_nb(date, source, num, size, avg): + r_statistics.hset(f'{source}_num', date, num) + r_statistics.hset(f'{source}_size', date, size) + r_statistics.hset(f'{source}_avg', date, avg) + +def get_item_stats_size_avg_by_date(): + return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True) + +def get_item_stats_nb_by_date(): + return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True) + +def _set_item_stats_nb_by_date(date, source): + return r_statistics.zrange(f'providers_set_{date}', ) + + +# # TODO: load ZSET IN CACHE => FAST UPDATE +def update_item_stats_size_nb(item_id, source, size, date): + # Add/Update in Redis + r_statistics.sadd('all_provider_set', source) + + nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1)) + sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size)) + new_avg = sum_size / nb_items + r_statistics.hset(f'{source}_avg', date, new_avg) + + # TOP Items Size + if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY: + r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source) + else: + member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if float(member_set[0][1]) < new_avg: + # remove min from set and add the new one + r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0]) + r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source) + + # TOP Nb Items + if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None: + r_statistics.zadd(f'providers_set_{date}', float(nb_items), source) + else: # zset at full capacity + member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if int(member_set[0][1]) < nb_items: + # remove min from set and add the new one + r_statistics.zrem(member_set[0][0]) + r_statistics.zadd(f'providers_set_{date}', float(nb_items), source) + +# keyword num + +def _add_module_stats(module_name, total_sum, keyword, date): + r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword) + +# # TODO: ONE HSET BY MODULE / CUSTOM STATS +def update_module_stats(module_name, num, keyword, date): + + # Add/Update in Redis + r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!! + + # Compute Most Posted + # check if this keyword is eligible for progression + keyword_total_sum = 0 + + curr_value = r_statistics.hget(date, module+'-'+keyword) + keyword_total_sum += int(curr_value) if curr_value is not None else 0 + + if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY: + r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword) + else: # zset at full capacity + member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if int(member_set[0][1]) < keyword_total_sum: + #remove min from set and add the new one + r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0]) + r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword) + +def get_module_tld_stats_by_tld_date(date, tld): + nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld) + if not nb_tld: + nb_tld = 0 + return int(nb_tld) + +def get_module_tld_stats_by_date(module, date): + return r_statistics.hgetall(f'{module}_by_tld:{date}') + +def add_module_tld_stats_by_date(module, date, tld, nb): + r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb)) + + +def get_iban_country_stats_by_date(date): + return r_statistics.hgetall(f'iban_by_country:{date}') + +def add_iban_country_stats_by_date(date, tld, nb): + r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb)) + +# r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1) +# r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 4e62b328..471e978d 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name): else: return ({'error':'invalid input'}, 400) +def get_domains_blacklist(domain_type): + return r_serv_onion.smembers(f'blacklist_{domain_type}') + +def add_domain_blacklist(domain_type, domain): + r_serv_onion.sadd(f'blacklist_{domain_type}', domain) + ##-- CRAWLER GLOBAL --## #### AUTOMATIC CRAWLER #### diff --git a/bin/modules/Credential.py b/bin/modules/Credential.py index 5e14fa13..ea8956ec 100755 --- a/bin/modules/Credential.py +++ b/bin/modules/Credential.py @@ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule from packages.Item import Item from lib import ConfigLoader from lib import regex_helper +from lib import Statistics class Credential(AbstractModule): @@ -96,6 +97,7 @@ class Credential(AbstractModule): item_content = item.get_content() + # TODO: USE SETS # Extract all credentials all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time) @@ -117,9 +119,6 @@ class Credential(AbstractModule): print(f"========> Found more than 10 credentials in this file : {item.get_id()}") self.redis_logger.warning(to_print) - # Send to duplicate - self.send_message_to_queue(item.get_id(), 'Duplicate') - msg = f'infoleak:automatic-detection="credential";{item.get_id()}' self.send_message_to_queue(msg, 'Tags') @@ -158,6 +157,7 @@ class Credential(AbstractModule): print(f"=======> Probably on : {discovered_sites}") date = datetime.now().strftime("%Y%m") + nb_tlds = {} for cred in all_credentials: maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] self.faup.decode(maildomains) @@ -167,7 +167,9 @@ class Credential(AbstractModule): tld = tld.decode() except: pass - self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1) + nb_tlds[tld] = nb_tlds.get(tld, 0) + 1 + for tld in nb_tlds: + Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld]) else: self.redis_logger.info(to_print) print(f'found {nb_cred} credentials') diff --git a/bin/modules/CreditCards.py b/bin/modules/CreditCards.py index 8aa1415e..70a119ae 100755 --- a/bin/modules/CreditCards.py +++ b/bin/modules/CreditCards.py @@ -75,9 +75,6 @@ class CreditCards(AbstractModule): if (len(creditcard_set) > 0): self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}') - #Send to duplicate - self.send_message_to_queue(item.get_id(), 'Duplicate') - msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}' self.send_message_to_queue(msg, 'Tags') diff --git a/bin/modules/Mail.py b/bin/modules/Mail.py new file mode 100755 index 00000000..78742774 --- /dev/null +++ b/bin/modules/Mail.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +The Mails Module +====================== + +This module is consuming the Redis-list created by the Categ module. + +It apply mail regexes on item content and warn if above a threshold. + +""" + +import os +import re +import redis +import sys +import time +import datetime + +import dns.resolver +import dns.exception + +from pyfaup.faup import Faup + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages # +################################## +from modules.abstract_module import AbstractModule +from lib.objects.Items import Item +from lib.ConfigLoader import ConfigLoader +from lib import Statistics + + +class Mail(AbstractModule): + """ + Module Mail module for AIL framework + """ + + def __init__(self): + super(Mail, self).__init__() + + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + + self.dns_server = config_loader.get_config_str('Mail', 'dns') + + self.faup = Faup() + + # Numbers of Mails needed to Tags + self.mail_threshold = 10 + + self.regex_timeout = 30 + self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" + re.compile(self.email_regex) + + def is_mxdomain_in_cache(self, mxdomain): + return self.r_cache.exists(f'mxdomain:{mxdomain}') + + def save_mxdomain_in_cache(self, mxdomain): + self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1)) + + def check_mx_record(self, set_mxdomains): + """Check if emails MX domains are responding. + + :param adress_set: -- (set) This is a set of emails domains + :return: (int) Number of adress with a responding and valid MX domains + + """ + resolver = dns.resolver.Resolver() + resolver.nameservers = [self.dns_server] + resolver.timeout = 5.0 + resolver.lifetime = 2.0 + + valid_mxdomain = [] + for mxdomain in set_mxdomains: + + # check if is in cache + # # TODO: + if self.is_mxdomain_in_cache(mxdomain): + valid_mxdomain.append(mxdomain) + else: + + # DNS resolution + try: + answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX) + if answers: + self.save_mxdomain_in_cache(mxdomain) + valid_mxdomain.append(mxdomain) + # DEBUG + # print('---') + # print(answers.response) + # print(answers.qname) + # print(answers.rdtype) + # print(answers.rdclass) + # print(answers.nameserver) + # print() + + except dns.resolver.NoNameservers: + self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.') + print('NoNameserver, No non-broken nameservers are available to answer the query.') + except dns.resolver.NoAnswer: + self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.') + print('NoAnswer, The response did not contain an answer to the question.') + except dns.name.EmptyLabel: + self.redis_logger.debug('SyntaxError: EmptyLabel') + print('SyntaxError: EmptyLabel') + except dns.resolver.NXDOMAIN: + #save_mxdomain_in_cache(mxdomain) + self.redis_logger.debug('The query name does not exist.') + print('The query name does not exist.') + except dns.name.LabelTooLong: + self.redis_logger.debug('The Label is too long') + print('The Label is too long') + except dns.exception.Timeout: + print('dns timeout') + #save_mxdomain_in_cache(mxdomain) + except Exception as e: + print(e) + return valid_mxdomain + + # # TODO: sanityze mails + def compute(self, message): + item_id, score = message.split() + item = Item(item_id) + item_date = item.get_date() + + mails = self.regex_findall(self.email_regex, item_id, item.get_content()) + mxdomains_email = {} + for mail in mails: + mxdomain = mail.rsplit('@', 1)[1].lower() + if not mxdomain in mxdomains_email: + mxdomains_email[mxdomain] = set() + mxdomains_email[mxdomain].add(mail) + + ## TODO: add MAIL trackers + + valid_mx = self.check_mx_record(mxdomains_email.keys()) + print(f'valid_mx: {valid_mx}') + mx_tlds = {} + num_valid_email = 0 + for domain_mx in valid_mx: + nb_mails = len(mxdomains_email[domain_mx]) + num_valid_email += nb_mails + + # Create doamin_mail stats + msg = f'mail;{nb_mails};{domain_mx};{item_date}' + self.send_message_to_queue(msg, 'ModuleStats') + + # Create country stats + self.faup.decode(domain_mx) + tld = self.faup.get()['tld'] + try: + tld = tld.decode() + except: + pass + mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails + for tld in mx_tlds: + Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld]) + + if num_valid_email > self.mail_threshold: + msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}' + print(f'{item_id} Checked {num_valid_email} e-mail(s)') + self.redis_logger.warning(msg) + # Tags + msg = f'infoleak:automatic-detection="mail";{item_id}' + self.send_message_to_queue(msg, 'Tags') + else: + self.redis_logger.info(msg) + + + +if __name__ == '__main__': + module = Mail() + #module.compute('tests/2021/01/01/mails.gz 50') + module.run() diff --git a/bin/modules/ModuleStats.py b/bin/modules/ModuleStats.py new file mode 100755 index 00000000..16e18b51 --- /dev/null +++ b/bin/modules/ModuleStats.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" + This module makes statistics for some modules and providers + +""" + +################################## +# Import External packages # +################################## +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages # +################################## +from modules.abstract_module import AbstractModule +from lib.objects.Items import Item +from lib import Statistics + + +class ModuleStats(AbstractModule): + """ + Module Statistics module for AIL framework + """ + + + def __init__(self): + + super(ModuleStats, self).__init__() + + # Waiting time in secondes between to message proccessed + self.pending_seconds = 20 + + def compute(self, message): + + # MODULE STATS + if len(message.split(';')) > 1: + module_name, num, keyword, date = message.split(';') + Statisticsupdate_module_stats(module_name, num, keyword, date) + # ITEM STATS + else: + item = Item(item_id) + source = item.get_source() + date = item.get_date() + size = item.get_size() + Statistics.update_item_stats_size_nb(item_id, source, size, date) + + +if __name__ == '__main__': + + module = ModuleStats() + module.run() diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 5b2e672b..3370cb17 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R [CreditCards] subscribe = Redis_CreditCards -publish = Redis_ModuleStats,Redis_Tags +publish = Redis_Tags [BankAccount] subscribe = Redis_Global