chg: [statistics] ARDB migration

2024-11-23 06:37:15 +00:00 · 2022-09-08 10:31:57 +02:00 · 2022-09-08 10:31:57 +02:00 · aa6ba61050
commit aa6ba61050
parent d27d47dc70
13 changed files with 482 additions and 420 deletions
--- a/bin/BankAccount.py
+++ b/bin/BankAccount.py
@ -16,6 +16,13 @@ import re
 import string
 from itertools import chain
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages        #
 ##################################
 from lib import Statistics
 from packages import Item
 from pubsublogger import publisher
@ -48,6 +55,7 @@ def is_valid_iban(iban):
        return True
    return False
 # # TODO: SET
 def check_all_iban(l_iban, obj_id):
    nb_valid_iban = 0
    for iban in l_iban:
@ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id):
            if is_valid_iban(iban):
                print('------')
                nb_valid_iban = nb_valid_iban + 1
-                server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)
+                Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1)
    if(nb_valid_iban > 0):
        to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
@ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id):
        msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
        p.populate_set_out(msg, 'Tags')
        #Send to duplicate
        p.populate_set_out(obj_id, 'Duplicate')
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
@ -82,13 +88,6 @@ if __name__ == "__main__":
    p = Process(config_section)
    max_execution_time = p.config.getint("BankAccount", "max_execution_time")
    # ARDB #
    server_statistics = redis.StrictRedis(
        host=p.config.get("ARDB_Statistics", "host"),
        port=p.config.getint("ARDB_Statistics", "port"),
        db=p.config.getint("ARDB_Statistics", "db"),
        decode_responses=True)
    publisher.info("BankAccount started")
    #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -314,14 +314,6 @@ if __name__ == '__main__':
    print('splash url: {}'.format(splash_url))
    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
    r_serv_metadata = redis.StrictRedis(
        host=p.config.get("ARDB_Metadata", "host"),
        port=p.config.getint("ARDB_Metadata", "port"),
        db=p.config.getint("ARDB_Metadata", "db"),
        decode_responses=True)
    r_cache = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
--- a/bin/DB_KVROCKS_MIGRATION.py
+++ b/bin/DB_KVROCKS_MIGRATION.py
@ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN'])
 # Import Project packages
 ##################################
 from lib.ConfigLoader import ConfigLoader
 from lib import Statistics
 from lib import Tag
 from lib import Users
 from lib.objects import Decodeds
@ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker")
 r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
 r_crawler = config_loader.get_redis_conn("ARDB_Onion")
 r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
 r_serv_trend = config_loader.get_redis_conn("ARDB_Trending")
 r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
 config_loader = None
 # # - - CONFIGS - - # #
@ -358,9 +361,16 @@ def items_migration():
 def get_last_crawled_domains(domain_type):
    return r_crawler.lrange(f'last_{domain_type}', 0 ,-1)
 def get_domains_blacklist(domain_type):
    return r_crawler.smembers(f'blacklist_{domain_type}')
 def crawler_migration():
    print('CRAWLER MIGRATION...')
    for domain_type in ['onion', 'regular']:
        for domain in get_domains_blacklist(domain_type):
            crawlers.add_domain_blacklist(domain_type, domain)
    # for domain_type in ['onion', 'regular']:
    #     for row in get_last_crawled_domains(domain_type):
    #         dom_row, epoch = row.rsplit(';', 1)
@ -368,19 +378,18 @@ def crawler_migration():
    #         print(domain, port, epoch)
    #         #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch)
-    for cookiejar_uuid in old_crawlers.get_all_cookiejar():
+    # for cookiejar_uuid in old_crawlers.get_all_cookiejar():
-        meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
+    #     meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
-        #print(meta)
+    #     #print(meta)
    #     crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
    #     crawlers._set_cookiejar_date(meta['date'])
-
+    #
-        for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
+    #     for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
-            print(cookie_uuid)
+    #         print(cookie_uuid)
    #         crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
    # TODO: auto crawler -> to Fix / change
    # TODO: crawlers queues
 ###############################
@ -689,12 +698,89 @@ def subtypes_obj_migration():
 #
 # Credential:
 # HSET 'credential_by_tld:'+date, tld, 1
 def get_all_provider():
    return r_serv_trend.smembers('all_provider_set')
 def get_item_source_stats_by_date(date, source):
    stats = {}
    stats['num'] = r_serv_trend.hget(f'{source}_num', date)
    stats['size'] = r_serv_trend.hget(f'{source}_size', date)
    stats['avg'] = r_serv_trend.hget(f'{source}_avg', date)
    return stats
 def get_item_stats_size_avg_by_date(date):
    return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
 def get_item_stats_nb_by_date(date):
    return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True)
 def get_top_stats_module(module_name, date):
    return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True)
 def get_module_tld_stats_by_date(module, date):
    return r_statistics.hgetall(f'{module}_by_tld:{date}')
 def statistics_migration():
    # paste_by_modules_timeout
    # Date full history => lot of keys
    # top_size_set_{date}
    # top_avg_size_set_{date}
    # 'providers_set_{date}
    sources = get_all_provider()
    for date in Date.get_date_range_today('20180101'):
        size_avg = get_item_stats_size_avg_by_date(date)
        nb_items = get_item_stats_nb_by_date(date)
        # top_size_set_{date}
        # top_avg_size_set_{date}
        # 'providers_set_{date}
        # ITEM STATS
        for source in sources:
            source_stat = get_item_source_stats_by_date(date, source)
            Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg'])
        # # MODULE STATS
        # for module in ['credential', 'mail', 'SQLInjection']:
        #     stats = get_module_tld_stats_by_date(module, date)
        #     for tld in stats:
        #         if tld:
        #             print(module, date, tld, stats[tld])
        #             Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld])
        # for module in ['credential']:
        #     # TOP STATS
        #     top_module = get_top_stats_module(module, date)
        #     for keyword, total_sum in top_module:
        #         print(date, module, keyword, total_sum)
        #         #Statistics._add_module_stats(module, total_sum, keyword, date)
    pass
 if __name__ == '__main__':
-    core_migration()
+    #core_migration()
    # user_migration()
    # tags_migration()
    #items_migration()
@ -706,6 +792,7 @@ if __name__ == '__main__':
    # ail_2_ail_migration()
    # trackers_migration()
    # investigations_migration()
    statistics_migration()
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -217,8 +217,12 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x"
    sleep 0.1
    # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
    # sleep 0.1
    screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
    sleep 0.1
@ -267,8 +271,6 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x"
@ -277,8 +279,6 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"
--- a/bin/Mail.py
+++ b/bin/Mail.py
@ -1,220 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Mails Module
 ======================
 This module is consuming the Redis-list created by the Categ module.
 It apply mail regexes on item content and warn if above a threshold.
 """
 import os
 import re
 import sys
 import uuid
 import redis
 import time
 import datetime
 import dns.resolver
 import dns.exception
 from multiprocessing import Process as Proc
 from pubsublogger import publisher
 from Helper import Process
 from pyfaup.faup import Faup
 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
 import Item
 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
 import ConfigLoader
 ## LOAD CONFIG ##
 config_loader = ConfigLoader.ConfigLoader()
 server_statistics = config_loader.get_redis_conn("ARDB_Statistics")
 r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
 dns_server = config_loader.get_config_str('Mail', 'dns')
 config_loader = None
 ## -- ##
 def is_mxdomain_in_cache(mxdomain):
    return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
 def save_mxdomain_in_cache(mxdomain):
    r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
 def check_mx_record(set_mxdomains, dns_server):
    """Check if emails MX domains are responding.
    :param adress_set: -- (set) This is a set of emails domains
    :return: (int) Number of adress with a responding and valid MX domains
    """
    resolver = dns.resolver.Resolver()
    resolver.nameservers = [dns_server]
    resolver.timeout = 5.0
    resolver.lifetime = 2.0
    valid_mxdomain = []
    for mxdomain in set_mxdomains:
        # check if is in cache
        # # TODO:
        if is_mxdomain_in_cache(mxdomain):
            valid_mxdomain.append(mxdomain)
        else:
            # DNS resolution
            try:
                answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
                if answers:
                    save_mxdomain_in_cache(mxdomain)
                    valid_mxdomain.append(mxdomain)
                    # DEBUG
                    # print('---')
                    # print(answers.response)
                    # print(answers.qname)
                    # print(answers.rdtype)
                    # print(answers.rdclass)
                    # print(answers.nameserver)
                    # print()
            except dns.resolver.NoNameservers:
                publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
                print('NoNameserver, No non-broken nameservers are available to answer the query.')
            except dns.resolver.NoAnswer:
                publisher.debug('NoAnswer, The response did not contain an answer to the question.')
                print('NoAnswer, The response did not contain an answer to the question.')
            except dns.name.EmptyLabel:
                publisher.debug('SyntaxError: EmptyLabel')
                print('SyntaxError: EmptyLabel')
            except dns.resolver.NXDOMAIN:
                #save_mxdomain_in_cache(mxdomain)
                publisher.debug('The query name does not exist.')
                print('The query name does not exist.')
            except dns.name.LabelTooLong:
                publisher.debug('The Label is too long')
                print('The Label is too long')
            except dns.exception.Timeout:
                print('dns timeout')
                #save_mxdomain_in_cache(mxdomain)
            except Exception as e:
                print(e)
    return valid_mxdomain
 def extract_all_emails(redis_key, item_content):
    all_emails = re.findall(email_regex, item_content)
    if len(all_emails) > 1:
        r_serv_cache.sadd(redis_key, *all_emails)
        r_serv_cache.expire(redis_key, 360)
    elif all_emails:
        r_serv_cache.sadd(redis_key, all_emails[0])
        r_serv_cache.expire(redis_key, 360)
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'Mail'
    faup = Faup()
    p = Process(config_section)
    publisher.info("Mails module started")
    # Numbers of Mails needed to Tags
    mail_threshold = 10
    max_execution_time = 30
    email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
    redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
    while True:
        message = p.get_from_set()
        if message is not None:
            item_id, score = message.split()
            print(item_id)
            item_content = Item.get_item_content(item_id)
            proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
            try:
                proc.start()
                proc.join(max_execution_time)
                if proc.is_alive():
                    proc.terminate()
                    p.incr_module_timeout_statistic()
                    err_mess = "Mails: processing timeout: {}".format(item_id)
                    print(err_mess)
                    publisher.info(err_mess)
                    continue
                else:
                    all_emails = r_serv_cache.smembers(redis_key)
                    r_serv_cache.delete(redis_key)
                    proc.terminate()
            except KeyboardInterrupt:
                print("Caught KeyboardInterrupt, terminating workers")
                proc.terminate()
                sys.exit(0)
            # get MXdomains
            set_mxdomains = set()
            dict_mxdomains_email = {}
            for email in all_emails:
                mxdomain = email.split('@')[1].lower()
                if not mxdomain in dict_mxdomains_email:
                    dict_mxdomains_email[mxdomain] = []
                    set_mxdomains.add(mxdomain)
                dict_mxdomains_email[mxdomain].append(email)
                ## TODO: add MAIL trackers
            valid_mx = check_mx_record(set_mxdomains, dns_server)
            item_date = Item.get_item_date(item_id)
            num_valid_email = 0
            for domain_mx in valid_mx:
                num_valid_email += len(dict_mxdomains_email[domain_mx])
                for email in dict_mxdomains_email[domain_mx]:
                    msg = 'mail;{};{};{}'.format(1, email, item_date)
                    p.populate_set_out(msg, 'ModuleStats')
                    # Create country stats
                    faup.decode(email)
                    tld = faup.get()['tld']
                    try:
                        tld = tld.decode()
                    except:
                        pass
                    server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1)
            msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id)
            if num_valid_email > mail_threshold:
                print('{}    Checked {} e-mail(s)'.format(item_id, num_valid_email))
                publisher.warning(msg)
                #Send to duplicate
                p.populate_set_out(item_id, 'Duplicate')
                #tags
                msg = 'infoleak:automatic-detection="mail";{}'.format(item_id)
                p.populate_set_out(msg, 'Tags')
            else:
                publisher.info(msg)
        else:
            time.sleep(10)
--- a/bin/ModuleStats.py
+++ b/bin/ModuleStats.py
@ -1,156 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
    This module makes statistics for some modules and providers
 """
 ##################################
 # Import External packages       #
 ##################################
 import time
 import datetime
 import redis
 import os
 import sys
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages        #
 ##################################
 from modules.abstract_module import AbstractModule
 from packages.Date import Date
 from packages import Paste
 import ConfigLoader
 class ModuleStats(AbstractModule):
    """
    Module Statistics module for AIL framework
    """
    # Config Var
    MAX_SET_CARDINALITY = 8
    def __init__(self):
        super(ModuleStats, self).__init__()
        # Waiting time in secondes between to message proccessed
        self.pending_seconds = 20
        # Sent to the logging a description of the module
        self.redis_logger.info("Makes statistics about valid URL")
        # REDIS #
        self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
    def compute(self, message):
        if len(message.split(';')) > 1:
            self.compute_most_posted(message)
        else:
            self.compute_provider_info(message)
    def get_date_range(self, num_day):
        curr_date = datetime.date.today()
        date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
        date_list = []
        for i in range(0, num_day+1):
            date_list.append(date.substract_day(i))
        return date_list
    def compute_most_posted(self, message):
        module, num, keyword, paste_date = message.split(';')
        redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
        # Add/Update in Redis
        self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
        # Compute Most Posted
        date = self.get_date_range(0)[0]
        # check if this keyword is eligible for progression
        keyword_total_sum = 0
        curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
        keyword_total_sum += int(curr_value) if curr_value is not None else 0
        if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
            self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
        else: # not in set
            member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
            # Member set is a list of (value, score) pairs
            if int(member_set[0][1]) < keyword_total_sum:
                #remove min from set and add the new one
                self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
                self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
                self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
                self.redis_logger.debug(redis_progression_name_set)
    def compute_provider_info(self, message):
        redis_all_provider = 'all_provider_set'
        paste = Paste.Paste(message)
        paste_baseName = paste.p_name.split('.')[0]
        paste_size = paste._get_p_size()
        paste_provider = paste.p_source
        paste_date = str(paste._get_p_date())
        redis_sum_size_set = 'top_size_set_' + paste_date
        redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
        redis_providers_name_set = 'providers_set_' + paste_date
        # Add/Update in Redis
        self.r_serv_trend.sadd(redis_all_provider, paste_provider)
        num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
        sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
        new_avg = float(sum_size) / float(num_paste)
        self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
        #
        # Compute Most Posted
        #
        # Size
        if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
            self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
            self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
        else: #set full capacity
            member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
            # Member set is a list of (value, score) pairs
            if float(member_set[0][1]) < new_avg:
                #remove min from set and add the new one
                self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
                self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
                self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
                self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
                self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
        # Num
        # if set not full or provider already present
        if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
            self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
        else: #set at full capacity
            member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
            # Member set is a list of (value, score) pairs
            if int(member_set[0][1]) < num_paste:
                #remove min from set and add the new one
                self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
                self.r_serv_trend.zrem(member_set[0][0])
                self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
 if __name__ == '__main__':
    module = ModuleStats()
    module.run()
--- a/bin/lib/Statistics.py
+++ b/bin/lib/Statistics.py
@ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
 import ConfigLoader
 config_loader = ConfigLoader.ConfigLoader()
-r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics")
+r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
 #r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending")
 config_loader = None
 PIE_CHART_MAX_CARDINALITY = 8
 def incr_module_timeout_statistic(module_name):
    curr_date = datetime.date.today()
-    r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
+    r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
 def create_item_statistics(item_id, source, size):
    pass
 def get_item_sources():
    return r_statistics.smembers('all_provider_set')
 def get_nb_items_processed_by_day_and_source(date, source):
    nb_items = r_statistics.hget(f'{source}_num', date)
    if not nb_items:
        nb_items = 0
    return int(nb_items)
 def get_items_total_size_by_day_and_source(date, source):
    total_size = r_statistics.hget(f'{source}_size', date)
    if not total_size:
        total_size = 0
    return float(total_size)
 def get_items_av_size_by_day_and_source(date, source):
    av_size = r_statistics.hget(f'{source}_avg', date)
    if not av_size:
        av_size = 0
    return float(av_size)
 def _create_item_stats_size_nb(date, source, num, size, avg):
    r_statistics.hset(f'{source}_num', date, num)
    r_statistics.hset(f'{source}_size', date, size)
    r_statistics.hset(f'{source}_avg', date, avg)
 def get_item_stats_size_avg_by_date():
    return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
 def get_item_stats_nb_by_date():
    return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True)
 def _set_item_stats_nb_by_date(date, source):
    return r_statistics.zrange(f'providers_set_{date}', )
 # # TODO: load ZSET IN CACHE => FAST UPDATE
 def update_item_stats_size_nb(item_id, source, size, date):
    # Add/Update in Redis
    r_statistics.sadd('all_provider_set', source)
    nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1))
    sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size))
    new_avg = sum_size / nb_items
    r_statistics.hset(f'{source}_avg', date, new_avg)
    # TOP Items Size
    if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY:
        r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
    else:
        member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
        # Member set is a list of (value, score) pairs
        if float(member_set[0][1]) < new_avg:
            # remove min from set and add the new one
            r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0])
            r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
    # TOP Nb Items
    if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None:
        r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
    else: # zset at full capacity
        member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
        # Member set is a list of (value, score) pairs
        if int(member_set[0][1]) < nb_items:
            # remove min from set and add the new one
            r_statistics.zrem(member_set[0][0])
            r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
 # keyword  num
 def _add_module_stats(module_name, total_sum, keyword, date):
    r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword)
 # # TODO: ONE HSET BY MODULE / CUSTOM STATS
 def update_module_stats(module_name, num, keyword, date):
    # Add/Update in Redis
    r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!!
    # Compute Most Posted
    # check if this keyword is eligible for progression
    keyword_total_sum = 0
    curr_value = r_statistics.hget(date, module+'-'+keyword)
    keyword_total_sum += int(curr_value) if curr_value is not None else 0
    if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY:
        r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
    else: # zset at full capacity
        member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
        # Member set is a list of (value, score) pairs
        if int(member_set[0][1]) < keyword_total_sum:
            #remove min from set and add the new one
            r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0])
            r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
 def get_module_tld_stats_by_tld_date(date, tld):
    nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld)
    if not nb_tld:
        nb_tld = 0
    return int(nb_tld)
 def get_module_tld_stats_by_date(module, date):
    return r_statistics.hgetall(f'{module}_by_tld:{date}')
 def add_module_tld_stats_by_date(module, date, tld, nb):
    r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb))
 def get_iban_country_stats_by_date(date):
    return r_statistics.hgetall(f'iban_by_country:{date}')
 def add_iban_country_stats_by_date(date, tld, nb):
    r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb))
 # r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
 # r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
    else:
        return ({'error':'invalid input'}, 400)
 def get_domains_blacklist(domain_type):
    return r_serv_onion.smembers(f'blacklist_{domain_type}')
 def add_domain_blacklist(domain_type, domain):
    r_serv_onion.sadd(f'blacklist_{domain_type}', domain)
 ##-- CRAWLER GLOBAL --##
 #### AUTOMATIC CRAWLER ####
--- a/bin/modules/Credential.py
+++ b/bin/modules/Credential.py
@ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule
 from packages.Item import Item
 from lib import ConfigLoader
 from lib import regex_helper
 from lib import Statistics
 class Credential(AbstractModule):
@ -96,6 +97,7 @@ class Credential(AbstractModule):
        item_content = item.get_content()
        # TODO: USE SETS
        # Extract all credentials
        all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time)
@ -117,9 +119,6 @@ class Credential(AbstractModule):
                print(f"========> Found more than 10 credentials in this file : {item.get_id()}")
                self.redis_logger.warning(to_print)
                # Send to duplicate
                self.send_message_to_queue(item.get_id(), 'Duplicate')
                msg = f'infoleak:automatic-detection="credential";{item.get_id()}'
                self.send_message_to_queue(msg, 'Tags')
@ -158,6 +157,7 @@ class Credential(AbstractModule):
                    print(f"=======> Probably on : {discovered_sites}")
                date = datetime.now().strftime("%Y%m")
                nb_tlds = {}
                for cred in all_credentials:
                    maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
                    self.faup.decode(maildomains)
@ -167,7 +167,9 @@ class Credential(AbstractModule):
                        tld = tld.decode()
                    except:
                        pass
-                    self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
+                    nb_tlds[tld] = nb_tlds.get(tld, 0) + 1
                for tld in nb_tlds:
                    Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld])
            else:
                self.redis_logger.info(to_print)
                print(f'found {nb_cred} credentials')
--- a/bin/modules/CreditCards.py
+++ b/bin/modules/CreditCards.py
@ -75,9 +75,6 @@ class CreditCards(AbstractModule):
            if (len(creditcard_set) > 0):
                self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}')
                #Send to duplicate
                self.send_message_to_queue(item.get_id(), 'Duplicate')
                msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}'
                self.send_message_to_queue(msg, 'Tags')
--- a/bin/modules/Mail.py
+++ b/bin/modules/Mail.py
@ -0,0 +1,177 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Mails Module
 ======================
 This module is consuming the Redis-list created by the Categ module.
 It apply mail regexes on item content and warn if above a threshold.
 """
 import os
 import re
 import redis
 import sys
 import time
 import datetime
 import dns.resolver
 import dns.exception
 from pyfaup.faup import Faup
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages        #
 ##################################
 from modules.abstract_module import AbstractModule
 from lib.objects.Items import Item
 from lib.ConfigLoader import ConfigLoader
 from lib import Statistics
 class Mail(AbstractModule):
    """
    Module Mail module for AIL framework
    """
    def __init__(self):
        super(Mail, self).__init__()
        config_loader = ConfigLoader()
        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
        self.dns_server = config_loader.get_config_str('Mail', 'dns')
        self.faup = Faup()
        # Numbers of Mails needed to Tags
        self.mail_threshold = 10
        self.regex_timeout = 30
        self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
        re.compile(self.email_regex)
    def is_mxdomain_in_cache(self, mxdomain):
        return self.r_cache.exists(f'mxdomain:{mxdomain}')
    def save_mxdomain_in_cache(self, mxdomain):
        self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1))
    def check_mx_record(self, set_mxdomains):
        """Check if emails MX domains are responding.
        :param adress_set: -- (set) This is a set of emails domains
        :return: (int) Number of adress with a responding and valid MX domains
        """
        resolver = dns.resolver.Resolver()
        resolver.nameservers = [self.dns_server]
        resolver.timeout = 5.0
        resolver.lifetime = 2.0
        valid_mxdomain = []
        for mxdomain in set_mxdomains:
            # check if is in cache
            # # TODO:
            if self.is_mxdomain_in_cache(mxdomain):
                valid_mxdomain.append(mxdomain)
            else:
                # DNS resolution
                try:
                    answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
                    if answers:
                        self.save_mxdomain_in_cache(mxdomain)
                        valid_mxdomain.append(mxdomain)
                        # DEBUG
                        # print('---')
                        # print(answers.response)
                        # print(answers.qname)
                        # print(answers.rdtype)
                        # print(answers.rdclass)
                        # print(answers.nameserver)
                        # print()
                except dns.resolver.NoNameservers:
                    self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
                    print('NoNameserver, No non-broken nameservers are available to answer the query.')
                except dns.resolver.NoAnswer:
                    self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.')
                    print('NoAnswer, The response did not contain an answer to the question.')
                except dns.name.EmptyLabel:
                    self.redis_logger.debug('SyntaxError: EmptyLabel')
                    print('SyntaxError: EmptyLabel')
                except dns.resolver.NXDOMAIN:
                    #save_mxdomain_in_cache(mxdomain)
                    self.redis_logger.debug('The query name does not exist.')
                    print('The query name does not exist.')
                except dns.name.LabelTooLong:
                    self.redis_logger.debug('The Label is too long')
                    print('The Label is too long')
                except dns.exception.Timeout:
                    print('dns timeout')
                    #save_mxdomain_in_cache(mxdomain)
                except Exception as e:
                    print(e)
        return valid_mxdomain
    # # TODO: sanityze mails
    def compute(self, message):
        item_id, score = message.split()
        item = Item(item_id)
        item_date = item.get_date()
        mails = self.regex_findall(self.email_regex, item_id, item.get_content())
        mxdomains_email = {}
        for mail in mails:
            mxdomain = mail.rsplit('@', 1)[1].lower()
            if not mxdomain in mxdomains_email:
                mxdomains_email[mxdomain] = set()
            mxdomains_email[mxdomain].add(mail)
            ## TODO: add MAIL trackers
        valid_mx = self.check_mx_record(mxdomains_email.keys())
        print(f'valid_mx: {valid_mx}')
        mx_tlds = {}
        num_valid_email = 0
        for domain_mx in valid_mx:
            nb_mails = len(mxdomains_email[domain_mx])
            num_valid_email += nb_mails
            # Create doamin_mail stats
            msg = f'mail;{nb_mails};{domain_mx};{item_date}'
            self.send_message_to_queue(msg, 'ModuleStats')
            # Create country stats
            self.faup.decode(domain_mx)
            tld = self.faup.get()['tld']
            try:
                tld = tld.decode()
            except:
                pass
            mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails
        for tld in mx_tlds:
            Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld])
        if num_valid_email > self.mail_threshold:
            msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}'
            print(f'{item_id}    Checked {num_valid_email} e-mail(s)')
            self.redis_logger.warning(msg)
            # Tags
            msg = f'infoleak:automatic-detection="mail";{item_id}'
            self.send_message_to_queue(msg, 'Tags')
        else:
            self.redis_logger.info(msg)
 if __name__ == '__main__':
    module = Mail()
    #module.compute('tests/2021/01/01/mails.gz 50')
    module.run()
--- a/bin/modules/ModuleStats.py
+++ b/bin/modules/ModuleStats.py
@ -0,0 +1,54 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
    This module makes statistics for some modules and providers
 """
 ##################################
 # Import External packages       #
 ##################################
 import os
 import sys
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages        #
 ##################################
 from modules.abstract_module import AbstractModule
 from lib.objects.Items import Item
 from lib import Statistics
 class ModuleStats(AbstractModule):
    """
    Module Statistics module for AIL framework
    """
    def __init__(self):
        super(ModuleStats, self).__init__()
        # Waiting time in secondes between to message proccessed
        self.pending_seconds = 20
    def compute(self, message):
        # MODULE STATS
        if len(message.split(';')) > 1:
            module_name, num, keyword, date = message.split(';')
            Statisticsupdate_module_stats(module_name, num, keyword, date)
        # ITEM STATS
        else:
            item = Item(item_id)
            source = item.get_source()
            date = item.get_date()
            size = item.get_size()
            Statistics.update_item_stats_size_nb(item_id, source, size, date)
 if __name__ == '__main__':
    module = ModuleStats()
    module.run()
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R
 [CreditCards]
 subscribe = Redis_CreditCards
-publish = Redis_ModuleStats,Redis_Tags
+publish = Redis_Tags
 [BankAccount]
 subscribe = Redis_Global