chg: [statistics] ARDB migration

2025-09-02 05:02:40 +00:00 · 2022-09-08 10:31:57 +02:00 · 2022-09-08 10:31:57 +02:00 · aa6ba61050
commit aa6ba61050
parent d27d47dc70
13 changed files with 482 additions and 420 deletions
--- a/bin/BankAccount.py
+++ b/bin/BankAccount.py
@ -16,6 +16,13 @@ import re
 import string
 from itertools import chain

+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages        #
+##################################
+from lib import Statistics
+
+
 from packages import Item
 from pubsublogger import publisher

@ -48,6 +55,7 @@ def is_valid_iban(iban):
        return True
    return False

+# # TODO: SET
 def check_all_iban(l_iban, obj_id):
    nb_valid_iban = 0
    for iban in l_iban:
@ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id):
            if is_valid_iban(iban):
                print('------')
                nb_valid_iban = nb_valid_iban + 1
-                server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)
+                Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1)
+

    if(nb_valid_iban > 0):
        to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
@ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id):
        msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
        p.populate_set_out(msg, 'Tags')

-        #Send to duplicate
-        p.populate_set_out(obj_id, 'Duplicate')
-
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
@ -82,13 +88,6 @@ if __name__ == "__main__":
    p = Process(config_section)
    max_execution_time = p.config.getint("BankAccount", "max_execution_time")

-    # ARDB #
-    server_statistics = redis.StrictRedis(
-        host=p.config.get("ARDB_Statistics", "host"),
-        port=p.config.getint("ARDB_Statistics", "port"),
-        db=p.config.getint("ARDB_Statistics", "db"),
-        decode_responses=True)
-
    publisher.info("BankAccount started")

    #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -314,14 +314,6 @@ if __name__ == '__main__':

    print('splash url: {}'.format(splash_url))

-    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
-
-    r_serv_metadata = redis.StrictRedis(
-        host=p.config.get("ARDB_Metadata", "host"),
-        port=p.config.getint("ARDB_Metadata", "port"),
-        db=p.config.getint("ARDB_Metadata", "db"),
-        decode_responses=True)
-
    r_cache = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
--- a/bin/DB_KVROCKS_MIGRATION.py
+++ b/bin/DB_KVROCKS_MIGRATION.py
@ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN'])
 # Import Project packages
 ##################################
 from lib.ConfigLoader import ConfigLoader
+from lib import Statistics
 from lib import Tag
 from lib import Users
 from lib.objects import Decodeds
@ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker")
 r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
 r_crawler = config_loader.get_redis_conn("ARDB_Onion")
 r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
+r_serv_trend = config_loader.get_redis_conn("ARDB_Trending")
+r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
 config_loader = None
 # # - - CONFIGS - - # #

@ -358,9 +361,16 @@ def items_migration():
 def get_last_crawled_domains(domain_type):
    return r_crawler.lrange(f'last_{domain_type}', 0 ,-1)

+def get_domains_blacklist(domain_type):
+    return r_crawler.smembers(f'blacklist_{domain_type}')
+
 def crawler_migration():
    print('CRAWLER MIGRATION...')

+    for domain_type in ['onion', 'regular']:
+        for domain in get_domains_blacklist(domain_type):
+            crawlers.add_domain_blacklist(domain_type, domain)
+
    # for domain_type in ['onion', 'regular']:
    #     for row in get_last_crawled_domains(domain_type):
    #         dom_row, epoch = row.rsplit(';', 1)
@ -368,19 +378,18 @@ def crawler_migration():
    #         print(domain, port, epoch)
    #         #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch)

-    for cookiejar_uuid in old_crawlers.get_all_cookiejar():
-        meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
-        #print(meta)
-        #crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
-        #crawlers._set_cookiejar_date(meta['date'])
-
-        for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
-            print(cookie_uuid)
-            #crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
+    # for cookiejar_uuid in old_crawlers.get_all_cookiejar():
+    #     meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
+    #     #print(meta)
+    #     crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
+    #     crawlers._set_cookiejar_date(meta['date'])
+    #
+    #     for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
+    #         print(cookie_uuid)
+    #         crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)

    # TODO: auto crawler -> to Fix / change

-
    # TODO: crawlers queues

 ###############################
@ -689,12 +698,89 @@ def subtypes_obj_migration():
 #
 # Credential:
 # HSET 'credential_by_tld:'+date, tld, 1
+
+def get_all_provider():
+    return r_serv_trend.smembers('all_provider_set')
+
+def get_item_source_stats_by_date(date, source):
+    stats = {}
+    stats['num'] = r_serv_trend.hget(f'{source}_num', date)
+    stats['size'] = r_serv_trend.hget(f'{source}_size', date)
+    stats['avg'] = r_serv_trend.hget(f'{source}_avg', date)
+    return stats
+
+def get_item_stats_size_avg_by_date(date):
+    return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
+
+def get_item_stats_nb_by_date(date):
+    return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True)
+
+def get_top_stats_module(module_name, date):
+    return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True)
+
+def get_module_tld_stats_by_date(module, date):
+    return r_statistics.hgetall(f'{module}_by_tld:{date}')
+
 def statistics_migration():
+    # paste_by_modules_timeout
+
+    # Date full history => lot of keys
+
+
+    # top_size_set_{date}
+    # top_avg_size_set_{date}
+
+    # 'providers_set_{date}
+
+
+
+    sources = get_all_provider()
+    for date in Date.get_date_range_today('20180101'):
+
+        size_avg = get_item_stats_size_avg_by_date(date)
+
+        nb_items = get_item_stats_nb_by_date(date)
+
+        # top_size_set_{date}
+        # top_avg_size_set_{date}
+
+        # 'providers_set_{date}
+
+        # ITEM STATS
+        for source in sources:
+            source_stat = get_item_source_stats_by_date(date, source)
+            Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg'])
+
+
+
+        # # MODULE STATS
+        # for module in ['credential', 'mail', 'SQLInjection']:
+        #     stats = get_module_tld_stats_by_date(module, date)
+        #     for tld in stats:
+        #         if tld:
+        #             print(module, date, tld, stats[tld])
+        #             Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld])
+        # for module in ['credential']:
+        #     # TOP STATS
+        #     top_module = get_top_stats_module(module, date)
+        #     for keyword, total_sum in top_module:
+        #         print(date, module, keyword, total_sum)
+        #         #Statistics._add_module_stats(module, total_sum, keyword, date)
+
+
+
+
+
+
+
+
+
    pass

+
 if __name__ == '__main__':

-    core_migration()
+    #core_migration()
    # user_migration()
    # tags_migration()
    #items_migration()
@ -706,6 +792,7 @@ if __name__ == '__main__':
    # ail_2_ail_migration()
    # trackers_migration()
    # investigations_migration()
+    statistics_migration()



--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -217,8 +217,12 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x"
    sleep 0.1
+    screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x"
+    sleep 0.1
    # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
    # sleep 0.1
+    screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x"
+    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
    sleep 0.1

@ -267,8 +271,6 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x"
-    sleep 0.1
    screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x"
@ -277,8 +279,6 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x"
-    sleep 0.1
    screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"
--- a/bin/Mail.py
+++ b/bin/Mail.py
@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-
-"""
-The Mails Module
-======================
-
-This module is consuming the Redis-list created by the Categ module.
-
-It apply mail regexes on item content and warn if above a threshold.
-
-"""
-
-import os
-import re
-import sys
-import uuid
-import redis
-import time
-import datetime
-
-import dns.resolver
-import dns.exception
-
-from multiprocessing import Process as Proc
-
-from pubsublogger import publisher
-from Helper import Process
-
-from pyfaup.faup import Faup
-
-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
-import Item
-
-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
-import ConfigLoader
-
-## LOAD CONFIG ##
-config_loader = ConfigLoader.ConfigLoader()
-server_statistics = config_loader.get_redis_conn("ARDB_Statistics")
-r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
-
-dns_server = config_loader.get_config_str('Mail', 'dns')
-
-config_loader = None
-## -- ##
-
-def is_mxdomain_in_cache(mxdomain):
-    return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
-
-def save_mxdomain_in_cache(mxdomain):
-    r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
-
-def check_mx_record(set_mxdomains, dns_server):
-    """Check if emails MX domains are responding.
-
-    :param adress_set: -- (set) This is a set of emails domains
-    :return: (int) Number of adress with a responding and valid MX domains
-
-    """
-    resolver = dns.resolver.Resolver()
-    resolver.nameservers = [dns_server]
-    resolver.timeout = 5.0
-    resolver.lifetime = 2.0
-
-    valid_mxdomain = []
-    for mxdomain in set_mxdomains:
-
-        # check if is in cache
-        # # TODO:
-        if is_mxdomain_in_cache(mxdomain):
-            valid_mxdomain.append(mxdomain)
-        else:
-
-            # DNS resolution
-            try:
-                answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
-                if answers:
-                    save_mxdomain_in_cache(mxdomain)
-                    valid_mxdomain.append(mxdomain)
-                    # DEBUG
-                    # print('---')
-                    # print(answers.response)
-                    # print(answers.qname)
-                    # print(answers.rdtype)
-                    # print(answers.rdclass)
-                    # print(answers.nameserver)
-                    # print()
-
-            except dns.resolver.NoNameservers:
-                publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
-                print('NoNameserver, No non-broken nameservers are available to answer the query.')
-            except dns.resolver.NoAnswer:
-                publisher.debug('NoAnswer, The response did not contain an answer to the question.')
-                print('NoAnswer, The response did not contain an answer to the question.')
-            except dns.name.EmptyLabel:
-                publisher.debug('SyntaxError: EmptyLabel')
-                print('SyntaxError: EmptyLabel')
-            except dns.resolver.NXDOMAIN:
-                #save_mxdomain_in_cache(mxdomain)
-                publisher.debug('The query name does not exist.')
-                print('The query name does not exist.')
-            except dns.name.LabelTooLong:
-                publisher.debug('The Label is too long')
-                print('The Label is too long')
-            except dns.exception.Timeout:
-                print('dns timeout')
-                #save_mxdomain_in_cache(mxdomain)
-            except Exception as e:
-                print(e)
-    return valid_mxdomain
-
-def extract_all_emails(redis_key, item_content):
-    all_emails = re.findall(email_regex, item_content)
-    if len(all_emails) > 1:
-        r_serv_cache.sadd(redis_key, *all_emails)
-        r_serv_cache.expire(redis_key, 360)
-    elif all_emails:
-        r_serv_cache.sadd(redis_key, all_emails[0])
-        r_serv_cache.expire(redis_key, 360)
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'Mail'
-
-    faup = Faup()
-
-    p = Process(config_section)
-
-    publisher.info("Mails module started")
-
-    # Numbers of Mails needed to Tags
-    mail_threshold = 10
-
-    max_execution_time = 30
-
-    email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
-
-    redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
-
-    while True:
-        message = p.get_from_set()
-
-        if message is not None:
-            item_id, score = message.split()
-
-            print(item_id)
-
-            item_content = Item.get_item_content(item_id)
-
-            proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
-            try:
-                proc.start()
-                proc.join(max_execution_time)
-                if proc.is_alive():
-                    proc.terminate()
-                    p.incr_module_timeout_statistic()
-                    err_mess = "Mails: processing timeout: {}".format(item_id)
-                    print(err_mess)
-                    publisher.info(err_mess)
-                    continue
-                else:
-                    all_emails = r_serv_cache.smembers(redis_key)
-                    r_serv_cache.delete(redis_key)
-                    proc.terminate()
-            except KeyboardInterrupt:
-                print("Caught KeyboardInterrupt, terminating workers")
-                proc.terminate()
-                sys.exit(0)
-
-            # get MXdomains
-            set_mxdomains = set()
-            dict_mxdomains_email = {}
-            for email in all_emails:
-                mxdomain = email.split('@')[1].lower()
-                if not mxdomain in dict_mxdomains_email:
-                    dict_mxdomains_email[mxdomain] = []
-                    set_mxdomains.add(mxdomain)
-                dict_mxdomains_email[mxdomain].append(email)
-
-                ## TODO: add MAIL trackers
-
-            valid_mx = check_mx_record(set_mxdomains, dns_server)
-
-            item_date = Item.get_item_date(item_id)
-
-            num_valid_email = 0
-            for domain_mx in valid_mx:
-                num_valid_email += len(dict_mxdomains_email[domain_mx])
-
-                for email in dict_mxdomains_email[domain_mx]:
-                    msg = 'mail;{};{};{}'.format(1, email, item_date)
-                    p.populate_set_out(msg, 'ModuleStats')
-
-                    # Create country stats
-                    faup.decode(email)
-                    tld = faup.get()['tld']
-                    try:
-                        tld = tld.decode()
-                    except:
-                        pass
-                    server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1)
-
-            msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id)
-
-            if num_valid_email > mail_threshold:
-                print('{}    Checked {} e-mail(s)'.format(item_id, num_valid_email))
-                publisher.warning(msg)
-                #Send to duplicate
-                p.populate_set_out(item_id, 'Duplicate')
-                #tags
-                msg = 'infoleak:automatic-detection="mail";{}'.format(item_id)
-                p.populate_set_out(msg, 'Tags')
-            else:
-                publisher.info(msg)
-
-        else:
-            time.sleep(10)
--- a/bin/ModuleStats.py
+++ b/bin/ModuleStats.py
@ -1,156 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-    This module makes statistics for some modules and providers
-
-"""
-
-##################################
-# Import External packages       #
-##################################
-import time
-import datetime
-import redis
-import os
-import sys
-
-sys.path.append(os.environ['AIL_BIN'])
-##################################
-# Import Project packages        #
-##################################
-from modules.abstract_module import AbstractModule
-from packages.Date import Date
-from packages import Paste
-import ConfigLoader
-
-
-class ModuleStats(AbstractModule):
-    """
-    Module Statistics module for AIL framework
-    """
-
-    # Config Var
-    MAX_SET_CARDINALITY = 8
-
-
-    def __init__(self):
-
-        super(ModuleStats, self).__init__()
-
-        # Waiting time in secondes between to message proccessed
-        self.pending_seconds = 20
-
-        # Sent to the logging a description of the module
-        self.redis_logger.info("Makes statistics about valid URL")
-
-        # REDIS #
-        self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
-
-    def compute(self, message):
-
-        if len(message.split(';')) > 1:
-            self.compute_most_posted(message)
-        else:
-            self.compute_provider_info(message)
-
-
-    def get_date_range(self, num_day):
-        curr_date = datetime.date.today()
-        date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
-        date_list = []
-
-        for i in range(0, num_day+1):
-            date_list.append(date.substract_day(i))
-        return date_list
-
-
-    def compute_most_posted(self, message):
-        module, num, keyword, paste_date = message.split(';')
-
-        redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
-
-        # Add/Update in Redis
-        self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
-
-        # Compute Most Posted
-        date = self.get_date_range(0)[0]
-        # check if this keyword is eligible for progression
-        keyword_total_sum = 0
-
-        curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
-        keyword_total_sum += int(curr_value) if curr_value is not None else 0
-
-        if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
-            self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
-
-        else: # not in set
-            member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
-            # Member set is a list of (value, score) pairs
-            if int(member_set[0][1]) < keyword_total_sum:
-                #remove min from set and add the new one
-                self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
-                self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
-                self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
-                self.redis_logger.debug(redis_progression_name_set)
-
-
-    def compute_provider_info(self, message):
-        redis_all_provider = 'all_provider_set'
-
-        paste = Paste.Paste(message)
-
-        paste_baseName = paste.p_name.split('.')[0]
-        paste_size = paste._get_p_size()
-        paste_provider = paste.p_source
-        paste_date = str(paste._get_p_date())
-        redis_sum_size_set = 'top_size_set_' + paste_date
-        redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
-        redis_providers_name_set = 'providers_set_' + paste_date
-
-        # Add/Update in Redis
-        self.r_serv_trend.sadd(redis_all_provider, paste_provider)
-
-        num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
-        sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
-        new_avg = float(sum_size) / float(num_paste)
-        self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
-
-
-        #
-        # Compute Most Posted
-        #
-
-        # Size
-        if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
-            self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
-            self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
-        else: #set full capacity
-            member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
-            # Member set is a list of (value, score) pairs
-            if float(member_set[0][1]) < new_avg:
-                #remove min from set and add the new one
-                self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
-                self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
-                self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
-                self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
-                self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
-
-
-        # Num
-        # if set not full or provider already present
-        if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
-            self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
-        else: #set at full capacity
-            member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
-            # Member set is a list of (value, score) pairs
-            if int(member_set[0][1]) < num_paste:
-                #remove min from set and add the new one
-                self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
-                self.r_serv_trend.zrem(member_set[0][0])
-                self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
-
-
-if __name__ == '__main__':
-
-    module = ModuleStats()
-    module.run()
--- a/bin/lib/Statistics.py
+++ b/bin/lib/Statistics.py
@ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
 import ConfigLoader

 config_loader = ConfigLoader.ConfigLoader()
-r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics")
+r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
+#r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending")
 config_loader = None

+PIE_CHART_MAX_CARDINALITY = 8
+
 def incr_module_timeout_statistic(module_name):
    curr_date = datetime.date.today()
-    r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
+    r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
+
+def create_item_statistics(item_id, source, size):
+    pass
+
+def get_item_sources():
+    return r_statistics.smembers('all_provider_set')
+
+def get_nb_items_processed_by_day_and_source(date, source):
+    nb_items = r_statistics.hget(f'{source}_num', date)
+    if not nb_items:
+        nb_items = 0
+    return int(nb_items)
+
+def get_items_total_size_by_day_and_source(date, source):
+    total_size = r_statistics.hget(f'{source}_size', date)
+    if not total_size:
+        total_size = 0
+    return float(total_size)
+
+def get_items_av_size_by_day_and_source(date, source):
+    av_size = r_statistics.hget(f'{source}_avg', date)
+    if not av_size:
+        av_size = 0
+    return float(av_size)
+
+def _create_item_stats_size_nb(date, source, num, size, avg):
+    r_statistics.hset(f'{source}_num', date, num)
+    r_statistics.hset(f'{source}_size', date, size)
+    r_statistics.hset(f'{source}_avg', date, avg)
+
+def get_item_stats_size_avg_by_date():
+    return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
+
+def get_item_stats_nb_by_date():
+    return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True)
+
+def _set_item_stats_nb_by_date(date, source):
+    return r_statistics.zrange(f'providers_set_{date}', )
+
+
+# # TODO: load ZSET IN CACHE => FAST UPDATE
+def update_item_stats_size_nb(item_id, source, size, date):
+    # Add/Update in Redis
+    r_statistics.sadd('all_provider_set', source)
+
+    nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1))
+    sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size))
+    new_avg = sum_size / nb_items
+    r_statistics.hset(f'{source}_avg', date, new_avg)
+
+    # TOP Items Size
+    if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY:
+        r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
+    else:
+        member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
+        # Member set is a list of (value, score) pairs
+        if float(member_set[0][1]) < new_avg:
+            # remove min from set and add the new one
+            r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0])
+            r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
+
+    # TOP Nb Items
+    if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None:
+        r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
+    else: # zset at full capacity
+        member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
+        # Member set is a list of (value, score) pairs
+        if int(member_set[0][1]) < nb_items:
+            # remove min from set and add the new one
+            r_statistics.zrem(member_set[0][0])
+            r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
+
+# keyword  num
+
+def _add_module_stats(module_name, total_sum, keyword, date):
+    r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword)
+
+# # TODO: ONE HSET BY MODULE / CUSTOM STATS
+def update_module_stats(module_name, num, keyword, date):
+
+    # Add/Update in Redis
+    r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!!
+
+    # Compute Most Posted
+    # check if this keyword is eligible for progression
+    keyword_total_sum = 0
+
+    curr_value = r_statistics.hget(date, module+'-'+keyword)
+    keyword_total_sum += int(curr_value) if curr_value is not None else 0
+
+    if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY:
+        r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
+    else: # zset at full capacity
+        member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
+        # Member set is a list of (value, score) pairs
+        if int(member_set[0][1]) < keyword_total_sum:
+            #remove min from set and add the new one
+            r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0])
+            r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
+
+def get_module_tld_stats_by_tld_date(date, tld):
+    nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld)
+    if not nb_tld:
+        nb_tld = 0
+    return int(nb_tld)
+
+def get_module_tld_stats_by_date(module, date):
+    return r_statistics.hgetall(f'{module}_by_tld:{date}')
+
+def add_module_tld_stats_by_date(module, date, tld, nb):
+    r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb))
+
+
+def get_iban_country_stats_by_date(date):
+    return r_statistics.hgetall(f'iban_by_country:{date}')
+
+def add_iban_country_stats_by_date(date, tld, nb):
+    r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb))
+
+# r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
+# r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
    else:
        return ({'error':'invalid input'}, 400)

+def get_domains_blacklist(domain_type):
+    return r_serv_onion.smembers(f'blacklist_{domain_type}')
+
+def add_domain_blacklist(domain_type, domain):
+    r_serv_onion.sadd(f'blacklist_{domain_type}', domain)
+
 ##-- CRAWLER GLOBAL --##

 #### AUTOMATIC CRAWLER ####
--- a/bin/modules/Credential.py
+++ b/bin/modules/Credential.py
@ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule
 from packages.Item import Item
 from lib import ConfigLoader
 from lib import regex_helper
+from lib import Statistics


 class Credential(AbstractModule):
@ -96,6 +97,7 @@ class Credential(AbstractModule):

        item_content = item.get_content()

+        # TODO: USE SETS
        # Extract all credentials
        all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time)

@ -117,9 +119,6 @@ class Credential(AbstractModule):
                print(f"========> Found more than 10 credentials in this file : {item.get_id()}")
                self.redis_logger.warning(to_print)

-                # Send to duplicate
-                self.send_message_to_queue(item.get_id(), 'Duplicate')
-
                msg = f'infoleak:automatic-detection="credential";{item.get_id()}'
                self.send_message_to_queue(msg, 'Tags')

@ -158,6 +157,7 @@ class Credential(AbstractModule):
                    print(f"=======> Probably on : {discovered_sites}")

                date = datetime.now().strftime("%Y%m")
+                nb_tlds = {}
                for cred in all_credentials:
                    maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
                    self.faup.decode(maildomains)
@ -167,7 +167,9 @@ class Credential(AbstractModule):
                        tld = tld.decode()
                    except:
                        pass
-                    self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
+                    nb_tlds[tld] = nb_tlds.get(tld, 0) + 1
+                for tld in nb_tlds:
+                    Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld])
            else:
                self.redis_logger.info(to_print)
                print(f'found {nb_cred} credentials')
--- a/bin/modules/CreditCards.py
+++ b/bin/modules/CreditCards.py
@ -75,9 +75,6 @@ class CreditCards(AbstractModule):
            if (len(creditcard_set) > 0):
                self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}')

-                #Send to duplicate
-                self.send_message_to_queue(item.get_id(), 'Duplicate')
-
                msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}'
                self.send_message_to_queue(msg, 'Tags')

--- a/bin/modules/Mail.py
+++ b/bin/modules/Mail.py
@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+"""
+The Mails Module
+======================
+
+This module is consuming the Redis-list created by the Categ module.
+
+It apply mail regexes on item content and warn if above a threshold.
+
+"""
+
+import os
+import re
+import redis
+import sys
+import time
+import datetime
+
+import dns.resolver
+import dns.exception
+
+from pyfaup.faup import Faup
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages        #
+##################################
+from modules.abstract_module import AbstractModule
+from lib.objects.Items import Item
+from lib.ConfigLoader import ConfigLoader
+from lib import Statistics
+
+
+class Mail(AbstractModule):
+    """
+    Module Mail module for AIL framework
+    """
+
+    def __init__(self):
+        super(Mail, self).__init__()
+
+        config_loader = ConfigLoader()
+        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
+
+        self.dns_server = config_loader.get_config_str('Mail', 'dns')
+
+        self.faup = Faup()
+
+        # Numbers of Mails needed to Tags
+        self.mail_threshold = 10
+
+        self.regex_timeout = 30
+        self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
+        re.compile(self.email_regex)
+
+    def is_mxdomain_in_cache(self, mxdomain):
+        return self.r_cache.exists(f'mxdomain:{mxdomain}')
+
+    def save_mxdomain_in_cache(self, mxdomain):
+        self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1))
+
+    def check_mx_record(self, set_mxdomains):
+        """Check if emails MX domains are responding.
+
+        :param adress_set: -- (set) This is a set of emails domains
+        :return: (int) Number of adress with a responding and valid MX domains
+
+        """
+        resolver = dns.resolver.Resolver()
+        resolver.nameservers = [self.dns_server]
+        resolver.timeout = 5.0
+        resolver.lifetime = 2.0
+
+        valid_mxdomain = []
+        for mxdomain in set_mxdomains:
+
+            # check if is in cache
+            # # TODO:
+            if self.is_mxdomain_in_cache(mxdomain):
+                valid_mxdomain.append(mxdomain)
+            else:
+
+                # DNS resolution
+                try:
+                    answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
+                    if answers:
+                        self.save_mxdomain_in_cache(mxdomain)
+                        valid_mxdomain.append(mxdomain)
+                        # DEBUG
+                        # print('---')
+                        # print(answers.response)
+                        # print(answers.qname)
+                        # print(answers.rdtype)
+                        # print(answers.rdclass)
+                        # print(answers.nameserver)
+                        # print()
+
+                except dns.resolver.NoNameservers:
+                    self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
+                    print('NoNameserver, No non-broken nameservers are available to answer the query.')
+                except dns.resolver.NoAnswer:
+                    self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.')
+                    print('NoAnswer, The response did not contain an answer to the question.')
+                except dns.name.EmptyLabel:
+                    self.redis_logger.debug('SyntaxError: EmptyLabel')
+                    print('SyntaxError: EmptyLabel')
+                except dns.resolver.NXDOMAIN:
+                    #save_mxdomain_in_cache(mxdomain)
+                    self.redis_logger.debug('The query name does not exist.')
+                    print('The query name does not exist.')
+                except dns.name.LabelTooLong:
+                    self.redis_logger.debug('The Label is too long')
+                    print('The Label is too long')
+                except dns.exception.Timeout:
+                    print('dns timeout')
+                    #save_mxdomain_in_cache(mxdomain)
+                except Exception as e:
+                    print(e)
+        return valid_mxdomain
+
+    # # TODO: sanityze mails
+    def compute(self, message):
+        item_id, score = message.split()
+        item = Item(item_id)
+        item_date = item.get_date()
+
+        mails = self.regex_findall(self.email_regex, item_id, item.get_content())
+        mxdomains_email = {}
+        for mail in mails:
+            mxdomain = mail.rsplit('@', 1)[1].lower()
+            if not mxdomain in mxdomains_email:
+                mxdomains_email[mxdomain] = set()
+            mxdomains_email[mxdomain].add(mail)
+
+            ## TODO: add MAIL trackers
+
+        valid_mx = self.check_mx_record(mxdomains_email.keys())
+        print(f'valid_mx: {valid_mx}')
+        mx_tlds = {}
+        num_valid_email = 0
+        for domain_mx in valid_mx:
+            nb_mails = len(mxdomains_email[domain_mx])
+            num_valid_email += nb_mails
+
+            # Create doamin_mail stats
+            msg = f'mail;{nb_mails};{domain_mx};{item_date}'
+            self.send_message_to_queue(msg, 'ModuleStats')
+
+            # Create country stats
+            self.faup.decode(domain_mx)
+            tld = self.faup.get()['tld']
+            try:
+                tld = tld.decode()
+            except:
+                pass
+            mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails
+        for tld in mx_tlds:
+            Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld])
+
+        if num_valid_email > self.mail_threshold:
+            msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}'
+            print(f'{item_id}    Checked {num_valid_email} e-mail(s)')
+            self.redis_logger.warning(msg)
+            # Tags
+            msg = f'infoleak:automatic-detection="mail";{item_id}'
+            self.send_message_to_queue(msg, 'Tags')
+        else:
+            self.redis_logger.info(msg)
+
+
+
+if __name__ == '__main__':
+    module = Mail()
+    #module.compute('tests/2021/01/01/mails.gz 50')
+    module.run()
--- a/bin/modules/ModuleStats.py
+++ b/bin/modules/ModuleStats.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+    This module makes statistics for some modules and providers
+
+"""
+
+##################################
+# Import External packages       #
+##################################
+import os
+import sys
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages        #
+##################################
+from modules.abstract_module import AbstractModule
+from lib.objects.Items import Item
+from lib import Statistics
+
+
+class ModuleStats(AbstractModule):
+    """
+    Module Statistics module for AIL framework
+    """
+
+
+    def __init__(self):
+
+        super(ModuleStats, self).__init__()
+
+        # Waiting time in secondes between to message proccessed
+        self.pending_seconds = 20
+
+    def compute(self, message):
+
+        # MODULE STATS
+        if len(message.split(';')) > 1:
+            module_name, num, keyword, date = message.split(';')
+            Statisticsupdate_module_stats(module_name, num, keyword, date)
+        # ITEM STATS
+        else:
+            item = Item(item_id)
+            source = item.get_source()
+            date = item.get_date()
+            size = item.get_size()
+            Statistics.update_item_stats_size_nb(item_id, source, size, date)
+
+
+if __name__ == '__main__':
+
+    module = ModuleStats()
+    module.run()
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R

 [CreditCards]
 subscribe = Redis_CreditCards
-publish = Redis_ModuleStats,Redis_Tags
+publish = Redis_Tags

 [BankAccount]
 subscribe = Redis_Global