chg: [Term Tracker] refractor term tracker word/set/regex modules + remove old modules

2024-11-27 00:07:16 +00:00 · 2019-08-09 14:20:13 +02:00 · 2019-08-09 14:20:13 +02:00 · 1008c7c4fe
commit 1008c7c4fe
parent d9bdfecef3
19 changed files with 304 additions and 981 deletions
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@ -138,12 +138,12 @@ Redis and ARDB overview
 | Set - Key | Value |
 | ------ | ------ |
-| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** |
+| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** |
 ##### All Term Tracked items:
 | Set - Key | Value |
 | ------ | ------ |
-| tracked_term:item:**uuid** | **item_id** |
+| tracked_term:item:**uuid**:**date** | **item_id** |
 ##### All Term Tracked tags:
 | Set - Key | Value |
@ -155,6 +155,29 @@ Redis and ARDB overview
 | ------ | ------ |
 | tracked_term:mail:**uuid** | **mail** |
 ##### Refresh Tracked term:
 | Key | Value |
 | ------ | ------ |
 | tracked_term:refresh:word | **last refreshed epoch** |
 | tracked_term:refresh:set | - |
 | tracked_term:refresh:regex | - |
 ##### Zset Stat Tracked term:
 | Key | Field | Value |
 | ------ | ------ | ------ |
 | tracked_term:stat:**uuid** | **date** | **nb_seen** |
 ##### Stat token:
 | Key | Field | Value |
 | ------ | ------ | ------ |
 | stat_token_total_by_day:**date** | **word** | **nb_seen** |
 | | | |
 | stat_token_per_item_by_day:**date** | **word** | **nb_seen** |
 | Set - Key | Value |
 | ------ | ------ |
 | stat_token_history | **date** |
 ## DB2 - TermFreq:
 ##### Set:
@ -167,16 +190,6 @@ Redis and ARDB overview
 | TrackedRegexSet | **tracked_regex** |
 | | |
 | | |
 | global:TrackedSetTermSet | **tracked_term** |
 | global:TrackedSetSet | **tracked_set** |
 | global:TrackedRegexSet | **tracked_regex** |
 | | |
 | | |
 | user:**user_id**:TrackedSetTermSet | **tracked_term** |
 | user:**user_id**:TrackedSetSet | **tracked_set** |
 | user:**user_id**:TrackedRegexSet | **tracked_regex** |
 | | |
 | | |
 | tracked_**tracked_term** | **item_path** |
 | set_**tracked_set** | **item_path** |
 | regex_**tracked_regex** | **item_path** |
--- a/bin/Curve.py
+++ b/bin/Curve.py
@ -1,184 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
 This modules update a .csv file used to draw curves representing selected
 words and their occurency per day.
 ..note:: The channel will have the name of the file created.
 ..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
 the same Subscriber name in both of them.
 This Module is also used for term frequency.
 /!\ Top set management is done in the module Curve_manage_top_set
 Requirements
 ------------
 *Need running Redis instances. (Redis)
 *Categories files of words in /files/ need to be created
 *Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
 """
 import redis
 import time
 from pubsublogger import publisher
 from packages import lib_words
 import os
 import datetime
 import calendar
 from Helper import Process
 # Email notifications
 from NotificationHelper import *
 # Config Variables
 BlackListTermsSet_Name = "BlackListSetTermSet"
 TrackedTermsSet_Name = "TrackedSetTermSet"
 top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
 oneDay = 60*60*24
 top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
 top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
 top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
 top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
 TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
 # create direct link in mail
 full_paste_url = "/showsavedpaste/?paste="
 def check_if_tracked_term(term, path):
    if term in server_term.smembers(TrackedTermsSet_Name):
        #add_paste to tracked_word_set
        set_name = "tracked_" + term
        server_term.sadd(set_name, path)
        print(term, 'addded', set_name, '->', path)
        p.populate_set_out("New Term added", 'CurveManageTopSets')
        # Send a notification only when the member is in the set
        if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
            # create mail body
            mail_body = ("AIL Framework,\n"
                        "New occurrence for term: " + term + "\n"
                        ''+full_paste_url + path)
            # Send to every associated email adress
            for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
                sendEmailNotification(email, 'Term', mail_body)
        # tag paste
        for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term):
            msg = '{};{}'.format(tag, path)
            p.populate_set_out(msg, 'Tags')
 def getValueOverRange(word, startDate, num_day):
    to_return = 0
    for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
        value = server_term.hget(timestamp, word)
        to_return += int(value) if value is not None else 0
    return to_return
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'Curve'
    p = Process(config_section)
    # REDIS #
    r_serv1 = redis.StrictRedis(
        host=p.config.get("ARDB_Curve", "host"),
        port=p.config.get("ARDB_Curve", "port"),
        db=p.config.get("ARDB_Curve", "db"),
        decode_responses=True)
    server_term = redis.StrictRedis(
        host=p.config.get("ARDB_TermFreq", "host"),
        port=p.config.get("ARDB_TermFreq", "port"),
        db=p.config.get("ARDB_TermFreq", "db"),
        decode_responses=True)
    # FUNCTIONS #
    publisher.info("Script Curve started")
    # create direct link in mail
    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
    # FILE CURVE SECTION #
    csv_path = os.path.join(os.environ['AIL_HOME'],
                            p.config.get("Directories", "wordtrending_csv"))
    wordfile_path = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "wordsfile"))
    message = p.get_from_set()
    prec_filename = None
    generate_new_graph = False
    # Term Frequency
    top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
    top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
    top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
    while True:
        if message is not None:
            generate_new_graph = True
            filename, word, score = message.split()
            temp = filename.split('/')
            date = temp[-4] + temp[-3] + temp[-2]
            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
            curr_set = top_termFreq_setName_day[0] + str(timestamp)
            low_word = word.lower()
            #Old curve with words in file
            r_serv1.hincrby(low_word, date, int(score))
            # Update redis
            #consider the num of occurence of this term
            curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
            #1 term per paste
            curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
            # Add in set only if term is not in the blacklist
            if low_word not in server_term.smembers(BlackListTermsSet_Name):
                #consider the num of occurence of this term
                server_term.zincrby(curr_set, low_word, float(score))
                #1 term per paste
                server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
            #Add more info for tracked terms
            check_if_tracked_term(low_word, filename)
            #send to RegexForTermsFrequency
            to_send = "{} {} {}".format(filename, timestamp, word)
            p.populate_set_out(to_send, 'RegexForTermsFrequency')
        else:
            if generate_new_graph:
                generate_new_graph = False
                print('Building graph')
                today = datetime.date.today()
                year = today.year
                month = today.month
                lib_words.create_curve_with_word_file(r_serv1, csv_path,
                                                      wordfile_path, year,
                                                      month)
            publisher.debug("Script Curve is Idling")
            print("sleeping")
            time.sleep(10)
        message = p.get_from_set()
--- a/bin/CurveManageTopSets.py
+++ b/bin/CurveManageTopSets.py
@ -1,166 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 This module manage top sets for terms frequency.
 Every 'refresh_rate' update the weekly and monthly set
 """
 import redis
 import time
 import datetime
 import copy
 from pubsublogger import publisher
 from packages import lib_words
 import datetime
 import calendar
 import os
 import configparser
 # Config Variables
 Refresh_rate = 60*5 #sec
 BlackListTermsSet_Name = "BlackListSetTermSet"
 TrackedTermsSet_Name = "TrackedSetTermSet"
 top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
 oneDay = 60*60*24
 num_day_month = 31
 num_day_week = 7
 top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
 top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
 top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
 top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
 def manage_top_set():
    startDate = datetime.datetime.now()
    startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0)
    startDate = calendar.timegm(startDate.timetuple())
    blacklist_size = int(server_term.scard(BlackListTermsSet_Name))
    dico = {}
    dico_per_paste = {}
    # Retreive top data (max_card + blacklist_size) from days sets
    for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay):
        curr_set = top_termFreq_setName_day[0] + str(timestamp)
        array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
        array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
        for word, value in array_top_day:
            if word not in server_term.smembers(BlackListTermsSet_Name):
                if word in dico.keys():
                    dico[word] += value
                else:
                    dico[word] = value
        for word, value in array_top_day_per_paste:
            if word not in server_term.smembers(BlackListTermsSet_Name):
                if word in dico_per_paste.keys():
                    dico_per_paste[word] += value
                else:
                    dico_per_paste[word] = value
        if timestamp == startDate - num_day_week*oneDay:
            dico_week = copy.deepcopy(dico)
            dico_week_per_paste = copy.deepcopy(dico_per_paste)
    # convert dico into sorted array
    array_month = []
    for w, v in dico.items():
        array_month.append((w, v))
    array_month.sort(key=lambda tup: -tup[1])
    array_month = array_month[0:20]
    array_week = []
    for w, v in dico_week.items():
        array_week.append((w, v))
    array_week.sort(key=lambda tup: -tup[1])
    array_week = array_week[0:20]
    # convert dico_per_paste into sorted array
    array_month_per_paste = []
    for w, v in dico_per_paste.items():
        array_month_per_paste.append((w, v))
    array_month_per_paste.sort(key=lambda tup: -tup[1])
    array_month_per_paste = array_month_per_paste[0:20]
    array_week_per_paste = []
    for w, v in dico_week_per_paste.items():
        array_week_per_paste.append((w, v))
    array_week_per_paste.sort(key=lambda tup: -tup[1])
    array_week_per_paste = array_week_per_paste[0:20]
    # suppress every terms in top sets
    for curr_set, curr_num_day in top_termFreq_set_array[1:3]:
        for w in server_term.zrange(curr_set, 0, -1):
            server_term.zrem(curr_set, w)
        for w in server_term.zrange("per_paste_" + curr_set, 0, -1):
            server_term.zrem("per_paste_" + curr_set, w)
    # Add top term from sorted array in their respective sorted sets
    for elem in array_week:
        server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0])
    for elem in array_week_per_paste:
        server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0])
    for elem in array_month:
        server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0])
    for elem in array_month_per_paste:
        server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0])
    timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
    value = str(timestamp) + ", " + "-"
    r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
    print("refreshed module")
 if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'
    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
    if not os.path.exists(configfile):
        raise Exception('Unable to find the configuration file. \
                        Did you set environment variables? \
                        Or activate the virtualenv.')
    cfg = configparser.ConfigParser()
    cfg.read(configfile)
    # For Module Manager
    r_temp = redis.StrictRedis(
        host=cfg.get('RedisPubSub', 'host'),
        port=cfg.getint('RedisPubSub', 'port'),
        db=cfg.getint('RedisPubSub', 'db'),
        decode_responses=True)
    timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
    value = str(timestamp) + ", " + "-"
    r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
    r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid()))
    server_term = redis.StrictRedis(
        host=cfg.get("ARDB_TermFreq", "host"),
        port=cfg.getint("ARDB_TermFreq", "port"),
        db=cfg.getint("ARDB_TermFreq", "db"),
        decode_responses=True)
    publisher.info("Script Curve_manage_top_set started")
    # Sent to the logging a description of the module
    publisher.info("Manage the top sets with the data created by the module curve.")
    manage_top_set()
    while True:
        # Get one message from the input queue (module only work if linked with a queue)
        time.sleep(Refresh_rate) # sleep a long time then manage the set
        manage_top_set()
--- a/bin/DbCleaner.py
+++ b/bin/DbCleaner.py
@ -0,0 +1,59 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The TermTracker Module
 ===================
 """
 import os
 import sys
 import time
 import datetime
 from pubsublogger import publisher
 import NotificationHelper
 from packages import Date
 from packages import Item
 from packages import Term
 def clean_term_db_stat_token():
    all_stat_date = Term.get_all_token_stat_history()
    list_date_to_keep = Date.get_date_range(31)
    for date in all_stat_date:
        if date not in list_date_to_keep:
            # remove history
            Term.delete_token_statistics_by_date(date)
    print('Term Stats Cleaned')
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    publisher.info("DbCleaner started")
    config_section = 'TermTrackerMod'
    # low priority
    time.sleep(180)
    daily_cleaner = True
    current_date = datetime.datetime.now().strftime("%Y%m%d")
    while True:
        if daily_cleaner:
            clean_term_db_stat_token()
            daily_cleaner = False
        else:
            sys.exit(0)
            time.sleep(600)
        new_date = datetime.datetime.now().strftime("%Y%m%d")
        if new_date != current_date:
            current_date = new_date
            daily_cleaner = True
--- a/bin/Dir.py
+++ b/bin/Dir.py
@ -1,48 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import argparse
 import redis
 from pubsublogger import publisher
 from packages.lib_words import create_dirfile
 import configparser
 def main():
    """Main Function"""
    # CONFIG #
    cfg = configparser.ConfigParser()
    cfg.read('./packages/config.cfg')
    parser = argparse.ArgumentParser(
        description='''This script is a part of the Analysis Information Leak
        framework. It create a redis list called "listfile" which contain
        the absolute filename of all the files from the directory given in
        the argument "directory".''',
        epilog='Example: ./Dir.py /home/2013/03/')
    parser.add_argument('directory', type=str,
                        help='The directory to run inside', action='store')
    parser.add_argument('-db', type=int, default=0,
                        help='The name of the Redis DB (default 0)',
                        choices=[0, 1, 2, 3, 4], action='store')
    parser.add_argument('-ow', help='trigger the overwritting mode',
                        action='store_true')
    args = parser.parse_args()
    r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"),
                               port=cfg.getint("Redis_Queues", "port"),
                               db=cfg.getint("Redis_Queues", "db"),
                               decode_responses=True)
    publisher.port = 6380
    publisher.channel = "Script"
    create_dirfile(r_serv, args.directory, args.ow)
 if __name__ == "__main__":
    main()
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -153,14 +153,10 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
@ -175,13 +171,9 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x"
+    screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x"
+    screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
    sleep 0.1
@ -213,6 +205,8 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"
--- a/bin/Lines.py
+++ b/bin/Lines.py
@ -1,85 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The ZMQ_PubSub_Lines Module
 ============================
 This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q
 Module.
 It perform a sorting on the line's length and publish/forward them to
 differents channels:
 *Channel 1 if max length(line) < max
 *Channel 2 if max length(line) > max
 The collected informations about the processed pastes
 (number of lines and maximum length line) are stored in Redis.
 ..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
 the same Subscriber name in both of them.
 Requirements
 ------------
 *Need running Redis instances. (LevelDB & Redis)
 *Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.
 """
 import argparse
 import time
 from packages import Paste
 from pubsublogger import publisher
 from Helper import Process
 if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'
    config_section = 'Lines'
    p = Process(config_section)
    # SCRIPT PARSER #
    parser = argparse.ArgumentParser(
        description='This script is a part of the Analysis Information \
                Leak framework.')
    parser.add_argument(
        '-max', type=int, default=500,
        help='The limit between "short lines" and "long lines"',
        action='store')
    args = parser.parse_args()
    # FUNCTIONS #
    tmp_string = "Lines script Subscribed to channel {} and Start to publish \
            on channel Longlines, Shortlines"
    publisher.info(tmp_string)
    while True:
        try:
            message = p.get_from_set()
            print(message)
            if message is not None:
                PST = Paste.Paste(message)
            else:
                publisher.debug("Tokeniser is idling 10s")
                time.sleep(10)
                continue
            # FIXME do it in the paste class
            lines_infos = PST.get_lines_info()
            PST.save_attribute_redis("p_nb_lines", lines_infos[0])
            PST.save_attribute_redis("p_max_length_line", lines_infos[1])
            # FIXME Not used.
            PST.store.sadd("Pastes_Objects", PST.p_rel_path)
            print(PST.p_rel_path)
            if lines_infos[1] < args.max:
                p.populate_set_out( PST.p_rel_path , 'LinesShort')
            else:
                p.populate_set_out( PST.p_rel_path , 'LinesLong')
        except IOError:
            print("CRC Checksum Error on : ", PST.p_rel_path)
--- a/bin/ModuleStats.py
+++ b/bin/ModuleStats.py
@ -9,7 +9,6 @@ import time
 import datetime
 import redis
 import os
 from packages import lib_words
 from packages.Date import Date
 from pubsublogger import publisher
 from Helper import Process
--- a/bin/RegexForTermsFrequency.py
+++ b/bin/RegexForTermsFrequency.py
@ -1,157 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 This Module is used for term frequency.
 It processes every paste coming from the global module and test the regexs
 supplied in  the term webpage.
 """
 import redis
 import time
 from pubsublogger import publisher
 from packages import Paste
 import calendar
 import re
 import signal
 import time
 from Helper import Process
 # Email notifications
 from NotificationHelper import *
 class TimeoutException(Exception):
    pass
 def timeout_handler(signum, frame):
    raise TimeoutException
 signal.signal(signal.SIGALRM, timeout_handler)
 # Config Variables
 DICO_REFRESH_TIME = 60  # s
 BlackListTermsSet_Name = "BlackListSetTermSet"
 TrackedTermsSet_Name = "TrackedSetTermSet"
 TrackedRegexSet_Name = "TrackedRegexSet"
 top_term_freq_max_set_cardinality = 20  # Max cardinality of the terms frequences set
 oneDay = 60*60*24
 top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
 top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
 top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
 top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
 TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
 # create direct link in mail
 full_paste_url = "/showsavedpaste/?paste="
 def refresh_dicos():
    dico_regex = {}
    dico_regexname_to_redis = {}
    for regex_str in server_term.smembers(TrackedRegexSet_Name):
        dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
        dico_regexname_to_redis[regex_str[1:-1]] = regex_str
    return dico_regex, dico_regexname_to_redis
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'RegexForTermsFrequency'
    p = Process(config_section)
    max_execution_time = p.config.getint(config_section, "max_execution_time")
    # REDIS #
    server_term = redis.StrictRedis(
        host=p.config.get("ARDB_TermFreq", "host"),
        port=p.config.get("ARDB_TermFreq", "port"),
        db=p.config.get("ARDB_TermFreq", "db"),
        decode_responses=True)
    # FUNCTIONS #
    publisher.info("RegexForTermsFrequency script started")
    # create direct link in mail
    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
    # compile the regex
    dico_refresh_cooldown = time.time()
    dico_regex, dico_regexname_to_redis = refresh_dicos()
    message = p.get_from_set()
    # Regex Frequency
    while True:
        if message is not None:
            if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
                dico_refresh_cooldown = time.time()
                dico_regex, dico_regexname_to_redis = refresh_dicos()
                print('dico got refreshed')
            filename = message
            temp = filename.split('/')
            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
            curr_set = top_termFreq_setName_day[0] + str(timestamp)
            paste = Paste.Paste(filename)
            content = paste.get_p_content()
            # iterate the word with the regex
            for regex_str, compiled_regex in dico_regex.items():
                signal.alarm(max_execution_time)
                try:
                    matched = compiled_regex.search(content)
                except TimeoutException:
                    print ("{0} processing timeout".format(paste.p_rel_path))
                    continue
                else:
                    signal.alarm(0)
                if matched is not None:  # there is a match
                    print('regex matched {}'.format(regex_str))
                    matched = matched.group(0)
                    regex_str_complete = "/" + regex_str + "/"
                    # Add in Regex track set only if term is not in the blacklist
                    if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
                        # Send a notification only when the member is in the set
                        if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
                            # create mail body
                            mail_body = ("AIL Framework,\n"
                                         "New occurrence for regex: " + regex_str + "\n"
                                         ''+full_paste_url + filename)
                            # Send to every associated email adress
                            for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
                                sendEmailNotification(email, 'Term', mail_body)
                        # tag paste
                        for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
                            msg = '{};{}'.format(tag, filename)
                            p.populate_set_out(msg, 'Tags')
                        set_name = 'regex_' + dico_regexname_to_redis[regex_str]
                        new_to_the_set = server_term.sadd(set_name, filename)
                        new_to_the_set = True if new_to_the_set == 1 else False
                        # consider the num of occurence of this term
                        regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
                        # 1 term per paste
                        if new_to_the_set:
                            regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
                            server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
                    server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
                else:
                    pass
        else:
            publisher.debug("Script RegexForTermsFrequency is Idling")
            print("sleeping")
            time.sleep(5)
        message = p.get_from_set()
--- a/bin/RegexTracker.py
+++ b/bin/RegexTracker.py
@ -0,0 +1,96 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 This Module is used for regex tracking.
 It processes every paste coming from the global module and test the regexs
 supplied in  the term webpage.
 """
 import os
 import re
 import sys
 import time
 import signal
 from Helper import Process
 from pubsublogger import publisher
 import NotificationHelper
 from packages import Item
 from packages import Term
 full_item_url = "/showsavedpaste/?paste="
 mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}"
 dict_regex_tracked = Term.get_regex_tracked_words_dict()
 last_refresh = time.time()
 class TimeoutException(Exception):
    pass
 def timeout_handler(signum, frame):
    raise TimeoutException
 signal.signal(signal.SIGALRM, timeout_handler)
 def new_term_found(term, term_type, item_id, item_date):
    uuid_list = Term.get_term_uuid_list(term, 'regex')
    print('new tracked term found: {} in {}'.format(term, item_id))
    for term_uuid in uuid_list:
        Term.add_tracked_item(term_uuid, item_id, item_date)
        tags_to_add = Term.get_term_tags(term_uuid)
        for tag in tags_to_add:
            msg = '{};{}'.format(tag, item_id)
            p.populate_set_out(msg, 'Tags')
        mail_to_notify = Term.get_term_mails(term_uuid)
        if mail_to_notify:
            mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
        for mail in mail_to_notify:
            NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    publisher.info("Script RegexTracker started")
    config_section = 'RegexTracker'
    p = Process(config_section)
    max_execution_time = p.config.getint(config_section, "max_execution_time")
    ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
    # Regex Frequency
    while True:
        item_id = p.get_from_set()
        if item_id is not None:
            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)
            for regex in dict_regex_tracked:
                signal.alarm(max_execution_time)
                try:
                    matched = dict_regex_tracked[regex].search(item_content)
                except TimeoutException:
                    print ("{0} processing timeout".format(paste.p_rel_path))
                    continue
                else:
                    signal.alarm(0)
                if matched:
                    new_term_found(regex, 'regex', item_id, item_date)
        else:
            time.sleep(5)
        # refresh Tracked term
        if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
            dict_regex_tracked = Term.get_regex_tracked_words_dict()
            last_refresh = time.time()
            print('Tracked set refreshed')
--- a/bin/SetForTermsFrequency.py
+++ b/bin/SetForTermsFrequency.py
@ -1,151 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 This Module is used for term frequency.
 It processes every paste coming from the global module and test the sets
 supplied in  the term webpage.
 """
 import redis
 import time
 from pubsublogger import publisher
 from packages import lib_words
 from packages import Paste
 import os
 import datetime
 import calendar
 import re
 import ast
 from Helper import Process
 # Email notifications
 from NotificationHelper import *
 # Config Variables
 BlackListTermsSet_Name = "BlackListSetTermSet"
 TrackedTermsSet_Name = "TrackedSetTermSet"
 TrackedRegexSet_Name = "TrackedRegexSet"
 TrackedSetSet_Name = "TrackedSetSet"
 top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
 oneDay = 60*60*24
 top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
 top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
 top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
 top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
 TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
 # create direct link in mail
 full_paste_url = "/showsavedpaste/?paste="
 def add_quote_inside_tab(tab):
    quoted_tab = "["
    for elem in tab[1:-1].split(','):
        elem = elem.lstrip().strip()
        quoted_tab += "\'{}\', ".format(elem)
    quoted_tab = quoted_tab[:-2] #remove trailing ,
    quoted_tab += "]"
    return str(quoted_tab)
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'SetForTermsFrequency'
    p = Process(config_section)
    # REDIS #
    server_term = redis.StrictRedis(
        host=p.config.get("ARDB_TermFreq", "host"),
        port=p.config.get("ARDB_TermFreq", "port"),
        db=p.config.get("ARDB_TermFreq", "db"),
        decode_responses=True)
    # FUNCTIONS #
    publisher.info("RegexForTermsFrequency script started")
    # create direct link in mail
    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
    #get the dico and matching percent
    dico_percent = {}
    dico_set_tab = {}
    dico_setname_to_redis = {}
    for set_str in server_term.smembers(TrackedSetSet_Name):
        tab_set = set_str[1:-1]
        tab_set = add_quote_inside_tab(tab_set)
        perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
        if perc_finder is not None:
            match_percent = perc_finder.group(0)[1:-1]
            dico_percent[tab_set] = float(match_percent)
            dico_set_tab[tab_set] = ast.literal_eval(tab_set)
            dico_setname_to_redis[tab_set] = set_str
        else:
            continue
    message = p.get_from_set()
    while True:
        if message is not None:
            filename = message
            temp = filename.split('/')
            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
            content = Paste.Paste(filename).get_p_content()
            curr_set = top_termFreq_setName_day[0] + str(timestamp)
            #iterate over the words of the file
            match_dico = {}
            for word in content.split():
                for cur_set, array_set in dico_set_tab.items():
                    for w_set in array_set[:-1]: #avoid the percent matching
                        if word == w_set:
                            try:
                                match_dico[str(array_set)] += 1
                            except KeyError:
                                match_dico[str(array_set)] = 1
            #compute matching %
            for the_set, matchingNum in match_dico.items():
                eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
                if eff_percent >= dico_percent[the_set]:
                    # Send a notification only when the member is in the set
                    if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):
                        # create mail body
                        mail_body = ("AIL Framework,\n"
                                    "New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
                                    ''+full_paste_url + filename)
                        # Send to every associated email adress
                        for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
                            sendEmailNotification(email, 'Term', mail_body)
                    # tag paste
                    for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
                        msg = '{};{}'.format(tag, filename)
                        p.populate_set_out(msg, 'Tags')
                    print(the_set, "matched in", filename)
                    set_name = 'set_' + dico_setname_to_redis[the_set]
                    new_to_the_set = server_term.sadd(set_name, filename)
                    new_to_the_set = True if new_to_the_set == 1 else False
                    #consider the num of occurence of this set
                    set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
                    # FIXME - avoid using per paste as a set is checked over the entire paste
                    #1 term per paste
                    if new_to_the_set:
                        set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
                        server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
                server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
        else:
            publisher.debug("Script RegexForTermsFrequency is Idling")
            print("sleeping")
            time.sleep(5)
        message = p.get_from_set()
--- a/bin/TermTrackerMod.py
+++ b/bin/TermTrackerMod.py
@ -8,13 +8,14 @@ The TermTracker Module
 import os
 import sys
 import time
 import signal
 from Helper import Process
 from pubsublogger import publisher
 import NotificationHelper
-from packages import Paste
+from packages import Item
 from packages import Term
 sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n
 # loads tracked words
 list_tracked_words = Term.get_tracked_words_list()
 last_refresh_word = time.time()
 set_tracked_words_list = Term.get_set_tracked_words_list()
 last_refresh_set = time.time()
-def new_term_found(term, term_type, item_id):
+class TimeoutException(Exception):
-    uuid_list = Term.get_term_uuid_list(term)
+    pass
 def timeout_handler(signum, frame):
    raise TimeoutException
 signal.signal(signal.SIGALRM, timeout_handler)
 def new_term_found(term, term_type, item_id, item_date):
    uuid_list = Term.get_term_uuid_list(term, term_type)
    print('new tracked term found: {} in {}'.format(term, item_id))
    for term_uuid in uuid_list:
-        Term.add_tracked_item(term_uuid, item_id)
+        Term.add_tracked_item(term_uuid, item_id, item_date)
        tags_to_add = Term.get_term_tags(term_uuid)
        for tag in tags_to_add:
@ -52,28 +62,38 @@ if __name__ == "__main__":
    publisher.channel = "Script"
    publisher.info("Script TermTrackerMod started")
    #config_section = 'TermTrackerMod'
    config_section = 'TermTrackerMod'
    p = Process(config_section)
    max_execution_time = p.config.getint(config_section, "max_execution_time")
    full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
    while True:
        item_id = p.get_from_set()
        item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
        #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
-        if message is not None:
+        if item_id is not None:
-            paste = Paste.Paste(item_id)
+            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)
-            dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
+            signal.alarm(max_execution_time)
            try:
                dict_words_freq = Term.get_text_word_frequency(item_content)
            except TimeoutException:
                print ("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)
            # create token statistics
            for word in dict_words_freq:
                Term.create_token_statistics(item_date, word, dict_words_freq[word])
            # check solo words
            for word in list_tracked_words:
                if word in dict_words_freq:
-                    new_term_found(word, 'word', item_id)
+                    new_term_found(word, 'word', item_id, item_date)
            # check words set
            for elem in set_tracked_words_list:
@ -86,7 +106,19 @@ if __name__ == "__main__":
                    if word in dict_words_freq:
                        nb_uniq_word += 1
                if nb_uniq_word >= nb_words_threshold:
-                    new_term_found(word_set, 'set', item_id)
+                    new_term_found(word_set, 'set', item_id, item_date)
        else:
            time.sleep(5)
        # refresh Tracked term
        if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
            list_tracked_words = Term.get_tracked_words_list()
            last_refresh_word = time.time()
            print('Tracked word refreshed')
        if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
            set_tracked_words_list = Term.get_set_tracked_words_list()
            last_refresh_set = time.time()
            print('Tracked set refreshed')
--- a/bin/Tokenize.py
+++ b/bin/Tokenize.py
@ -1,71 +0,0 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 """
 The Tokenize Module
 ===================
 This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
 Module.
 It tokenize the content of the paste and publish the result in the following
 format:
    channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring
    ..seealso:: Paste method (_get_top_words)
 ..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
 the same Subscriber name in both of them.
 Requirements
 ------------
 *Need running Redis instances. (Redis)
 *Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
 """
 import time
 from packages import Paste
 from pubsublogger import publisher
 from Helper import Process
 import signal
 class TimeoutException(Exception):
    pass
 def timeout_handler(signum, frame):
    raise TimeoutException
 signal.signal(signal.SIGALRM, timeout_handler)
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'Tokenize'
    p = Process(config_section)
    # LOGGING #
    publisher.info("Tokeniser started")
    while True:
        message = p.get_from_set()
        print(message)
        if message is not None:
            paste = Paste.Paste(message)
            signal.alarm(5)
            try:
                for word, score in paste._get_top_words().items():
                    if len(word) >= 4:
                        msg = '{} {} {}'.format(paste.p_rel_path, word, score)
                        p.populate_set_out(msg)
            except TimeoutException:
                p.incr_module_timeout_statistic()
                print ("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)
        else:
            publisher.debug("Tokeniser is idling 10s")
            time.sleep(10)
            print("Sleeping")
--- a/bin/packages/Date.py
+++ b/bin/packages/Date.py
@ -1,5 +1,7 @@
 #!/usr/bin/python3
 import datetime
 class Date(object):
    """docstring for Date"""
    def __init__(self, *args):
@ -34,7 +36,6 @@ class Date(object):
        self.day = day
    def substract_day(self, numDay):
        import datetime
        computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
        comp_year = str(computed_date.year)
        comp_month = str(computed_date.month).zfill(2)
@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1):
    new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
    new_date = str(new_date).replace('-', '')
    return new_date
 def get_date_range(num_day):
    curr_date = datetime.date.today()
    date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
    date_list = []
    for i in range(0, num_day+1):
        date_list.append(date.substract_day(i))
    return list(reversed(date_list))
--- a/bin/packages/Item.py
+++ b/bin/packages/Item.py
@ -2,10 +2,13 @@
 # -*-coding:UTF-8 -*
 import os
 import sys
 import gzip
 import redis
 sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
 import Flask_config
 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
 import Date
 import Tag
--- a/bin/packages/Term.py
+++ b/bin/packages/Term.py
@ -4,6 +4,7 @@
 import os
 import re
 import sys
 import time
 import uuid
 import redis
 import datetime
@ -72,14 +73,30 @@ def get_set_tracked_words_list():
        all_set_list.append((ter_set, num_words, elem))
    return all_set_list
-def is_term_tracked_in_global_level(term):
+def get_regex_tracked_words_dict():
-    res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
+    regex_list = r_serv_term.smembers('all:tracked_term:regex')
    dict_tracked_regex = {}
    for regex in regex_list:
        dict_tracked_regex[regex] = re.compile(regex)
    return dict_tracked_regex
 def is_term_tracked_in_global_level(term, term_type):
    res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))
    if res:
        for elem_uuid in res:
            if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
                return True
    return False
 def is_term_tracked_in_user_level(term, term_type, user_id):
    res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id))
    if res:
        for elem_uuid in res:
            if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term:
                if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type:
                    return True
    return False
 def parse_json_term_to_add(dict_input, user_id):
    term = dict_input.get('term', None)
    if not term:
@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id):
    # check if term already tracked in global
    if level==1:
-        if is_term_tracked_in_global_level(term):
+        if is_term_tracked_in_global_level(term, term_type):
            return ({"status": "error", "reason": "Term already tracked"}, 409)
    else:
        if is_term_tracked_in_user_level(term, term_type, user_id):
            return ({"status": "error", "reason": "Term already tracked"}, 409)
    term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
    r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)
    # create term - uuid map
-    r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid)
+    r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
    # add display level set
    if level == 0: # user only
@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
    for mail in mails:
        r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)
    # toggle refresh module tracker list/set
    r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
    return term_uuid
 def delete_term(term_uuid):
    term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
    term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
    term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
-    r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
+    r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
-    r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
+    # Term not tracked by other users
    if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)):
        r_serv_term.srem('all:tracked_term:{}'.format(term_type), term)
        # toggle refresh module tracker list/set
        r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
    if level == 0: # user only
        user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
@ -218,8 +245,8 @@ def delete_term(term_uuid):
    # remove item set
    r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
-def get_term_uuid_list(term):
+def get_term_uuid_list(term, term_type):
-    return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
+    return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)))
 def get_term_tags(term_uuid):
    return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
@ -227,10 +254,30 @@ def get_term_tags(term_uuid):
 def get_term_mails(term_uuid):
    return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
-def add_tracked_item(term_uuid, item_id):
+def add_tracked_item(term_uuid, item_id, item_date):
-    r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
+    # track item
    r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id)
    # track nb item by date
    r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1)
 def create_token_statistics(item_date, word, nb):
    r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1)
    r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb)
    r_serv_term.sadd('stat_token_history', item_date)
 def delete_token_statistics_by_date(item_date):
    r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date))
    r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date))
    r_serv_term.srem('stat_token_history', item_date)
 def get_all_token_stat_history():
    return r_serv_term.smembers('stat_token_history')
 def get_tracked_term_last_updated_by_type(term_type):
    epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type))
    if not epoch_update:
        epoch_update = 0
    return float(epoch_update)
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -107,7 +107,10 @@ operation_mode = 3
 ttl_duplicate = 86400
 default_unnamed_feed_name = unnamed_feeder
-[RegexForTermsFrequency]
+[TermTrackerMod]
 max_execution_time = 120
 [RegexTracker]
 max_execution_time = 60
 ##### Redis #####
--- a/bin/packages/lib_words.py
+++ b/bin/packages/lib_words.py
@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY
 import csv
 def listdirectory(path):
    """Path Traversing Function.
    :param path: -- The absolute pathname to a directory.
    This function is returning all the absolute path of the files contained in
    the argument directory.
    """
    fichier = []
    for root, dirs, files in os.walk(path):
        for i in files:
            fichier.append(os.path.join(root, i))
    return fichier
 clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
 """It filters out non-printable characters from the string it receives."""
 def create_dirfile(r_serv, directory, overwrite):
    """Create a file of path.
    :param r_serv: -- connexion to redis database
    :param directory: -- The folder where to launch the listing of the .gz files
    This function create a list in redis with inside the absolute path
    of all the pastes needed to be proceeded by function using parallel
    (like redis_words_ranking)
    """
    if overwrite:
        r_serv.delete("filelist")
        for x in listdirectory(directory):
            r_serv.lpush("filelist", x)
        publisher.info("The list was overwritten")
    else:
        if r_serv.llen("filelist") == 0:
            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)
            publisher.info("New list created")
        else:
            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)
            publisher.info("The list was updated with new elements")
 def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -19,36 +19,17 @@ subscribe = Redis_Global
 [Attributes]
 subscribe = Redis_Global
 [Lines]
 subscribe = Redis_Global
 publish = Redis_LinesShort,Redis_LinesLong
 [DomClassifier]
 subscribe = Redis_Global
 [Tokenize]
 subscribe = Redis_LinesShort
 publish = Redis_Words
 [Curve]
 subscribe = Redis_Words
 publish = Redis_CurveManageTopSets,Redis_Tags
 [TermTrackerMod]
 subscribe = Redis_Global
 publish = Redis_Tags
-[RegexForTermsFrequency]
+[RegexTracker]
 subscribe = Redis_Global
 publish = Redis_Tags
 [SetForTermsFrequency]
 subscribe = Redis_Global
 publish = Redis_Tags
 [CurveManageTopSets]
 subscribe = Redis_CurveManageTopSets
 [Categ]
 subscribe = Redis_Global
 publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey