chg: [Term Tracker] refractor term tracker word/set/regex modules + remove old modules

2025-09-01 20:52:40 +00:00 · 2019-08-09 14:20:13 +02:00 · 2019-08-09 14:20:13 +02:00 · 1008c7c4fe
commit 1008c7c4fe
parent d9bdfecef3
19 changed files with 304 additions and 981 deletions
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@ -138,12 +138,12 @@ Redis and ARDB overview

 | Set - Key | Value |
 | ------ | ------ |
-| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** |
+| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** |

 ##### All Term Tracked items:
 | Set - Key | Value |
 | ------ | ------ |
-| tracked_term:item:**uuid** | **item_id** |
+| tracked_term:item:**uuid**:**date** | **item_id** |

 ##### All Term Tracked tags:
 | Set - Key | Value |
@ -155,6 +155,29 @@ Redis and ARDB overview
 | ------ | ------ |
 | tracked_term:mail:**uuid** | **mail** |

+##### Refresh Tracked term:
+| Key | Value |
+| ------ | ------ |
+| tracked_term:refresh:word | **last refreshed epoch** |
+| tracked_term:refresh:set | - |
+| tracked_term:refresh:regex | - |
+
+##### Zset Stat Tracked term:
+| Key | Field | Value |
+| ------ | ------ | ------ |
+| tracked_term:stat:**uuid** | **date** | **nb_seen** |
+
+##### Stat token:
+| Key | Field | Value |
+| ------ | ------ | ------ |
+| stat_token_total_by_day:**date** | **word** | **nb_seen** |
+| | | |
+| stat_token_per_item_by_day:**date** | **word** | **nb_seen** |
+
+| Set - Key | Value |
+| ------ | ------ |
+| stat_token_history | **date** |
+
 ## DB2 - TermFreq:

 ##### Set:
@ -167,16 +190,6 @@ Redis and ARDB overview
 | TrackedRegexSet | **tracked_regex** |
 | | |
 | | |
-| global:TrackedSetTermSet | **tracked_term** |
-| global:TrackedSetSet | **tracked_set** |
-| global:TrackedRegexSet | **tracked_regex** |
-| | |
-| | |
-| user:**user_id**:TrackedSetTermSet | **tracked_term** |
-| user:**user_id**:TrackedSetSet | **tracked_set** |
-| user:**user_id**:TrackedRegexSet | **tracked_regex** |
-| | |
-| | |
 | tracked_**tracked_term** | **item_path** |
 | set_**tracked_set** | **item_path** |
 | regex_**tracked_regex** | **item_path** |
--- a/bin/Curve.py
+++ b/bin/Curve.py
@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
-
-This modules update a .csv file used to draw curves representing selected
-words and their occurency per day.
-
-..note:: The channel will have the name of the file created.
-
-..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
-the same Subscriber name in both of them.
-
-
-This Module is also used for term frequency.
-
-/!\ Top set management is done in the module Curve_manage_top_set
-
-
-Requirements
------------
-
-*Need running Redis instances. (Redis)
-*Categories files of words in /files/ need to be created
-*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
-
-"""
-import redis
-import time
-from pubsublogger import publisher
-from packages import lib_words
-import os
-import datetime
-import calendar
-
-from Helper import Process
-
-# Email notifications
-from NotificationHelper import *
-
-# Config Variables
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
-
-TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
-
-# create direct link in mail
-full_paste_url = "/showsavedpaste/?paste="
-
-def check_if_tracked_term(term, path):
-    if term in server_term.smembers(TrackedTermsSet_Name):
-        #add_paste to tracked_word_set
-        set_name = "tracked_" + term
-        server_term.sadd(set_name, path)
-        print(term, 'addded', set_name, '->', path)
-        p.populate_set_out("New Term added", 'CurveManageTopSets')
-
-        # Send a notification only when the member is in the set
-        if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
-
-            # create mail body
-            mail_body = ("AIL Framework,\n"
-                        "New occurrence for term: " + term + "\n"
-                        ''+full_paste_url + path)
-
-            # Send to every associated email adress
-            for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
-                sendEmailNotification(email, 'Term', mail_body)
-
-        # tag paste
-        for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term):
-            msg = '{};{}'.format(tag, path)
-            p.populate_set_out(msg, 'Tags')
-
-
-def getValueOverRange(word, startDate, num_day):
-    to_return = 0
-    for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
-        value = server_term.hget(timestamp, word)
-        to_return += int(value) if value is not None else 0
-    return to_return
-
-
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'Curve'
-    p = Process(config_section)
-
-    # REDIS #
-    r_serv1 = redis.StrictRedis(
-        host=p.config.get("ARDB_Curve", "host"),
-        port=p.config.get("ARDB_Curve", "port"),
-        db=p.config.get("ARDB_Curve", "db"),
-        decode_responses=True)
-
-    server_term = redis.StrictRedis(
-        host=p.config.get("ARDB_TermFreq", "host"),
-        port=p.config.get("ARDB_TermFreq", "port"),
-        db=p.config.get("ARDB_TermFreq", "db"),
-        decode_responses=True)
-
-    # FUNCTIONS #
-    publisher.info("Script Curve started")
-
-    # create direct link in mail
-    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
-
-    # FILE CURVE SECTION #
-    csv_path = os.path.join(os.environ['AIL_HOME'],
-                            p.config.get("Directories", "wordtrending_csv"))
-    wordfile_path = os.path.join(os.environ['AIL_HOME'],
-                                 p.config.get("Directories", "wordsfile"))
-
-    message = p.get_from_set()
-    prec_filename = None
-    generate_new_graph = False
-
-    # Term Frequency
-    top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-    top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-    top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-
-    while True:
-
-        if message is not None:
-            generate_new_graph = True
-
-            filename, word, score = message.split()
-            temp = filename.split('/')
-            date = temp[-4] + temp[-3] + temp[-2]
-            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
-            curr_set = top_termFreq_setName_day[0] + str(timestamp)
-
-
-            low_word = word.lower()
-            #Old curve with words in file
-            r_serv1.hincrby(low_word, date, int(score))
-
-            # Update redis
-            #consider the num of occurence of this term
-            curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
-            #1 term per paste
-            curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
-
-            # Add in set only if term is not in the blacklist
-            if low_word not in server_term.smembers(BlackListTermsSet_Name):
-                #consider the num of occurence of this term
-                server_term.zincrby(curr_set, low_word, float(score))
-                #1 term per paste
-                server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
-
-            #Add more info for tracked terms
-            check_if_tracked_term(low_word, filename)
-
-            #send to RegexForTermsFrequency
-            to_send = "{} {} {}".format(filename, timestamp, word)
-            p.populate_set_out(to_send, 'RegexForTermsFrequency')
-
-        else:
-
-            if generate_new_graph:
-                generate_new_graph = False
-                print('Building graph')
-                today = datetime.date.today()
-                year = today.year
-                month = today.month
-
-                lib_words.create_curve_with_word_file(r_serv1, csv_path,
-                                                      wordfile_path, year,
-                                                      month)
-
-            publisher.debug("Script Curve is Idling")
-            print("sleeping")
-            time.sleep(10)
-        message = p.get_from_set()
--- a/bin/CurveManageTopSets.py
+++ b/bin/CurveManageTopSets.py
@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-
-This module manage top sets for terms frequency.
-Every 'refresh_rate' update the weekly and monthly set
-
-"""
-
-import redis
-import time
-import datetime
-import copy
-from pubsublogger import publisher
-from packages import lib_words
-import datetime
-import calendar
-import os
-import configparser
-
-# Config Variables
-Refresh_rate = 60*5 #sec
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-num_day_month = 31
-num_day_week = 7
-
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
-
-
-def manage_top_set():
-    startDate = datetime.datetime.now()
-    startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0)
-    startDate = calendar.timegm(startDate.timetuple())
-    blacklist_size = int(server_term.scard(BlackListTermsSet_Name))
-
-    dico = {}
-    dico_per_paste = {}
-
-    # Retreive top data (max_card + blacklist_size) from days sets
-    for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay):
-        curr_set = top_termFreq_setName_day[0] + str(timestamp)
-        array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
-        array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
-
-        for word, value in array_top_day:
-            if word not in server_term.smembers(BlackListTermsSet_Name):
-                if word in dico.keys():
-                    dico[word] += value
-                else:
-                    dico[word] = value
-
-        for word, value in array_top_day_per_paste:
-            if word not in server_term.smembers(BlackListTermsSet_Name):
-                if word in dico_per_paste.keys():
-                    dico_per_paste[word] += value
-                else:
-                    dico_per_paste[word] = value
-
-        if timestamp == startDate - num_day_week*oneDay:
-            dico_week = copy.deepcopy(dico)
-            dico_week_per_paste = copy.deepcopy(dico_per_paste)
-
-    # convert dico into sorted array
-    array_month = []
-    for w, v in dico.items():
-        array_month.append((w, v))
-    array_month.sort(key=lambda tup: -tup[1])
-    array_month = array_month[0:20]
-
-    array_week = []
-    for w, v in dico_week.items():
-        array_week.append((w, v))
-    array_week.sort(key=lambda tup: -tup[1])
-    array_week = array_week[0:20]
-
-    # convert dico_per_paste into sorted array
-    array_month_per_paste = []
-    for w, v in dico_per_paste.items():
-        array_month_per_paste.append((w, v))
-    array_month_per_paste.sort(key=lambda tup: -tup[1])
-    array_month_per_paste = array_month_per_paste[0:20]
-
-    array_week_per_paste = []
-    for w, v in dico_week_per_paste.items():
-        array_week_per_paste.append((w, v))
-    array_week_per_paste.sort(key=lambda tup: -tup[1])
-    array_week_per_paste = array_week_per_paste[0:20]
-
-
-    # suppress every terms in top sets
-    for curr_set, curr_num_day in top_termFreq_set_array[1:3]:
-        for w in server_term.zrange(curr_set, 0, -1):
-            server_term.zrem(curr_set, w)
-        for w in server_term.zrange("per_paste_" + curr_set, 0, -1):
-            server_term.zrem("per_paste_" + curr_set, w)
-
-    # Add top term from sorted array in their respective sorted sets
-    for elem in array_week:
-        server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0])
-    for elem in array_week_per_paste:
-        server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0])
-
-    for elem in array_month:
-        server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0])
-    for elem in array_month_per_paste:
-        server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0])
-
-    timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
-    value = str(timestamp) + ", " + "-"
-    r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
-    print("refreshed module")
-
-
-
-if __name__ == '__main__':
-    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
-    # Port of the redis instance used by pubsublogger
-    publisher.port = 6380
-    # Script is the default channel used for the modules.
-    publisher.channel = 'Script'
-
-    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
-    if not os.path.exists(configfile):
-        raise Exception('Unable to find the configuration file. \
-                        Did you set environment variables? \
-                        Or activate the virtualenv.')
-
-    cfg = configparser.ConfigParser()
-    cfg.read(configfile)
-
-
-    # For Module Manager
-    r_temp = redis.StrictRedis(
-        host=cfg.get('RedisPubSub', 'host'),
-        port=cfg.getint('RedisPubSub', 'port'),
-        db=cfg.getint('RedisPubSub', 'db'),
-        decode_responses=True)
-
-    timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
-    value = str(timestamp) + ", " + "-"
-    r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
-    r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid()))
-
-    server_term = redis.StrictRedis(
-        host=cfg.get("ARDB_TermFreq", "host"),
-        port=cfg.getint("ARDB_TermFreq", "port"),
-        db=cfg.getint("ARDB_TermFreq", "db"),
-        decode_responses=True)
-
-    publisher.info("Script Curve_manage_top_set started")
-
-    # Sent to the logging a description of the module
-    publisher.info("Manage the top sets with the data created by the module curve.")
-
-    manage_top_set()
-
-    while True:
-        # Get one message from the input queue (module only work if linked with a queue)
-        time.sleep(Refresh_rate) # sleep a long time then manage the set
-        manage_top_set()
--- a/bin/DbCleaner.py
+++ b/bin/DbCleaner.py
@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+The TermTracker Module
+===================
+
+"""
+import os
+import sys
+import time
+import datetime
+
+from pubsublogger import publisher
+
+import NotificationHelper
+
+from packages import Date
+from packages import Item
+from packages import Term
+
+def clean_term_db_stat_token():
+    all_stat_date = Term.get_all_token_stat_history()
+
+    list_date_to_keep = Date.get_date_range(31)
+    for date in all_stat_date:
+        if date not in list_date_to_keep:
+            # remove history
+            Term.delete_token_statistics_by_date(date)
+
+    print('Term Stats Cleaned')
+
+
+if __name__ == "__main__":
+
+    publisher.port = 6380
+    publisher.channel = "Script"
+    publisher.info("DbCleaner started")
+
+    config_section = 'TermTrackerMod'
+
+    # low priority
+    time.sleep(180)
+
+    daily_cleaner = True
+    current_date = datetime.datetime.now().strftime("%Y%m%d")
+
+    while True:
+
+        if daily_cleaner:
+            clean_term_db_stat_token()
+            daily_cleaner = False
+        else:
+            sys.exit(0)
+            time.sleep(600)
+
+        new_date = datetime.datetime.now().strftime("%Y%m%d")
+        if new_date != current_date:
+            current_date = new_date
+            daily_cleaner = True
--- a/bin/Dir.py
+++ b/bin/Dir.py
@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-
-import argparse
-import redis
-from pubsublogger import publisher
-from packages.lib_words import create_dirfile
-import configparser
-
-
-def main():
-    """Main Function"""
-
-    # CONFIG #
-    cfg = configparser.ConfigParser()
-    cfg.read('./packages/config.cfg')
-
-    parser = argparse.ArgumentParser(
-        description='''This script is a part of the Analysis Information Leak
-        framework. It create a redis list called "listfile" which contain
-        the absolute filename of all the files from the directory given in
-        the argument "directory".''',
-        epilog='Example: ./Dir.py /home/2013/03/')
-
-    parser.add_argument('directory', type=str,
-                        help='The directory to run inside', action='store')
-
-    parser.add_argument('-db', type=int, default=0,
-                        help='The name of the Redis DB (default 0)',
-                        choices=[0, 1, 2, 3, 4], action='store')
-
-    parser.add_argument('-ow', help='trigger the overwritting mode',
-                        action='store_true')
-
-    args = parser.parse_args()
-
-    r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"),
-                               port=cfg.getint("Redis_Queues", "port"),
-                               db=cfg.getint("Redis_Queues", "db"),
-                               decode_responses=True)
-
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    create_dirfile(r_serv, args.directory, args.ow)
-
-if __name__ == "__main__":
-    main()
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -153,14 +153,10 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x"
-    sleep 0.1
    screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x"
-    sleep 0.1
    screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
@ -175,13 +171,9 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x"
+    screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x"
    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x"
-    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x"
-    sleep 0.1
-    screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x"
+    screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
    sleep 0.1
@ -213,6 +205,8 @@ function launching_scripts {
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
    sleep 0.1
+    screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"
+    sleep 0.1
    screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
    sleep 0.1
    screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"
--- a/bin/Lines.py
+++ b/bin/Lines.py
@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-
-"""
-The ZMQ_PubSub_Lines Module
-============================
-
-This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q
-Module.
-
-It perform a sorting on the line's length and publish/forward them to
-differents channels:
-
-*Channel 1 if max length(line) < max
-*Channel 2 if max length(line) > max
-
-The collected informations about the processed pastes
-(number of lines and maximum length line) are stored in Redis.
-
-..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
-the same Subscriber name in both of them.
-
-Requirements
------------
-
-*Need running Redis instances. (LevelDB & Redis)
-*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.
-
-"""
-import argparse
-import time
-from packages import Paste
-from pubsublogger import publisher
-
-from Helper import Process
-
-if __name__ == '__main__':
-    publisher.port = 6380
-    publisher.channel = 'Script'
-
-    config_section = 'Lines'
-    p = Process(config_section)
-
-    # SCRIPT PARSER #
-    parser = argparse.ArgumentParser(
-        description='This script is a part of the Analysis Information \
-                Leak framework.')
-
-    parser.add_argument(
-        '-max', type=int, default=500,
-        help='The limit between "short lines" and "long lines"',
-        action='store')
-
-    args = parser.parse_args()
-
-    # FUNCTIONS #
-    tmp_string = "Lines script Subscribed to channel {} and Start to publish \
-            on channel Longlines, Shortlines"
-    publisher.info(tmp_string)
-
-    while True:
-        try:
-            message = p.get_from_set()
-            print(message)
-            if message is not None:
-                PST = Paste.Paste(message)
-            else:
-                publisher.debug("Tokeniser is idling 10s")
-                time.sleep(10)
-                continue
-
-            # FIXME do it in the paste class
-            lines_infos = PST.get_lines_info()
-            PST.save_attribute_redis("p_nb_lines", lines_infos[0])
-            PST.save_attribute_redis("p_max_length_line", lines_infos[1])
-
-            # FIXME Not used.
-            PST.store.sadd("Pastes_Objects", PST.p_rel_path)
-            print(PST.p_rel_path)
-            if lines_infos[1] < args.max:
-                p.populate_set_out( PST.p_rel_path , 'LinesShort')
-            else:
-                p.populate_set_out( PST.p_rel_path , 'LinesLong')
-        except IOError:
-            print("CRC Checksum Error on : ", PST.p_rel_path)
--- a/bin/ModuleStats.py
+++ b/bin/ModuleStats.py
@ -9,7 +9,6 @@ import time
 import datetime
 import redis
 import os
-from packages import lib_words
 from packages.Date import Date
 from pubsublogger import publisher
 from Helper import Process
--- a/bin/RegexForTermsFrequency.py
+++ b/bin/RegexForTermsFrequency.py
@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-This Module is used for term frequency.
-It processes every paste coming from the global module and test the regexs
-supplied in  the term webpage.
-
-"""
-import redis
-import time
-from pubsublogger import publisher
-from packages import Paste
-import calendar
-import re
-import signal
-import time
-from Helper import Process
-# Email notifications
-from NotificationHelper import *
-
-
-class TimeoutException(Exception):
-    pass
-
-
-def timeout_handler(signum, frame):
-    raise TimeoutException
-
-signal.signal(signal.SIGALRM, timeout_handler)
-
-# Config Variables
-DICO_REFRESH_TIME = 60  # s
-
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-TrackedRegexSet_Name = "TrackedRegexSet"
-
-top_term_freq_max_set_cardinality = 20  # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
-
-TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
-
-# create direct link in mail
-full_paste_url = "/showsavedpaste/?paste="
-
-
-def refresh_dicos():
-    dico_regex = {}
-    dico_regexname_to_redis = {}
-    for regex_str in server_term.smembers(TrackedRegexSet_Name):
-        dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
-        dico_regexname_to_redis[regex_str[1:-1]] = regex_str
-
-    return dico_regex, dico_regexname_to_redis
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'RegexForTermsFrequency'
-    p = Process(config_section)
-    max_execution_time = p.config.getint(config_section, "max_execution_time")
-
-    # REDIS #
-    server_term = redis.StrictRedis(
-        host=p.config.get("ARDB_TermFreq", "host"),
-        port=p.config.get("ARDB_TermFreq", "port"),
-        db=p.config.get("ARDB_TermFreq", "db"),
-        decode_responses=True)
-
-    # FUNCTIONS #
-    publisher.info("RegexForTermsFrequency script started")
-
-    # create direct link in mail
-    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
-
-    # compile the regex
-    dico_refresh_cooldown = time.time()
-    dico_regex, dico_regexname_to_redis = refresh_dicos()
-
-    message = p.get_from_set()
-
-    # Regex Frequency
-    while True:
-
-        if message is not None:
-            if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
-                dico_refresh_cooldown = time.time()
-                dico_regex, dico_regexname_to_redis = refresh_dicos()
-                print('dico got refreshed')
-
-            filename = message
-            temp = filename.split('/')
-            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
-
-            curr_set = top_termFreq_setName_day[0] + str(timestamp)
-            paste = Paste.Paste(filename)
-            content = paste.get_p_content()
-
-            # iterate the word with the regex
-            for regex_str, compiled_regex in dico_regex.items():
-
-                signal.alarm(max_execution_time)
-                try:
-                    matched = compiled_regex.search(content)
-                except TimeoutException:
-                    print ("{0} processing timeout".format(paste.p_rel_path))
-                    continue
-                else:
-                    signal.alarm(0)
-
-                if matched is not None:  # there is a match
-                    print('regex matched {}'.format(regex_str))
-                    matched = matched.group(0)
-                    regex_str_complete = "/" + regex_str + "/"
-                    # Add in Regex track set only if term is not in the blacklist
-                    if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
-                        # Send a notification only when the member is in the set
-                        if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
-
-                            # create mail body
-                            mail_body = ("AIL Framework,\n"
-                                         "New occurrence for regex: " + regex_str + "\n"
-                                         ''+full_paste_url + filename)
-
-                            # Send to every associated email adress
-                            for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
-                                sendEmailNotification(email, 'Term', mail_body)
-
-                        # tag paste
-                        for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
-                            msg = '{};{}'.format(tag, filename)
-                            p.populate_set_out(msg, 'Tags')
-
-                        set_name = 'regex_' + dico_regexname_to_redis[regex_str]
-                        new_to_the_set = server_term.sadd(set_name, filename)
-                        new_to_the_set = True if new_to_the_set == 1 else False
-
-                        # consider the num of occurence of this term
-                        regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
-                        # 1 term per paste
-                        if new_to_the_set:
-                            regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
-                            server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
-                    server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
-                else:
-                    pass
-
-        else:
-            publisher.debug("Script RegexForTermsFrequency is Idling")
-            print("sleeping")
-            time.sleep(5)
-        message = p.get_from_set()
--- a/bin/RegexTracker.py
+++ b/bin/RegexTracker.py
@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+This Module is used for regex tracking.
+It processes every paste coming from the global module and test the regexs
+supplied in  the term webpage.
+
+"""
+import os
+import re
+import sys
+import time
+import signal
+
+from Helper import Process
+from pubsublogger import publisher
+
+import NotificationHelper
+
+from packages import Item
+from packages import Term
+
+full_item_url = "/showsavedpaste/?paste="
+mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}"
+
+dict_regex_tracked = Term.get_regex_tracked_words_dict()
+last_refresh = time.time()
+
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+
+def new_term_found(term, term_type, item_id, item_date):
+    uuid_list = Term.get_term_uuid_list(term, 'regex')
+    print('new tracked term found: {} in {}'.format(term, item_id))
+
+    for term_uuid in uuid_list:
+        Term.add_tracked_item(term_uuid, item_id, item_date)
+
+        tags_to_add = Term.get_term_tags(term_uuid)
+        for tag in tags_to_add:
+            msg = '{};{}'.format(tag, item_id)
+            p.populate_set_out(msg, 'Tags')
+
+        mail_to_notify = Term.get_term_mails(term_uuid)
+        if mail_to_notify:
+            mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
+        for mail in mail_to_notify:
+            NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
+
+if __name__ == "__main__":
+    publisher.port = 6380
+    publisher.channel = "Script"
+    publisher.info("Script RegexTracker started")
+
+    config_section = 'RegexTracker'
+    p = Process(config_section)
+    max_execution_time = p.config.getint(config_section, "max_execution_time")
+
+    ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
+
+    # Regex Frequency
+    while True:
+
+        item_id = p.get_from_set()
+
+        if item_id is not None:
+
+            item_date = Item.get_item_date(item_id)
+            item_content = Item.get_item_content(item_id)
+
+            for regex in dict_regex_tracked:
+
+                signal.alarm(max_execution_time)
+                try:
+                    matched = dict_regex_tracked[regex].search(item_content)
+                except TimeoutException:
+                    print ("{0} processing timeout".format(paste.p_rel_path))
+                    continue
+                else:
+                    signal.alarm(0)
+
+                if matched:
+                    new_term_found(regex, 'regex', item_id, item_date)
+
+
+        else:
+            time.sleep(5)
+
+        # refresh Tracked term
+        if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
+            dict_regex_tracked = Term.get_regex_tracked_words_dict()
+            last_refresh = time.time()
+            print('Tracked set refreshed')
--- a/bin/SetForTermsFrequency.py
+++ b/bin/SetForTermsFrequency.py
@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-This Module is used for term frequency.
-It processes every paste coming from the global module and test the sets
-supplied in  the term webpage.
-
-"""
-import redis
-import time
-from pubsublogger import publisher
-from packages import lib_words
-from packages import Paste
-import os
-import datetime
-import calendar
-import re
-import ast
-from Helper import Process
-
-# Email notifications
-from NotificationHelper import *
-
-# Config Variables
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-TrackedRegexSet_Name = "TrackedRegexSet"
-TrackedSetSet_Name = "TrackedSetSet"
-
-top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
-
-TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
-
-# create direct link in mail
-full_paste_url = "/showsavedpaste/?paste="
-
-def add_quote_inside_tab(tab):
-    quoted_tab = "["
-    for elem in tab[1:-1].split(','):
-        elem = elem.lstrip().strip()
-        quoted_tab += "\'{}\', ".format(elem)
-    quoted_tab = quoted_tab[:-2] #remove trailing ,
-    quoted_tab += "]"
-    return str(quoted_tab)
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'SetForTermsFrequency'
-    p = Process(config_section)
-
-    # REDIS #
-    server_term = redis.StrictRedis(
-        host=p.config.get("ARDB_TermFreq", "host"),
-        port=p.config.get("ARDB_TermFreq", "port"),
-        db=p.config.get("ARDB_TermFreq", "db"),
-        decode_responses=True)
-
-    # FUNCTIONS #
-    publisher.info("RegexForTermsFrequency script started")
-
-    # create direct link in mail
-    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
-
-    #get the dico and matching percent
-    dico_percent = {}
-    dico_set_tab = {}
-    dico_setname_to_redis = {}
-    for set_str in server_term.smembers(TrackedSetSet_Name):
-        tab_set = set_str[1:-1]
-        tab_set = add_quote_inside_tab(tab_set)
-        perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
-        if perc_finder is not None:
-            match_percent = perc_finder.group(0)[1:-1]
-            dico_percent[tab_set] = float(match_percent)
-            dico_set_tab[tab_set] = ast.literal_eval(tab_set)
-            dico_setname_to_redis[tab_set] = set_str
-        else:
-            continue
-
-    message = p.get_from_set()
-
-    while True:
-
-        if message is not None:
-            filename = message
-            temp = filename.split('/')
-            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
-            content = Paste.Paste(filename).get_p_content()
-
-            curr_set = top_termFreq_setName_day[0] + str(timestamp)
-
-            #iterate over the words of the file
-            match_dico = {}
-            for word in content.split():
-                for cur_set, array_set in dico_set_tab.items():
-                    for w_set in array_set[:-1]: #avoid the percent matching
-                        if word == w_set:
-                            try:
-                                match_dico[str(array_set)] += 1
-                            except KeyError:
-                                match_dico[str(array_set)] = 1
-
-            #compute matching %
-            for the_set, matchingNum in match_dico.items():
-                eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
-                if eff_percent >= dico_percent[the_set]:
-                    # Send a notification only when the member is in the set
-                    if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):
-
-                        # create mail body
-                        mail_body = ("AIL Framework,\n"
-                                    "New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
-                                    ''+full_paste_url + filename)
-
-                        # Send to every associated email adress
-                        for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
-                            sendEmailNotification(email, 'Term', mail_body)
-
-                    # tag paste
-                    for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
-                        msg = '{};{}'.format(tag, filename)
-                        p.populate_set_out(msg, 'Tags')
-
-                    print(the_set, "matched in", filename)
-                    set_name = 'set_' + dico_setname_to_redis[the_set]
-                    new_to_the_set = server_term.sadd(set_name, filename)
-                    new_to_the_set = True if new_to_the_set == 1 else False
-
-                    #consider the num of occurence of this set
-                    set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
-
-                    # FIXME - avoid using per paste as a set is checked over the entire paste
-                    #1 term per paste
-                    if new_to_the_set:
-                        set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
-                        server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
-                server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
-
-
-        else:
-            publisher.debug("Script RegexForTermsFrequency is Idling")
-            print("sleeping")
-            time.sleep(5)
-        message = p.get_from_set()
--- a/bin/TermTrackerMod.py
+++ b/bin/TermTrackerMod.py
@ -8,13 +8,14 @@ The TermTracker Module
 import os
 import sys
 import time
+import signal

 from Helper import Process
 from pubsublogger import publisher

 import NotificationHelper

-from packages import Paste
+from packages import Item
 from packages import Term

 sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n

 # loads tracked words
 list_tracked_words = Term.get_tracked_words_list()
+last_refresh_word = time.time()
 set_tracked_words_list = Term.get_set_tracked_words_list()
+last_refresh_set = time.time()

-def new_term_found(term, term_type, item_id):
-    uuid_list = Term.get_term_uuid_list(term)
+class TimeoutException(Exception):
+    pass
+def timeout_handler(signum, frame):
+    raise TimeoutException
+signal.signal(signal.SIGALRM, timeout_handler)
+
+def new_term_found(term, term_type, item_id, item_date):
+    uuid_list = Term.get_term_uuid_list(term, term_type)
+    print('new tracked term found: {} in {}'.format(term, item_id))

    for term_uuid in uuid_list:
-        Term.add_tracked_item(term_uuid, item_id)
+        Term.add_tracked_item(term_uuid, item_id, item_date)

        tags_to_add = Term.get_term_tags(term_uuid)
        for tag in tags_to_add:
@ -52,28 +62,38 @@ if __name__ == "__main__":
    publisher.channel = "Script"
    publisher.info("Script TermTrackerMod started")

-    #config_section = 'TermTrackerMod'
    config_section = 'TermTrackerMod'
    p = Process(config_section)
+    max_execution_time = p.config.getint(config_section, "max_execution_time")

    full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url

    while True:

        item_id = p.get_from_set()
-        item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
-        #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'

-        if message is not None:
+        if item_id is not None:

-            paste = Paste.Paste(item_id)
+            item_date = Item.get_item_date(item_id)
+            item_content = Item.get_item_content(item_id)

-            dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
+            signal.alarm(max_execution_time)
+            try:
+                dict_words_freq = Term.get_text_word_frequency(item_content)
+            except TimeoutException:
+                print ("{0} processing timeout".format(paste.p_rel_path))
+                continue
+            else:
+                signal.alarm(0)
+
+            # create token statistics
+            for word in dict_words_freq:
+                Term.create_token_statistics(item_date, word, dict_words_freq[word])

            # check solo words
            for word in list_tracked_words:
                if word in dict_words_freq:
-                    new_term_found(word, 'word', item_id)
+                    new_term_found(word, 'word', item_id, item_date)

            # check words set
            for elem in set_tracked_words_list:
@ -86,7 +106,19 @@ if __name__ == "__main__":
                    if word in dict_words_freq:
                        nb_uniq_word += 1
                if nb_uniq_word >= nb_words_threshold:
-                    new_term_found(word_set, 'set', item_id)
+                    new_term_found(word_set, 'set', item_id, item_date)

        else:
            time.sleep(5)
+
+
+        # refresh Tracked term
+        if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
+            list_tracked_words = Term.get_tracked_words_list()
+            last_refresh_word = time.time()
+            print('Tracked word refreshed')
+
+        if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
+            set_tracked_words_list = Term.get_set_tracked_words_list()
+            last_refresh_set = time.time()
+            print('Tracked set refreshed')
--- a/bin/Tokenize.py
+++ b/bin/Tokenize.py
@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-The Tokenize Module
-===================
-
-This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
-Module.
-
-It tokenize the content of the paste and publish the result in the following
-format:
-    channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring
-
-    ..seealso:: Paste method (_get_top_words)
-
-..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
-the same Subscriber name in both of them.
-
-Requirements
------------
-
-*Need running Redis instances. (Redis)
-*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
-
-"""
-import time
-from packages import Paste
-from pubsublogger import publisher
-
-from Helper import Process
-import signal
-
-class TimeoutException(Exception):
-    pass
-
-def timeout_handler(signum, frame):
-    raise TimeoutException
-
-signal.signal(signal.SIGALRM, timeout_handler)
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'Tokenize'
-    p = Process(config_section)
-
-    # LOGGING #
-    publisher.info("Tokeniser started")
-
-    while True:
-        message = p.get_from_set()
-        print(message)
-        if message is not None:
-            paste = Paste.Paste(message)
-            signal.alarm(5)
-            try:
-                for word, score in paste._get_top_words().items():
-                    if len(word) >= 4:
-                        msg = '{} {} {}'.format(paste.p_rel_path, word, score)
-                        p.populate_set_out(msg)
-            except TimeoutException:
-                p.incr_module_timeout_statistic()
-                print ("{0} processing timeout".format(paste.p_rel_path))
-                continue
-            else:
-                signal.alarm(0)
-        else:
-            publisher.debug("Tokeniser is idling 10s")
-            time.sleep(10)
-            print("Sleeping")
--- a/bin/packages/Date.py
+++ b/bin/packages/Date.py
@ -1,5 +1,7 @@
 #!/usr/bin/python3

+import datetime
+
 class Date(object):
    """docstring for Date"""
    def __init__(self, *args):
@ -34,7 +36,6 @@ class Date(object):
        self.day = day

    def substract_day(self, numDay):
-        import datetime
        computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
        comp_year = str(computed_date.year)
        comp_month = str(computed_date.month).zfill(2)
@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1):
    new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
    new_date = str(new_date).replace('-', '')
    return new_date
+
+def get_date_range(num_day):
+    curr_date = datetime.date.today()
+    date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
+    date_list = []
+
+    for i in range(0, num_day+1):
+        date_list.append(date.substract_day(i))
+    return list(reversed(date_list))
--- a/bin/packages/Item.py
+++ b/bin/packages/Item.py
@ -2,10 +2,13 @@
 # -*-coding:UTF-8 -*

 import os
+import sys
 import gzip
 import redis

+sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
 import Flask_config
+sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
 import Date
 import Tag

--- a/bin/packages/Term.py
+++ b/bin/packages/Term.py
@ -4,6 +4,7 @@
 import os
 import re
 import sys
+import time
 import uuid
 import redis
 import datetime
@ -72,14 +73,30 @@ def get_set_tracked_words_list():
        all_set_list.append((ter_set, num_words, elem))
    return all_set_list

-def is_term_tracked_in_global_level(term):
-    res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
+def get_regex_tracked_words_dict():
+    regex_list = r_serv_term.smembers('all:tracked_term:regex')
+    dict_tracked_regex = {}
+    for regex in regex_list:
+        dict_tracked_regex[regex] = re.compile(regex)
+    return dict_tracked_regex
+
+def is_term_tracked_in_global_level(term, term_type):
+    res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))
    if res:
        for elem_uuid in res:
            if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
                return True
    return False

+def is_term_tracked_in_user_level(term, term_type, user_id):
+    res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id))
+    if res:
+        for elem_uuid in res:
+            if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term:
+                if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type:
+                    return True
+    return False
+
 def parse_json_term_to_add(dict_input, user_id):
    term = dict_input.get('term', None)
    if not term:
@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id):

    # check if term already tracked in global
    if level==1:
-        if is_term_tracked_in_global_level(term):
+        if is_term_tracked_in_global_level(term, term_type):
+            return ({"status": "error", "reason": "Term already tracked"}, 409)
+    else:
+        if is_term_tracked_in_user_level(term, term_type, user_id):
            return ({"status": "error", "reason": "Term already tracked"}, 409)

    term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
    r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)

    # create term - uuid map
-    r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid)
+    r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)

    # add display level set
    if level == 0: # user only
@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
    for mail in mails:
        r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)

+    # toggle refresh module tracker list/set
+    r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
+
    return term_uuid

 def delete_term(term_uuid):
    term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
    term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
    term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
-    r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
-    r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
+    r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
+    # Term not tracked by other users
+    if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)):
+        r_serv_term.srem('all:tracked_term:{}'.format(term_type), term)

+        # toggle refresh module tracker list/set
+        r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())

    if level == 0: # user only
        user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
@ -218,8 +245,8 @@ def delete_term(term_uuid):
    # remove item set
    r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))

-def get_term_uuid_list(term):
-    return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
+def get_term_uuid_list(term, term_type):
+    return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)))

 def get_term_tags(term_uuid):
    return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
@ -227,10 +254,30 @@ def get_term_tags(term_uuid):
 def get_term_mails(term_uuid):
    return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))

-def add_tracked_item(term_uuid, item_id):
-    r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
+def add_tracked_item(term_uuid, item_id, item_date):
+    # track item
+    r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id)
+    # track nb item by date
+    r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1)

+def create_token_statistics(item_date, word, nb):
+    r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1)
+    r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb)
+    r_serv_term.sadd('stat_token_history', item_date)

+def delete_token_statistics_by_date(item_date):
+    r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date))
+    r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date))
+    r_serv_term.srem('stat_token_history', item_date)
+
+def get_all_token_stat_history():
+    return r_serv_term.smembers('stat_token_history')
+
+def get_tracked_term_last_updated_by_type(term_type):
+    epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type))
+    if not epoch_update:
+        epoch_update = 0
+    return float(epoch_update)



--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -107,7 +107,10 @@ operation_mode = 3
 ttl_duplicate = 86400
 default_unnamed_feed_name = unnamed_feeder

-[RegexForTermsFrequency]
+[TermTrackerMod]
+max_execution_time = 120
+
+[RegexTracker]
 max_execution_time = 60

 ##### Redis #####
--- a/bin/packages/lib_words.py
+++ b/bin/packages/lib_words.py
@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY
 import csv


-def listdirectory(path):
-    """Path Traversing Function.
-
-    :param path: -- The absolute pathname to a directory.
-
-    This function is returning all the absolute path of the files contained in
-    the argument directory.
-
-    """
-    fichier = []
-    for root, dirs, files in os.walk(path):
-
-        for i in files:
-
-            fichier.append(os.path.join(root, i))
-
-    return fichier
-
 clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
 """It filters out non-printable characters from the string it receives."""


-def create_dirfile(r_serv, directory, overwrite):
-    """Create a file of path.
-
-    :param r_serv: -- connexion to redis database
-    :param directory: -- The folder where to launch the listing of the .gz files
-
-    This function create a list in redis with inside the absolute path
-    of all the pastes needed to be proceeded by function using parallel
-    (like redis_words_ranking)
-
-    """
-    if overwrite:
-        r_serv.delete("filelist")
-
-        for x in listdirectory(directory):
-            r_serv.lpush("filelist", x)
-
-        publisher.info("The list was overwritten")
-
-    else:
-        if r_serv.llen("filelist") == 0:
-
-            for x in listdirectory(directory):
-                r_serv.lpush("filelist", x)
-
-            publisher.info("New list created")
-        else:
-
-            for x in listdirectory(directory):
-                r_serv.lpush("filelist", x)
-
-            publisher.info("The list was updated with new elements")
-
-
 def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.

--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -19,36 +19,17 @@ subscribe = Redis_Global
 [Attributes]
 subscribe = Redis_Global

-[Lines]
-subscribe = Redis_Global
-publish = Redis_LinesShort,Redis_LinesLong
-
 [DomClassifier]
 subscribe = Redis_Global

-[Tokenize]
-subscribe = Redis_LinesShort
-publish = Redis_Words
-
-[Curve]
-subscribe = Redis_Words
-publish = Redis_CurveManageTopSets,Redis_Tags
-
 [TermTrackerMod]
 subscribe = Redis_Global
 publish = Redis_Tags

-[RegexForTermsFrequency]
+[RegexTracker]
 subscribe = Redis_Global
 publish = Redis_Tags

-[SetForTermsFrequency]
-subscribe = Redis_Global
-publish = Redis_Tags
-
-[CurveManageTopSets]
-subscribe = Redis_CurveManageTopSets
-
 [Categ]
 subscribe = Redis_Global
 publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey