diff --git a/OVERVIEW.md b/OVERVIEW.md index 77339321..a3425155 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -138,12 +138,12 @@ Redis and ARDB overview | Set - Key | Value | | ------ | ------ | -| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** | +| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** | ##### All Term Tracked items: | Set - Key | Value | | ------ | ------ | -| tracked_term:item:**uuid** | **item_id** | +| tracked_term:item:**uuid**:**date** | **item_id** | ##### All Term Tracked tags: | Set - Key | Value | @@ -155,6 +155,29 @@ Redis and ARDB overview | ------ | ------ | | tracked_term:mail:**uuid** | **mail** | +##### Refresh Tracked term: +| Key | Value | +| ------ | ------ | +| tracked_term:refresh:word | **last refreshed epoch** | +| tracked_term:refresh:set | - | +| tracked_term:refresh:regex | - | + +##### Zset Stat Tracked term: +| Key | Field | Value | +| ------ | ------ | ------ | +| tracked_term:stat:**uuid** | **date** | **nb_seen** | + +##### Stat token: +| Key | Field | Value | +| ------ | ------ | ------ | +| stat_token_total_by_day:**date** | **word** | **nb_seen** | +| | | | +| stat_token_per_item_by_day:**date** | **word** | **nb_seen** | + +| Set - Key | Value | +| ------ | ------ | +| stat_token_history | **date** | + ## DB2 - TermFreq: ##### Set: @@ -167,16 +190,6 @@ Redis and ARDB overview | TrackedRegexSet | **tracked_regex** | | | | | | | -| global:TrackedSetTermSet | **tracked_term** | -| global:TrackedSetSet | **tracked_set** | -| global:TrackedRegexSet | **tracked_regex** | -| | | -| | | -| user:**user_id**:TrackedSetTermSet | **tracked_term** | -| user:**user_id**:TrackedSetSet | **tracked_set** | -| user:**user_id**:TrackedRegexSet | **tracked_regex** | -| | | -| | | | tracked_**tracked_term** | **item_path** | | set_**tracked_set** | **item_path** | | regex_**tracked_regex** | **item_path** | diff --git a/bin/Curve.py b/bin/Curve.py deleted file mode 100755 index c7083c54..00000000 --- a/bin/Curve.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module. - -This modules update a .csv file used to draw curves representing selected -words and their occurency per day. - -..note:: The channel will have the name of the file created. - -..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put -the same Subscriber name in both of them. - - -This Module is also used for term frequency. - -/!\ Top set management is done in the module Curve_manage_top_set - - -Requirements ------------- - -*Need running Redis instances. (Redis) -*Categories files of words in /files/ need to be created -*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. - -""" -import redis -import time -from pubsublogger import publisher -from packages import lib_words -import os -import datetime -import calendar - -from Helper import Process - -# Email notifications -from NotificationHelper import * - -# Config Variables -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] - -TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_" - -# create direct link in mail -full_paste_url = "/showsavedpaste/?paste=" - -def check_if_tracked_term(term, path): - if term in server_term.smembers(TrackedTermsSet_Name): - #add_paste to tracked_word_set - set_name = "tracked_" + term - server_term.sadd(set_name, path) - print(term, 'addded', set_name, '->', path) - p.populate_set_out("New Term added", 'CurveManageTopSets') - - # Send a notification only when the member is in the set - if term in server_term.smembers(TrackedTermsNotificationEnabled_Name): - - # create mail body - mail_body = ("AIL Framework,\n" - "New occurrence for term: " + term + "\n" - ''+full_paste_url + path) - - # Send to every associated email adress - for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term): - sendEmailNotification(email, 'Term', mail_body) - - # tag paste - for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term): - msg = '{};{}'.format(tag, path) - p.populate_set_out(msg, 'Tags') - - -def getValueOverRange(word, startDate, num_day): - to_return = 0 - for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay): - value = server_term.hget(timestamp, word) - to_return += int(value) if value is not None else 0 - return to_return - - - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'Curve' - p = Process(config_section) - - # REDIS # - r_serv1 = redis.StrictRedis( - host=p.config.get("ARDB_Curve", "host"), - port=p.config.get("ARDB_Curve", "port"), - db=p.config.get("ARDB_Curve", "db"), - decode_responses=True) - - server_term = redis.StrictRedis( - host=p.config.get("ARDB_TermFreq", "host"), - port=p.config.get("ARDB_TermFreq", "port"), - db=p.config.get("ARDB_TermFreq", "db"), - decode_responses=True) - - # FUNCTIONS # - publisher.info("Script Curve started") - - # create direct link in mail - full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url - - # FILE CURVE SECTION # - csv_path = os.path.join(os.environ['AIL_HOME'], - p.config.get("Directories", "wordtrending_csv")) - wordfile_path = os.path.join(os.environ['AIL_HOME'], - p.config.get("Directories", "wordsfile")) - - message = p.get_from_set() - prec_filename = None - generate_new_graph = False - - # Term Frequency - top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] - top_termFreq_setName_week = ["TopTermFreq_set_week", 7] - top_termFreq_setName_month = ["TopTermFreq_set_month", 31] - - while True: - - if message is not None: - generate_new_graph = True - - filename, word, score = message.split() - temp = filename.split('/') - date = temp[-4] + temp[-3] + temp[-2] - timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - curr_set = top_termFreq_setName_day[0] + str(timestamp) - - - low_word = word.lower() - #Old curve with words in file - r_serv1.hincrby(low_word, date, int(score)) - - # Update redis - #consider the num of occurence of this term - curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score))) - #1 term per paste - curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1))) - - # Add in set only if term is not in the blacklist - if low_word not in server_term.smembers(BlackListTermsSet_Name): - #consider the num of occurence of this term - server_term.zincrby(curr_set, low_word, float(score)) - #1 term per paste - server_term.zincrby("per_paste_" + curr_set, low_word, float(1)) - - #Add more info for tracked terms - check_if_tracked_term(low_word, filename) - - #send to RegexForTermsFrequency - to_send = "{} {} {}".format(filename, timestamp, word) - p.populate_set_out(to_send, 'RegexForTermsFrequency') - - else: - - if generate_new_graph: - generate_new_graph = False - print('Building graph') - today = datetime.date.today() - year = today.year - month = today.month - - lib_words.create_curve_with_word_file(r_serv1, csv_path, - wordfile_path, year, - month) - - publisher.debug("Script Curve is Idling") - print("sleeping") - time.sleep(10) - message = p.get_from_set() diff --git a/bin/CurveManageTopSets.py b/bin/CurveManageTopSets.py deleted file mode 100755 index 4eaf9c3f..00000000 --- a/bin/CurveManageTopSets.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" - -This module manage top sets for terms frequency. -Every 'refresh_rate' update the weekly and monthly set - -""" - -import redis -import time -import datetime -import copy -from pubsublogger import publisher -from packages import lib_words -import datetime -import calendar -import os -import configparser - -# Config Variables -Refresh_rate = 60*5 #sec -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -num_day_month = 31 -num_day_week = 7 - -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] - - -def manage_top_set(): - startDate = datetime.datetime.now() - startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0) - startDate = calendar.timegm(startDate.timetuple()) - blacklist_size = int(server_term.scard(BlackListTermsSet_Name)) - - dico = {} - dico_per_paste = {} - - # Retreive top data (max_card + blacklist_size) from days sets - for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay): - curr_set = top_termFreq_setName_day[0] + str(timestamp) - array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size) - array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size) - - for word, value in array_top_day: - if word not in server_term.smembers(BlackListTermsSet_Name): - if word in dico.keys(): - dico[word] += value - else: - dico[word] = value - - for word, value in array_top_day_per_paste: - if word not in server_term.smembers(BlackListTermsSet_Name): - if word in dico_per_paste.keys(): - dico_per_paste[word] += value - else: - dico_per_paste[word] = value - - if timestamp == startDate - num_day_week*oneDay: - dico_week = copy.deepcopy(dico) - dico_week_per_paste = copy.deepcopy(dico_per_paste) - - # convert dico into sorted array - array_month = [] - for w, v in dico.items(): - array_month.append((w, v)) - array_month.sort(key=lambda tup: -tup[1]) - array_month = array_month[0:20] - - array_week = [] - for w, v in dico_week.items(): - array_week.append((w, v)) - array_week.sort(key=lambda tup: -tup[1]) - array_week = array_week[0:20] - - # convert dico_per_paste into sorted array - array_month_per_paste = [] - for w, v in dico_per_paste.items(): - array_month_per_paste.append((w, v)) - array_month_per_paste.sort(key=lambda tup: -tup[1]) - array_month_per_paste = array_month_per_paste[0:20] - - array_week_per_paste = [] - for w, v in dico_week_per_paste.items(): - array_week_per_paste.append((w, v)) - array_week_per_paste.sort(key=lambda tup: -tup[1]) - array_week_per_paste = array_week_per_paste[0:20] - - - # suppress every terms in top sets - for curr_set, curr_num_day in top_termFreq_set_array[1:3]: - for w in server_term.zrange(curr_set, 0, -1): - server_term.zrem(curr_set, w) - for w in server_term.zrange("per_paste_" + curr_set, 0, -1): - server_term.zrem("per_paste_" + curr_set, w) - - # Add top term from sorted array in their respective sorted sets - for elem in array_week: - server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0]) - for elem in array_week_per_paste: - server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0]) - - for elem in array_month: - server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0]) - for elem in array_month_per_paste: - server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0]) - - timestamp = int(time.mktime(datetime.datetime.now().timetuple())) - value = str(timestamp) + ", " + "-" - r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value) - print("refreshed module") - - - -if __name__ == '__main__': - # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) - # Port of the redis instance used by pubsublogger - publisher.port = 6380 - # Script is the default channel used for the modules. - publisher.channel = 'Script' - - configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') - if not os.path.exists(configfile): - raise Exception('Unable to find the configuration file. \ - Did you set environment variables? \ - Or activate the virtualenv.') - - cfg = configparser.ConfigParser() - cfg.read(configfile) - - - # For Module Manager - r_temp = redis.StrictRedis( - host=cfg.get('RedisPubSub', 'host'), - port=cfg.getint('RedisPubSub', 'port'), - db=cfg.getint('RedisPubSub', 'db'), - decode_responses=True) - - timestamp = int(time.mktime(datetime.datetime.now().timetuple())) - value = str(timestamp) + ", " + "-" - r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value) - r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid())) - - server_term = redis.StrictRedis( - host=cfg.get("ARDB_TermFreq", "host"), - port=cfg.getint("ARDB_TermFreq", "port"), - db=cfg.getint("ARDB_TermFreq", "db"), - decode_responses=True) - - publisher.info("Script Curve_manage_top_set started") - - # Sent to the logging a description of the module - publisher.info("Manage the top sets with the data created by the module curve.") - - manage_top_set() - - while True: - # Get one message from the input queue (module only work if linked with a queue) - time.sleep(Refresh_rate) # sleep a long time then manage the set - manage_top_set() diff --git a/bin/DbCleaner.py b/bin/DbCleaner.py new file mode 100755 index 00000000..ed2bb752 --- /dev/null +++ b/bin/DbCleaner.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The TermTracker Module +=================== + +""" +import os +import sys +import time +import datetime + +from pubsublogger import publisher + +import NotificationHelper + +from packages import Date +from packages import Item +from packages import Term + +def clean_term_db_stat_token(): + all_stat_date = Term.get_all_token_stat_history() + + list_date_to_keep = Date.get_date_range(31) + for date in all_stat_date: + if date not in list_date_to_keep: + # remove history + Term.delete_token_statistics_by_date(date) + + print('Term Stats Cleaned') + + +if __name__ == "__main__": + + publisher.port = 6380 + publisher.channel = "Script" + publisher.info("DbCleaner started") + + config_section = 'TermTrackerMod' + + # low priority + time.sleep(180) + + daily_cleaner = True + current_date = datetime.datetime.now().strftime("%Y%m%d") + + while True: + + if daily_cleaner: + clean_term_db_stat_token() + daily_cleaner = False + else: + sys.exit(0) + time.sleep(600) + + new_date = datetime.datetime.now().strftime("%Y%m%d") + if new_date != current_date: + current_date = new_date + daily_cleaner = True diff --git a/bin/Dir.py b/bin/Dir.py deleted file mode 100755 index d76a7ad5..00000000 --- a/bin/Dir.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import argparse -import redis -from pubsublogger import publisher -from packages.lib_words import create_dirfile -import configparser - - -def main(): - """Main Function""" - - # CONFIG # - cfg = configparser.ConfigParser() - cfg.read('./packages/config.cfg') - - parser = argparse.ArgumentParser( - description='''This script is a part of the Analysis Information Leak - framework. It create a redis list called "listfile" which contain - the absolute filename of all the files from the directory given in - the argument "directory".''', - epilog='Example: ./Dir.py /home/2013/03/') - - parser.add_argument('directory', type=str, - help='The directory to run inside', action='store') - - parser.add_argument('-db', type=int, default=0, - help='The name of the Redis DB (default 0)', - choices=[0, 1, 2, 3, 4], action='store') - - parser.add_argument('-ow', help='trigger the overwritting mode', - action='store_true') - - args = parser.parse_args() - - r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"), - port=cfg.getint("Redis_Queues", "port"), - db=cfg.getint("Redis_Queues", "db"), - decode_responses=True) - - publisher.port = 6380 - publisher.channel = "Script" - - create_dirfile(r_serv, args.directory, args.ow) - -if __name__ == "__main__": - main() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 98645165..4d6619c8 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -153,14 +153,10 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x" - sleep 0.1 screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x" - sleep 0.1 screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x" @@ -175,13 +171,9 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x" + screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x" - sleep 0.1 - screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x" - sleep 0.1 - screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x" + screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x" sleep 0.1 @@ -213,6 +205,8 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x" + sleep 0.1 screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x" diff --git a/bin/Lines.py b/bin/Lines.py deleted file mode 100755 index e4187dc7..00000000 --- a/bin/Lines.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -""" -The ZMQ_PubSub_Lines Module -============================ - -This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q -Module. - -It perform a sorting on the line's length and publish/forward them to -differents channels: - -*Channel 1 if max length(line) < max -*Channel 2 if max length(line) > max - -The collected informations about the processed pastes -(number of lines and maximum length line) are stored in Redis. - -..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put -the same Subscriber name in both of them. - -Requirements ------------- - -*Need running Redis instances. (LevelDB & Redis) -*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly. - -""" -import argparse -import time -from packages import Paste -from pubsublogger import publisher - -from Helper import Process - -if __name__ == '__main__': - publisher.port = 6380 - publisher.channel = 'Script' - - config_section = 'Lines' - p = Process(config_section) - - # SCRIPT PARSER # - parser = argparse.ArgumentParser( - description='This script is a part of the Analysis Information \ - Leak framework.') - - parser.add_argument( - '-max', type=int, default=500, - help='The limit between "short lines" and "long lines"', - action='store') - - args = parser.parse_args() - - # FUNCTIONS # - tmp_string = "Lines script Subscribed to channel {} and Start to publish \ - on channel Longlines, Shortlines" - publisher.info(tmp_string) - - while True: - try: - message = p.get_from_set() - print(message) - if message is not None: - PST = Paste.Paste(message) - else: - publisher.debug("Tokeniser is idling 10s") - time.sleep(10) - continue - - # FIXME do it in the paste class - lines_infos = PST.get_lines_info() - PST.save_attribute_redis("p_nb_lines", lines_infos[0]) - PST.save_attribute_redis("p_max_length_line", lines_infos[1]) - - # FIXME Not used. - PST.store.sadd("Pastes_Objects", PST.p_rel_path) - print(PST.p_rel_path) - if lines_infos[1] < args.max: - p.populate_set_out( PST.p_rel_path , 'LinesShort') - else: - p.populate_set_out( PST.p_rel_path , 'LinesLong') - except IOError: - print("CRC Checksum Error on : ", PST.p_rel_path) diff --git a/bin/ModuleStats.py b/bin/ModuleStats.py index 6743cdca..cfdb82f7 100755 --- a/bin/ModuleStats.py +++ b/bin/ModuleStats.py @@ -9,7 +9,6 @@ import time import datetime import redis import os -from packages import lib_words from packages.Date import Date from pubsublogger import publisher from Helper import Process diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py deleted file mode 100755 index cd8102c1..00000000 --- a/bin/RegexForTermsFrequency.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -This Module is used for term frequency. -It processes every paste coming from the global module and test the regexs -supplied in the term webpage. - -""" -import redis -import time -from pubsublogger import publisher -from packages import Paste -import calendar -import re -import signal -import time -from Helper import Process -# Email notifications -from NotificationHelper import * - - -class TimeoutException(Exception): - pass - - -def timeout_handler(signum, frame): - raise TimeoutException - -signal.signal(signal.SIGALRM, timeout_handler) - -# Config Variables -DICO_REFRESH_TIME = 60 # s - -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -TrackedRegexSet_Name = "TrackedRegexSet" - -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month] - -TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_" - -# create direct link in mail -full_paste_url = "/showsavedpaste/?paste=" - - -def refresh_dicos(): - dico_regex = {} - dico_regexname_to_redis = {} - for regex_str in server_term.smembers(TrackedRegexSet_Name): - dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1]) - dico_regexname_to_redis[regex_str[1:-1]] = regex_str - - return dico_regex, dico_regexname_to_redis - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'RegexForTermsFrequency' - p = Process(config_section) - max_execution_time = p.config.getint(config_section, "max_execution_time") - - # REDIS # - server_term = redis.StrictRedis( - host=p.config.get("ARDB_TermFreq", "host"), - port=p.config.get("ARDB_TermFreq", "port"), - db=p.config.get("ARDB_TermFreq", "db"), - decode_responses=True) - - # FUNCTIONS # - publisher.info("RegexForTermsFrequency script started") - - # create direct link in mail - full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url - - # compile the regex - dico_refresh_cooldown = time.time() - dico_regex, dico_regexname_to_redis = refresh_dicos() - - message = p.get_from_set() - - # Regex Frequency - while True: - - if message is not None: - if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME: - dico_refresh_cooldown = time.time() - dico_regex, dico_regexname_to_redis = refresh_dicos() - print('dico got refreshed') - - filename = message - temp = filename.split('/') - timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - - curr_set = top_termFreq_setName_day[0] + str(timestamp) - paste = Paste.Paste(filename) - content = paste.get_p_content() - - # iterate the word with the regex - for regex_str, compiled_regex in dico_regex.items(): - - signal.alarm(max_execution_time) - try: - matched = compiled_regex.search(content) - except TimeoutException: - print ("{0} processing timeout".format(paste.p_rel_path)) - continue - else: - signal.alarm(0) - - if matched is not None: # there is a match - print('regex matched {}'.format(regex_str)) - matched = matched.group(0) - regex_str_complete = "/" + regex_str + "/" - # Add in Regex track set only if term is not in the blacklist - if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name): - # Send a notification only when the member is in the set - if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name): - - # create mail body - mail_body = ("AIL Framework,\n" - "New occurrence for regex: " + regex_str + "\n" - ''+full_paste_url + filename) - - # Send to every associated email adress - for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete): - sendEmailNotification(email, 'Term', mail_body) - - # tag paste - for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete): - msg = '{};{}'.format(tag, filename) - p.populate_set_out(msg, 'Tags') - - set_name = 'regex_' + dico_regexname_to_redis[regex_str] - new_to_the_set = server_term.sadd(set_name, filename) - new_to_the_set = True if new_to_the_set == 1 else False - - # consider the num of occurence of this term - regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1))) - # 1 term per paste - if new_to_the_set: - regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1))) - server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1)) - server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1)) - else: - pass - - else: - publisher.debug("Script RegexForTermsFrequency is Idling") - print("sleeping") - time.sleep(5) - message = p.get_from_set() diff --git a/bin/RegexTracker.py b/bin/RegexTracker.py new file mode 100755 index 00000000..260db3c9 --- /dev/null +++ b/bin/RegexTracker.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +This Module is used for regex tracking. +It processes every paste coming from the global module and test the regexs +supplied in the term webpage. + +""" +import os +import re +import sys +import time +import signal + +from Helper import Process +from pubsublogger import publisher + +import NotificationHelper + +from packages import Item +from packages import Term + +full_item_url = "/showsavedpaste/?paste=" +mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}" + +dict_regex_tracked = Term.get_regex_tracked_words_dict() +last_refresh = time.time() + +class TimeoutException(Exception): + pass +def timeout_handler(signum, frame): + raise TimeoutException +signal.signal(signal.SIGALRM, timeout_handler) + +def new_term_found(term, term_type, item_id, item_date): + uuid_list = Term.get_term_uuid_list(term, 'regex') + print('new tracked term found: {} in {}'.format(term, item_id)) + + for term_uuid in uuid_list: + Term.add_tracked_item(term_uuid, item_id, item_date) + + tags_to_add = Term.get_term_tags(term_uuid) + for tag in tags_to_add: + msg = '{};{}'.format(tag, item_id) + p.populate_set_out(msg, 'Tags') + + mail_to_notify = Term.get_term_mails(term_uuid) + if mail_to_notify: + mail_body = mail_body_template.format(term, item_id, full_item_url, item_id) + for mail in mail_to_notify: + NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body) + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + publisher.info("Script RegexTracker started") + + config_section = 'RegexTracker' + p = Process(config_section) + max_execution_time = p.config.getint(config_section, "max_execution_time") + + ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url + + # Regex Frequency + while True: + + item_id = p.get_from_set() + + if item_id is not None: + + item_date = Item.get_item_date(item_id) + item_content = Item.get_item_content(item_id) + + for regex in dict_regex_tracked: + + signal.alarm(max_execution_time) + try: + matched = dict_regex_tracked[regex].search(item_content) + except TimeoutException: + print ("{0} processing timeout".format(paste.p_rel_path)) + continue + else: + signal.alarm(0) + + if matched: + new_term_found(regex, 'regex', item_id, item_date) + + + else: + time.sleep(5) + + # refresh Tracked term + if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'): + dict_regex_tracked = Term.get_regex_tracked_words_dict() + last_refresh = time.time() + print('Tracked set refreshed') diff --git a/bin/SetForTermsFrequency.py b/bin/SetForTermsFrequency.py deleted file mode 100755 index 19ed7210..00000000 --- a/bin/SetForTermsFrequency.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -This Module is used for term frequency. -It processes every paste coming from the global module and test the sets -supplied in the term webpage. - -""" -import redis -import time -from pubsublogger import publisher -from packages import lib_words -from packages import Paste -import os -import datetime -import calendar -import re -import ast -from Helper import Process - -# Email notifications -from NotificationHelper import * - -# Config Variables -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -TrackedRegexSet_Name = "TrackedRegexSet" -TrackedSetSet_Name = "TrackedSetSet" - -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] - -TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_" - -# create direct link in mail -full_paste_url = "/showsavedpaste/?paste=" - -def add_quote_inside_tab(tab): - quoted_tab = "[" - for elem in tab[1:-1].split(','): - elem = elem.lstrip().strip() - quoted_tab += "\'{}\', ".format(elem) - quoted_tab = quoted_tab[:-2] #remove trailing , - quoted_tab += "]" - return str(quoted_tab) - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'SetForTermsFrequency' - p = Process(config_section) - - # REDIS # - server_term = redis.StrictRedis( - host=p.config.get("ARDB_TermFreq", "host"), - port=p.config.get("ARDB_TermFreq", "port"), - db=p.config.get("ARDB_TermFreq", "db"), - decode_responses=True) - - # FUNCTIONS # - publisher.info("RegexForTermsFrequency script started") - - # create direct link in mail - full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url - - #get the dico and matching percent - dico_percent = {} - dico_set_tab = {} - dico_setname_to_redis = {} - for set_str in server_term.smembers(TrackedSetSet_Name): - tab_set = set_str[1:-1] - tab_set = add_quote_inside_tab(tab_set) - perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set) - if perc_finder is not None: - match_percent = perc_finder.group(0)[1:-1] - dico_percent[tab_set] = float(match_percent) - dico_set_tab[tab_set] = ast.literal_eval(tab_set) - dico_setname_to_redis[tab_set] = set_str - else: - continue - - message = p.get_from_set() - - while True: - - if message is not None: - filename = message - temp = filename.split('/') - timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - content = Paste.Paste(filename).get_p_content() - - curr_set = top_termFreq_setName_day[0] + str(timestamp) - - #iterate over the words of the file - match_dico = {} - for word in content.split(): - for cur_set, array_set in dico_set_tab.items(): - for w_set in array_set[:-1]: #avoid the percent matching - if word == w_set: - try: - match_dico[str(array_set)] += 1 - except KeyError: - match_dico[str(array_set)] = 1 - - #compute matching % - for the_set, matchingNum in match_dico.items(): - eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching - if eff_percent >= dico_percent[the_set]: - # Send a notification only when the member is in the set - if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name): - - # create mail body - mail_body = ("AIL Framework,\n" - "New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n" - ''+full_paste_url + filename) - - # Send to every associated email adress - for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]): - sendEmailNotification(email, 'Term', mail_body) - - # tag paste - for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]): - msg = '{};{}'.format(tag, filename) - p.populate_set_out(msg, 'Tags') - - print(the_set, "matched in", filename) - set_name = 'set_' + dico_setname_to_redis[the_set] - new_to_the_set = server_term.sadd(set_name, filename) - new_to_the_set = True if new_to_the_set == 1 else False - - #consider the num of occurence of this set - set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1))) - - # FIXME - avoid using per paste as a set is checked over the entire paste - #1 term per paste - if new_to_the_set: - set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1))) - server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1)) - server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1)) - - - else: - publisher.debug("Script RegexForTermsFrequency is Idling") - print("sleeping") - time.sleep(5) - message = p.get_from_set() diff --git a/bin/TermTrackerMod.py b/bin/TermTrackerMod.py index 08eb7247..fca0439f 100755 --- a/bin/TermTrackerMod.py +++ b/bin/TermTrackerMod.py @@ -8,13 +8,14 @@ The TermTracker Module import os import sys import time +import signal from Helper import Process from pubsublogger import publisher import NotificationHelper -from packages import Paste +from packages import Item from packages import Term sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) @@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n # loads tracked words list_tracked_words = Term.get_tracked_words_list() +last_refresh_word = time.time() set_tracked_words_list = Term.get_set_tracked_words_list() +last_refresh_set = time.time() -def new_term_found(term, term_type, item_id): - uuid_list = Term.get_term_uuid_list(term) +class TimeoutException(Exception): + pass +def timeout_handler(signum, frame): + raise TimeoutException +signal.signal(signal.SIGALRM, timeout_handler) + +def new_term_found(term, term_type, item_id, item_date): + uuid_list = Term.get_term_uuid_list(term, term_type) + print('new tracked term found: {} in {}'.format(term, item_id)) for term_uuid in uuid_list: - Term.add_tracked_item(term_uuid, item_id) + Term.add_tracked_item(term_uuid, item_id, item_date) tags_to_add = Term.get_term_tags(term_uuid) for tag in tags_to_add: @@ -52,28 +62,38 @@ if __name__ == "__main__": publisher.channel = "Script" publisher.info("Script TermTrackerMod started") - #config_section = 'TermTrackerMod' config_section = 'TermTrackerMod' p = Process(config_section) + max_execution_time = p.config.getint(config_section, "max_execution_time") full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url while True: item_id = p.get_from_set() - item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz' - #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz' - if message is not None: + if item_id is not None: - paste = Paste.Paste(item_id) + item_date = Item.get_item_date(item_id) + item_content = Item.get_item_content(item_id) - dict_words_freq = Term.get_text_word_frequency(paste.get_p_content()) + signal.alarm(max_execution_time) + try: + dict_words_freq = Term.get_text_word_frequency(item_content) + except TimeoutException: + print ("{0} processing timeout".format(paste.p_rel_path)) + continue + else: + signal.alarm(0) + + # create token statistics + for word in dict_words_freq: + Term.create_token_statistics(item_date, word, dict_words_freq[word]) # check solo words for word in list_tracked_words: if word in dict_words_freq: - new_term_found(word, 'word', item_id) + new_term_found(word, 'word', item_id, item_date) # check words set for elem in set_tracked_words_list: @@ -86,7 +106,19 @@ if __name__ == "__main__": if word in dict_words_freq: nb_uniq_word += 1 if nb_uniq_word >= nb_words_threshold: - new_term_found(word_set, 'set', item_id) + new_term_found(word_set, 'set', item_id, item_date) else: time.sleep(5) + + + # refresh Tracked term + if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'): + list_tracked_words = Term.get_tracked_words_list() + last_refresh_word = time.time() + print('Tracked word refreshed') + + if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'): + set_tracked_words_list = Term.get_set_tracked_words_list() + last_refresh_set = time.time() + print('Tracked set refreshed') diff --git a/bin/Tokenize.py b/bin/Tokenize.py deleted file mode 100755 index 4e13b9ff..00000000 --- a/bin/Tokenize.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -The Tokenize Module -=================== - -This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q -Module. - -It tokenize the content of the paste and publish the result in the following -format: - channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring - - ..seealso:: Paste method (_get_top_words) - -..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put -the same Subscriber name in both of them. - -Requirements ------------- - -*Need running Redis instances. (Redis) -*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. - -""" -import time -from packages import Paste -from pubsublogger import publisher - -from Helper import Process -import signal - -class TimeoutException(Exception): - pass - -def timeout_handler(signum, frame): - raise TimeoutException - -signal.signal(signal.SIGALRM, timeout_handler) - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'Tokenize' - p = Process(config_section) - - # LOGGING # - publisher.info("Tokeniser started") - - while True: - message = p.get_from_set() - print(message) - if message is not None: - paste = Paste.Paste(message) - signal.alarm(5) - try: - for word, score in paste._get_top_words().items(): - if len(word) >= 4: - msg = '{} {} {}'.format(paste.p_rel_path, word, score) - p.populate_set_out(msg) - except TimeoutException: - p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_rel_path)) - continue - else: - signal.alarm(0) - else: - publisher.debug("Tokeniser is idling 10s") - time.sleep(10) - print("Sleeping") diff --git a/bin/packages/Date.py b/bin/packages/Date.py index 85edb0be..ccf59c54 100644 --- a/bin/packages/Date.py +++ b/bin/packages/Date.py @@ -1,5 +1,7 @@ #!/usr/bin/python3 +import datetime + class Date(object): """docstring for Date""" def __init__(self, *args): @@ -34,7 +36,6 @@ class Date(object): self.day = day def substract_day(self, numDay): - import datetime computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay) comp_year = str(computed_date.year) comp_month = str(computed_date.month).zfill(2) @@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1): new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day) new_date = str(new_date).replace('-', '') return new_date + +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) + date_list = [] + + for i in range(0, num_day+1): + date_list.append(date.substract_day(i)) + return list(reversed(date_list)) diff --git a/bin/packages/Item.py b/bin/packages/Item.py index 2c10cb85..4dcdde85 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -2,10 +2,13 @@ # -*-coding:UTF-8 -* import os +import sys import gzip import redis +sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) import Flask_config +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) import Date import Tag diff --git a/bin/packages/Term.py b/bin/packages/Term.py index 312ed7bd..2f45c677 100755 --- a/bin/packages/Term.py +++ b/bin/packages/Term.py @@ -4,6 +4,7 @@ import os import re import sys +import time import uuid import redis import datetime @@ -72,14 +73,30 @@ def get_set_tracked_words_list(): all_set_list.append((ter_set, num_words, elem)) return all_set_list -def is_term_tracked_in_global_level(term): - res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)) +def get_regex_tracked_words_dict(): + regex_list = r_serv_term.smembers('all:tracked_term:regex') + dict_tracked_regex = {} + for regex in regex_list: + dict_tracked_regex[regex] = re.compile(regex) + return dict_tracked_regex + +def is_term_tracked_in_global_level(term, term_type): + res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)) if res: for elem_uuid in res: if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1': return True return False +def is_term_tracked_in_user_level(term, term_type, user_id): + res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id)) + if res: + for elem_uuid in res: + if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term: + if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type: + return True + return False + def parse_json_term_to_add(dict_input, user_id): term = dict_input.get('term', None) if not term: @@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id): # check if term already tracked in global if level==1: - if is_term_tracked_in_global_level(term): + if is_term_tracked_in_global_level(term, term_type): + return ({"status": "error", "reason": "Term already tracked"}, 409) + else: + if is_term_tracked_in_user_level(term, term_type, user_id): return ({"status": "error", "reason": "Term already tracked"}, 409) term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails) @@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0) r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term) # create term - uuid map - r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid) + r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid) # add display level set if level == 0: # user only @@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0) for mail in mails: r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail) + # toggle refresh module tracker list/set + r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time()) + return term_uuid def delete_term(term_uuid): term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked') term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type') term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level') - r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid) - r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid) + r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid) + # Term not tracked by other users + if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)): + r_serv_term.srem('all:tracked_term:{}'.format(term_type), term) + # toggle refresh module tracker list/set + r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time()) if level == 0: # user only user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id') @@ -218,8 +245,8 @@ def delete_term(term_uuid): # remove item set r_serv_term.delete('tracked_term:item:{}'.format(term_uuid)) -def get_term_uuid_list(term): - return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))) +def get_term_uuid_list(term, term_type): + return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))) def get_term_tags(term_uuid): return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid))) @@ -227,10 +254,30 @@ def get_term_tags(term_uuid): def get_term_mails(term_uuid): return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid))) -def add_tracked_item(term_uuid, item_id): - r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id) +def add_tracked_item(term_uuid, item_id, item_date): + # track item + r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id) + # track nb item by date + r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1) +def create_token_statistics(item_date, word, nb): + r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1) + r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb) + r_serv_term.sadd('stat_token_history', item_date) +def delete_token_statistics_by_date(item_date): + r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date)) + r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date)) + r_serv_term.srem('stat_token_history', item_date) + +def get_all_token_stat_history(): + return r_serv_term.smembers('stat_token_history') + +def get_tracked_term_last_updated_by_type(term_type): + epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type)) + if not epoch_update: + epoch_update = 0 + return float(epoch_update) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 09e05ddf..52388ed5 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -107,7 +107,10 @@ operation_mode = 3 ttl_duplicate = 86400 default_unnamed_feed_name = unnamed_feeder -[RegexForTermsFrequency] +[TermTrackerMod] +max_execution_time = 120 + +[RegexTracker] max_execution_time = 60 ##### Redis ##### diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index 54581403..e44a922c 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY import csv -def listdirectory(path): - """Path Traversing Function. - - :param path: -- The absolute pathname to a directory. - - This function is returning all the absolute path of the files contained in - the argument directory. - - """ - fichier = [] - for root, dirs, files in os.walk(path): - - for i in files: - - fichier.append(os.path.join(root, i)) - - return fichier - clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty)) """It filters out non-printable characters from the string it receives.""" -def create_dirfile(r_serv, directory, overwrite): - """Create a file of path. - - :param r_serv: -- connexion to redis database - :param directory: -- The folder where to launch the listing of the .gz files - - This function create a list in redis with inside the absolute path - of all the pastes needed to be proceeded by function using parallel - (like redis_words_ranking) - - """ - if overwrite: - r_serv.delete("filelist") - - for x in listdirectory(directory): - r_serv.lpush("filelist", x) - - publisher.info("The list was overwritten") - - else: - if r_serv.llen("filelist") == 0: - - for x in listdirectory(directory): - r_serv.lpush("filelist", x) - - publisher.info("New list created") - else: - - for x in listdirectory(directory): - r_serv.lpush("filelist", x) - - publisher.info("The list was updated with new elements") - - def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month): """Create a csv file used with dygraph. diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 4526d978..7c8e3138 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -19,36 +19,17 @@ subscribe = Redis_Global [Attributes] subscribe = Redis_Global -[Lines] -subscribe = Redis_Global -publish = Redis_LinesShort,Redis_LinesLong - [DomClassifier] subscribe = Redis_Global -[Tokenize] -subscribe = Redis_LinesShort -publish = Redis_Words - -[Curve] -subscribe = Redis_Words -publish = Redis_CurveManageTopSets,Redis_Tags - [TermTrackerMod] subscribe = Redis_Global publish = Redis_Tags -[RegexForTermsFrequency] +[RegexTracker] subscribe = Redis_Global publish = Redis_Tags -[SetForTermsFrequency] -subscribe = Redis_Global -publish = Redis_Tags - -[CurveManageTopSets] -subscribe = Redis_CurveManageTopSets - [Categ] subscribe = Redis_Global publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey