chg: [Term Tracker] refractor term tracker word/set/regex modules + remove old modules

This commit is contained in:
Terrtia 2019-08-09 14:20:13 +02:00
parent d9bdfecef3
commit 1008c7c4fe
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
19 changed files with 304 additions and 981 deletions

View file

@ -138,12 +138,12 @@ Redis and ARDB overview
| Set - Key | Value |
| ------ | ------ |
| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** |
| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** |
##### All Term Tracked items:
| Set - Key | Value |
| ------ | ------ |
| tracked_term:item:**uuid** | **item_id** |
| tracked_term:item:**uuid**:**date** | **item_id** |
##### All Term Tracked tags:
| Set - Key | Value |
@ -155,6 +155,29 @@ Redis and ARDB overview
| ------ | ------ |
| tracked_term:mail:**uuid** | **mail** |
##### Refresh Tracked term:
| Key | Value |
| ------ | ------ |
| tracked_term:refresh:word | **last refreshed epoch** |
| tracked_term:refresh:set | - |
| tracked_term:refresh:regex | - |
##### Zset Stat Tracked term:
| Key | Field | Value |
| ------ | ------ | ------ |
| tracked_term:stat:**uuid** | **date** | **nb_seen** |
##### Stat token:
| Key | Field | Value |
| ------ | ------ | ------ |
| stat_token_total_by_day:**date** | **word** | **nb_seen** |
| | | |
| stat_token_per_item_by_day:**date** | **word** | **nb_seen** |
| Set - Key | Value |
| ------ | ------ |
| stat_token_history | **date** |
## DB2 - TermFreq:
##### Set:
@ -167,16 +190,6 @@ Redis and ARDB overview
| TrackedRegexSet | **tracked_regex** |
| | |
| | |
| global:TrackedSetTermSet | **tracked_term** |
| global:TrackedSetSet | **tracked_set** |
| global:TrackedRegexSet | **tracked_regex** |
| | |
| | |
| user:**user_id**:TrackedSetTermSet | **tracked_term** |
| user:**user_id**:TrackedSetSet | **tracked_set** |
| user:**user_id**:TrackedRegexSet | **tracked_regex** |
| | |
| | |
| tracked_**tracked_term** | **item_path** |
| set_**tracked_set** | **item_path** |
| regex_**tracked_regex** | **item_path** |

View file

@ -1,184 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
This modules update a .csv file used to draw curves representing selected
words and their occurency per day.
..note:: The channel will have the name of the file created.
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.
This Module is also used for term frequency.
/!\ Top set management is done in the module Curve_manage_top_set
Requirements
------------
*Need running Redis instances. (Redis)
*Categories files of words in /files/ need to be created
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
import os
import datetime
import calendar
from Helper import Process
# Email notifications
from NotificationHelper import *
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
# create direct link in mail
full_paste_url = "/showsavedpaste/?paste="
def check_if_tracked_term(term, path):
if term in server_term.smembers(TrackedTermsSet_Name):
#add_paste to tracked_word_set
set_name = "tracked_" + term
server_term.sadd(set_name, path)
print(term, 'addded', set_name, '->', path)
p.populate_set_out("New Term added", 'CurveManageTopSets')
# Send a notification only when the member is in the set
if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
# create mail body
mail_body = ("AIL Framework,\n"
"New occurrence for term: " + term + "\n"
''+full_paste_url + path)
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
sendEmailNotification(email, 'Term', mail_body)
# tag paste
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term):
msg = '{};{}'.format(tag, path)
p.populate_set_out(msg, 'Tags')
def getValueOverRange(word, startDate, num_day):
to_return = 0
for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
value = server_term.hget(timestamp, word)
to_return += int(value) if value is not None else 0
return to_return
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Curve'
p = Process(config_section)
# REDIS #
r_serv1 = redis.StrictRedis(
host=p.config.get("ARDB_Curve", "host"),
port=p.config.get("ARDB_Curve", "port"),
db=p.config.get("ARDB_Curve", "db"),
decode_responses=True)
server_term = redis.StrictRedis(
host=p.config.get("ARDB_TermFreq", "host"),
port=p.config.get("ARDB_TermFreq", "port"),
db=p.config.get("ARDB_TermFreq", "db"),
decode_responses=True)
# FUNCTIONS #
publisher.info("Script Curve started")
# create direct link in mail
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
# FILE CURVE SECTION #
csv_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "wordtrending_csv"))
wordfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "wordsfile"))
message = p.get_from_set()
prec_filename = None
generate_new_graph = False
# Term Frequency
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
while True:
if message is not None:
generate_new_graph = True
filename, word, score = message.split()
temp = filename.split('/')
date = temp[-4] + temp[-3] + temp[-2]
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
low_word = word.lower()
#Old curve with words in file
r_serv1.hincrby(low_word, date, int(score))
# Update redis
#consider the num of occurence of this term
curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
#1 term per paste
curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
# Add in set only if term is not in the blacklist
if low_word not in server_term.smembers(BlackListTermsSet_Name):
#consider the num of occurence of this term
server_term.zincrby(curr_set, low_word, float(score))
#1 term per paste
server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
#Add more info for tracked terms
check_if_tracked_term(low_word, filename)
#send to RegexForTermsFrequency
to_send = "{} {} {}".format(filename, timestamp, word)
p.populate_set_out(to_send, 'RegexForTermsFrequency')
else:
if generate_new_graph:
generate_new_graph = False
print('Building graph')
today = datetime.date.today()
year = today.year
month = today.month
lib_words.create_curve_with_word_file(r_serv1, csv_path,
wordfile_path, year,
month)
publisher.debug("Script Curve is Idling")
print("sleeping")
time.sleep(10)
message = p.get_from_set()

View file

@ -1,166 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This module manage top sets for terms frequency.
Every 'refresh_rate' update the weekly and monthly set
"""
import redis
import time
import datetime
import copy
from pubsublogger import publisher
from packages import lib_words
import datetime
import calendar
import os
import configparser
# Config Variables
Refresh_rate = 60*5 #sec
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
num_day_month = 31
num_day_week = 7
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def manage_top_set():
startDate = datetime.datetime.now()
startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0)
startDate = calendar.timegm(startDate.timetuple())
blacklist_size = int(server_term.scard(BlackListTermsSet_Name))
dico = {}
dico_per_paste = {}
# Retreive top data (max_card + blacklist_size) from days sets
for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay):
curr_set = top_termFreq_setName_day[0] + str(timestamp)
array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
for word, value in array_top_day:
if word not in server_term.smembers(BlackListTermsSet_Name):
if word in dico.keys():
dico[word] += value
else:
dico[word] = value
for word, value in array_top_day_per_paste:
if word not in server_term.smembers(BlackListTermsSet_Name):
if word in dico_per_paste.keys():
dico_per_paste[word] += value
else:
dico_per_paste[word] = value
if timestamp == startDate - num_day_week*oneDay:
dico_week = copy.deepcopy(dico)
dico_week_per_paste = copy.deepcopy(dico_per_paste)
# convert dico into sorted array
array_month = []
for w, v in dico.items():
array_month.append((w, v))
array_month.sort(key=lambda tup: -tup[1])
array_month = array_month[0:20]
array_week = []
for w, v in dico_week.items():
array_week.append((w, v))
array_week.sort(key=lambda tup: -tup[1])
array_week = array_week[0:20]
# convert dico_per_paste into sorted array
array_month_per_paste = []
for w, v in dico_per_paste.items():
array_month_per_paste.append((w, v))
array_month_per_paste.sort(key=lambda tup: -tup[1])
array_month_per_paste = array_month_per_paste[0:20]
array_week_per_paste = []
for w, v in dico_week_per_paste.items():
array_week_per_paste.append((w, v))
array_week_per_paste.sort(key=lambda tup: -tup[1])
array_week_per_paste = array_week_per_paste[0:20]
# suppress every terms in top sets
for curr_set, curr_num_day in top_termFreq_set_array[1:3]:
for w in server_term.zrange(curr_set, 0, -1):
server_term.zrem(curr_set, w)
for w in server_term.zrange("per_paste_" + curr_set, 0, -1):
server_term.zrem("per_paste_" + curr_set, w)
# Add top term from sorted array in their respective sorted sets
for elem in array_week:
server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0])
for elem in array_week_per_paste:
server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0])
for elem in array_month:
server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0])
for elem in array_month_per_paste:
server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0])
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
value = str(timestamp) + ", " + "-"
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
print("refreshed module")
if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
# For Module Manager
r_temp = redis.StrictRedis(
host=cfg.get('RedisPubSub', 'host'),
port=cfg.getint('RedisPubSub', 'port'),
db=cfg.getint('RedisPubSub', 'db'),
decode_responses=True)
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
value = str(timestamp) + ", " + "-"
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid()))
server_term = redis.StrictRedis(
host=cfg.get("ARDB_TermFreq", "host"),
port=cfg.getint("ARDB_TermFreq", "port"),
db=cfg.getint("ARDB_TermFreq", "db"),
decode_responses=True)
publisher.info("Script Curve_manage_top_set started")
# Sent to the logging a description of the module
publisher.info("Manage the top sets with the data created by the module curve.")
manage_top_set()
while True:
# Get one message from the input queue (module only work if linked with a queue)
time.sleep(Refresh_rate) # sleep a long time then manage the set
manage_top_set()

59
bin/DbCleaner.py Executable file
View file

@ -0,0 +1,59 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The TermTracker Module
===================
"""
import os
import sys
import time
import datetime
from pubsublogger import publisher
import NotificationHelper
from packages import Date
from packages import Item
from packages import Term
def clean_term_db_stat_token():
all_stat_date = Term.get_all_token_stat_history()
list_date_to_keep = Date.get_date_range(31)
for date in all_stat_date:
if date not in list_date_to_keep:
# remove history
Term.delete_token_statistics_by_date(date)
print('Term Stats Cleaned')
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
publisher.info("DbCleaner started")
config_section = 'TermTrackerMod'
# low priority
time.sleep(180)
daily_cleaner = True
current_date = datetime.datetime.now().strftime("%Y%m%d")
while True:
if daily_cleaner:
clean_term_db_stat_token()
daily_cleaner = False
else:
sys.exit(0)
time.sleep(600)
new_date = datetime.datetime.now().strftime("%Y%m%d")
if new_date != current_date:
current_date = new_date
daily_cleaner = True

View file

@ -1,48 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import argparse
import redis
from pubsublogger import publisher
from packages.lib_words import create_dirfile
import configparser
def main():
"""Main Function"""
# CONFIG #
cfg = configparser.ConfigParser()
cfg.read('./packages/config.cfg')
parser = argparse.ArgumentParser(
description='''This script is a part of the Analysis Information Leak
framework. It create a redis list called "listfile" which contain
the absolute filename of all the files from the directory given in
the argument "directory".''',
epilog='Example: ./Dir.py /home/2013/03/')
parser.add_argument('directory', type=str,
help='The directory to run inside', action='store')
parser.add_argument('-db', type=int, default=0,
help='The name of the Redis DB (default 0)',
choices=[0, 1, 2, 3, 4], action='store')
parser.add_argument('-ow', help='trigger the overwritting mode',
action='store_true')
args = parser.parse_args()
r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"),
port=cfg.getint("Redis_Queues", "port"),
db=cfg.getint("Redis_Queues", "db"),
decode_responses=True)
publisher.port = 6380
publisher.channel = "Script"
create_dirfile(r_serv, args.directory, args.ow)
if __name__ == "__main__":
main()

View file

@ -153,14 +153,10 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
@ -175,13 +171,9 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x"
screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x"
screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
sleep 0.1
@ -213,6 +205,8 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"

View file

@ -1,85 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The ZMQ_PubSub_Lines Module
============================
This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q
Module.
It perform a sorting on the line's length and publish/forward them to
differents channels:
*Channel 1 if max length(line) < max
*Channel 2 if max length(line) > max
The collected informations about the processed pastes
(number of lines and maximum length line) are stored in Redis.
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.
Requirements
------------
*Need running Redis instances. (LevelDB & Redis)
*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.
"""
import argparse
import time
from packages import Paste
from pubsublogger import publisher
from Helper import Process
if __name__ == '__main__':
publisher.port = 6380
publisher.channel = 'Script'
config_section = 'Lines'
p = Process(config_section)
# SCRIPT PARSER #
parser = argparse.ArgumentParser(
description='This script is a part of the Analysis Information \
Leak framework.')
parser.add_argument(
'-max', type=int, default=500,
help='The limit between "short lines" and "long lines"',
action='store')
args = parser.parse_args()
# FUNCTIONS #
tmp_string = "Lines script Subscribed to channel {} and Start to publish \
on channel Longlines, Shortlines"
publisher.info(tmp_string)
while True:
try:
message = p.get_from_set()
print(message)
if message is not None:
PST = Paste.Paste(message)
else:
publisher.debug("Tokeniser is idling 10s")
time.sleep(10)
continue
# FIXME do it in the paste class
lines_infos = PST.get_lines_info()
PST.save_attribute_redis("p_nb_lines", lines_infos[0])
PST.save_attribute_redis("p_max_length_line", lines_infos[1])
# FIXME Not used.
PST.store.sadd("Pastes_Objects", PST.p_rel_path)
print(PST.p_rel_path)
if lines_infos[1] < args.max:
p.populate_set_out( PST.p_rel_path , 'LinesShort')
else:
p.populate_set_out( PST.p_rel_path , 'LinesLong')
except IOError:
print("CRC Checksum Error on : ", PST.p_rel_path)

View file

@ -9,7 +9,6 @@ import time
import datetime
import redis
import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher
from Helper import Process

View file

@ -1,157 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the regexs
supplied in the term webpage.
"""
import redis
import time
from pubsublogger import publisher
from packages import Paste
import calendar
import re
import signal
import time
from Helper import Process
# Email notifications
from NotificationHelper import *
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# Config Variables
DICO_REFRESH_TIME = 60 # s
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
# create direct link in mail
full_paste_url = "/showsavedpaste/?paste="
def refresh_dicos():
dico_regex = {}
dico_regexname_to_redis = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
return dico_regex, dico_regexname_to_redis
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'RegexForTermsFrequency'
p = Process(config_section)
max_execution_time = p.config.getint(config_section, "max_execution_time")
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("ARDB_TermFreq", "host"),
port=p.config.get("ARDB_TermFreq", "port"),
db=p.config.get("ARDB_TermFreq", "db"),
decode_responses=True)
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
# create direct link in mail
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
# compile the regex
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
message = p.get_from_set()
# Regex Frequency
while True:
if message is not None:
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
print('dico got refreshed')
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
paste = Paste.Paste(filename)
content = paste.get_p_content()
# iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
signal.alarm(max_execution_time)
try:
matched = compiled_regex.search(content)
except TimeoutException:
print ("{0} processing timeout".format(paste.p_rel_path))
continue
else:
signal.alarm(0)
if matched is not None: # there is a match
print('regex matched {}'.format(regex_str))
matched = matched.group(0)
regex_str_complete = "/" + regex_str + "/"
# Add in Regex track set only if term is not in the blacklist
if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
# Send a notification only when the member is in the set
if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
# create mail body
mail_body = ("AIL Framework,\n"
"New occurrence for regex: " + regex_str + "\n"
''+full_paste_url + filename)
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
sendEmailNotification(email, 'Term', mail_body)
# tag paste
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
msg = '{};{}'.format(tag, filename)
p.populate_set_out(msg, 'Tags')
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
# consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
# 1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
else:
pass
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print("sleeping")
time.sleep(5)
message = p.get_from_set()

96
bin/RegexTracker.py Executable file
View file

@ -0,0 +1,96 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This Module is used for regex tracking.
It processes every paste coming from the global module and test the regexs
supplied in the term webpage.
"""
import os
import re
import sys
import time
import signal
from Helper import Process
from pubsublogger import publisher
import NotificationHelper
from packages import Item
from packages import Term
full_item_url = "/showsavedpaste/?paste="
mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}"
dict_regex_tracked = Term.get_regex_tracked_words_dict()
last_refresh = time.time()
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
def new_term_found(term, term_type, item_id, item_date):
uuid_list = Term.get_term_uuid_list(term, 'regex')
print('new tracked term found: {} in {}'.format(term, item_id))
for term_uuid in uuid_list:
Term.add_tracked_item(term_uuid, item_id, item_date)
tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
p.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid)
if mail_to_notify:
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
for mail in mail_to_notify:
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
publisher.info("Script RegexTracker started")
config_section = 'RegexTracker'
p = Process(config_section)
max_execution_time = p.config.getint(config_section, "max_execution_time")
ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
# Regex Frequency
while True:
item_id = p.get_from_set()
if item_id is not None:
item_date = Item.get_item_date(item_id)
item_content = Item.get_item_content(item_id)
for regex in dict_regex_tracked:
signal.alarm(max_execution_time)
try:
matched = dict_regex_tracked[regex].search(item_content)
except TimeoutException:
print ("{0} processing timeout".format(paste.p_rel_path))
continue
else:
signal.alarm(0)
if matched:
new_term_found(regex, 'regex', item_id, item_date)
else:
time.sleep(5)
# refresh Tracked term
if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
dict_regex_tracked = Term.get_regex_tracked_words_dict()
last_refresh = time.time()
print('Tracked set refreshed')

View file

@ -1,151 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the sets
supplied in the term webpage.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast
from Helper import Process
# Email notifications
from NotificationHelper import *
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
# create direct link in mail
full_paste_url = "/showsavedpaste/?paste="
def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\'{}\', ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return str(quoted_tab)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'SetForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("ARDB_TermFreq", "host"),
port=p.config.get("ARDB_TermFreq", "port"),
db=p.config.get("ARDB_TermFreq", "db"),
decode_responses=True)
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
# create direct link in mail
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
dico_setname_to_redis = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[tab_set] = float(match_percent)
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
dico_setname_to_redis[tab_set] = set_str
else:
continue
message = p.get_from_set()
while True:
if message is not None:
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
content = Paste.Paste(filename).get_p_content()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate over the words of the file
match_dico = {}
for word in content.split():
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set[:-1]: #avoid the percent matching
if word == w_set:
try:
match_dico[str(array_set)] += 1
except KeyError:
match_dico[str(array_set)] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
if eff_percent >= dico_percent[the_set]:
# Send a notification only when the member is in the set
if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):
# create mail body
mail_body = ("AIL Framework,\n"
"New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
''+full_paste_url + filename)
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
sendEmailNotification(email, 'Term', mail_body)
# tag paste
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
msg = '{};{}'.format(tag, filename)
p.populate_set_out(msg, 'Tags')
print(the_set, "matched in", filename)
set_name = 'set_' + dico_setname_to_redis[the_set]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
if new_to_the_set:
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print("sleeping")
time.sleep(5)
message = p.get_from_set()

View file

@ -8,13 +8,14 @@ The TermTracker Module
import os
import sys
import time
import signal
from Helper import Process
from pubsublogger import publisher
import NotificationHelper
from packages import Paste
from packages import Item
from packages import Term
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n
# loads tracked words
list_tracked_words = Term.get_tracked_words_list()
last_refresh_word = time.time()
set_tracked_words_list = Term.get_set_tracked_words_list()
last_refresh_set = time.time()
def new_term_found(term, term_type, item_id):
uuid_list = Term.get_term_uuid_list(term)
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
def new_term_found(term, term_type, item_id, item_date):
uuid_list = Term.get_term_uuid_list(term, term_type)
print('new tracked term found: {} in {}'.format(term, item_id))
for term_uuid in uuid_list:
Term.add_tracked_item(term_uuid, item_id)
Term.add_tracked_item(term_uuid, item_id, item_date)
tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
@ -52,28 +62,38 @@ if __name__ == "__main__":
publisher.channel = "Script"
publisher.info("Script TermTrackerMod started")
#config_section = 'TermTrackerMod'
config_section = 'TermTrackerMod'
p = Process(config_section)
max_execution_time = p.config.getint(config_section, "max_execution_time")
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
while True:
item_id = p.get_from_set()
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
if message is not None:
if item_id is not None:
paste = Paste.Paste(item_id)
item_date = Item.get_item_date(item_id)
item_content = Item.get_item_content(item_id)
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
signal.alarm(max_execution_time)
try:
dict_words_freq = Term.get_text_word_frequency(item_content)
except TimeoutException:
print ("{0} processing timeout".format(paste.p_rel_path))
continue
else:
signal.alarm(0)
# create token statistics
for word in dict_words_freq:
Term.create_token_statistics(item_date, word, dict_words_freq[word])
# check solo words
for word in list_tracked_words:
if word in dict_words_freq:
new_term_found(word, 'word', item_id)
new_term_found(word, 'word', item_id, item_date)
# check words set
for elem in set_tracked_words_list:
@ -86,7 +106,19 @@ if __name__ == "__main__":
if word in dict_words_freq:
nb_uniq_word += 1
if nb_uniq_word >= nb_words_threshold:
new_term_found(word_set, 'set', item_id)
new_term_found(word_set, 'set', item_id, item_date)
else:
time.sleep(5)
# refresh Tracked term
if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
list_tracked_words = Term.get_tracked_words_list()
last_refresh_word = time.time()
print('Tracked word refreshed')
if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
set_tracked_words_list = Term.get_set_tracked_words_list()
last_refresh_set = time.time()
print('Tracked set refreshed')

View file

@ -1,71 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Tokenize Module
===================
This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
Module.
It tokenize the content of the paste and publish the result in the following
format:
channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring
..seealso:: Paste method (_get_top_words)
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
the same Subscriber name in both of them.
Requirements
------------
*Need running Redis instances. (Redis)
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
"""
import time
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Tokenize'
p = Process(config_section)
# LOGGING #
publisher.info("Tokeniser started")
while True:
message = p.get_from_set()
print(message)
if message is not None:
paste = Paste.Paste(message)
signal.alarm(5)
try:
for word, score in paste._get_top_words().items():
if len(word) >= 4:
msg = '{} {} {}'.format(paste.p_rel_path, word, score)
p.populate_set_out(msg)
except TimeoutException:
p.incr_module_timeout_statistic()
print ("{0} processing timeout".format(paste.p_rel_path))
continue
else:
signal.alarm(0)
else:
publisher.debug("Tokeniser is idling 10s")
time.sleep(10)
print("Sleeping")

View file

@ -1,5 +1,7 @@
#!/usr/bin/python3
import datetime
class Date(object):
"""docstring for Date"""
def __init__(self, *args):
@ -34,7 +36,6 @@ class Date(object):
self.day = day
def substract_day(self, numDay):
import datetime
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
comp_year = str(computed_date.year)
comp_month = str(computed_date.month).zfill(2)
@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1):
new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
new_date = str(new_date).replace('-', '')
return new_date
def get_date_range(num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return list(reversed(date_list))

View file

@ -2,10 +2,13 @@
# -*-coding:UTF-8 -*
import os
import sys
import gzip
import redis
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
import Flask_config
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import Date
import Tag

View file

@ -4,6 +4,7 @@
import os
import re
import sys
import time
import uuid
import redis
import datetime
@ -72,14 +73,30 @@ def get_set_tracked_words_list():
all_set_list.append((ter_set, num_words, elem))
return all_set_list
def is_term_tracked_in_global_level(term):
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
def get_regex_tracked_words_dict():
regex_list = r_serv_term.smembers('all:tracked_term:regex')
dict_tracked_regex = {}
for regex in regex_list:
dict_tracked_regex[regex] = re.compile(regex)
return dict_tracked_regex
def is_term_tracked_in_global_level(term, term_type):
res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))
if res:
for elem_uuid in res:
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
return True
return False
def is_term_tracked_in_user_level(term, term_type, user_id):
res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id))
if res:
for elem_uuid in res:
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term:
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type:
return True
return False
def parse_json_term_to_add(dict_input, user_id):
term = dict_input.get('term', None)
if not term:
@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id):
# check if term already tracked in global
if level==1:
if is_term_tracked_in_global_level(term):
if is_term_tracked_in_global_level(term, term_type):
return ({"status": "error", "reason": "Term already tracked"}, 409)
else:
if is_term_tracked_in_user_level(term, term_type, user_id):
return ({"status": "error", "reason": "Term already tracked"}, 409)
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)
# create term - uuid map
r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid)
r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
# add display level set
if level == 0: # user only
@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
for mail in mails:
r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)
# toggle refresh module tracker list/set
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
return term_uuid
def delete_term(term_uuid):
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
# Term not tracked by other users
if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)):
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term)
# toggle refresh module tracker list/set
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
if level == 0: # user only
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
@ -218,8 +245,8 @@ def delete_term(term_uuid):
# remove item set
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
def get_term_uuid_list(term):
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
def get_term_uuid_list(term, term_type):
return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)))
def get_term_tags(term_uuid):
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
@ -227,10 +254,30 @@ def get_term_tags(term_uuid):
def get_term_mails(term_uuid):
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
def add_tracked_item(term_uuid, item_id):
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
def add_tracked_item(term_uuid, item_id, item_date):
# track item
r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id)
# track nb item by date
r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1)
def create_token_statistics(item_date, word, nb):
r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1)
r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb)
r_serv_term.sadd('stat_token_history', item_date)
def delete_token_statistics_by_date(item_date):
r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date))
r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date))
r_serv_term.srem('stat_token_history', item_date)
def get_all_token_stat_history():
return r_serv_term.smembers('stat_token_history')
def get_tracked_term_last_updated_by_type(term_type):
epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type))
if not epoch_update:
epoch_update = 0
return float(epoch_update)

View file

@ -107,7 +107,10 @@ operation_mode = 3
ttl_duplicate = 86400
default_unnamed_feed_name = unnamed_feeder
[RegexForTermsFrequency]
[TermTrackerMod]
max_execution_time = 120
[RegexTracker]
max_execution_time = 60
##### Redis #####

View file

@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY
import csv
def listdirectory(path):
"""Path Traversing Function.
:param path: -- The absolute pathname to a directory.
This function is returning all the absolute path of the files contained in
the argument directory.
"""
fichier = []
for root, dirs, files in os.walk(path):
for i in files:
fichier.append(os.path.join(root, i))
return fichier
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""
def create_dirfile(r_serv, directory, overwrite):
"""Create a file of path.
:param r_serv: -- connexion to redis database
:param directory: -- The folder where to launch the listing of the .gz files
This function create a list in redis with inside the absolute path
of all the pastes needed to be proceeded by function using parallel
(like redis_words_ranking)
"""
if overwrite:
r_serv.delete("filelist")
for x in listdirectory(directory):
r_serv.lpush("filelist", x)
publisher.info("The list was overwritten")
else:
if r_serv.llen("filelist") == 0:
for x in listdirectory(directory):
r_serv.lpush("filelist", x)
publisher.info("New list created")
else:
for x in listdirectory(directory):
r_serv.lpush("filelist", x)
publisher.info("The list was updated with new elements")
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
"""Create a csv file used with dygraph.

View file

@ -19,36 +19,17 @@ subscribe = Redis_Global
[Attributes]
subscribe = Redis_Global
[Lines]
subscribe = Redis_Global
publish = Redis_LinesShort,Redis_LinesLong
[DomClassifier]
subscribe = Redis_Global
[Tokenize]
subscribe = Redis_LinesShort
publish = Redis_Words
[Curve]
subscribe = Redis_Words
publish = Redis_CurveManageTopSets,Redis_Tags
[TermTrackerMod]
subscribe = Redis_Global
publish = Redis_Tags
[RegexForTermsFrequency]
[RegexTracker]
subscribe = Redis_Global
publish = Redis_Tags
[SetForTermsFrequency]
subscribe = Redis_Global
publish = Redis_Tags
[CurveManageTopSets]
subscribe = Redis_CurveManageTopSets
[Categ]
subscribe = Redis_Global
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey