mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
chg: [Term Tracker] refractor term tracker word/set/regex modules + remove old modules
This commit is contained in:
parent
d9bdfecef3
commit
1008c7c4fe
19 changed files with 304 additions and 981 deletions
37
OVERVIEW.md
37
OVERVIEW.md
|
@ -138,12 +138,12 @@ Redis and ARDB overview
|
||||||
|
|
||||||
| Set - Key | Value |
|
| Set - Key | Value |
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** |
|
| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** |
|
||||||
|
|
||||||
##### All Term Tracked items:
|
##### All Term Tracked items:
|
||||||
| Set - Key | Value |
|
| Set - Key | Value |
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| tracked_term:item:**uuid** | **item_id** |
|
| tracked_term:item:**uuid**:**date** | **item_id** |
|
||||||
|
|
||||||
##### All Term Tracked tags:
|
##### All Term Tracked tags:
|
||||||
| Set - Key | Value |
|
| Set - Key | Value |
|
||||||
|
@ -155,6 +155,29 @@ Redis and ARDB overview
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| tracked_term:mail:**uuid** | **mail** |
|
| tracked_term:mail:**uuid** | **mail** |
|
||||||
|
|
||||||
|
##### Refresh Tracked term:
|
||||||
|
| Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| tracked_term:refresh:word | **last refreshed epoch** |
|
||||||
|
| tracked_term:refresh:set | - |
|
||||||
|
| tracked_term:refresh:regex | - |
|
||||||
|
|
||||||
|
##### Zset Stat Tracked term:
|
||||||
|
| Key | Field | Value |
|
||||||
|
| ------ | ------ | ------ |
|
||||||
|
| tracked_term:stat:**uuid** | **date** | **nb_seen** |
|
||||||
|
|
||||||
|
##### Stat token:
|
||||||
|
| Key | Field | Value |
|
||||||
|
| ------ | ------ | ------ |
|
||||||
|
| stat_token_total_by_day:**date** | **word** | **nb_seen** |
|
||||||
|
| | | |
|
||||||
|
| stat_token_per_item_by_day:**date** | **word** | **nb_seen** |
|
||||||
|
|
||||||
|
| Set - Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| stat_token_history | **date** |
|
||||||
|
|
||||||
## DB2 - TermFreq:
|
## DB2 - TermFreq:
|
||||||
|
|
||||||
##### Set:
|
##### Set:
|
||||||
|
@ -167,16 +190,6 @@ Redis and ARDB overview
|
||||||
| TrackedRegexSet | **tracked_regex** |
|
| TrackedRegexSet | **tracked_regex** |
|
||||||
| | |
|
| | |
|
||||||
| | |
|
| | |
|
||||||
| global:TrackedSetTermSet | **tracked_term** |
|
|
||||||
| global:TrackedSetSet | **tracked_set** |
|
|
||||||
| global:TrackedRegexSet | **tracked_regex** |
|
|
||||||
| | |
|
|
||||||
| | |
|
|
||||||
| user:**user_id**:TrackedSetTermSet | **tracked_term** |
|
|
||||||
| user:**user_id**:TrackedSetSet | **tracked_set** |
|
|
||||||
| user:**user_id**:TrackedRegexSet | **tracked_regex** |
|
|
||||||
| | |
|
|
||||||
| | |
|
|
||||||
| tracked_**tracked_term** | **item_path** |
|
| tracked_**tracked_term** | **item_path** |
|
||||||
| set_**tracked_set** | **item_path** |
|
| set_**tracked_set** | **item_path** |
|
||||||
| regex_**tracked_regex** | **item_path** |
|
| regex_**tracked_regex** | **item_path** |
|
||||||
|
|
184
bin/Curve.py
184
bin/Curve.py
|
@ -1,184 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
|
|
||||||
|
|
||||||
This modules update a .csv file used to draw curves representing selected
|
|
||||||
words and their occurency per day.
|
|
||||||
|
|
||||||
..note:: The channel will have the name of the file created.
|
|
||||||
|
|
||||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
||||||
the same Subscriber name in both of them.
|
|
||||||
|
|
||||||
|
|
||||||
This Module is also used for term frequency.
|
|
||||||
|
|
||||||
/!\ Top set management is done in the module Curve_manage_top_set
|
|
||||||
|
|
||||||
|
|
||||||
Requirements
|
|
||||||
------------
|
|
||||||
|
|
||||||
*Need running Redis instances. (Redis)
|
|
||||||
*Categories files of words in /files/ need to be created
|
|
||||||
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import redis
|
|
||||||
import time
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from packages import lib_words
|
|
||||||
import os
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
|
|
||||||
from Helper import Process
|
|
||||||
|
|
||||||
# Email notifications
|
|
||||||
from NotificationHelper import *
|
|
||||||
|
|
||||||
# Config Variables
|
|
||||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
||||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
||||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
|
||||||
oneDay = 60*60*24
|
|
||||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
||||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
||||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
||||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
|
||||||
|
|
||||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = "/showsavedpaste/?paste="
|
|
||||||
|
|
||||||
def check_if_tracked_term(term, path):
|
|
||||||
if term in server_term.smembers(TrackedTermsSet_Name):
|
|
||||||
#add_paste to tracked_word_set
|
|
||||||
set_name = "tracked_" + term
|
|
||||||
server_term.sadd(set_name, path)
|
|
||||||
print(term, 'addded', set_name, '->', path)
|
|
||||||
p.populate_set_out("New Term added", 'CurveManageTopSets')
|
|
||||||
|
|
||||||
# Send a notification only when the member is in the set
|
|
||||||
if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
|
||||||
|
|
||||||
# create mail body
|
|
||||||
mail_body = ("AIL Framework,\n"
|
|
||||||
"New occurrence for term: " + term + "\n"
|
|
||||||
''+full_paste_url + path)
|
|
||||||
|
|
||||||
# Send to every associated email adress
|
|
||||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
|
|
||||||
sendEmailNotification(email, 'Term', mail_body)
|
|
||||||
|
|
||||||
# tag paste
|
|
||||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term):
|
|
||||||
msg = '{};{}'.format(tag, path)
|
|
||||||
p.populate_set_out(msg, 'Tags')
|
|
||||||
|
|
||||||
|
|
||||||
def getValueOverRange(word, startDate, num_day):
|
|
||||||
to_return = 0
|
|
||||||
for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
|
|
||||||
value = server_term.hget(timestamp, word)
|
|
||||||
to_return += int(value) if value is not None else 0
|
|
||||||
return to_return
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
config_section = 'Curve'
|
|
||||||
p = Process(config_section)
|
|
||||||
|
|
||||||
# REDIS #
|
|
||||||
r_serv1 = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_Curve", "host"),
|
|
||||||
port=p.config.get("ARDB_Curve", "port"),
|
|
||||||
db=p.config.get("ARDB_Curve", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
server_term = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_TermFreq", "host"),
|
|
||||||
port=p.config.get("ARDB_TermFreq", "port"),
|
|
||||||
db=p.config.get("ARDB_TermFreq", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
# FUNCTIONS #
|
|
||||||
publisher.info("Script Curve started")
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
|
||||||
|
|
||||||
# FILE CURVE SECTION #
|
|
||||||
csv_path = os.path.join(os.environ['AIL_HOME'],
|
|
||||||
p.config.get("Directories", "wordtrending_csv"))
|
|
||||||
wordfile_path = os.path.join(os.environ['AIL_HOME'],
|
|
||||||
p.config.get("Directories", "wordsfile"))
|
|
||||||
|
|
||||||
message = p.get_from_set()
|
|
||||||
prec_filename = None
|
|
||||||
generate_new_graph = False
|
|
||||||
|
|
||||||
# Term Frequency
|
|
||||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
||||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
||||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
||||||
|
|
||||||
while True:
|
|
||||||
|
|
||||||
if message is not None:
|
|
||||||
generate_new_graph = True
|
|
||||||
|
|
||||||
filename, word, score = message.split()
|
|
||||||
temp = filename.split('/')
|
|
||||||
date = temp[-4] + temp[-3] + temp[-2]
|
|
||||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
|
||||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
|
||||||
|
|
||||||
|
|
||||||
low_word = word.lower()
|
|
||||||
#Old curve with words in file
|
|
||||||
r_serv1.hincrby(low_word, date, int(score))
|
|
||||||
|
|
||||||
# Update redis
|
|
||||||
#consider the num of occurence of this term
|
|
||||||
curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
|
|
||||||
#1 term per paste
|
|
||||||
curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
|
|
||||||
|
|
||||||
# Add in set only if term is not in the blacklist
|
|
||||||
if low_word not in server_term.smembers(BlackListTermsSet_Name):
|
|
||||||
#consider the num of occurence of this term
|
|
||||||
server_term.zincrby(curr_set, low_word, float(score))
|
|
||||||
#1 term per paste
|
|
||||||
server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
|
|
||||||
|
|
||||||
#Add more info for tracked terms
|
|
||||||
check_if_tracked_term(low_word, filename)
|
|
||||||
|
|
||||||
#send to RegexForTermsFrequency
|
|
||||||
to_send = "{} {} {}".format(filename, timestamp, word)
|
|
||||||
p.populate_set_out(to_send, 'RegexForTermsFrequency')
|
|
||||||
|
|
||||||
else:
|
|
||||||
|
|
||||||
if generate_new_graph:
|
|
||||||
generate_new_graph = False
|
|
||||||
print('Building graph')
|
|
||||||
today = datetime.date.today()
|
|
||||||
year = today.year
|
|
||||||
month = today.month
|
|
||||||
|
|
||||||
lib_words.create_curve_with_word_file(r_serv1, csv_path,
|
|
||||||
wordfile_path, year,
|
|
||||||
month)
|
|
||||||
|
|
||||||
publisher.debug("Script Curve is Idling")
|
|
||||||
print("sleeping")
|
|
||||||
time.sleep(10)
|
|
||||||
message = p.get_from_set()
|
|
|
@ -1,166 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
|
|
||||||
This module manage top sets for terms frequency.
|
|
||||||
Every 'refresh_rate' update the weekly and monthly set
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import redis
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import copy
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from packages import lib_words
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
import os
|
|
||||||
import configparser
|
|
||||||
|
|
||||||
# Config Variables
|
|
||||||
Refresh_rate = 60*5 #sec
|
|
||||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
||||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
||||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
|
||||||
oneDay = 60*60*24
|
|
||||||
num_day_month = 31
|
|
||||||
num_day_week = 7
|
|
||||||
|
|
||||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
||||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
||||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
||||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
|
||||||
|
|
||||||
|
|
||||||
def manage_top_set():
|
|
||||||
startDate = datetime.datetime.now()
|
|
||||||
startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
||||||
startDate = calendar.timegm(startDate.timetuple())
|
|
||||||
blacklist_size = int(server_term.scard(BlackListTermsSet_Name))
|
|
||||||
|
|
||||||
dico = {}
|
|
||||||
dico_per_paste = {}
|
|
||||||
|
|
||||||
# Retreive top data (max_card + blacklist_size) from days sets
|
|
||||||
for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay):
|
|
||||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
|
||||||
array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
|
|
||||||
array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
|
|
||||||
|
|
||||||
for word, value in array_top_day:
|
|
||||||
if word not in server_term.smembers(BlackListTermsSet_Name):
|
|
||||||
if word in dico.keys():
|
|
||||||
dico[word] += value
|
|
||||||
else:
|
|
||||||
dico[word] = value
|
|
||||||
|
|
||||||
for word, value in array_top_day_per_paste:
|
|
||||||
if word not in server_term.smembers(BlackListTermsSet_Name):
|
|
||||||
if word in dico_per_paste.keys():
|
|
||||||
dico_per_paste[word] += value
|
|
||||||
else:
|
|
||||||
dico_per_paste[word] = value
|
|
||||||
|
|
||||||
if timestamp == startDate - num_day_week*oneDay:
|
|
||||||
dico_week = copy.deepcopy(dico)
|
|
||||||
dico_week_per_paste = copy.deepcopy(dico_per_paste)
|
|
||||||
|
|
||||||
# convert dico into sorted array
|
|
||||||
array_month = []
|
|
||||||
for w, v in dico.items():
|
|
||||||
array_month.append((w, v))
|
|
||||||
array_month.sort(key=lambda tup: -tup[1])
|
|
||||||
array_month = array_month[0:20]
|
|
||||||
|
|
||||||
array_week = []
|
|
||||||
for w, v in dico_week.items():
|
|
||||||
array_week.append((w, v))
|
|
||||||
array_week.sort(key=lambda tup: -tup[1])
|
|
||||||
array_week = array_week[0:20]
|
|
||||||
|
|
||||||
# convert dico_per_paste into sorted array
|
|
||||||
array_month_per_paste = []
|
|
||||||
for w, v in dico_per_paste.items():
|
|
||||||
array_month_per_paste.append((w, v))
|
|
||||||
array_month_per_paste.sort(key=lambda tup: -tup[1])
|
|
||||||
array_month_per_paste = array_month_per_paste[0:20]
|
|
||||||
|
|
||||||
array_week_per_paste = []
|
|
||||||
for w, v in dico_week_per_paste.items():
|
|
||||||
array_week_per_paste.append((w, v))
|
|
||||||
array_week_per_paste.sort(key=lambda tup: -tup[1])
|
|
||||||
array_week_per_paste = array_week_per_paste[0:20]
|
|
||||||
|
|
||||||
|
|
||||||
# suppress every terms in top sets
|
|
||||||
for curr_set, curr_num_day in top_termFreq_set_array[1:3]:
|
|
||||||
for w in server_term.zrange(curr_set, 0, -1):
|
|
||||||
server_term.zrem(curr_set, w)
|
|
||||||
for w in server_term.zrange("per_paste_" + curr_set, 0, -1):
|
|
||||||
server_term.zrem("per_paste_" + curr_set, w)
|
|
||||||
|
|
||||||
# Add top term from sorted array in their respective sorted sets
|
|
||||||
for elem in array_week:
|
|
||||||
server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0])
|
|
||||||
for elem in array_week_per_paste:
|
|
||||||
server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0])
|
|
||||||
|
|
||||||
for elem in array_month:
|
|
||||||
server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0])
|
|
||||||
for elem in array_month_per_paste:
|
|
||||||
server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0])
|
|
||||||
|
|
||||||
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
|
|
||||||
value = str(timestamp) + ", " + "-"
|
|
||||||
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
|
|
||||||
print("refreshed module")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
|
|
||||||
# Port of the redis instance used by pubsublogger
|
|
||||||
publisher.port = 6380
|
|
||||||
# Script is the default channel used for the modules.
|
|
||||||
publisher.channel = 'Script'
|
|
||||||
|
|
||||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
|
||||||
if not os.path.exists(configfile):
|
|
||||||
raise Exception('Unable to find the configuration file. \
|
|
||||||
Did you set environment variables? \
|
|
||||||
Or activate the virtualenv.')
|
|
||||||
|
|
||||||
cfg = configparser.ConfigParser()
|
|
||||||
cfg.read(configfile)
|
|
||||||
|
|
||||||
|
|
||||||
# For Module Manager
|
|
||||||
r_temp = redis.StrictRedis(
|
|
||||||
host=cfg.get('RedisPubSub', 'host'),
|
|
||||||
port=cfg.getint('RedisPubSub', 'port'),
|
|
||||||
db=cfg.getint('RedisPubSub', 'db'),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
|
|
||||||
value = str(timestamp) + ", " + "-"
|
|
||||||
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
|
|
||||||
r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid()))
|
|
||||||
|
|
||||||
server_term = redis.StrictRedis(
|
|
||||||
host=cfg.get("ARDB_TermFreq", "host"),
|
|
||||||
port=cfg.getint("ARDB_TermFreq", "port"),
|
|
||||||
db=cfg.getint("ARDB_TermFreq", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
publisher.info("Script Curve_manage_top_set started")
|
|
||||||
|
|
||||||
# Sent to the logging a description of the module
|
|
||||||
publisher.info("Manage the top sets with the data created by the module curve.")
|
|
||||||
|
|
||||||
manage_top_set()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# Get one message from the input queue (module only work if linked with a queue)
|
|
||||||
time.sleep(Refresh_rate) # sleep a long time then manage the set
|
|
||||||
manage_top_set()
|
|
59
bin/DbCleaner.py
Executable file
59
bin/DbCleaner.py
Executable file
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
"""
|
||||||
|
The TermTracker Module
|
||||||
|
===================
|
||||||
|
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
import NotificationHelper
|
||||||
|
|
||||||
|
from packages import Date
|
||||||
|
from packages import Item
|
||||||
|
from packages import Term
|
||||||
|
|
||||||
|
def clean_term_db_stat_token():
|
||||||
|
all_stat_date = Term.get_all_token_stat_history()
|
||||||
|
|
||||||
|
list_date_to_keep = Date.get_date_range(31)
|
||||||
|
for date in all_stat_date:
|
||||||
|
if date not in list_date_to_keep:
|
||||||
|
# remove history
|
||||||
|
Term.delete_token_statistics_by_date(date)
|
||||||
|
|
||||||
|
print('Term Stats Cleaned')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
publisher.port = 6380
|
||||||
|
publisher.channel = "Script"
|
||||||
|
publisher.info("DbCleaner started")
|
||||||
|
|
||||||
|
config_section = 'TermTrackerMod'
|
||||||
|
|
||||||
|
# low priority
|
||||||
|
time.sleep(180)
|
||||||
|
|
||||||
|
daily_cleaner = True
|
||||||
|
current_date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
if daily_cleaner:
|
||||||
|
clean_term_db_stat_token()
|
||||||
|
daily_cleaner = False
|
||||||
|
else:
|
||||||
|
sys.exit(0)
|
||||||
|
time.sleep(600)
|
||||||
|
|
||||||
|
new_date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
|
if new_date != current_date:
|
||||||
|
current_date = new_date
|
||||||
|
daily_cleaner = True
|
48
bin/Dir.py
48
bin/Dir.py
|
@ -1,48 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import redis
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from packages.lib_words import create_dirfile
|
|
||||||
import configparser
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main Function"""
|
|
||||||
|
|
||||||
# CONFIG #
|
|
||||||
cfg = configparser.ConfigParser()
|
|
||||||
cfg.read('./packages/config.cfg')
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description='''This script is a part of the Analysis Information Leak
|
|
||||||
framework. It create a redis list called "listfile" which contain
|
|
||||||
the absolute filename of all the files from the directory given in
|
|
||||||
the argument "directory".''',
|
|
||||||
epilog='Example: ./Dir.py /home/2013/03/')
|
|
||||||
|
|
||||||
parser.add_argument('directory', type=str,
|
|
||||||
help='The directory to run inside', action='store')
|
|
||||||
|
|
||||||
parser.add_argument('-db', type=int, default=0,
|
|
||||||
help='The name of the Redis DB (default 0)',
|
|
||||||
choices=[0, 1, 2, 3, 4], action='store')
|
|
||||||
|
|
||||||
parser.add_argument('-ow', help='trigger the overwritting mode',
|
|
||||||
action='store_true')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"),
|
|
||||||
port=cfg.getint("Redis_Queues", "port"),
|
|
||||||
db=cfg.getint("Redis_Queues", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
create_dirfile(r_serv, args.directory, args.ow)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -153,14 +153,10 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
|
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x"
|
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
|
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
|
screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x"
|
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
|
screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
|
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
|
||||||
|
@ -175,13 +171,9 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
|
screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x"
|
screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x"
|
screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x"
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x"
|
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x"
|
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
|
screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
@ -213,6 +205,8 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
|
screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"
|
||||||
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
|
screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"
|
screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"
|
||||||
|
|
85
bin/Lines.py
85
bin/Lines.py
|
@ -1,85 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
"""
|
|
||||||
The ZMQ_PubSub_Lines Module
|
|
||||||
============================
|
|
||||||
|
|
||||||
This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q
|
|
||||||
Module.
|
|
||||||
|
|
||||||
It perform a sorting on the line's length and publish/forward them to
|
|
||||||
differents channels:
|
|
||||||
|
|
||||||
*Channel 1 if max length(line) < max
|
|
||||||
*Channel 2 if max length(line) > max
|
|
||||||
|
|
||||||
The collected informations about the processed pastes
|
|
||||||
(number of lines and maximum length line) are stored in Redis.
|
|
||||||
|
|
||||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
||||||
the same Subscriber name in both of them.
|
|
||||||
|
|
||||||
Requirements
|
|
||||||
------------
|
|
||||||
|
|
||||||
*Need running Redis instances. (LevelDB & Redis)
|
|
||||||
*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
from packages import Paste
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
from Helper import Process
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = 'Script'
|
|
||||||
|
|
||||||
config_section = 'Lines'
|
|
||||||
p = Process(config_section)
|
|
||||||
|
|
||||||
# SCRIPT PARSER #
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description='This script is a part of the Analysis Information \
|
|
||||||
Leak framework.')
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'-max', type=int, default=500,
|
|
||||||
help='The limit between "short lines" and "long lines"',
|
|
||||||
action='store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# FUNCTIONS #
|
|
||||||
tmp_string = "Lines script Subscribed to channel {} and Start to publish \
|
|
||||||
on channel Longlines, Shortlines"
|
|
||||||
publisher.info(tmp_string)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
message = p.get_from_set()
|
|
||||||
print(message)
|
|
||||||
if message is not None:
|
|
||||||
PST = Paste.Paste(message)
|
|
||||||
else:
|
|
||||||
publisher.debug("Tokeniser is idling 10s")
|
|
||||||
time.sleep(10)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# FIXME do it in the paste class
|
|
||||||
lines_infos = PST.get_lines_info()
|
|
||||||
PST.save_attribute_redis("p_nb_lines", lines_infos[0])
|
|
||||||
PST.save_attribute_redis("p_max_length_line", lines_infos[1])
|
|
||||||
|
|
||||||
# FIXME Not used.
|
|
||||||
PST.store.sadd("Pastes_Objects", PST.p_rel_path)
|
|
||||||
print(PST.p_rel_path)
|
|
||||||
if lines_infos[1] < args.max:
|
|
||||||
p.populate_set_out( PST.p_rel_path , 'LinesShort')
|
|
||||||
else:
|
|
||||||
p.populate_set_out( PST.p_rel_path , 'LinesLong')
|
|
||||||
except IOError:
|
|
||||||
print("CRC Checksum Error on : ", PST.p_rel_path)
|
|
|
@ -9,7 +9,6 @@ import time
|
||||||
import datetime
|
import datetime
|
||||||
import redis
|
import redis
|
||||||
import os
|
import os
|
||||||
from packages import lib_words
|
|
||||||
from packages.Date import Date
|
from packages.Date import Date
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
|
|
@ -1,157 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
This Module is used for term frequency.
|
|
||||||
It processes every paste coming from the global module and test the regexs
|
|
||||||
supplied in the term webpage.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import redis
|
|
||||||
import time
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from packages import Paste
|
|
||||||
import calendar
|
|
||||||
import re
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
from Helper import Process
|
|
||||||
# Email notifications
|
|
||||||
from NotificationHelper import *
|
|
||||||
|
|
||||||
|
|
||||||
class TimeoutException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def timeout_handler(signum, frame):
|
|
||||||
raise TimeoutException
|
|
||||||
|
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
|
||||||
|
|
||||||
# Config Variables
|
|
||||||
DICO_REFRESH_TIME = 60 # s
|
|
||||||
|
|
||||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
||||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
||||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
|
||||||
|
|
||||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
|
||||||
oneDay = 60*60*24
|
|
||||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
||||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
||||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
||||||
top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
|
|
||||||
|
|
||||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = "/showsavedpaste/?paste="
|
|
||||||
|
|
||||||
|
|
||||||
def refresh_dicos():
|
|
||||||
dico_regex = {}
|
|
||||||
dico_regexname_to_redis = {}
|
|
||||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
|
||||||
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
|
||||||
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
|
||||||
|
|
||||||
return dico_regex, dico_regexname_to_redis
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
config_section = 'RegexForTermsFrequency'
|
|
||||||
p = Process(config_section)
|
|
||||||
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
|
||||||
|
|
||||||
# REDIS #
|
|
||||||
server_term = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_TermFreq", "host"),
|
|
||||||
port=p.config.get("ARDB_TermFreq", "port"),
|
|
||||||
db=p.config.get("ARDB_TermFreq", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
# FUNCTIONS #
|
|
||||||
publisher.info("RegexForTermsFrequency script started")
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
|
||||||
|
|
||||||
# compile the regex
|
|
||||||
dico_refresh_cooldown = time.time()
|
|
||||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
|
||||||
|
|
||||||
message = p.get_from_set()
|
|
||||||
|
|
||||||
# Regex Frequency
|
|
||||||
while True:
|
|
||||||
|
|
||||||
if message is not None:
|
|
||||||
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
|
||||||
dico_refresh_cooldown = time.time()
|
|
||||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
|
||||||
print('dico got refreshed')
|
|
||||||
|
|
||||||
filename = message
|
|
||||||
temp = filename.split('/')
|
|
||||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
|
||||||
|
|
||||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
|
||||||
paste = Paste.Paste(filename)
|
|
||||||
content = paste.get_p_content()
|
|
||||||
|
|
||||||
# iterate the word with the regex
|
|
||||||
for regex_str, compiled_regex in dico_regex.items():
|
|
||||||
|
|
||||||
signal.alarm(max_execution_time)
|
|
||||||
try:
|
|
||||||
matched = compiled_regex.search(content)
|
|
||||||
except TimeoutException:
|
|
||||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
signal.alarm(0)
|
|
||||||
|
|
||||||
if matched is not None: # there is a match
|
|
||||||
print('regex matched {}'.format(regex_str))
|
|
||||||
matched = matched.group(0)
|
|
||||||
regex_str_complete = "/" + regex_str + "/"
|
|
||||||
# Add in Regex track set only if term is not in the blacklist
|
|
||||||
if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
|
|
||||||
# Send a notification only when the member is in the set
|
|
||||||
if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
|
||||||
|
|
||||||
# create mail body
|
|
||||||
mail_body = ("AIL Framework,\n"
|
|
||||||
"New occurrence for regex: " + regex_str + "\n"
|
|
||||||
''+full_paste_url + filename)
|
|
||||||
|
|
||||||
# Send to every associated email adress
|
|
||||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
|
|
||||||
sendEmailNotification(email, 'Term', mail_body)
|
|
||||||
|
|
||||||
# tag paste
|
|
||||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
|
|
||||||
msg = '{};{}'.format(tag, filename)
|
|
||||||
p.populate_set_out(msg, 'Tags')
|
|
||||||
|
|
||||||
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
|
||||||
new_to_the_set = server_term.sadd(set_name, filename)
|
|
||||||
new_to_the_set = True if new_to_the_set == 1 else False
|
|
||||||
|
|
||||||
# consider the num of occurence of this term
|
|
||||||
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
|
||||||
# 1 term per paste
|
|
||||||
if new_to_the_set:
|
|
||||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
|
||||||
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
|
||||||
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
else:
|
|
||||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
|
||||||
print("sleeping")
|
|
||||||
time.sleep(5)
|
|
||||||
message = p.get_from_set()
|
|
96
bin/RegexTracker.py
Executable file
96
bin/RegexTracker.py
Executable file
|
@ -0,0 +1,96 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
"""
|
||||||
|
This Module is used for regex tracking.
|
||||||
|
It processes every paste coming from the global module and test the regexs
|
||||||
|
supplied in the term webpage.
|
||||||
|
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
|
||||||
|
from Helper import Process
|
||||||
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
import NotificationHelper
|
||||||
|
|
||||||
|
from packages import Item
|
||||||
|
from packages import Term
|
||||||
|
|
||||||
|
full_item_url = "/showsavedpaste/?paste="
|
||||||
|
mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}"
|
||||||
|
|
||||||
|
dict_regex_tracked = Term.get_regex_tracked_words_dict()
|
||||||
|
last_refresh = time.time()
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
pass
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutException
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
|
||||||
|
def new_term_found(term, term_type, item_id, item_date):
|
||||||
|
uuid_list = Term.get_term_uuid_list(term, 'regex')
|
||||||
|
print('new tracked term found: {} in {}'.format(term, item_id))
|
||||||
|
|
||||||
|
for term_uuid in uuid_list:
|
||||||
|
Term.add_tracked_item(term_uuid, item_id, item_date)
|
||||||
|
|
||||||
|
tags_to_add = Term.get_term_tags(term_uuid)
|
||||||
|
for tag in tags_to_add:
|
||||||
|
msg = '{};{}'.format(tag, item_id)
|
||||||
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
|
mail_to_notify = Term.get_term_mails(term_uuid)
|
||||||
|
if mail_to_notify:
|
||||||
|
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
|
||||||
|
for mail in mail_to_notify:
|
||||||
|
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
publisher.port = 6380
|
||||||
|
publisher.channel = "Script"
|
||||||
|
publisher.info("Script RegexTracker started")
|
||||||
|
|
||||||
|
config_section = 'RegexTracker'
|
||||||
|
p = Process(config_section)
|
||||||
|
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
||||||
|
|
||||||
|
ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
||||||
|
|
||||||
|
# Regex Frequency
|
||||||
|
while True:
|
||||||
|
|
||||||
|
item_id = p.get_from_set()
|
||||||
|
|
||||||
|
if item_id is not None:
|
||||||
|
|
||||||
|
item_date = Item.get_item_date(item_id)
|
||||||
|
item_content = Item.get_item_content(item_id)
|
||||||
|
|
||||||
|
for regex in dict_regex_tracked:
|
||||||
|
|
||||||
|
signal.alarm(max_execution_time)
|
||||||
|
try:
|
||||||
|
matched = dict_regex_tracked[regex].search(item_content)
|
||||||
|
except TimeoutException:
|
||||||
|
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
signal.alarm(0)
|
||||||
|
|
||||||
|
if matched:
|
||||||
|
new_term_found(regex, 'regex', item_id, item_date)
|
||||||
|
|
||||||
|
|
||||||
|
else:
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# refresh Tracked term
|
||||||
|
if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
|
||||||
|
dict_regex_tracked = Term.get_regex_tracked_words_dict()
|
||||||
|
last_refresh = time.time()
|
||||||
|
print('Tracked set refreshed')
|
|
@ -1,151 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
This Module is used for term frequency.
|
|
||||||
It processes every paste coming from the global module and test the sets
|
|
||||||
supplied in the term webpage.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import redis
|
|
||||||
import time
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from packages import lib_words
|
|
||||||
from packages import Paste
|
|
||||||
import os
|
|
||||||
import datetime
|
|
||||||
import calendar
|
|
||||||
import re
|
|
||||||
import ast
|
|
||||||
from Helper import Process
|
|
||||||
|
|
||||||
# Email notifications
|
|
||||||
from NotificationHelper import *
|
|
||||||
|
|
||||||
# Config Variables
|
|
||||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
||||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
||||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
|
||||||
TrackedSetSet_Name = "TrackedSetSet"
|
|
||||||
|
|
||||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
|
||||||
oneDay = 60*60*24
|
|
||||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
||||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
||||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
||||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
|
||||||
|
|
||||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = "/showsavedpaste/?paste="
|
|
||||||
|
|
||||||
def add_quote_inside_tab(tab):
|
|
||||||
quoted_tab = "["
|
|
||||||
for elem in tab[1:-1].split(','):
|
|
||||||
elem = elem.lstrip().strip()
|
|
||||||
quoted_tab += "\'{}\', ".format(elem)
|
|
||||||
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
|
||||||
quoted_tab += "]"
|
|
||||||
return str(quoted_tab)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
config_section = 'SetForTermsFrequency'
|
|
||||||
p = Process(config_section)
|
|
||||||
|
|
||||||
# REDIS #
|
|
||||||
server_term = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_TermFreq", "host"),
|
|
||||||
port=p.config.get("ARDB_TermFreq", "port"),
|
|
||||||
db=p.config.get("ARDB_TermFreq", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
# FUNCTIONS #
|
|
||||||
publisher.info("RegexForTermsFrequency script started")
|
|
||||||
|
|
||||||
# create direct link in mail
|
|
||||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
|
||||||
|
|
||||||
#get the dico and matching percent
|
|
||||||
dico_percent = {}
|
|
||||||
dico_set_tab = {}
|
|
||||||
dico_setname_to_redis = {}
|
|
||||||
for set_str in server_term.smembers(TrackedSetSet_Name):
|
|
||||||
tab_set = set_str[1:-1]
|
|
||||||
tab_set = add_quote_inside_tab(tab_set)
|
|
||||||
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
|
||||||
if perc_finder is not None:
|
|
||||||
match_percent = perc_finder.group(0)[1:-1]
|
|
||||||
dico_percent[tab_set] = float(match_percent)
|
|
||||||
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
|
|
||||||
dico_setname_to_redis[tab_set] = set_str
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
message = p.get_from_set()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
|
|
||||||
if message is not None:
|
|
||||||
filename = message
|
|
||||||
temp = filename.split('/')
|
|
||||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
|
||||||
content = Paste.Paste(filename).get_p_content()
|
|
||||||
|
|
||||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
|
||||||
|
|
||||||
#iterate over the words of the file
|
|
||||||
match_dico = {}
|
|
||||||
for word in content.split():
|
|
||||||
for cur_set, array_set in dico_set_tab.items():
|
|
||||||
for w_set in array_set[:-1]: #avoid the percent matching
|
|
||||||
if word == w_set:
|
|
||||||
try:
|
|
||||||
match_dico[str(array_set)] += 1
|
|
||||||
except KeyError:
|
|
||||||
match_dico[str(array_set)] = 1
|
|
||||||
|
|
||||||
#compute matching %
|
|
||||||
for the_set, matchingNum in match_dico.items():
|
|
||||||
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
|
|
||||||
if eff_percent >= dico_percent[the_set]:
|
|
||||||
# Send a notification only when the member is in the set
|
|
||||||
if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
|
||||||
|
|
||||||
# create mail body
|
|
||||||
mail_body = ("AIL Framework,\n"
|
|
||||||
"New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
|
|
||||||
''+full_paste_url + filename)
|
|
||||||
|
|
||||||
# Send to every associated email adress
|
|
||||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
|
|
||||||
sendEmailNotification(email, 'Term', mail_body)
|
|
||||||
|
|
||||||
# tag paste
|
|
||||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
|
|
||||||
msg = '{};{}'.format(tag, filename)
|
|
||||||
p.populate_set_out(msg, 'Tags')
|
|
||||||
|
|
||||||
print(the_set, "matched in", filename)
|
|
||||||
set_name = 'set_' + dico_setname_to_redis[the_set]
|
|
||||||
new_to_the_set = server_term.sadd(set_name, filename)
|
|
||||||
new_to_the_set = True if new_to_the_set == 1 else False
|
|
||||||
|
|
||||||
#consider the num of occurence of this set
|
|
||||||
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
|
|
||||||
|
|
||||||
# FIXME - avoid using per paste as a set is checked over the entire paste
|
|
||||||
#1 term per paste
|
|
||||||
if new_to_the_set:
|
|
||||||
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
|
|
||||||
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
|
|
||||||
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
|
||||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
|
||||||
print("sleeping")
|
|
||||||
time.sleep(5)
|
|
||||||
message = p.get_from_set()
|
|
|
@ -8,13 +8,14 @@ The TermTracker Module
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
import signal
|
||||||
|
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
|
||||||
import NotificationHelper
|
import NotificationHelper
|
||||||
|
|
||||||
from packages import Paste
|
from packages import Item
|
||||||
from packages import Term
|
from packages import Term
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||||
|
@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n
|
||||||
|
|
||||||
# loads tracked words
|
# loads tracked words
|
||||||
list_tracked_words = Term.get_tracked_words_list()
|
list_tracked_words = Term.get_tracked_words_list()
|
||||||
|
last_refresh_word = time.time()
|
||||||
set_tracked_words_list = Term.get_set_tracked_words_list()
|
set_tracked_words_list = Term.get_set_tracked_words_list()
|
||||||
|
last_refresh_set = time.time()
|
||||||
|
|
||||||
def new_term_found(term, term_type, item_id):
|
class TimeoutException(Exception):
|
||||||
uuid_list = Term.get_term_uuid_list(term)
|
pass
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutException
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
|
||||||
|
def new_term_found(term, term_type, item_id, item_date):
|
||||||
|
uuid_list = Term.get_term_uuid_list(term, term_type)
|
||||||
|
print('new tracked term found: {} in {}'.format(term, item_id))
|
||||||
|
|
||||||
for term_uuid in uuid_list:
|
for term_uuid in uuid_list:
|
||||||
Term.add_tracked_item(term_uuid, item_id)
|
Term.add_tracked_item(term_uuid, item_id, item_date)
|
||||||
|
|
||||||
tags_to_add = Term.get_term_tags(term_uuid)
|
tags_to_add = Term.get_term_tags(term_uuid)
|
||||||
for tag in tags_to_add:
|
for tag in tags_to_add:
|
||||||
|
@ -52,28 +62,38 @@ if __name__ == "__main__":
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
publisher.info("Script TermTrackerMod started")
|
publisher.info("Script TermTrackerMod started")
|
||||||
|
|
||||||
#config_section = 'TermTrackerMod'
|
|
||||||
config_section = 'TermTrackerMod'
|
config_section = 'TermTrackerMod'
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
|
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
||||||
|
|
||||||
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
item_id = p.get_from_set()
|
item_id = p.get_from_set()
|
||||||
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
|
|
||||||
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
|
|
||||||
|
|
||||||
if message is not None:
|
if item_id is not None:
|
||||||
|
|
||||||
paste = Paste.Paste(item_id)
|
item_date = Item.get_item_date(item_id)
|
||||||
|
item_content = Item.get_item_content(item_id)
|
||||||
|
|
||||||
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
|
signal.alarm(max_execution_time)
|
||||||
|
try:
|
||||||
|
dict_words_freq = Term.get_text_word_frequency(item_content)
|
||||||
|
except TimeoutException:
|
||||||
|
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
signal.alarm(0)
|
||||||
|
|
||||||
|
# create token statistics
|
||||||
|
for word in dict_words_freq:
|
||||||
|
Term.create_token_statistics(item_date, word, dict_words_freq[word])
|
||||||
|
|
||||||
# check solo words
|
# check solo words
|
||||||
for word in list_tracked_words:
|
for word in list_tracked_words:
|
||||||
if word in dict_words_freq:
|
if word in dict_words_freq:
|
||||||
new_term_found(word, 'word', item_id)
|
new_term_found(word, 'word', item_id, item_date)
|
||||||
|
|
||||||
# check words set
|
# check words set
|
||||||
for elem in set_tracked_words_list:
|
for elem in set_tracked_words_list:
|
||||||
|
@ -86,7 +106,19 @@ if __name__ == "__main__":
|
||||||
if word in dict_words_freq:
|
if word in dict_words_freq:
|
||||||
nb_uniq_word += 1
|
nb_uniq_word += 1
|
||||||
if nb_uniq_word >= nb_words_threshold:
|
if nb_uniq_word >= nb_words_threshold:
|
||||||
new_term_found(word_set, 'set', item_id)
|
new_term_found(word_set, 'set', item_id, item_date)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
# refresh Tracked term
|
||||||
|
if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
|
||||||
|
list_tracked_words = Term.get_tracked_words_list()
|
||||||
|
last_refresh_word = time.time()
|
||||||
|
print('Tracked word refreshed')
|
||||||
|
|
||||||
|
if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
|
||||||
|
set_tracked_words_list = Term.get_set_tracked_words_list()
|
||||||
|
last_refresh_set = time.time()
|
||||||
|
print('Tracked set refreshed')
|
||||||
|
|
|
@ -1,71 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
The Tokenize Module
|
|
||||||
===================
|
|
||||||
|
|
||||||
This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
|
|
||||||
Module.
|
|
||||||
|
|
||||||
It tokenize the content of the paste and publish the result in the following
|
|
||||||
format:
|
|
||||||
channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring
|
|
||||||
|
|
||||||
..seealso:: Paste method (_get_top_words)
|
|
||||||
|
|
||||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
||||||
the same Subscriber name in both of them.
|
|
||||||
|
|
||||||
Requirements
|
|
||||||
------------
|
|
||||||
|
|
||||||
*Need running Redis instances. (Redis)
|
|
||||||
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
|
||||||
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from packages import Paste
|
|
||||||
from pubsublogger import publisher
|
|
||||||
|
|
||||||
from Helper import Process
|
|
||||||
import signal
|
|
||||||
|
|
||||||
class TimeoutException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def timeout_handler(signum, frame):
|
|
||||||
raise TimeoutException
|
|
||||||
|
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
config_section = 'Tokenize'
|
|
||||||
p = Process(config_section)
|
|
||||||
|
|
||||||
# LOGGING #
|
|
||||||
publisher.info("Tokeniser started")
|
|
||||||
|
|
||||||
while True:
|
|
||||||
message = p.get_from_set()
|
|
||||||
print(message)
|
|
||||||
if message is not None:
|
|
||||||
paste = Paste.Paste(message)
|
|
||||||
signal.alarm(5)
|
|
||||||
try:
|
|
||||||
for word, score in paste._get_top_words().items():
|
|
||||||
if len(word) >= 4:
|
|
||||||
msg = '{} {} {}'.format(paste.p_rel_path, word, score)
|
|
||||||
p.populate_set_out(msg)
|
|
||||||
except TimeoutException:
|
|
||||||
p.incr_module_timeout_statistic()
|
|
||||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
signal.alarm(0)
|
|
||||||
else:
|
|
||||||
publisher.debug("Tokeniser is idling 10s")
|
|
||||||
time.sleep(10)
|
|
||||||
print("Sleeping")
|
|
|
@ -1,5 +1,7 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
class Date(object):
|
class Date(object):
|
||||||
"""docstring for Date"""
|
"""docstring for Date"""
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
|
@ -34,7 +36,6 @@ class Date(object):
|
||||||
self.day = day
|
self.day = day
|
||||||
|
|
||||||
def substract_day(self, numDay):
|
def substract_day(self, numDay):
|
||||||
import datetime
|
|
||||||
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
|
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
|
||||||
comp_year = str(computed_date.year)
|
comp_year = str(computed_date.year)
|
||||||
comp_month = str(computed_date.month).zfill(2)
|
comp_month = str(computed_date.month).zfill(2)
|
||||||
|
@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1):
|
||||||
new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
|
new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
|
||||||
new_date = str(new_date).replace('-', '')
|
new_date = str(new_date).replace('-', '')
|
||||||
return new_date
|
return new_date
|
||||||
|
|
||||||
|
def get_date_range(num_day):
|
||||||
|
curr_date = datetime.date.today()
|
||||||
|
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
|
||||||
|
date_list = []
|
||||||
|
|
||||||
|
for i in range(0, num_day+1):
|
||||||
|
date_list.append(date.substract_day(i))
|
||||||
|
return list(reversed(date_list))
|
||||||
|
|
|
@ -2,10 +2,13 @@
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import gzip
|
import gzip
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||||
import Flask_config
|
import Flask_config
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||||
import Date
|
import Date
|
||||||
import Tag
|
import Tag
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
import redis
|
import redis
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -72,14 +73,30 @@ def get_set_tracked_words_list():
|
||||||
all_set_list.append((ter_set, num_words, elem))
|
all_set_list.append((ter_set, num_words, elem))
|
||||||
return all_set_list
|
return all_set_list
|
||||||
|
|
||||||
def is_term_tracked_in_global_level(term):
|
def get_regex_tracked_words_dict():
|
||||||
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
|
regex_list = r_serv_term.smembers('all:tracked_term:regex')
|
||||||
|
dict_tracked_regex = {}
|
||||||
|
for regex in regex_list:
|
||||||
|
dict_tracked_regex[regex] = re.compile(regex)
|
||||||
|
return dict_tracked_regex
|
||||||
|
|
||||||
|
def is_term_tracked_in_global_level(term, term_type):
|
||||||
|
res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))
|
||||||
if res:
|
if res:
|
||||||
for elem_uuid in res:
|
for elem_uuid in res:
|
||||||
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
|
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_term_tracked_in_user_level(term, term_type, user_id):
|
||||||
|
res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id))
|
||||||
|
if res:
|
||||||
|
for elem_uuid in res:
|
||||||
|
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term:
|
||||||
|
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def parse_json_term_to_add(dict_input, user_id):
|
def parse_json_term_to_add(dict_input, user_id):
|
||||||
term = dict_input.get('term', None)
|
term = dict_input.get('term', None)
|
||||||
if not term:
|
if not term:
|
||||||
|
@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id):
|
||||||
|
|
||||||
# check if term already tracked in global
|
# check if term already tracked in global
|
||||||
if level==1:
|
if level==1:
|
||||||
if is_term_tracked_in_global_level(term):
|
if is_term_tracked_in_global_level(term, term_type):
|
||||||
|
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
||||||
|
else:
|
||||||
|
if is_term_tracked_in_user_level(term, term_type, user_id):
|
||||||
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
||||||
|
|
||||||
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
|
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
|
||||||
|
@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
|
||||||
r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)
|
r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)
|
||||||
|
|
||||||
# create term - uuid map
|
# create term - uuid map
|
||||||
r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid)
|
r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
|
||||||
|
|
||||||
# add display level set
|
# add display level set
|
||||||
if level == 0: # user only
|
if level == 0: # user only
|
||||||
|
@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
|
||||||
for mail in mails:
|
for mail in mails:
|
||||||
r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)
|
r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)
|
||||||
|
|
||||||
|
# toggle refresh module tracker list/set
|
||||||
|
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
|
||||||
|
|
||||||
return term_uuid
|
return term_uuid
|
||||||
|
|
||||||
def delete_term(term_uuid):
|
def delete_term(term_uuid):
|
||||||
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
|
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
|
||||||
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
|
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
|
||||||
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
|
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
|
||||||
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
|
r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
|
||||||
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
|
# Term not tracked by other users
|
||||||
|
if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)):
|
||||||
|
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term)
|
||||||
|
|
||||||
|
# toggle refresh module tracker list/set
|
||||||
|
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
|
||||||
|
|
||||||
if level == 0: # user only
|
if level == 0: # user only
|
||||||
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
|
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
|
||||||
|
@ -218,8 +245,8 @@ def delete_term(term_uuid):
|
||||||
# remove item set
|
# remove item set
|
||||||
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
|
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
|
||||||
|
|
||||||
def get_term_uuid_list(term):
|
def get_term_uuid_list(term, term_type):
|
||||||
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
|
return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)))
|
||||||
|
|
||||||
def get_term_tags(term_uuid):
|
def get_term_tags(term_uuid):
|
||||||
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
|
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
|
||||||
|
@ -227,10 +254,30 @@ def get_term_tags(term_uuid):
|
||||||
def get_term_mails(term_uuid):
|
def get_term_mails(term_uuid):
|
||||||
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
|
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
|
||||||
|
|
||||||
def add_tracked_item(term_uuid, item_id):
|
def add_tracked_item(term_uuid, item_id, item_date):
|
||||||
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
|
# track item
|
||||||
|
r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id)
|
||||||
|
# track nb item by date
|
||||||
|
r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1)
|
||||||
|
|
||||||
|
def create_token_statistics(item_date, word, nb):
|
||||||
|
r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1)
|
||||||
|
r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb)
|
||||||
|
r_serv_term.sadd('stat_token_history', item_date)
|
||||||
|
|
||||||
|
def delete_token_statistics_by_date(item_date):
|
||||||
|
r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date))
|
||||||
|
r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date))
|
||||||
|
r_serv_term.srem('stat_token_history', item_date)
|
||||||
|
|
||||||
|
def get_all_token_stat_history():
|
||||||
|
return r_serv_term.smembers('stat_token_history')
|
||||||
|
|
||||||
|
def get_tracked_term_last_updated_by_type(term_type):
|
||||||
|
epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type))
|
||||||
|
if not epoch_update:
|
||||||
|
epoch_update = 0
|
||||||
|
return float(epoch_update)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -107,7 +107,10 @@ operation_mode = 3
|
||||||
ttl_duplicate = 86400
|
ttl_duplicate = 86400
|
||||||
default_unnamed_feed_name = unnamed_feeder
|
default_unnamed_feed_name = unnamed_feeder
|
||||||
|
|
||||||
[RegexForTermsFrequency]
|
[TermTrackerMod]
|
||||||
|
max_execution_time = 120
|
||||||
|
|
||||||
|
[RegexTracker]
|
||||||
max_execution_time = 60
|
max_execution_time = 60
|
||||||
|
|
||||||
##### Redis #####
|
##### Redis #####
|
||||||
|
|
|
@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
|
||||||
def listdirectory(path):
|
|
||||||
"""Path Traversing Function.
|
|
||||||
|
|
||||||
:param path: -- The absolute pathname to a directory.
|
|
||||||
|
|
||||||
This function is returning all the absolute path of the files contained in
|
|
||||||
the argument directory.
|
|
||||||
|
|
||||||
"""
|
|
||||||
fichier = []
|
|
||||||
for root, dirs, files in os.walk(path):
|
|
||||||
|
|
||||||
for i in files:
|
|
||||||
|
|
||||||
fichier.append(os.path.join(root, i))
|
|
||||||
|
|
||||||
return fichier
|
|
||||||
|
|
||||||
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
||||||
"""It filters out non-printable characters from the string it receives."""
|
"""It filters out non-printable characters from the string it receives."""
|
||||||
|
|
||||||
|
|
||||||
def create_dirfile(r_serv, directory, overwrite):
|
|
||||||
"""Create a file of path.
|
|
||||||
|
|
||||||
:param r_serv: -- connexion to redis database
|
|
||||||
:param directory: -- The folder where to launch the listing of the .gz files
|
|
||||||
|
|
||||||
This function create a list in redis with inside the absolute path
|
|
||||||
of all the pastes needed to be proceeded by function using parallel
|
|
||||||
(like redis_words_ranking)
|
|
||||||
|
|
||||||
"""
|
|
||||||
if overwrite:
|
|
||||||
r_serv.delete("filelist")
|
|
||||||
|
|
||||||
for x in listdirectory(directory):
|
|
||||||
r_serv.lpush("filelist", x)
|
|
||||||
|
|
||||||
publisher.info("The list was overwritten")
|
|
||||||
|
|
||||||
else:
|
|
||||||
if r_serv.llen("filelist") == 0:
|
|
||||||
|
|
||||||
for x in listdirectory(directory):
|
|
||||||
r_serv.lpush("filelist", x)
|
|
||||||
|
|
||||||
publisher.info("New list created")
|
|
||||||
else:
|
|
||||||
|
|
||||||
for x in listdirectory(directory):
|
|
||||||
r_serv.lpush("filelist", x)
|
|
||||||
|
|
||||||
publisher.info("The list was updated with new elements")
|
|
||||||
|
|
||||||
|
|
||||||
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
||||||
"""Create a csv file used with dygraph.
|
"""Create a csv file used with dygraph.
|
||||||
|
|
||||||
|
|
|
@ -19,36 +19,17 @@ subscribe = Redis_Global
|
||||||
[Attributes]
|
[Attributes]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
|
|
||||||
[Lines]
|
|
||||||
subscribe = Redis_Global
|
|
||||||
publish = Redis_LinesShort,Redis_LinesLong
|
|
||||||
|
|
||||||
[DomClassifier]
|
[DomClassifier]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
|
|
||||||
[Tokenize]
|
|
||||||
subscribe = Redis_LinesShort
|
|
||||||
publish = Redis_Words
|
|
||||||
|
|
||||||
[Curve]
|
|
||||||
subscribe = Redis_Words
|
|
||||||
publish = Redis_CurveManageTopSets,Redis_Tags
|
|
||||||
|
|
||||||
[TermTrackerMod]
|
[TermTrackerMod]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
publish = Redis_Tags
|
publish = Redis_Tags
|
||||||
|
|
||||||
[RegexForTermsFrequency]
|
[RegexTracker]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
publish = Redis_Tags
|
publish = Redis_Tags
|
||||||
|
|
||||||
[SetForTermsFrequency]
|
|
||||||
subscribe = Redis_Global
|
|
||||||
publish = Redis_Tags
|
|
||||||
|
|
||||||
[CurveManageTopSets]
|
|
||||||
subscribe = Redis_CurveManageTopSets
|
|
||||||
|
|
||||||
[Categ]
|
[Categ]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey
|
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey
|
||||||
|
|
Loading…
Reference in a new issue