mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [Term Tracker] refractor term tracker word/set/regex modules + remove old modules
This commit is contained in:
parent
d9bdfecef3
commit
1008c7c4fe
19 changed files with 304 additions and 981 deletions
37
OVERVIEW.md
37
OVERVIEW.md
|
@ -138,12 +138,12 @@ Redis and ARDB overview
|
|||
|
||||
| Set - Key | Value |
|
||||
| ------ | ------ |
|
||||
| all:tracked_term_uuid:**tracked term** | **uuid - tracked term uuid** |
|
||||
| all:tracked_term_uuid:**term type**:**tracked term** | **uuid - tracked term uuid** |
|
||||
|
||||
##### All Term Tracked items:
|
||||
| Set - Key | Value |
|
||||
| ------ | ------ |
|
||||
| tracked_term:item:**uuid** | **item_id** |
|
||||
| tracked_term:item:**uuid**:**date** | **item_id** |
|
||||
|
||||
##### All Term Tracked tags:
|
||||
| Set - Key | Value |
|
||||
|
@ -155,6 +155,29 @@ Redis and ARDB overview
|
|||
| ------ | ------ |
|
||||
| tracked_term:mail:**uuid** | **mail** |
|
||||
|
||||
##### Refresh Tracked term:
|
||||
| Key | Value |
|
||||
| ------ | ------ |
|
||||
| tracked_term:refresh:word | **last refreshed epoch** |
|
||||
| tracked_term:refresh:set | - |
|
||||
| tracked_term:refresh:regex | - |
|
||||
|
||||
##### Zset Stat Tracked term:
|
||||
| Key | Field | Value |
|
||||
| ------ | ------ | ------ |
|
||||
| tracked_term:stat:**uuid** | **date** | **nb_seen** |
|
||||
|
||||
##### Stat token:
|
||||
| Key | Field | Value |
|
||||
| ------ | ------ | ------ |
|
||||
| stat_token_total_by_day:**date** | **word** | **nb_seen** |
|
||||
| | | |
|
||||
| stat_token_per_item_by_day:**date** | **word** | **nb_seen** |
|
||||
|
||||
| Set - Key | Value |
|
||||
| ------ | ------ |
|
||||
| stat_token_history | **date** |
|
||||
|
||||
## DB2 - TermFreq:
|
||||
|
||||
##### Set:
|
||||
|
@ -167,16 +190,6 @@ Redis and ARDB overview
|
|||
| TrackedRegexSet | **tracked_regex** |
|
||||
| | |
|
||||
| | |
|
||||
| global:TrackedSetTermSet | **tracked_term** |
|
||||
| global:TrackedSetSet | **tracked_set** |
|
||||
| global:TrackedRegexSet | **tracked_regex** |
|
||||
| | |
|
||||
| | |
|
||||
| user:**user_id**:TrackedSetTermSet | **tracked_term** |
|
||||
| user:**user_id**:TrackedSetSet | **tracked_set** |
|
||||
| user:**user_id**:TrackedRegexSet | **tracked_regex** |
|
||||
| | |
|
||||
| | |
|
||||
| tracked_**tracked_term** | **item_path** |
|
||||
| set_**tracked_set** | **item_path** |
|
||||
| regex_**tracked_regex** | **item_path** |
|
||||
|
|
184
bin/Curve.py
184
bin/Curve.py
|
@ -1,184 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
|
||||
|
||||
This modules update a .csv file used to draw curves representing selected
|
||||
words and their occurency per day.
|
||||
|
||||
..note:: The channel will have the name of the file created.
|
||||
|
||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
||||
the same Subscriber name in both of them.
|
||||
|
||||
|
||||
This Module is also used for term frequency.
|
||||
|
||||
/!\ Top set management is done in the module Curve_manage_top_set
|
||||
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
*Need running Redis instances. (Redis)
|
||||
*Categories files of words in /files/ need to be created
|
||||
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
||||
|
||||
"""
|
||||
import redis
|
||||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
import os
|
||||
import datetime
|
||||
import calendar
|
||||
|
||||
from Helper import Process
|
||||
|
||||
# Email notifications
|
||||
from NotificationHelper import *
|
||||
|
||||
# Config Variables
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = "/showsavedpaste/?paste="
|
||||
|
||||
def check_if_tracked_term(term, path):
|
||||
if term in server_term.smembers(TrackedTermsSet_Name):
|
||||
#add_paste to tracked_word_set
|
||||
set_name = "tracked_" + term
|
||||
server_term.sadd(set_name, path)
|
||||
print(term, 'addded', set_name, '->', path)
|
||||
p.populate_set_out("New Term added", 'CurveManageTopSets')
|
||||
|
||||
# Send a notification only when the member is in the set
|
||||
if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
||||
|
||||
# create mail body
|
||||
mail_body = ("AIL Framework,\n"
|
||||
"New occurrence for term: " + term + "\n"
|
||||
''+full_paste_url + path)
|
||||
|
||||
# Send to every associated email adress
|
||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
|
||||
sendEmailNotification(email, 'Term', mail_body)
|
||||
|
||||
# tag paste
|
||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + term):
|
||||
msg = '{};{}'.format(tag, path)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
|
||||
def getValueOverRange(word, startDate, num_day):
|
||||
to_return = 0
|
||||
for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
|
||||
value = server_term.hget(timestamp, word)
|
||||
to_return += int(value) if value is not None else 0
|
||||
return to_return
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'Curve'
|
||||
p = Process(config_section)
|
||||
|
||||
# REDIS #
|
||||
r_serv1 = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_Curve", "host"),
|
||||
port=p.config.get("ARDB_Curve", "port"),
|
||||
db=p.config.get("ARDB_Curve", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
server_term = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_TermFreq", "host"),
|
||||
port=p.config.get("ARDB_TermFreq", "port"),
|
||||
db=p.config.get("ARDB_TermFreq", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("Script Curve started")
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
||||
|
||||
# FILE CURVE SECTION #
|
||||
csv_path = os.path.join(os.environ['AIL_HOME'],
|
||||
p.config.get("Directories", "wordtrending_csv"))
|
||||
wordfile_path = os.path.join(os.environ['AIL_HOME'],
|
||||
p.config.get("Directories", "wordsfile"))
|
||||
|
||||
message = p.get_from_set()
|
||||
prec_filename = None
|
||||
generate_new_graph = False
|
||||
|
||||
# Term Frequency
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
|
||||
while True:
|
||||
|
||||
if message is not None:
|
||||
generate_new_graph = True
|
||||
|
||||
filename, word, score = message.split()
|
||||
temp = filename.split('/')
|
||||
date = temp[-4] + temp[-3] + temp[-2]
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
|
||||
|
||||
low_word = word.lower()
|
||||
#Old curve with words in file
|
||||
r_serv1.hincrby(low_word, date, int(score))
|
||||
|
||||
# Update redis
|
||||
#consider the num of occurence of this term
|
||||
curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
|
||||
#1 term per paste
|
||||
curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
|
||||
|
||||
# Add in set only if term is not in the blacklist
|
||||
if low_word not in server_term.smembers(BlackListTermsSet_Name):
|
||||
#consider the num of occurence of this term
|
||||
server_term.zincrby(curr_set, low_word, float(score))
|
||||
#1 term per paste
|
||||
server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
|
||||
|
||||
#Add more info for tracked terms
|
||||
check_if_tracked_term(low_word, filename)
|
||||
|
||||
#send to RegexForTermsFrequency
|
||||
to_send = "{} {} {}".format(filename, timestamp, word)
|
||||
p.populate_set_out(to_send, 'RegexForTermsFrequency')
|
||||
|
||||
else:
|
||||
|
||||
if generate_new_graph:
|
||||
generate_new_graph = False
|
||||
print('Building graph')
|
||||
today = datetime.date.today()
|
||||
year = today.year
|
||||
month = today.month
|
||||
|
||||
lib_words.create_curve_with_word_file(r_serv1, csv_path,
|
||||
wordfile_path, year,
|
||||
month)
|
||||
|
||||
publisher.debug("Script Curve is Idling")
|
||||
print("sleeping")
|
||||
time.sleep(10)
|
||||
message = p.get_from_set()
|
|
@ -1,166 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
|
||||
This module manage top sets for terms frequency.
|
||||
Every 'refresh_rate' update the weekly and monthly set
|
||||
|
||||
"""
|
||||
|
||||
import redis
|
||||
import time
|
||||
import datetime
|
||||
import copy
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
import datetime
|
||||
import calendar
|
||||
import os
|
||||
import configparser
|
||||
|
||||
# Config Variables
|
||||
Refresh_rate = 60*5 #sec
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
num_day_month = 31
|
||||
num_day_week = 7
|
||||
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
|
||||
def manage_top_set():
|
||||
startDate = datetime.datetime.now()
|
||||
startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
startDate = calendar.timegm(startDate.timetuple())
|
||||
blacklist_size = int(server_term.scard(BlackListTermsSet_Name))
|
||||
|
||||
dico = {}
|
||||
dico_per_paste = {}
|
||||
|
||||
# Retreive top data (max_card + blacklist_size) from days sets
|
||||
for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay):
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
|
||||
array_top_day_per_paste = server_term.zrevrangebyscore("per_paste_" + curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size)
|
||||
|
||||
for word, value in array_top_day:
|
||||
if word not in server_term.smembers(BlackListTermsSet_Name):
|
||||
if word in dico.keys():
|
||||
dico[word] += value
|
||||
else:
|
||||
dico[word] = value
|
||||
|
||||
for word, value in array_top_day_per_paste:
|
||||
if word not in server_term.smembers(BlackListTermsSet_Name):
|
||||
if word in dico_per_paste.keys():
|
||||
dico_per_paste[word] += value
|
||||
else:
|
||||
dico_per_paste[word] = value
|
||||
|
||||
if timestamp == startDate - num_day_week*oneDay:
|
||||
dico_week = copy.deepcopy(dico)
|
||||
dico_week_per_paste = copy.deepcopy(dico_per_paste)
|
||||
|
||||
# convert dico into sorted array
|
||||
array_month = []
|
||||
for w, v in dico.items():
|
||||
array_month.append((w, v))
|
||||
array_month.sort(key=lambda tup: -tup[1])
|
||||
array_month = array_month[0:20]
|
||||
|
||||
array_week = []
|
||||
for w, v in dico_week.items():
|
||||
array_week.append((w, v))
|
||||
array_week.sort(key=lambda tup: -tup[1])
|
||||
array_week = array_week[0:20]
|
||||
|
||||
# convert dico_per_paste into sorted array
|
||||
array_month_per_paste = []
|
||||
for w, v in dico_per_paste.items():
|
||||
array_month_per_paste.append((w, v))
|
||||
array_month_per_paste.sort(key=lambda tup: -tup[1])
|
||||
array_month_per_paste = array_month_per_paste[0:20]
|
||||
|
||||
array_week_per_paste = []
|
||||
for w, v in dico_week_per_paste.items():
|
||||
array_week_per_paste.append((w, v))
|
||||
array_week_per_paste.sort(key=lambda tup: -tup[1])
|
||||
array_week_per_paste = array_week_per_paste[0:20]
|
||||
|
||||
|
||||
# suppress every terms in top sets
|
||||
for curr_set, curr_num_day in top_termFreq_set_array[1:3]:
|
||||
for w in server_term.zrange(curr_set, 0, -1):
|
||||
server_term.zrem(curr_set, w)
|
||||
for w in server_term.zrange("per_paste_" + curr_set, 0, -1):
|
||||
server_term.zrem("per_paste_" + curr_set, w)
|
||||
|
||||
# Add top term from sorted array in their respective sorted sets
|
||||
for elem in array_week:
|
||||
server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0])
|
||||
for elem in array_week_per_paste:
|
||||
server_term.zadd("per_paste_" + top_termFreq_setName_week[0], float(elem[1]), elem[0])
|
||||
|
||||
for elem in array_month:
|
||||
server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0])
|
||||
for elem in array_month_per_paste:
|
||||
server_term.zadd("per_paste_" + top_termFreq_setName_month[0], float(elem[1]), elem[0])
|
||||
|
||||
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
|
||||
value = str(timestamp) + ", " + "-"
|
||||
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
|
||||
print("refreshed module")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
|
||||
# Port of the redis instance used by pubsublogger
|
||||
publisher.port = 6380
|
||||
# Script is the default channel used for the modules.
|
||||
publisher.channel = 'Script'
|
||||
|
||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||
if not os.path.exists(configfile):
|
||||
raise Exception('Unable to find the configuration file. \
|
||||
Did you set environment variables? \
|
||||
Or activate the virtualenv.')
|
||||
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg.read(configfile)
|
||||
|
||||
|
||||
# For Module Manager
|
||||
r_temp = redis.StrictRedis(
|
||||
host=cfg.get('RedisPubSub', 'host'),
|
||||
port=cfg.getint('RedisPubSub', 'port'),
|
||||
db=cfg.getint('RedisPubSub', 'db'),
|
||||
decode_responses=True)
|
||||
|
||||
timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
|
||||
value = str(timestamp) + ", " + "-"
|
||||
r_temp.set("MODULE_"+ "CurveManageTopSets" + "_" + str(os.getpid()), value)
|
||||
r_temp.sadd("MODULE_TYPE_"+ "CurveManageTopSets" , str(os.getpid()))
|
||||
|
||||
server_term = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_TermFreq", "host"),
|
||||
port=cfg.getint("ARDB_TermFreq", "port"),
|
||||
db=cfg.getint("ARDB_TermFreq", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
publisher.info("Script Curve_manage_top_set started")
|
||||
|
||||
# Sent to the logging a description of the module
|
||||
publisher.info("Manage the top sets with the data created by the module curve.")
|
||||
|
||||
manage_top_set()
|
||||
|
||||
while True:
|
||||
# Get one message from the input queue (module only work if linked with a queue)
|
||||
time.sleep(Refresh_rate) # sleep a long time then manage the set
|
||||
manage_top_set()
|
59
bin/DbCleaner.py
Executable file
59
bin/DbCleaner.py
Executable file
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
The TermTracker Module
|
||||
===================
|
||||
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from pubsublogger import publisher
|
||||
|
||||
import NotificationHelper
|
||||
|
||||
from packages import Date
|
||||
from packages import Item
|
||||
from packages import Term
|
||||
|
||||
def clean_term_db_stat_token():
|
||||
all_stat_date = Term.get_all_token_stat_history()
|
||||
|
||||
list_date_to_keep = Date.get_date_range(31)
|
||||
for date in all_stat_date:
|
||||
if date not in list_date_to_keep:
|
||||
# remove history
|
||||
Term.delete_token_statistics_by_date(date)
|
||||
|
||||
print('Term Stats Cleaned')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
publisher.info("DbCleaner started")
|
||||
|
||||
config_section = 'TermTrackerMod'
|
||||
|
||||
# low priority
|
||||
time.sleep(180)
|
||||
|
||||
daily_cleaner = True
|
||||
current_date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
|
||||
while True:
|
||||
|
||||
if daily_cleaner:
|
||||
clean_term_db_stat_token()
|
||||
daily_cleaner = False
|
||||
else:
|
||||
sys.exit(0)
|
||||
time.sleep(600)
|
||||
|
||||
new_date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
if new_date != current_date:
|
||||
current_date = new_date
|
||||
daily_cleaner = True
|
48
bin/Dir.py
48
bin/Dir.py
|
@ -1,48 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import argparse
|
||||
import redis
|
||||
from pubsublogger import publisher
|
||||
from packages.lib_words import create_dirfile
|
||||
import configparser
|
||||
|
||||
|
||||
def main():
|
||||
"""Main Function"""
|
||||
|
||||
# CONFIG #
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg.read('./packages/config.cfg')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='''This script is a part of the Analysis Information Leak
|
||||
framework. It create a redis list called "listfile" which contain
|
||||
the absolute filename of all the files from the directory given in
|
||||
the argument "directory".''',
|
||||
epilog='Example: ./Dir.py /home/2013/03/')
|
||||
|
||||
parser.add_argument('directory', type=str,
|
||||
help='The directory to run inside', action='store')
|
||||
|
||||
parser.add_argument('-db', type=int, default=0,
|
||||
help='The name of the Redis DB (default 0)',
|
||||
choices=[0, 1, 2, 3, 4], action='store')
|
||||
|
||||
parser.add_argument('-ow', help='trigger the overwritting mode',
|
||||
action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
r_serv = redis.StrictRedis(host=cfg.get("Redis_Queues", "host"),
|
||||
port=cfg.getint("Redis_Queues", "port"),
|
||||
db=cfg.getint("Redis_Queues", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
create_dirfile(r_serv, args.directory, args.ow)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -153,14 +153,10 @@ function launching_scripts {
|
|||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Duplicates.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Lines" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Lines.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DomClassifier.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Categ" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Categ.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Tokenize" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tokenize.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "CreditCards" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CreditCards.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
|
||||
|
@ -175,13 +171,9 @@ function launching_scripts {
|
|||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Credential" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Credential.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Curve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Curve.py; read x"
|
||||
screen -S "Script_AIL" -X screen -t "TermTrackerMod" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./TermTrackerMod.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "CurveManageTopSets" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./CurveManageTopSets.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "RegexForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexForTermsFrequency.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "SetForTermsFrequency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SetForTermsFrequency.py; read x"
|
||||
screen -S "Script_AIL" -X screen -t "RegexTracker" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./RegexTracker.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "Indexer" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Indexer.py; read x"
|
||||
sleep 0.1
|
||||
|
@ -213,6 +205,8 @@ function launching_scripts {
|
|||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "UpdateBackground" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./update-background.py; read x"
|
||||
sleep 0.1
|
||||
screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./submit_paste.py; read x"
|
||||
|
|
85
bin/Lines.py
85
bin/Lines.py
|
@ -1,85 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
"""
|
||||
The ZMQ_PubSub_Lines Module
|
||||
============================
|
||||
|
||||
This module is consuming the Redis-list created by the ZMQ_PubSub_Line_Q
|
||||
Module.
|
||||
|
||||
It perform a sorting on the line's length and publish/forward them to
|
||||
differents channels:
|
||||
|
||||
*Channel 1 if max length(line) < max
|
||||
*Channel 2 if max length(line) > max
|
||||
|
||||
The collected informations about the processed pastes
|
||||
(number of lines and maximum length line) are stored in Redis.
|
||||
|
||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
||||
the same Subscriber name in both of them.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
*Need running Redis instances. (LevelDB & Redis)
|
||||
*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.
|
||||
|
||||
"""
|
||||
import argparse
|
||||
import time
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
||||
from Helper import Process
|
||||
|
||||
if __name__ == '__main__':
|
||||
publisher.port = 6380
|
||||
publisher.channel = 'Script'
|
||||
|
||||
config_section = 'Lines'
|
||||
p = Process(config_section)
|
||||
|
||||
# SCRIPT PARSER #
|
||||
parser = argparse.ArgumentParser(
|
||||
description='This script is a part of the Analysis Information \
|
||||
Leak framework.')
|
||||
|
||||
parser.add_argument(
|
||||
'-max', type=int, default=500,
|
||||
help='The limit between "short lines" and "long lines"',
|
||||
action='store')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# FUNCTIONS #
|
||||
tmp_string = "Lines script Subscribed to channel {} and Start to publish \
|
||||
on channel Longlines, Shortlines"
|
||||
publisher.info(tmp_string)
|
||||
|
||||
while True:
|
||||
try:
|
||||
message = p.get_from_set()
|
||||
print(message)
|
||||
if message is not None:
|
||||
PST = Paste.Paste(message)
|
||||
else:
|
||||
publisher.debug("Tokeniser is idling 10s")
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
# FIXME do it in the paste class
|
||||
lines_infos = PST.get_lines_info()
|
||||
PST.save_attribute_redis("p_nb_lines", lines_infos[0])
|
||||
PST.save_attribute_redis("p_max_length_line", lines_infos[1])
|
||||
|
||||
# FIXME Not used.
|
||||
PST.store.sadd("Pastes_Objects", PST.p_rel_path)
|
||||
print(PST.p_rel_path)
|
||||
if lines_infos[1] < args.max:
|
||||
p.populate_set_out( PST.p_rel_path , 'LinesShort')
|
||||
else:
|
||||
p.populate_set_out( PST.p_rel_path , 'LinesLong')
|
||||
except IOError:
|
||||
print("CRC Checksum Error on : ", PST.p_rel_path)
|
|
@ -9,7 +9,6 @@ import time
|
|||
import datetime
|
||||
import redis
|
||||
import os
|
||||
from packages import lib_words
|
||||
from packages.Date import Date
|
||||
from pubsublogger import publisher
|
||||
from Helper import Process
|
||||
|
|
|
@ -1,157 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This Module is used for term frequency.
|
||||
It processes every paste coming from the global module and test the regexs
|
||||
supplied in the term webpage.
|
||||
|
||||
"""
|
||||
import redis
|
||||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import Paste
|
||||
import calendar
|
||||
import re
|
||||
import signal
|
||||
import time
|
||||
from Helper import Process
|
||||
# Email notifications
|
||||
from NotificationHelper import *
|
||||
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
# Config Variables
|
||||
DICO_REFRESH_TIME = 60 # s
|
||||
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = "/showsavedpaste/?paste="
|
||||
|
||||
|
||||
def refresh_dicos():
|
||||
dico_regex = {}
|
||||
dico_regexname_to_redis = {}
|
||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
||||
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
||||
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
||||
|
||||
return dico_regex, dico_regexname_to_redis
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'RegexForTermsFrequency'
|
||||
p = Process(config_section)
|
||||
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
||||
|
||||
# REDIS #
|
||||
server_term = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_TermFreq", "host"),
|
||||
port=p.config.get("ARDB_TermFreq", "port"),
|
||||
db=p.config.get("ARDB_TermFreq", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("RegexForTermsFrequency script started")
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
||||
|
||||
# compile the regex
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
|
||||
message = p.get_from_set()
|
||||
|
||||
# Regex Frequency
|
||||
while True:
|
||||
|
||||
if message is not None:
|
||||
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
print('dico got refreshed')
|
||||
|
||||
filename = message
|
||||
temp = filename.split('/')
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
paste = Paste.Paste(filename)
|
||||
content = paste.get_p_content()
|
||||
|
||||
# iterate the word with the regex
|
||||
for regex_str, compiled_regex in dico_regex.items():
|
||||
|
||||
signal.alarm(max_execution_time)
|
||||
try:
|
||||
matched = compiled_regex.search(content)
|
||||
except TimeoutException:
|
||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||
continue
|
||||
else:
|
||||
signal.alarm(0)
|
||||
|
||||
if matched is not None: # there is a match
|
||||
print('regex matched {}'.format(regex_str))
|
||||
matched = matched.group(0)
|
||||
regex_str_complete = "/" + regex_str + "/"
|
||||
# Add in Regex track set only if term is not in the blacklist
|
||||
if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
|
||||
# Send a notification only when the member is in the set
|
||||
if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
||||
|
||||
# create mail body
|
||||
mail_body = ("AIL Framework,\n"
|
||||
"New occurrence for regex: " + regex_str + "\n"
|
||||
''+full_paste_url + filename)
|
||||
|
||||
# Send to every associated email adress
|
||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
|
||||
sendEmailNotification(email, 'Term', mail_body)
|
||||
|
||||
# tag paste
|
||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
|
||||
msg = '{};{}'.format(tag, filename)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
# consider the num of occurence of this term
|
||||
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
||||
# 1 term per paste
|
||||
if new_to_the_set:
|
||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
else:
|
||||
pass
|
||||
|
||||
else:
|
||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||
print("sleeping")
|
||||
time.sleep(5)
|
||||
message = p.get_from_set()
|
96
bin/RegexTracker.py
Executable file
96
bin/RegexTracker.py
Executable file
|
@ -0,0 +1,96 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This Module is used for regex tracking.
|
||||
It processes every paste coming from the global module and test the regexs
|
||||
supplied in the term webpage.
|
||||
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
|
||||
from Helper import Process
|
||||
from pubsublogger import publisher
|
||||
|
||||
import NotificationHelper
|
||||
|
||||
from packages import Item
|
||||
from packages import Term
|
||||
|
||||
full_item_url = "/showsavedpaste/?paste="
|
||||
mail_body_template = "AIL Framework,\nNew occurrence for term tracked regex: {}\nitem id: {}\nurl: {}{}"
|
||||
|
||||
dict_regex_tracked = Term.get_regex_tracked_words_dict()
|
||||
last_refresh = time.time()
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
def new_term_found(term, term_type, item_id, item_date):
|
||||
uuid_list = Term.get_term_uuid_list(term, 'regex')
|
||||
print('new tracked term found: {} in {}'.format(term, item_id))
|
||||
|
||||
for term_uuid in uuid_list:
|
||||
Term.add_tracked_item(term_uuid, item_id, item_date)
|
||||
|
||||
tags_to_add = Term.get_term_tags(term_uuid)
|
||||
for tag in tags_to_add:
|
||||
msg = '{};{}'.format(tag, item_id)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
mail_to_notify = Term.get_term_mails(term_uuid)
|
||||
if mail_to_notify:
|
||||
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
|
||||
for mail in mail_to_notify:
|
||||
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
publisher.info("Script RegexTracker started")
|
||||
|
||||
config_section = 'RegexTracker'
|
||||
p = Process(config_section)
|
||||
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
||||
|
||||
ull_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
||||
|
||||
# Regex Frequency
|
||||
while True:
|
||||
|
||||
item_id = p.get_from_set()
|
||||
|
||||
if item_id is not None:
|
||||
|
||||
item_date = Item.get_item_date(item_id)
|
||||
item_content = Item.get_item_content(item_id)
|
||||
|
||||
for regex in dict_regex_tracked:
|
||||
|
||||
signal.alarm(max_execution_time)
|
||||
try:
|
||||
matched = dict_regex_tracked[regex].search(item_content)
|
||||
except TimeoutException:
|
||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||
continue
|
||||
else:
|
||||
signal.alarm(0)
|
||||
|
||||
if matched:
|
||||
new_term_found(regex, 'regex', item_id, item_date)
|
||||
|
||||
|
||||
else:
|
||||
time.sleep(5)
|
||||
|
||||
# refresh Tracked term
|
||||
if last_refresh < Term.get_tracked_term_last_updated_by_type('regex'):
|
||||
dict_regex_tracked = Term.get_regex_tracked_words_dict()
|
||||
last_refresh = time.time()
|
||||
print('Tracked set refreshed')
|
|
@ -1,151 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This Module is used for term frequency.
|
||||
It processes every paste coming from the global module and test the sets
|
||||
supplied in the term webpage.
|
||||
|
||||
"""
|
||||
import redis
|
||||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
from packages import Paste
|
||||
import os
|
||||
import datetime
|
||||
import calendar
|
||||
import re
|
||||
import ast
|
||||
from Helper import Process
|
||||
|
||||
# Email notifications
|
||||
from NotificationHelper import *
|
||||
|
||||
# Config Variables
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||
TrackedSetSet_Name = "TrackedSetSet"
|
||||
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = "/showsavedpaste/?paste="
|
||||
|
||||
def add_quote_inside_tab(tab):
|
||||
quoted_tab = "["
|
||||
for elem in tab[1:-1].split(','):
|
||||
elem = elem.lstrip().strip()
|
||||
quoted_tab += "\'{}\', ".format(elem)
|
||||
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
||||
quoted_tab += "]"
|
||||
return str(quoted_tab)
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'SetForTermsFrequency'
|
||||
p = Process(config_section)
|
||||
|
||||
# REDIS #
|
||||
server_term = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_TermFreq", "host"),
|
||||
port=p.config.get("ARDB_TermFreq", "port"),
|
||||
db=p.config.get("ARDB_TermFreq", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("RegexForTermsFrequency script started")
|
||||
|
||||
# create direct link in mail
|
||||
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
||||
|
||||
#get the dico and matching percent
|
||||
dico_percent = {}
|
||||
dico_set_tab = {}
|
||||
dico_setname_to_redis = {}
|
||||
for set_str in server_term.smembers(TrackedSetSet_Name):
|
||||
tab_set = set_str[1:-1]
|
||||
tab_set = add_quote_inside_tab(tab_set)
|
||||
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
||||
if perc_finder is not None:
|
||||
match_percent = perc_finder.group(0)[1:-1]
|
||||
dico_percent[tab_set] = float(match_percent)
|
||||
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
|
||||
dico_setname_to_redis[tab_set] = set_str
|
||||
else:
|
||||
continue
|
||||
|
||||
message = p.get_from_set()
|
||||
|
||||
while True:
|
||||
|
||||
if message is not None:
|
||||
filename = message
|
||||
temp = filename.split('/')
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
content = Paste.Paste(filename).get_p_content()
|
||||
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
|
||||
#iterate over the words of the file
|
||||
match_dico = {}
|
||||
for word in content.split():
|
||||
for cur_set, array_set in dico_set_tab.items():
|
||||
for w_set in array_set[:-1]: #avoid the percent matching
|
||||
if word == w_set:
|
||||
try:
|
||||
match_dico[str(array_set)] += 1
|
||||
except KeyError:
|
||||
match_dico[str(array_set)] = 1
|
||||
|
||||
#compute matching %
|
||||
for the_set, matchingNum in match_dico.items():
|
||||
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
|
||||
if eff_percent >= dico_percent[the_set]:
|
||||
# Send a notification only when the member is in the set
|
||||
if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
||||
|
||||
# create mail body
|
||||
mail_body = ("AIL Framework,\n"
|
||||
"New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
|
||||
''+full_paste_url + filename)
|
||||
|
||||
# Send to every associated email adress
|
||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
|
||||
sendEmailNotification(email, 'Term', mail_body)
|
||||
|
||||
# tag paste
|
||||
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
|
||||
msg = '{};{}'.format(tag, filename)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
print(the_set, "matched in", filename)
|
||||
set_name = 'set_' + dico_setname_to_redis[the_set]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
#consider the num of occurence of this set
|
||||
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
|
||||
|
||||
# FIXME - avoid using per paste as a set is checked over the entire paste
|
||||
#1 term per paste
|
||||
if new_to_the_set:
|
||||
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
|
||||
|
||||
else:
|
||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||
print("sleeping")
|
||||
time.sleep(5)
|
||||
message = p.get_from_set()
|
|
@ -8,13 +8,14 @@ The TermTracker Module
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
|
||||
from Helper import Process
|
||||
from pubsublogger import publisher
|
||||
|
||||
import NotificationHelper
|
||||
|
||||
from packages import Paste
|
||||
from packages import Item
|
||||
from packages import Term
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||
|
@ -26,13 +27,22 @@ mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\n
|
|||
|
||||
# loads tracked words
|
||||
list_tracked_words = Term.get_tracked_words_list()
|
||||
last_refresh_word = time.time()
|
||||
set_tracked_words_list = Term.get_set_tracked_words_list()
|
||||
last_refresh_set = time.time()
|
||||
|
||||
def new_term_found(term, term_type, item_id):
|
||||
uuid_list = Term.get_term_uuid_list(term)
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
def new_term_found(term, term_type, item_id, item_date):
|
||||
uuid_list = Term.get_term_uuid_list(term, term_type)
|
||||
print('new tracked term found: {} in {}'.format(term, item_id))
|
||||
|
||||
for term_uuid in uuid_list:
|
||||
Term.add_tracked_item(term_uuid, item_id)
|
||||
Term.add_tracked_item(term_uuid, item_id, item_date)
|
||||
|
||||
tags_to_add = Term.get_term_tags(term_uuid)
|
||||
for tag in tags_to_add:
|
||||
|
@ -52,28 +62,38 @@ if __name__ == "__main__":
|
|||
publisher.channel = "Script"
|
||||
publisher.info("Script TermTrackerMod started")
|
||||
|
||||
#config_section = 'TermTrackerMod'
|
||||
config_section = 'TermTrackerMod'
|
||||
p = Process(config_section)
|
||||
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
||||
|
||||
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
||||
|
||||
while True:
|
||||
|
||||
item_id = p.get_from_set()
|
||||
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
|
||||
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
|
||||
|
||||
if message is not None:
|
||||
if item_id is not None:
|
||||
|
||||
paste = Paste.Paste(item_id)
|
||||
item_date = Item.get_item_date(item_id)
|
||||
item_content = Item.get_item_content(item_id)
|
||||
|
||||
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
|
||||
signal.alarm(max_execution_time)
|
||||
try:
|
||||
dict_words_freq = Term.get_text_word_frequency(item_content)
|
||||
except TimeoutException:
|
||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||
continue
|
||||
else:
|
||||
signal.alarm(0)
|
||||
|
||||
# create token statistics
|
||||
for word in dict_words_freq:
|
||||
Term.create_token_statistics(item_date, word, dict_words_freq[word])
|
||||
|
||||
# check solo words
|
||||
for word in list_tracked_words:
|
||||
if word in dict_words_freq:
|
||||
new_term_found(word, 'word', item_id)
|
||||
new_term_found(word, 'word', item_id, item_date)
|
||||
|
||||
# check words set
|
||||
for elem in set_tracked_words_list:
|
||||
|
@ -86,7 +106,19 @@ if __name__ == "__main__":
|
|||
if word in dict_words_freq:
|
||||
nb_uniq_word += 1
|
||||
if nb_uniq_word >= nb_words_threshold:
|
||||
new_term_found(word_set, 'set', item_id)
|
||||
new_term_found(word_set, 'set', item_id, item_date)
|
||||
|
||||
else:
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
# refresh Tracked term
|
||||
if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
|
||||
list_tracked_words = Term.get_tracked_words_list()
|
||||
last_refresh_word = time.time()
|
||||
print('Tracked word refreshed')
|
||||
|
||||
if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
|
||||
set_tracked_words_list = Term.get_set_tracked_words_list()
|
||||
last_refresh_set = time.time()
|
||||
print('Tracked set refreshed')
|
||||
|
|
|
@ -1,71 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
The Tokenize Module
|
||||
===================
|
||||
|
||||
This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
|
||||
Module.
|
||||
|
||||
It tokenize the content of the paste and publish the result in the following
|
||||
format:
|
||||
channel_name+' '+/path/of/the/paste.gz+' '+tokenized_word+' '+scoring
|
||||
|
||||
..seealso:: Paste method (_get_top_words)
|
||||
|
||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
||||
the same Subscriber name in both of them.
|
||||
|
||||
Requirements
|
||||
------------
|
||||
|
||||
*Need running Redis instances. (Redis)
|
||||
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
||||
|
||||
"""
|
||||
import time
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
||||
from Helper import Process
|
||||
import signal
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'Tokenize'
|
||||
p = Process(config_section)
|
||||
|
||||
# LOGGING #
|
||||
publisher.info("Tokeniser started")
|
||||
|
||||
while True:
|
||||
message = p.get_from_set()
|
||||
print(message)
|
||||
if message is not None:
|
||||
paste = Paste.Paste(message)
|
||||
signal.alarm(5)
|
||||
try:
|
||||
for word, score in paste._get_top_words().items():
|
||||
if len(word) >= 4:
|
||||
msg = '{} {} {}'.format(paste.p_rel_path, word, score)
|
||||
p.populate_set_out(msg)
|
||||
except TimeoutException:
|
||||
p.incr_module_timeout_statistic()
|
||||
print ("{0} processing timeout".format(paste.p_rel_path))
|
||||
continue
|
||||
else:
|
||||
signal.alarm(0)
|
||||
else:
|
||||
publisher.debug("Tokeniser is idling 10s")
|
||||
time.sleep(10)
|
||||
print("Sleeping")
|
|
@ -1,5 +1,7 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import datetime
|
||||
|
||||
class Date(object):
|
||||
"""docstring for Date"""
|
||||
def __init__(self, *args):
|
||||
|
@ -34,7 +36,6 @@ class Date(object):
|
|||
self.day = day
|
||||
|
||||
def substract_day(self, numDay):
|
||||
import datetime
|
||||
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
|
||||
comp_year = str(computed_date.year)
|
||||
comp_month = str(computed_date.month).zfill(2)
|
||||
|
@ -50,3 +51,12 @@ def date_substract_day(date, num_day=1):
|
|||
new_date = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:8])) - datetime.timedelta(num_day)
|
||||
new_date = str(new_date).replace('-', '')
|
||||
return new_date
|
||||
|
||||
def get_date_range(num_day):
|
||||
curr_date = datetime.date.today()
|
||||
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
|
||||
date_list = []
|
||||
|
||||
for i in range(0, num_day+1):
|
||||
date_list.append(date.substract_day(i))
|
||||
return list(reversed(date_list))
|
||||
|
|
|
@ -2,10 +2,13 @@
|
|||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import gzip
|
||||
import redis
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||
import Flask_config
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||
import Date
|
||||
import Tag
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
import redis
|
||||
import datetime
|
||||
|
@ -72,14 +73,30 @@ def get_set_tracked_words_list():
|
|||
all_set_list.append((ter_set, num_words, elem))
|
||||
return all_set_list
|
||||
|
||||
def is_term_tracked_in_global_level(term):
|
||||
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
|
||||
def get_regex_tracked_words_dict():
|
||||
regex_list = r_serv_term.smembers('all:tracked_term:regex')
|
||||
dict_tracked_regex = {}
|
||||
for regex in regex_list:
|
||||
dict_tracked_regex[regex] = re.compile(regex)
|
||||
return dict_tracked_regex
|
||||
|
||||
def is_term_tracked_in_global_level(term, term_type):
|
||||
res = r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term))
|
||||
if res:
|
||||
for elem_uuid in res:
|
||||
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_term_tracked_in_user_level(term, term_type, user_id):
|
||||
res = r_serv_term.smembers('user:tracked_term:{}'.format(user_id))
|
||||
if res:
|
||||
for elem_uuid in res:
|
||||
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'tracked')== term:
|
||||
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'type')== term_type:
|
||||
return True
|
||||
return False
|
||||
|
||||
def parse_json_term_to_add(dict_input, user_id):
|
||||
term = dict_input.get('term', None)
|
||||
if not term:
|
||||
|
@ -112,7 +129,10 @@ def parse_json_term_to_add(dict_input, user_id):
|
|||
|
||||
# check if term already tracked in global
|
||||
if level==1:
|
||||
if is_term_tracked_in_global_level(term):
|
||||
if is_term_tracked_in_global_level(term, term_type):
|
||||
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
||||
else:
|
||||
if is_term_tracked_in_user_level(term, term_type, user_id):
|
||||
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
||||
|
||||
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
|
||||
|
@ -174,7 +194,7 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
|
|||
r_serv_term.sadd('all:tracked_term:{}'.format(term_type), term)
|
||||
|
||||
# create term - uuid map
|
||||
r_serv_term.sadd('all:tracked_term_uuid:{}'.format(term), term_uuid)
|
||||
r_serv_term.sadd('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
|
||||
|
||||
# add display level set
|
||||
if level == 0: # user only
|
||||
|
@ -190,15 +210,22 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
|
|||
for mail in mails:
|
||||
r_serv_term.sadd('tracked_term:mail:{}'.format(term_uuid), mail)
|
||||
|
||||
# toggle refresh module tracker list/set
|
||||
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
|
||||
|
||||
return term_uuid
|
||||
|
||||
def delete_term(term_uuid):
|
||||
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
|
||||
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
|
||||
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
|
||||
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
|
||||
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
|
||||
r_serv_term.srem('all:tracked_term_uuid:{}:{}'.format(term_type, term), term_uuid)
|
||||
# Term not tracked by other users
|
||||
if not r_serv_term.exists('all:tracked_term_uuid:{}:{}'.format(term_type, term)):
|
||||
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term)
|
||||
|
||||
# toggle refresh module tracker list/set
|
||||
r_serv_term.set('tracked_term:refresh:{}'.format(term_type), time.time())
|
||||
|
||||
if level == 0: # user only
|
||||
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
|
||||
|
@ -218,8 +245,8 @@ def delete_term(term_uuid):
|
|||
# remove item set
|
||||
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
|
||||
|
||||
def get_term_uuid_list(term):
|
||||
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
|
||||
def get_term_uuid_list(term, term_type):
|
||||
return list(r_serv_term.smembers('all:tracked_term_uuid:{}:{}'.format(term_type, term)))
|
||||
|
||||
def get_term_tags(term_uuid):
|
||||
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
|
||||
|
@ -227,10 +254,30 @@ def get_term_tags(term_uuid):
|
|||
def get_term_mails(term_uuid):
|
||||
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
|
||||
|
||||
def add_tracked_item(term_uuid, item_id):
|
||||
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
|
||||
def add_tracked_item(term_uuid, item_id, item_date):
|
||||
# track item
|
||||
r_serv_term.sadd('tracked_term:item:{}:{}'.format(term_uuid, item_date), item_id)
|
||||
# track nb item by date
|
||||
r_serv_term.zincrby('tracked_term:stat:{}'.format(term_uuid), item_date, 1)
|
||||
|
||||
def create_token_statistics(item_date, word, nb):
|
||||
r_serv_term.zincrby('stat_token_per_item_by_day:{}'.format(item_date), word, 1)
|
||||
r_serv_term.zincrby('stat_token_total_by_day:{}'.format(item_date), word, nb)
|
||||
r_serv_term.sadd('stat_token_history', item_date)
|
||||
|
||||
def delete_token_statistics_by_date(item_date):
|
||||
r_serv_term.delete('stat_token_per_item_by_day:{}'.format(item_date))
|
||||
r_serv_term.delete('stat_token_total_by_day:{}'.format(item_date))
|
||||
r_serv_term.srem('stat_token_history', item_date)
|
||||
|
||||
def get_all_token_stat_history():
|
||||
return r_serv_term.smembers('stat_token_history')
|
||||
|
||||
def get_tracked_term_last_updated_by_type(term_type):
|
||||
epoch_update = r_serv_term.get('tracked_term:refresh:{}'.format(term_type))
|
||||
if not epoch_update:
|
||||
epoch_update = 0
|
||||
return float(epoch_update)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -107,7 +107,10 @@ operation_mode = 3
|
|||
ttl_duplicate = 86400
|
||||
default_unnamed_feed_name = unnamed_feeder
|
||||
|
||||
[RegexForTermsFrequency]
|
||||
[TermTrackerMod]
|
||||
max_execution_time = 120
|
||||
|
||||
[RegexTracker]
|
||||
max_execution_time = 60
|
||||
|
||||
##### Redis #####
|
||||
|
|
|
@ -11,62 +11,10 @@ from dateutil.rrule import rrule, DAILY
|
|||
import csv
|
||||
|
||||
|
||||
def listdirectory(path):
|
||||
"""Path Traversing Function.
|
||||
|
||||
:param path: -- The absolute pathname to a directory.
|
||||
|
||||
This function is returning all the absolute path of the files contained in
|
||||
the argument directory.
|
||||
|
||||
"""
|
||||
fichier = []
|
||||
for root, dirs, files in os.walk(path):
|
||||
|
||||
for i in files:
|
||||
|
||||
fichier.append(os.path.join(root, i))
|
||||
|
||||
return fichier
|
||||
|
||||
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
||||
"""It filters out non-printable characters from the string it receives."""
|
||||
|
||||
|
||||
def create_dirfile(r_serv, directory, overwrite):
|
||||
"""Create a file of path.
|
||||
|
||||
:param r_serv: -- connexion to redis database
|
||||
:param directory: -- The folder where to launch the listing of the .gz files
|
||||
|
||||
This function create a list in redis with inside the absolute path
|
||||
of all the pastes needed to be proceeded by function using parallel
|
||||
(like redis_words_ranking)
|
||||
|
||||
"""
|
||||
if overwrite:
|
||||
r_serv.delete("filelist")
|
||||
|
||||
for x in listdirectory(directory):
|
||||
r_serv.lpush("filelist", x)
|
||||
|
||||
publisher.info("The list was overwritten")
|
||||
|
||||
else:
|
||||
if r_serv.llen("filelist") == 0:
|
||||
|
||||
for x in listdirectory(directory):
|
||||
r_serv.lpush("filelist", x)
|
||||
|
||||
publisher.info("New list created")
|
||||
else:
|
||||
|
||||
for x in listdirectory(directory):
|
||||
r_serv.lpush("filelist", x)
|
||||
|
||||
publisher.info("The list was updated with new elements")
|
||||
|
||||
|
||||
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
||||
"""Create a csv file used with dygraph.
|
||||
|
||||
|
|
|
@ -19,36 +19,17 @@ subscribe = Redis_Global
|
|||
[Attributes]
|
||||
subscribe = Redis_Global
|
||||
|
||||
[Lines]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_LinesShort,Redis_LinesLong
|
||||
|
||||
[DomClassifier]
|
||||
subscribe = Redis_Global
|
||||
|
||||
[Tokenize]
|
||||
subscribe = Redis_LinesShort
|
||||
publish = Redis_Words
|
||||
|
||||
[Curve]
|
||||
subscribe = Redis_Words
|
||||
publish = Redis_CurveManageTopSets,Redis_Tags
|
||||
|
||||
[TermTrackerMod]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_Tags
|
||||
|
||||
[RegexForTermsFrequency]
|
||||
[RegexTracker]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_Tags
|
||||
|
||||
[SetForTermsFrequency]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_Tags
|
||||
|
||||
[CurveManageTopSets]
|
||||
subscribe = Redis_CurveManageTopSets
|
||||
|
||||
[Categ]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey
|
||||
|
|
Loading…
Reference in a new issue