New feature in Terms: regex and set of words support (draft)

This commit is contained in:
Mokaddem 2017-03-28 17:42:44 +02:00
parent be93af6f5a
commit 2da4c572c7
8 changed files with 366 additions and 22 deletions

View file

@ -132,6 +132,10 @@ if __name__ == "__main__":
#Add more info for tracked terms
check_if_tracked_term(low_word, filename)
#send to RegexForTermsFrequency
to_send = "{} {} {}".format(filename, timestamp, word)
p.populate_set_out(to_send, 'RegexForTermsFrequency')
else:
if generate_new_graph:

View file

@ -149,6 +149,10 @@ function launching_scripts {
sleep 0.1
screen -S "Script" -X screen -t "CurveManageTopSets" bash -c './CurveManageTopSets.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "RegexForTermsFrequency" bash -c './RegexForTermsFrequency.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "SetForTermsFrequency" bash -c './SetForTermsFrequency.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Indexer" bash -c './Indexer.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Keys" bash -c './Keys.py; read x'

89
bin/RegexForTermsFrequency.py Executable file
View file

@ -0,0 +1,89 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
import os
import datetime
import calendar
import re
from Helper import Process
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'RegexForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#compile the regex
dico_regex = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str] = re.compile(regex_str)
message = p.get_from_set()
# Regex Frequency
while True:
if message is not None:
filename, timestamp, word = message.split()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
matched = compiled_regex.match(word)
if word == "amzinggg":
print("matched")
server_term.incr("thisistest")
if matched is not None: #there is a match
matched = matched.group(0)
# Add in Regex track set only if term is not in the blacklist
if matched not in server_term.smembers(BlackListTermsSet_Name):
set_name = 'regex_' + regex_str
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
#1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
server_term.zincrby(curr_set, regex_str, float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()

118
bin/SetForTermsFrequency.py Executable file
View file

@ -0,0 +1,118 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast
from Helper import Process
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\"{}\", ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return quoted_tab
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'SetForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[str(set_str)] = match_percent
tab_set = '["IoT", "mirai", "botnet", [50]]'
dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1]
else:
continue
message = p.get_from_set()
while True:
if message is not None:
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
content = Paste.Paste(filename).get_p_content()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate over the words of the file
match_dico = {}
for word in content:
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set:
if word == w_set:
try:
match_dico[curr_set] += 1
except KeyError:
match_dico[curr_set] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = matchingNum / len(dico_set_tab[str(the_set)])
if eff_percent >= dico_percent[str(set_str)]:
print(the_set, "matched in", filename)
set_name = 'set_' + the_set
server_term.sadd(set_name, filename)
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, the_set, int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1)))
server_term.zincrby("per_paste_" + curr_set, the_set, float(1))
server_term.zincrby(curr_set, the_set, float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()

View file

@ -32,7 +32,13 @@ publish = Redis_Words
[Curve]
subscribe = Redis_Words
publish = Redis_CurveManageTopSets
publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency
[RegexForTermsFrequency]
subscribe = Redis_RegexForTermsFrequency
[SetForTermsFrequency]
subscribe = Redis_Global
[CurveManageTopSets]
subscribe = Redis_CurveManageTopSets

Binary file not shown.

Before

Width:  |  Height:  |  Size: 182 KiB

After

Width:  |  Height:  |  Size: 178 KiB

View file

@ -9,7 +9,7 @@ import datetime
import calendar
import flask
from flask import Flask, render_template, jsonify, request
import re
import Paste
# ============ VARIABLES ============
@ -18,6 +18,22 @@ import Flask_config
app = Flask_config.app
cfg = Flask_config.cfg
r_serv_term = Flask_config.r_serv_term
DEFAULT_MATCH_PERCENT = 50
#tracked
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
#black
BlackListTermsDate_Name = "BlackListTermDate"
BlackListTermsSet_Name = "BlackListSetTermSet"
#regex
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedRegexDate_Name = "TrackedRegexDate"
#set
TrackedSetSet_Name = "TrackedSetSet"
TrackedSetDate_Name = "TrackedSetDate"
# ============ FUNCTIONS ============
def Term_getValueOverRange(word, startDate, num_day, per_paste=""):
@ -47,15 +63,43 @@ def terms_management():
per_paste_text = ""
per_paste = 0
TrackedTermsSet_Name = "TrackedSetTermSet"
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
BlackListTermsDate_Name = "BlackListTermDate"
today = datetime.datetime.now()
today = today.replace(hour=0, minute=0, second=0, microsecond=0)
today_timestamp = calendar.timegm(today.timetuple())
#Regex
trackReg_list = []
trackReg_list_values = []
trackReg_list_num_of_paste = []
for tracked_regex in r_serv_term.smembers(TrackedRegexSet_Name):
trackReg_list.append(tracked_regex)
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
term_date = r_serv_term.hget(TrackedRegexDate_Name, tracked_regex)
set_paste_name = "regex_" + tracked_regex
trackReg_list_num_of_paste.append(r_serv_term.scard(set_paste_name))
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
value_range.append(term_date)
trackReg_list_values.append(value_range)
#Set
trackSet_list = []
trackSet_list_values = []
trackSet_list_num_of_paste = []
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
trackSet_list.append(tracked_set)
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)
set_paste_name = "set_" + tracked_set
trackSet_list_num_of_paste.append(r_serv_term.scard(set_paste_name))
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
value_range.append(term_date)
trackSet_list_values.append(value_range)
#Tracked terms
track_list = []
track_list_values = []
track_list_num_of_paste = []
@ -72,21 +116,34 @@ def terms_management():
track_list_values.append(value_range)
#blacklist terms
black_list = []
for blacked_term in r_serv_term.smembers(BlackListTermsSet_Name):
term_date = r_serv_term.hget(BlackListTermsDate_Name, blacked_term)
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
black_list.append([blacked_term, term_date])
return render_template("terms_management.html", black_list=black_list, track_list=track_list, track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste, per_paste=per_paste)
return render_template("terms_management.html",
black_list=black_list, track_list=track_list, trackReg_list=trackReg_list, trackSet_list=trackSet_list,
track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste,
trackReg_list_values=trackReg_list_values, trackReg_list_num_of_paste=trackReg_list_num_of_paste,
trackSet_list_values=trackSet_list_values, trackSet_list_num_of_paste=trackSet_list_num_of_paste,
per_paste=per_paste)
@app.route("/terms_management_query_paste/")
def terms_management_query_paste():
term = request.args.get('term')
TrackedTermsSet_Name = "TrackedSetTermSet"
paste_info = []
# check if regex or not
if term.startswith('/') and term.endswith('/'):
set_paste_name = "regex_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
elif term.startswith('\\') and term.endswith('\\'):
set_paste_name = "set_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
else:
set_paste_name = "tracked_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
@ -131,11 +188,6 @@ def terms_management_query():
@app.route("/terms_management_action/", methods=['GET'])
def terms_management_action():
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
BlackListTermsDate_Name = "BlackListTermDate"
BlackListTermsSet_Name = "BlackListSetTermSet"
today = datetime.datetime.now()
today = today.replace(microsecond=0)
today_timestamp = calendar.timegm(today.timetuple())
@ -149,10 +201,42 @@ def terms_management_action():
else:
if section == "followTerm":
if action == "add":
# check if regex/set or simple term
#regex
if term.startswith('/') and term.endswith('/'):
r_serv_term.sadd(TrackedRegexSet_Name, term)
r_serv_term.hset(TrackedRegexDate_Name, term, today_timestamp)
#set
elif term.startswith('\\') and term.endswith('\\'):
tab_term = term[1:-1]
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_term)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
set_to_add = term
else:
match_percent = DEFAULT_MATCH_PERCENT
set_to_add = "\\" + tab_term[:-1] + ", [{}]]\\".format(match_percent)
r_serv_term.sadd(TrackedSetSet_Name, set_to_add)
r_serv_term.hset(TrackedSetDate_Name, set_to_add, today_timestamp)
#simple term
else:
r_serv_term.sadd(TrackedTermsSet_Name, term.lower())
r_serv_term.hset(TrackedTermsDate_Name, term, today_timestamp)
r_serv_term.hset(TrackedTermsDate_Name, term.lower(), today_timestamp)
#del action
else:
if term.startswith('/') and term.endswith('/'):
r_serv_term.srem(TrackedRegexSet_Name, term)
r_serv_term.hdel(TrackedRegexDate_Name, term)
elif term.startswith('\\') and term.endswith('\\'):
r_serv_term.srem(TrackedSetSet_Name, term)
print(term)
r_serv_term.hdel(TrackedSetDate_Name, term)
else:
r_serv_term.srem(TrackedTermsSet_Name, term.lower())
r_serv_term.hdel(TrackedTermsDate_Name, term.lower())
elif section == "blacklistTerm":
if action == "add":
r_serv_term.sadd(BlackListTermsSet_Name, term.lower())

View file

@ -105,9 +105,16 @@
</div>
<div class="panel-body">
<div style="margin-bottom: 10px;">
<table>
<tr><td><b>Regex</b>: surround the term by '<b>/</b>'. </td> <td><b style="margin-left: 20px;">/([a-z])\w+([a-z])\n/</b></td></tr>
<tr><td><b>Set of terms</b>: surround the list by '<b>\</b>'. </td> <td><b style="margin-left: 20px;">\[term1, term2, ...]\</b></td></tr>
<tr><td> - To set a custom matching <b>threshold</b> (defaut=50), append it at the end as a inner list '<b>[thresh]</b>'. </td> <td><b style="margin-left: 20px;">\[term1, term2, ..., [75]]\</b></td></tr>
</table>
</div>
<div class="form-group input-group" style="margin-bottom: 30px;">
<span class="input-group-addon"><span class="fa fa-eye"></span></span>
<input id="followTermInput" class="form-control" placeholder="Term to track" type="text" style="max-width: 400px;">
<input id="followTermInput" class="form-control" placeholder="Term to track." type="text" style="max-width: 400px;">
<button id="followTermBtn" class="btn btn-success btn-interaction" style="margin-left: 10px;" data-section="followTerm" data-action="add"> Add term</button>
</div>
@ -124,6 +131,37 @@
</tr>
</thead>
<tbody>
<!-- SET -->
{% for set in trackSet_list %}
<tr style="background-color: #cdffca;">
<td>{{ set }}</td>
<td>{{ trackSet_list_values[loop.index0][3] }}</td>
<td>{{ trackSet_list_values[loop.index0][0] }}</td>
<td>{{ trackSet_list_values[loop.index0][1] }}</td>
<td>{{ trackSet_list_values[loop.index0][2] }}</td>
<td>{{ trackSet_list_num_of_paste[loop.index0] }}</td>
<td><p style="margin: 0px;">
<span data-toggle="modal" data-target="#mymodal" data-term="{{ set }}" ><button class="btn-link" data-toggle="tooltip" data-placement="right" title="Show concerned paste(s)"><span class="glyphicon glyphicon-info-sign"></span></button></span>
<button class="btn-link btn-interaction" data-toggle="tooltip" data-placement="left" title="Remove this term" data-content="{{ set }}" data-section="followTerm" data-action="delete"><span class="glyphicon glyphicon-trash"></span></button>
</p></td>
</tr>
{% endfor %}
<!-- REGEX -->
{% for regex in trackReg_list %}
<tr style="background-color: #fffdca;">
<td>{{ regex }}</td>
<td>{{ trackReg_list_values[loop.index0][3] }}</td>
<td>{{ trackReg_list_values[loop.index0][0] }}</td>
<td>{{ trackReg_list_values[loop.index0][1] }}</td>
<td>{{ trackReg_list_values[loop.index0][2] }}</td>
<td>{{ trackReg_list_num_of_paste[loop.index0] }}</td>
<td><p style="margin: 0px;">
<span data-toggle="modal" data-target="#mymodal" data-term="{{ regex }}" ><button class="btn-link" data-toggle="tooltip" data-placement="right" title="Show concerned paste(s)"><span class="glyphicon glyphicon-info-sign"></span></button></span>
<button class="btn-link btn-interaction" data-toggle="tooltip" data-placement="left" title="Remove this term" data-content="{{ regex }}" data-section="followTerm" data-action="delete"><span class="glyphicon glyphicon-trash"></span></button>
</p></td>
</tr>
{% endfor %}
<!-- Normal term -->
{% for term in track_list %}
<tr>
<td>{{ term }}</td>
@ -318,9 +356,9 @@ function perform_operation(){
var curr_section = $(this).attr('data-section');
var curr_action = $(this).attr('data-action');
if (curr_action == "add") {
var curr_term = $('#'+curr_section+'Input').val().toLowerCase();;
var curr_term = $('#'+curr_section+'Input').val();
} else {
var curr_term = $(this).attr('data-content').toLowerCase();;
var curr_term = $(this).attr('data-content');
}
var data_to_send = { section: curr_section, action:curr_action, term: curr_term};
@ -341,6 +379,7 @@ function perform_operation(){
} else if (json.action == "delete") {
// Find indexes of row which have the term in the first column
var index = table_track.rows().eq( 0 ).filter( function (rowIdx) {
console.log(table_track.cell( rowIdx, 0 ).data())
return table_track.cell( rowIdx, 0 ).data() === json.term;
} );
table_track.rows(index).remove().draw( false );