Merge pull request #115 from mokaddem/regexInTerms

New features for terms frequency
This commit is contained in:
Alexandre Dulaunoy 2017-06-08 07:04:17 +02:00 committed by GitHub
commit 446a97c47a
9 changed files with 388 additions and 48 deletions

View file

@ -1,26 +0,0 @@
Global
Duplicates
Indexer
Attributes
Lines
DomClassifier
Tokenize
Curve
CurveManageTopSets
Categ
CreditCards
Mail
Onion
DumpValidOnion
Web
WebStats
SQLInjectionDetection
ModuleStats
Browse_warning_paste
SentimentAnalysis
Release
Credential
Cve
Phone
SourceCode
Keys

View file

@ -132,6 +132,10 @@ if __name__ == "__main__":
#Add more info for tracked terms
check_if_tracked_term(low_word, filename)
#send to RegexForTermsFrequency
to_send = "{} {} {}".format(filename, timestamp, word)
p.populate_set_out(to_send, 'RegexForTermsFrequency')
else:
if generate_new_graph:

View file

@ -157,6 +157,10 @@ function launching_scripts {
sleep 0.1
screen -S "Script" -X screen -t "CurveManageTopSets" bash -c './CurveManageTopSets.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "RegexForTermsFrequency" bash -c './RegexForTermsFrequency.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "SetForTermsFrequency" bash -c './SetForTermsFrequency.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Indexer" bash -c './Indexer.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Keys" bash -c './Keys.py; read x'

107
bin/RegexForTermsFrequency.py Executable file
View file

@ -0,0 +1,107 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
from Helper import Process
# Config Variables
DICO_REFRESH_TIME = 60 #s
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def refresh_dicos():
dico_regex = {}
dico_regexname_to_redis = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
return dico_regex, dico_regexname_to_redis
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'RegexForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#compile the regex
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
message = p.get_from_set()
# Regex Frequency
while True:
if message is not None:
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
print('dico got refreshed')
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
content = Paste.Paste(filename).get_p_content()
#iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
matched = compiled_regex.search(content)
if matched is not None: #there is a match
print('regex matched {}'.format(regex_str))
matched = matched.group(0)
# Add in Regex track set only if term is not in the blacklist
if matched not in server_term.smembers(BlackListTermsSet_Name):
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
#1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
else:
pass
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()

122
bin/SetForTermsFrequency.py Executable file
View file

@ -0,0 +1,122 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast
from Helper import Process
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\'{}\', ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return str(quoted_tab)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'SetForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
dico_setname_to_redis = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[tab_set] = float(match_percent)
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
dico_setname_to_redis[tab_set] = set_str
else:
continue
message = p.get_from_set()
while True:
if message is not None:
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
content = Paste.Paste(filename).get_p_content()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate over the words of the file
match_dico = {}
for word in content.split():
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set[:-1]: #avoid the percent matching
if word == w_set:
try:
match_dico[str(array_set)] += 1
except KeyError:
match_dico[str(array_set)] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
if eff_percent >= dico_percent[the_set]:
print(the_set, "matched in", filename)
set_name = 'set_' + dico_setname_to_redis[the_set]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
if new_to_the_set:
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()

View file

@ -34,6 +34,12 @@ publish = Redis_Words
subscribe = Redis_Words
publish = Redis_CurveManageTopSets
[RegexForTermsFrequency]
subscribe = Redis_Global
[SetForTermsFrequency]
subscribe = Redis_Global
[CurveManageTopSets]
subscribe = Redis_CurveManageTopSets

Binary file not shown.

Before

Width:  |  Height:  |  Size: 182 KiB

After

Width:  |  Height:  |  Size: 188 KiB

View file

@ -9,7 +9,7 @@ import datetime
import calendar
import flask
from flask import Flask, render_template, jsonify, request
import re
import Paste
# ============ VARIABLES ============
@ -18,6 +18,22 @@ import Flask_config
app = Flask_config.app
cfg = Flask_config.cfg
r_serv_term = Flask_config.r_serv_term
DEFAULT_MATCH_PERCENT = 50
#tracked
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
#black
BlackListTermsDate_Name = "BlackListTermDate"
BlackListTermsSet_Name = "BlackListSetTermSet"
#regex
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedRegexDate_Name = "TrackedRegexDate"
#set
TrackedSetSet_Name = "TrackedSetSet"
TrackedSetDate_Name = "TrackedSetDate"
# ============ FUNCTIONS ============
def Term_getValueOverRange(word, startDate, num_day, per_paste=""):
@ -47,15 +63,43 @@ def terms_management():
per_paste_text = ""
per_paste = 0
TrackedTermsSet_Name = "TrackedSetTermSet"
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
BlackListTermsDate_Name = "BlackListTermDate"
today = datetime.datetime.now()
today = today.replace(hour=0, minute=0, second=0, microsecond=0)
today_timestamp = calendar.timegm(today.timetuple())
#Regex
trackReg_list = []
trackReg_list_values = []
trackReg_list_num_of_paste = []
for tracked_regex in r_serv_term.smembers(TrackedRegexSet_Name):
trackReg_list.append(tracked_regex)
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
term_date = r_serv_term.hget(TrackedRegexDate_Name, tracked_regex)
set_paste_name = "regex_" + tracked_regex
trackReg_list_num_of_paste.append(r_serv_term.scard(set_paste_name))
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
value_range.append(term_date)
trackReg_list_values.append(value_range)
#Set
trackSet_list = []
trackSet_list_values = []
trackSet_list_num_of_paste = []
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
trackSet_list.append(tracked_set)
value_range = Term_getValueOverRange(tracked_set, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)
set_paste_name = "set_" + tracked_set
trackSet_list_num_of_paste.append(r_serv_term.scard(set_paste_name))
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
value_range.append(term_date)
trackSet_list_values.append(value_range)
#Tracked terms
track_list = []
track_list_values = []
track_list_num_of_paste = []
@ -72,23 +116,36 @@ def terms_management():
track_list_values.append(value_range)
#blacklist terms
black_list = []
for blacked_term in r_serv_term.smembers(BlackListTermsSet_Name):
term_date = r_serv_term.hget(BlackListTermsDate_Name, blacked_term)
term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded"
black_list.append([blacked_term, term_date])
return render_template("terms_management.html", black_list=black_list, track_list=track_list, track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste, per_paste=per_paste)
return render_template("terms_management.html",
black_list=black_list, track_list=track_list, trackReg_list=trackReg_list, trackSet_list=trackSet_list,
track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste,
trackReg_list_values=trackReg_list_values, trackReg_list_num_of_paste=trackReg_list_num_of_paste,
trackSet_list_values=trackSet_list_values, trackSet_list_num_of_paste=trackSet_list_num_of_paste,
per_paste=per_paste)
@app.route("/terms_management_query_paste/")
def terms_management_query_paste():
term = request.args.get('term')
TrackedTermsSet_Name = "TrackedSetTermSet"
paste_info = []
set_paste_name = "tracked_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
# check if regex or not
if term.startswith('/') and term.endswith('/'):
set_paste_name = "regex_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
elif term.startswith('\\') and term.endswith('\\'):
set_paste_name = "set_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
else:
set_paste_name = "tracked_" + term
track_list_path = r_serv_term.smembers(set_paste_name)
for path in track_list_path:
paste = Paste.Paste(path)
@ -131,11 +188,6 @@ def terms_management_query():
@app.route("/terms_management_action/", methods=['GET'])
def terms_management_action():
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedTermsDate_Name = "TrackedTermDate"
BlackListTermsDate_Name = "BlackListTermDate"
BlackListTermsSet_Name = "BlackListSetTermSet"
today = datetime.datetime.now()
today = today.replace(microsecond=0)
today_timestamp = calendar.timegm(today.timetuple())
@ -149,10 +201,42 @@ def terms_management_action():
else:
if section == "followTerm":
if action == "add":
r_serv_term.sadd(TrackedTermsSet_Name, term.lower())
r_serv_term.hset(TrackedTermsDate_Name, term, today_timestamp)
# check if regex/set or simple term
#regex
if term.startswith('/') and term.endswith('/'):
r_serv_term.sadd(TrackedRegexSet_Name, term)
r_serv_term.hset(TrackedRegexDate_Name, term, today_timestamp)
#set
elif term.startswith('\\') and term.endswith('\\'):
tab_term = term[1:-1]
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_term)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
set_to_add = term
else:
match_percent = DEFAULT_MATCH_PERCENT
set_to_add = "\\" + tab_term[:-1] + ", [{}]]\\".format(match_percent)
r_serv_term.sadd(TrackedSetSet_Name, set_to_add)
r_serv_term.hset(TrackedSetDate_Name, set_to_add, today_timestamp)
#simple term
else:
r_serv_term.sadd(TrackedTermsSet_Name, term.lower())
r_serv_term.hset(TrackedTermsDate_Name, term.lower(), today_timestamp)
#del action
else:
r_serv_term.srem(TrackedTermsSet_Name, term.lower())
if term.startswith('/') and term.endswith('/'):
r_serv_term.srem(TrackedRegexSet_Name, term)
r_serv_term.hdel(TrackedRegexDate_Name, term)
elif term.startswith('\\') and term.endswith('\\'):
r_serv_term.srem(TrackedSetSet_Name, term)
print(term)
r_serv_term.hdel(TrackedSetDate_Name, term)
else:
r_serv_term.srem(TrackedTermsSet_Name, term.lower())
r_serv_term.hdel(TrackedTermsDate_Name, term.lower())
elif section == "blacklistTerm":
if action == "add":
r_serv_term.sadd(BlackListTermsSet_Name, term.lower())

View file

@ -105,9 +105,16 @@
</div>
<div class="panel-body">
<div style="margin-bottom: 10px;">
<table>
<tr><td><b>Regex</b>: surround the term by '<b>/</b>'. </td> <td><b style="margin-left: 20px;">/([a-z])\w+([a-z])\n/</b></td></tr>
<tr><td><b>Set of terms</b>: surround the list by '<b>\</b>'. </td> <td><b style="margin-left: 20px;">\[term1, term2, ...]\</b></td></tr>
<tr><td> - To set a custom matching <b>threshold</b> (defaut=50), append it at the end as a inner list '<b>[thresh]</b>'. </td> <td><b style="margin-left: 20px;">\[term1, term2, ..., [75]]\</b></td></tr>
</table>
</div>
<div class="form-group input-group" style="margin-bottom: 30px;">
<span class="input-group-addon"><span class="fa fa-eye"></span></span>
<input id="followTermInput" class="form-control" placeholder="Term to track" type="text" style="max-width: 400px;">
<input id="followTermInput" class="form-control" placeholder="Term to track." type="text" style="max-width: 400px;">
<button id="followTermBtn" class="btn btn-success btn-interaction" style="margin-left: 10px;" data-section="followTerm" data-action="add"> Add term</button>
</div>
@ -124,6 +131,37 @@
</tr>
</thead>
<tbody>
<!-- SET -->
{% for set in trackSet_list %}
<tr style="background-color: #cdffca;">
<td>{{ set }}</td>
<td>{{ trackSet_list_values[loop.index0][3] }}</td>
<td>{{ trackSet_list_values[loop.index0][0] }}</td>
<td>{{ trackSet_list_values[loop.index0][1] }}</td>
<td>{{ trackSet_list_values[loop.index0][2] }}</td>
<td>{{ trackSet_list_num_of_paste[loop.index0] }}</td>
<td><p style="margin: 0px;">
<span data-toggle="modal" data-target="#mymodal" data-term="{{ set }}" ><button class="btn-link" data-toggle="tooltip" data-placement="right" title="Show concerned paste(s)"><span class="glyphicon glyphicon-info-sign"></span></button></span>
<button class="btn-link btn-interaction" data-toggle="tooltip" data-placement="left" title="Remove this term" data-content="{{ set }}" data-section="followTerm" data-action="delete"><span class="glyphicon glyphicon-trash"></span></button>
</p></td>
</tr>
{% endfor %}
<!-- REGEX -->
{% for regex in trackReg_list %}
<tr style="background-color: #fffdca;">
<td>{{ regex }}</td>
<td>{{ trackReg_list_values[loop.index0][3] }}</td>
<td>{{ trackReg_list_values[loop.index0][0] }}</td>
<td>{{ trackReg_list_values[loop.index0][1] }}</td>
<td>{{ trackReg_list_values[loop.index0][2] }}</td>
<td>{{ trackReg_list_num_of_paste[loop.index0] }}</td>
<td><p style="margin: 0px;">
<span data-toggle="modal" data-target="#mymodal" data-term="{{ regex }}" ><button class="btn-link" data-toggle="tooltip" data-placement="right" title="Show concerned paste(s)"><span class="glyphicon glyphicon-info-sign"></span></button></span>
<button class="btn-link btn-interaction" data-toggle="tooltip" data-placement="left" title="Remove this term" data-content="{{ regex }}" data-section="followTerm" data-action="delete"><span class="glyphicon glyphicon-trash"></span></button>
</p></td>
</tr>
{% endfor %}
<!-- Normal term -->
{% for term in track_list %}
<tr>
<td>{{ term }}</td>
@ -251,7 +289,7 @@
//console.log(data);
event.preventDefault();
var the_modal=$(this);
var url = "{{ url_for('terms_management_query_paste') }}?term=" + $(this).attr('data-term');
var url = "{{ url_for('terms_management_query_paste') }}?term=" + encodeURIComponent($(this).attr('data-term'));
$.getJSON(url, function (data) {
if (data.length != 0) {
var html_to_add = "";
@ -318,9 +356,9 @@ function perform_operation(){
var curr_section = $(this).attr('data-section');
var curr_action = $(this).attr('data-action');
if (curr_action == "add") {
var curr_term = $('#'+curr_section+'Input').val().toLowerCase();;
var curr_term = $('#'+curr_section+'Input').val();
} else {
var curr_term = $(this).attr('data-content').toLowerCase();;
var curr_term = $(this).attr('data-content');
}
var data_to_send = { section: curr_section, action:curr_action, term: curr_term};
@ -341,6 +379,7 @@ function perform_operation(){
} else if (json.action == "delete") {
// Find indexes of row which have the term in the first column
var index = table_track.rows().eq( 0 ).filter( function (rowIdx) {
console.log(table_track.cell( rowIdx, 0 ).data())
return table_track.cell( rowIdx, 0 ).data() === json.term;
} );
table_track.rows(index).remove().draw( false );