Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...)

This commit is contained in:
Mokaddem 2017-04-18 15:28:21 +02:00
parent 2da4c572c7
commit c8baabd882
7 changed files with 58 additions and 62 deletions

View file

@ -1,26 +0,0 @@
Global
Duplicates
Indexer
Attributes
Lines
DomClassifier
Tokenize
Curve
CurveManageTopSets
Categ
CreditCards
Mail
Onion
DumpValidOnion
Web
WebStats
SQLInjectionDetection
ModuleStats
Browse_warning_paste
SentimentAnalysis
Release
Credential
Cve
Phone
SourceCode
Keys

View file

@ -8,6 +8,7 @@ import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
@ -16,6 +17,8 @@ import re
from Helper import Process
# Config Variables
DICO_REFRESH_TIME = 60 #s
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
@ -27,6 +30,15 @@ top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def refresh_dicos():
dico_regex = {}
dico_regexname_to_redis = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
return dico_regex, dico_regexname_to_redis
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
@ -44,10 +56,8 @@ if __name__ == "__main__":
publisher.info("RegexForTermsFrequency script started")
#compile the regex
dico_regex = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str] = re.compile(regex_str)
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
message = p.get_from_set()
@ -55,32 +65,40 @@ if __name__ == "__main__":
while True:
if message is not None:
filename, timestamp, word = message.split()
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
print('dico got refreshed')
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
content = Paste.Paste(filename).get_p_content()
#iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
matched = compiled_regex.match(word)
if word == "amzinggg":
print("matched")
server_term.incr("thisistest")
matched = compiled_regex.search(content)
if matched is not None: #there is a match
print('regex matched {}'.format(regex_str))
matched = matched.group(0)
# Add in Regex track set only if term is not in the blacklist
if matched not in server_term.smembers(BlackListTermsSet_Name):
set_name = 'regex_' + regex_str
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
#1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
server_term.zincrby(curr_set, regex_str, float(1))
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
else:
pass
else:
publisher.debug("Script RegexForTermsFrequency is Idling")

View file

@ -33,10 +33,10 @@ def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\"{}\", ".format(elem)
quoted_tab += "\'{}\', ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return quoted_tab
return str(quoted_tab)
if __name__ == "__main__":
publisher.port = 6380
@ -57,15 +57,16 @@ if __name__ == "__main__":
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
dico_setname_to_redis = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[str(set_str)] = match_percent
tab_set = '["IoT", "mirai", "botnet", [50]]'
dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1]
dico_percent[tab_set] = float(match_percent)
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
dico_setname_to_redis[tab_set] = set_str
else:
continue
@ -84,31 +85,34 @@ if __name__ == "__main__":
#iterate over the words of the file
match_dico = {}
for word in content:
for word in content.split():
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set:
for w_set in array_set[:-1]: #avoid the percent matching
if word == w_set:
try:
match_dico[curr_set] += 1
match_dico[str(array_set)] += 1
except KeyError:
match_dico[curr_set] = 1
match_dico[str(array_set)] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = matchingNum / len(dico_set_tab[str(the_set)])
if eff_percent >= dico_percent[str(set_str)]:
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
if eff_percent >= dico_percent[the_set]:
print(the_set, "matched in", filename)
set_name = 'set_' + the_set
server_term.sadd(set_name, filename)
set_name = 'set_' + dico_setname_to_redis[the_set]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, the_set, int(1)))
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1)))
server_term.zincrby("per_paste_" + curr_set, the_set, float(1))
server_term.zincrby(curr_set, the_set, float(1))
if new_to_the_set:
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
else:

View file

@ -32,10 +32,10 @@ publish = Redis_Words
[Curve]
subscribe = Redis_Words
publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency
publish = Redis_CurveManageTopSets
[RegexForTermsFrequency]
subscribe = Redis_RegexForTermsFrequency
subscribe = Redis_Global
[SetForTermsFrequency]
subscribe = Redis_Global

Binary file not shown.

Before

Width:  |  Height:  |  Size: 178 KiB

After

Width:  |  Height:  |  Size: 188 KiB

View file

@ -89,7 +89,7 @@ def terms_management():
trackSet_list_num_of_paste = []
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
trackSet_list.append(tracked_set)
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
value_range = Term_getValueOverRange(tracked_set, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)

View file

@ -289,7 +289,7 @@
//console.log(data);
event.preventDefault();
var the_modal=$(this);
var url = "{{ url_for('terms_management_query_paste') }}?term=" + $(this).attr('data-term');
var url = "{{ url_for('terms_management_query_paste') }}?term=" + encodeURIComponent($(this).attr('data-term'));
$.getJSON(url, function (data) {
if (data.length != 0) {
var html_to_add = "";