diff --git a/all_modules.txt b/all_modules.txt deleted file mode 100644 index 010429b4..00000000 --- a/all_modules.txt +++ /dev/null @@ -1,26 +0,0 @@ -Global -Duplicates -Indexer -Attributes -Lines -DomClassifier -Tokenize -Curve -CurveManageTopSets -Categ -CreditCards -Mail -Onion -DumpValidOnion -Web -WebStats -SQLInjectionDetection -ModuleStats -Browse_warning_paste -SentimentAnalysis -Release -Credential -Cve -Phone -SourceCode -Keys diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py index b9710a2c..023710c4 100755 --- a/bin/RegexForTermsFrequency.py +++ b/bin/RegexForTermsFrequency.py @@ -8,6 +8,7 @@ import redis import time from pubsublogger import publisher from packages import lib_words +from packages import Paste import os import datetime import calendar @@ -16,6 +17,8 @@ import re from Helper import Process # Config Variables +DICO_REFRESH_TIME = 60 #s + BlackListTermsSet_Name = "BlackListSetTermSet" TrackedTermsSet_Name = "TrackedSetTermSet" TrackedRegexSet_Name = "TrackedRegexSet" @@ -27,6 +30,15 @@ top_termFreq_setName_month = ["TopTermFreq_set_month", 31] top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] +def refresh_dicos(): + dico_regex = {} + dico_regexname_to_redis = {} + for regex_str in server_term.smembers(TrackedRegexSet_Name): + dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1]) + dico_regexname_to_redis[regex_str[1:-1]] = regex_str + + return dico_regex, dico_regexname_to_redis + if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" @@ -44,10 +56,8 @@ if __name__ == "__main__": publisher.info("RegexForTermsFrequency script started") #compile the regex - dico_regex = {} - for regex_str in server_term.smembers(TrackedRegexSet_Name): - dico_regex[regex_str] = re.compile(regex_str) - + dico_refresh_cooldown = time.time() + dico_regex, dico_regexname_to_redis = refresh_dicos() message = p.get_from_set() @@ -55,32 +65,40 @@ if __name__ == "__main__": while True: if message is not None: - filename, timestamp, word = message.split() + if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME: + dico_refresh_cooldown = time.time() + dico_regex, dico_regexname_to_redis = refresh_dicos() + print('dico got refreshed') + + filename = message + temp = filename.split('/') + timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) + curr_set = top_termFreq_setName_day[0] + str(timestamp) + content = Paste.Paste(filename).get_p_content() #iterate the word with the regex for regex_str, compiled_regex in dico_regex.items(): - matched = compiled_regex.match(word) - if word == "amzinggg": - print("matched") - server_term.incr("thisistest") + matched = compiled_regex.search(content) if matched is not None: #there is a match + print('regex matched {}'.format(regex_str)) matched = matched.group(0) # Add in Regex track set only if term is not in the blacklist if matched not in server_term.smembers(BlackListTermsSet_Name): - set_name = 'regex_' + regex_str + set_name = 'regex_' + dico_regexname_to_redis[regex_str] new_to_the_set = server_term.sadd(set_name, filename) new_to_the_set = True if new_to_the_set == 1 else False #consider the num of occurence of this term - regex_value = int(server_term.hincrby(timestamp, regex_str, int(1))) + regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1))) #1 term per paste if new_to_the_set: - regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1))) - server_term.zincrby("per_paste_" + curr_set, regex_str, float(1)) - server_term.zincrby(curr_set, regex_str, float(1)) - + regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1))) + server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1)) + server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1)) + else: + pass else: publisher.debug("Script RegexForTermsFrequency is Idling") diff --git a/bin/SetForTermsFrequency.py b/bin/SetForTermsFrequency.py index 626a79a9..b3100073 100755 --- a/bin/SetForTermsFrequency.py +++ b/bin/SetForTermsFrequency.py @@ -33,10 +33,10 @@ def add_quote_inside_tab(tab): quoted_tab = "[" for elem in tab[1:-1].split(','): elem = elem.lstrip().strip() - quoted_tab += "\"{}\", ".format(elem) + quoted_tab += "\'{}\', ".format(elem) quoted_tab = quoted_tab[:-2] #remove trailing , quoted_tab += "]" - return quoted_tab + return str(quoted_tab) if __name__ == "__main__": publisher.port = 6380 @@ -57,15 +57,16 @@ if __name__ == "__main__": #get the dico and matching percent dico_percent = {} dico_set_tab = {} + dico_setname_to_redis = {} for set_str in server_term.smembers(TrackedSetSet_Name): tab_set = set_str[1:-1] tab_set = add_quote_inside_tab(tab_set) perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set) if perc_finder is not None: match_percent = perc_finder.group(0)[1:-1] - dico_percent[str(set_str)] = match_percent - tab_set = '["IoT", "mirai", "botnet", [50]]' - dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1] + dico_percent[tab_set] = float(match_percent) + dico_set_tab[tab_set] = ast.literal_eval(tab_set) + dico_setname_to_redis[tab_set] = set_str else: continue @@ -84,31 +85,34 @@ if __name__ == "__main__": #iterate over the words of the file match_dico = {} - for word in content: + for word in content.split(): for cur_set, array_set in dico_set_tab.items(): - for w_set in array_set: + for w_set in array_set[:-1]: #avoid the percent matching if word == w_set: try: - match_dico[curr_set] += 1 + match_dico[str(array_set)] += 1 except KeyError: - match_dico[curr_set] = 1 + match_dico[str(array_set)] = 1 #compute matching % for the_set, matchingNum in match_dico.items(): - eff_percent = matchingNum / len(dico_set_tab[str(the_set)]) - if eff_percent >= dico_percent[str(set_str)]: + eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching + if eff_percent >= dico_percent[the_set]: print(the_set, "matched in", filename) - set_name = 'set_' + the_set - server_term.sadd(set_name, filename) + set_name = 'set_' + dico_setname_to_redis[the_set] + new_to_the_set = server_term.sadd(set_name, filename) + new_to_the_set = True if new_to_the_set == 1 else False + #consider the num of occurence of this set - set_value = int(server_term.hincrby(timestamp, the_set, int(1))) + set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1))) # FIXME - avoid using per paste as a set is checked over the entire paste #1 term per paste - regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1))) - server_term.zincrby("per_paste_" + curr_set, the_set, float(1)) - server_term.zincrby(curr_set, the_set, float(1)) + if new_to_the_set: + set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1))) + server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1)) + server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1)) else: diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 73c51a24..c7e0063f 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -32,10 +32,10 @@ publish = Redis_Words [Curve] subscribe = Redis_Words -publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency +publish = Redis_CurveManageTopSets [RegexForTermsFrequency] -subscribe = Redis_RegexForTermsFrequency +subscribe = Redis_Global [SetForTermsFrequency] subscribe = Redis_Global diff --git a/doc/module-data-flow.png b/doc/module-data-flow.png index 65c58bce..5eb1b9d3 100644 Binary files a/doc/module-data-flow.png and b/doc/module-data-flow.png differ diff --git a/var/www/Flasks/Flask_terms.py b/var/www/Flasks/Flask_terms.py index 75be5c58..5a11f903 100644 --- a/var/www/Flasks/Flask_terms.py +++ b/var/www/Flasks/Flask_terms.py @@ -89,7 +89,7 @@ def terms_management(): trackSet_list_num_of_paste = [] for tracked_set in r_serv_term.smembers(TrackedSetSet_Name): trackSet_list.append(tracked_set) - value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text) + value_range = Term_getValueOverRange(tracked_set, today_timestamp, [1, 7, 31], per_paste=per_paste_text) term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set) diff --git a/var/www/templates/terms_management.html b/var/www/templates/terms_management.html index 9f25d68d..a805c628 100644 --- a/var/www/templates/terms_management.html +++ b/var/www/templates/terms_management.html @@ -289,7 +289,7 @@ //console.log(data); event.preventDefault(); var the_modal=$(this); - var url = "{{ url_for('terms_management_query_paste') }}?term=" + $(this).attr('data-term'); + var url = "{{ url_for('terms_management_query_paste') }}?term=" + encodeURIComponent($(this).attr('data-term')); $.getJSON(url, function (data) { if (data.length != 0) { var html_to_add = "";