#!/usr/bin/env python2 # -*-coding:UTF-8 -* """ This Module is used for term frequency. """ import redis import time from pubsublogger import publisher from packages import lib_words from packages import Paste import os import datetime import calendar import re import ast from Helper import Process # Config Variables BlackListTermsSet_Name = "BlackListSetTermSet" TrackedTermsSet_Name = "TrackedSetTermSet" TrackedRegexSet_Name = "TrackedRegexSet" TrackedSetSet_Name = "TrackedSetSet" top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set oneDay = 60*60*24 top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] top_termFreq_setName_week = ["TopTermFreq_set_week", 7] top_termFreq_setName_month = ["TopTermFreq_set_month", 31] top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] def add_quote_inside_tab(tab): quoted_tab = "[" for elem in tab[1:-1].split(','): elem = elem.lstrip().strip() quoted_tab += "\'{}\', ".format(elem) quoted_tab = quoted_tab[:-2] #remove trailing , quoted_tab += "]" return str(quoted_tab) if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'SetForTermsFrequency' p = Process(config_section) # REDIS # server_term = redis.StrictRedis( host=p.config.get("Redis_Level_DB_TermFreq", "host"), port=p.config.get("Redis_Level_DB_TermFreq", "port"), db=p.config.get("Redis_Level_DB_TermFreq", "db")) # FUNCTIONS # publisher.info("RegexForTermsFrequency script started") #get the dico and matching percent dico_percent = {} dico_set_tab = {} dico_setname_to_redis = {} for set_str in server_term.smembers(TrackedSetSet_Name): tab_set = set_str[1:-1] tab_set = add_quote_inside_tab(tab_set) perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set) if perc_finder is not None: match_percent = perc_finder.group(0)[1:-1] dico_percent[tab_set] = float(match_percent) dico_set_tab[tab_set] = ast.literal_eval(tab_set) dico_setname_to_redis[tab_set] = set_str else: continue message = p.get_from_set() while True: if message is not None: filename = message temp = filename.split('/') timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) content = Paste.Paste(filename).get_p_content() curr_set = top_termFreq_setName_day[0] + str(timestamp) #iterate over the words of the file match_dico = {} for word in content.split(): for cur_set, array_set in dico_set_tab.items(): for w_set in array_set[:-1]: #avoid the percent matching if word == w_set: try: match_dico[str(array_set)] += 1 except KeyError: match_dico[str(array_set)] = 1 #compute matching % for the_set, matchingNum in match_dico.items(): eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching if eff_percent >= dico_percent[the_set]: print(the_set, "matched in", filename) set_name = 'set_' + dico_setname_to_redis[the_set] new_to_the_set = server_term.sadd(set_name, filename) new_to_the_set = True if new_to_the_set == 1 else False #consider the num of occurence of this set set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1))) # FIXME - avoid using per paste as a set is checked over the entire paste #1 term per paste if new_to_the_set: set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1))) server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1)) server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1)) else: publisher.debug("Script RegexForTermsFrequency is Idling") print "sleeping" time.sleep(5) message = p.get_from_set()