diff --git a/bin/Curve.py b/bin/Curve.py index e7435eb6..b43fcb1d 100755 --- a/bin/Curve.py +++ b/bin/Curve.py @@ -41,11 +41,25 @@ import calendar from Helper import Process # Config Variables +BlackListTermsSet_Name = "BlackListSetTermSet" +TrackedTermsSet_Name = "TrackedSetTermSet" top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set +oneDay = 60*60*24 +top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] +top_termFreq_setName_week = ["TopTermFreq_set_week", 7] +top_termFreq_setName_month = ["TopTermFreq_set_month", 31] +top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] + + +def check_if_tracked_term(term, path): + if term in TrackedTermsSet_Name: + #add_paste to tracked_word_set + set_name = "tracked_" + term + server.sadd(set_name, path) + p.populate_set_out("New Term added", 'CurveManageTopSets') def getValueOverRange(word, startDate, num_day): - oneDay = 60*60*24 to_return = 0 for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay): value = server_term.hget(timestamp, word) @@ -86,10 +100,9 @@ if __name__ == "__main__": generate_new_graph = False # Term Frequency - top_termFreq_setName_day = ["TopTermFreq_set_day", 1] + top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] top_termFreq_setName_week = ["TopTermFreq_set_week", 7] top_termFreq_setName_month = ["TopTermFreq_set_month", 31] - top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] while True: @@ -100,60 +113,51 @@ if __name__ == "__main__": temp = filename.split('/') date = temp[-4] + temp[-3] + temp[-2] timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - - # If set size is greater then the one authorized - # suppress smaller elements - for curr_set, curr_num_day in top_termFreq_set_array: - diffCard = server_term.scard(curr_set) - top_term_freq_max_set_cardinality - if diffCard > 0: - top_termFreq = server_term.smembers(curr_set) - sorted_top_termFreq_set = [] - for word in top_termFreq: - word_value = getValueOverRange(word, timestamp, curr_num_day) - sorted_top_termFreq_set.append((word, word_value)) - - sorted_top_termFreq_set.sort(key=lambda tup: tup[1]) - for i in range(0, diffCard): - print 'set oversized, dropping', sorted_top_termFreq_set[i][0] - server_term.srem(curr_set, sorted_top_termFreq_set[i][0]) + top_termFreq_setName_day[0] += timestamp - #timer = time.clock() low_word = word.lower() - #print 'wordlower', time.clock() - timer + #Old curve r_serv1.hincrby(low_word, date, int(score)) # Update redis curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score))) + if low_word not in server.smembers(BlackListTermsSet_Name): + server.zincrby(top_termFreq_setName_day[0], int(score), low_word) - # Manage Top set - for curr_set, curr_num_day in top_termFreq_set_array: + #Add more info for tracked terms + check_if_tracked_term(low_word, filename) - if server_term.scard(curr_set) < top_term_freq_max_set_cardinality: + # Manage Top set is done in module Curve_manage_top_sets + + ''' + if server_term.scard(curr_set) < top_term_freq_max_set_cardinality: + server_term.sadd(curr_set, low_word) + elif server_term.sismember(curr_set, low_word): + continue + + else: + + + #timer = time.clock() + curr_word_value = getValueOverRange(low_word, timestamp, curr_num_day) + #print 'curr_range', time.clock() - timer + top_termFreq = server_term.smembers(curr_set) + sorted_top_termFreq_set = [] + #timer = time.clock() + for word in top_termFreq: + word_value = getValueOverRange(word, timestamp, curr_num_day) + sorted_top_termFreq_set.append((word, word_value)) + + sorted_top_termFreq_set.sort(key=lambda tup: tup[1]) + #print 'whole_range', time.clock() - timer + + if curr_word_value > int(sorted_top_termFreq_set[0][1]): + print str(curr_num_day)+':', low_word, curr_word_value, '\t', sorted_top_termFreq_set[0][0], sorted_top_termFreq_set[0][1], '\t', curr_word_value > sorted_top_termFreq_set[0][1] + server_term.srem(curr_set, sorted_top_termFreq_set[0][0]) server_term.sadd(curr_set, low_word) - elif server_term.sismember(curr_set, low_word): - continue - - else: - #timer = time.clock() - curr_word_value = getValueOverRange(low_word, timestamp, curr_num_day) - #print 'curr_range', time.clock() - timer - top_termFreq = server_term.smembers(curr_set) - sorted_top_termFreq_set = [] - #timer = time.clock() - for word in top_termFreq: - word_value = getValueOverRange(word, timestamp, curr_num_day) - sorted_top_termFreq_set.append((word, word_value)) - - sorted_top_termFreq_set.sort(key=lambda tup: tup[1]) - #print 'whole_range', time.clock() - timer - - if curr_word_value > int(sorted_top_termFreq_set[0][1]): - print str(curr_num_day)+':', low_word, curr_word_value, '\t', sorted_top_termFreq_set[0][0], sorted_top_termFreq_set[0][1], '\t', curr_word_value > sorted_top_termFreq_set[0][1] - server_term.srem(curr_set, sorted_top_termFreq_set[0][0]) - server_term.sadd(curr_set, low_word) - + ''' else: if generate_new_graph: generate_new_graph = False diff --git a/bin/Curve_manage_top_sets.py b/bin/Curve_manage_top_sets.py new file mode 100755 index 00000000..c3494c16 --- /dev/null +++ b/bin/Curve_manage_top_sets.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* +""" + + + +zrank for each day +week -> top zrank for each day + + +Requirements +------------ + +*Need running Redis instances. (Redis) +*Categories files of words in /files/ need to be created +*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. + +""" + +import redis +import time +import copy +from pubsublogger import publisher +from packages import lib_words +import os +import datetime +import calendar + +from Helper import Process + +# Config Variables +BlackListTermsSet_Name = "BlackListSetTermSet" +TrackedTermsSet_Name = "TrackedSetTermSet" +top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set +oneDay = 60*60*24 +num_day_month = 31 +num_day_week = 7 + +top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] +top_termFreq_setName_week = ["TopTermFreq_set_week", 7] +top_termFreq_setName_month = ["TopTermFreq_set_month", 31] +top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] + + +def manage_top_set(): + startDate = datetime.datetime.now() + startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0) + startDate = calendar.timegm(startDate.timetuple()) + + dico = {} + + # Retreive top data (2*max_card) from days sets + for timestamp in range(startDate, startDate - num_day_month*oneDay, -oneDay): + top_termFreq_setName_day[0] += str(timestamp) + array_top_day = server_term.zrangebyscore(top_termFreq_setName_day[0], '-inf', '+inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality*2) + + for word, value in array_top_day: + if word in dico.keys(): + dico[word] += value + else: + dico[word] = value + + if timestamp == startDate - num_day_week*oneDay: + dico_week = copy.deepcopy(dico) + + # convert dico into sorted array + array_month = [] + for w, v in dico.iteritems(): + array_month.append((w, v)) + array_month.sort(key=lambda tup: -tup[1]) + + array_week = [] + for w, v in dico_week.iteritems(): + array_week.append((w, v)) + array_week.sort(key=lambda tup: -tup[1]) + + + # suppress every terms in top sets + for curr_set, curr_num_day in top_termFreq_set_array[1:3]: + for w in server_term.zrange(curr_set, 0, -1): + server_term.zrem(curr_set, w) + + # Add top term from sorted array in their respective sorted sets + for elem in array_week: + server_term.zadd(top_termFreq_setName_week[0], float(elem[1]), elem[0]) + + for elem in array_month: + server_term.zadd(top_termFreq_setName_month[0], float(elem[1]), elem[0]) + + + + +if __name__ == '__main__': + # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) + # Port of the redis instance used by pubsublogger + publisher.port = 6380 + # Script is the default channel used for the modules. + publisher.channel = 'Script' + + config_section = 'CurveManageTopSets' + p = Process(config_section) + + server_term = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_TermFreq", "host"), + port=p.config.get("Redis_Level_DB_TermFreq", "port"), + db=p.config.get("Redis_Level_DB_TermFreq", "db")) + + # FUNCTIONS # + publisher.info("Script Curve_manage_top_set started") + + # Sent to the logging a description of the module + publisher.info("Manage the top sets with the data created by the module curve.") + + manage_top_set() + + while True: + # Get one message from the input queue + message = p.get_from_set() + if message is None: + publisher.debug("{} queue is empty, waiting".format(config_section)) + print 'sleeping' + time.sleep(60) # sleep a long time then manage the set + manage_top_set() + continue + + # Do something with the message from the queue + #manage_top_set() + diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 62700d1f..fef9c3ce 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -136,7 +136,9 @@ function launching_scripts { sleep 0.1 screen -S "Script" -X screen -t "Credential" bash -c './Credential.py; read x' sleep 0.1 - #screen -S "Script" -X screen -t "Curve" bash -c './Curve.py; read x' + screen -S "Script" -X screen -t "Curve" bash -c './Curve.py; read x' + sleep 0.1 + screen -S "Script" -X screen -t "Curve_topsets_manager" bash -c './Curve_manage_top_sets.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Indexer" bash -c './Indexer.py; read x' sleep 0.1 @@ -158,12 +160,6 @@ function launching_scripts { sleep 0.1 screen -S "Script" -X screen -t "SentimentAnalyser" bash -c './SentimentAnalyser.py; read x' - sleep 0.1 - screen -S "Script" -X screen -t "Curve" bash -c './Curve.py; read x' - sleep 0.1 - screen -S "Script" -X screen -t "Curve" bash -c './Curve.py; read x' - sleep 0.1 - screen -S "Script" -X screen -t "Curve" bash -c './Curve.py; read x' } #If no params, display the help diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 0f3ceb38..7434def8 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -24,6 +24,10 @@ publish = Redis_Words [Curve] subscribe = Redis_Words +publish = Redis_CurveManageTopSets + +[CurveManageTopSets] +subscribe = Redis_CurveManageTopSets [Categ] subscribe = Redis_Global diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 3f434067..56f43907 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -227,8 +227,8 @@ def Term_getValueOverRange(word, startDate, num_day): curr_to_return = 0 for timestamp in range(startDate, startDate - max(num_day)*oneDay, -oneDay): value = r_serv_term.hget(timestamp, word) - print timestamp, word - print value + #print timestamp, word + #print value curr_to_return += int(value) if value is not None else 0 for i in num_day: if passed_days == i-1: @@ -595,11 +595,15 @@ def terms_management(): track_list = [] track_list_values = [] + track_list_num_of_paste = [] for tracked_term in r_serv_term.smembers(TrackedTermsSet_Name): track_list.append(tracked_term) value_range = Term_getValueOverRange(tracked_term, today_timestamp, [1, 7, 31]) term_date = r_serv_term.hget(TrackedTermsDate_Name, tracked_term) + + set_paste_name = "tracked_" + tracked_term + track_list_num_of_paste.append(r_serv_term.scard(set_paste_name)) term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded" value_range.append(term_date) track_list_values.append(value_range) @@ -609,10 +613,38 @@ def terms_management(): for blacked_term in r_serv_term.smembers(BlackListTermsSet_Name): term_date = r_serv_term.hget(BlackListTermsDate_Name, blacked_term) term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded" - print term_date black_list.append([blacked_term, term_date]) - return render_template("terms_management.html", black_list=black_list, track_list=track_list, track_list_values=track_list_values) + return render_template("terms_management.html", black_list=black_list, track_list=track_list, track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste) + + +@app.route("/terms_management_query_paste/") +def terms_management_query_paste(): + term = request.args.get('term') + TrackedTermsSet_Name = "TrackedSetTermSet" + paste_info = [] + + set_paste_name = "tracked_" + term + track_list_path = r_serv_term.smembers(set_paste_name) + print set_paste_name + print track_list_path + + for path in track_list_path: + paste = Paste.Paste(path) + p_date = str(paste._get_p_date()) + p_date = p_date[6:]+'/'+p_date[4:6]+'/'+p_date[0:4] + p_source = paste.p_source + p_encoding = paste._get_p_encoding() + p_language = paste._get_p_language() + p_size = paste.p_size + p_mime = paste.p_mime + p_lineinfo = paste.get_lines_info() + p_content = paste.get_p_content().decode('utf-8', 'ignore') + if p_content != 0: + p_content = p_content[0:400] + paste_info.append({"path": path, "date": p_date, "source": p_source, "encoding": p_encoding, "language": p_language, "size": p_size, "mime": p_mime, "lineinfo": p_lineinfo, "content": p_content}) + + return jsonify(paste_info) @app.route("/terms_management_query/") @@ -680,7 +712,11 @@ def terms_management_action(): @app.route("/terms_plot_tool/") def terms_plot_tool(): - return render_template("terms_plot_tool.html") + term = request.args.get('term') + if term is not None: + return render_template("terms_plot_tool.html", term=term) + else: + return render_template("terms_plot_tool.html", term="") @app.route("/terms_plot_tool_data/") @@ -699,6 +735,7 @@ def terms_plot_tool_data(): else: value_range = [] for timestamp in range(range_start, range_end+oneDay, oneDay): + print timestamp, term value = r_serv_term.hget(timestamp, term) curr_value_range = int(value) if value is not None else 0 value_range.append([timestamp, curr_value_range]) @@ -712,6 +749,7 @@ def terms_plot_top(): @app.route("/terms_plot_top_data/") def terms_plot_top_data(): + oneDay = 60*60*24 today = datetime.datetime.now() today = today.replace(hour=0, minute=0, second=0, microsecond=0) today_timestamp = calendar.timegm(today.timetuple()) @@ -721,7 +759,22 @@ def terms_plot_top_data(): if the_set is None: return "None" else: - oneDay = 60*60*24 + to_return = [] + if the_set == "TopTermFreq_set_day": + the_set += "_" + str(today_timestamp) + + for term, tot_value in r_serv_term.zrangebyscore(the_set, '-inf', '+inf', withscores=True, start=0, num=20): + value_range = [] + for timestamp in range(today_timestamp, today_timestamp - num_day*oneDay, -oneDay): + value = r_serv_term.hget(timestamp, term) + curr_value_range = int(value) if value is not None else 0 + value_range.append([timestamp, curr_value_range]) + + to_return.append([term, value_range, tot_value]) + + return jsonify(to_return) + + ''' to_return = [] for term in r_serv_term.smembers(the_set): value_range = [] @@ -735,6 +788,7 @@ def terms_plot_top_data(): to_return.append([term, value_range, tot_sum]) return jsonify(to_return) + ''' @app.route("/test/") #completely shows the paste in a new tab diff --git a/var/www/templates/terms_management.html b/var/www/templates/terms_management.html index 61561fc7..4dc5a05e 100644 --- a/var/www/templates/terms_management.html +++ b/var/www/templates/terms_management.html @@ -33,6 +33,24 @@
+ +