From 84dcd5586f8caa86ed8f41f747b748462067ebd9 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 25 Nov 2016 16:42:22 +0100 Subject: [PATCH 1/4] Started plotting same data with different dates --- var/www/Flask_server.py | 1 + var/www/templates/terms_plot_tool.html | 36 +++++++++++++++++++++----- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index fcd67a21..ec7bd671 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -771,6 +771,7 @@ def terms_plot_tool_data(): value = r_serv_term.hget(timestamp, term) curr_value_range = int(value) if value is not None else 0 value_range.append([timestamp, curr_value_range]) + value_range.insert(0,term) return jsonify(value_range) diff --git a/var/www/templates/terms_plot_tool.html b/var/www/templates/terms_plot_tool.html index 2d5799f0..fc5ab017 100644 --- a/var/www/templates/terms_plot_tool.html +++ b/var/www/templates/terms_plot_tool.html @@ -72,7 +72,7 @@
- Date: + Date:
@@ -168,6 +168,7 @@ From 1abba4dcf98ad26b776d4d515d2c00c3dc46907c Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 8 Dec 2016 08:44:10 +0100 Subject: [PATCH 2/4] Added support of re-plotting the plotted terms --- var/www/templates/terms_plot_tool.html | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/var/www/templates/terms_plot_tool.html b/var/www/templates/terms_plot_tool.html index fc5ab017..0205a89f 100644 --- a/var/www/templates/terms_plot_tool.html +++ b/var/www/templates/terms_plot_tool.html @@ -72,7 +72,7 @@
- Date: + Date:
@@ -252,26 +252,28 @@ function addData() { } -function replot(duration) { - console.log(plotted_terms); +function replot() { graph_data = []; + promises = []; for(i=0; i From 570324060e7e02e49027a670e44ef58d89a104dc Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 8 Dec 2016 09:13:31 +0100 Subject: [PATCH 3/4] terms top_sets correctly supports blacklisted terms --- bin/CurveManageTopSets.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/CurveManageTopSets.py b/bin/CurveManageTopSets.py index 8f316333..979df7ca 100755 --- a/bin/CurveManageTopSets.py +++ b/bin/CurveManageTopSets.py @@ -44,13 +44,14 @@ def manage_top_set(): startDate = datetime.datetime.now() startDate = startDate.replace(hour=0, minute=0, second=0, microsecond=0) startDate = calendar.timegm(startDate.timetuple()) + blacklist_size = int(server_term.scard(BlackListTermsSet_Name)) dico = {} - # Retreive top data (2*max_card) from days sets + # Retreive top data (max_card + blacklist_size) from days sets for timestamp in range(startDate, startDate - top_termFreq_setName_month[1]*oneDay, -oneDay): curr_set = top_termFreq_setName_day[0] + str(timestamp) - array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality*2) + array_top_day = server_term.zrevrangebyscore(curr_set, '+inf', '-inf', withscores=True, start=0, num=top_term_freq_max_set_cardinality+blacklist_size) for word, value in array_top_day: if word not in server_term.smembers(BlackListTermsSet_Name): From 73d4f9e082ba000893a183cc546e9c74496f3d9a Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 8 Dec 2016 10:05:07 +0100 Subject: [PATCH 4/4] Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) --- bin/WebStats.py | 62 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/bin/WebStats.py b/bin/WebStats.py index d8ff0876..1c41b64d 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -38,35 +38,55 @@ def get_date_range(num_day): date_list.append(date.substract_day(i)) return date_list +# Compute the progression for one keyword +def compute_progression_word(keyword): + date_range = get_date_range(num_day) + # check if this keyword is eligible for progression + keyword_total_sum = 0 + value_list = [] + for date in date_range: # get value up to date_range + curr_value = server.hget(keyword, date) + value_list.append(int(curr_value if curr_value is not None else 0)) + keyword_total_sum += int(curr_value) if curr_value is not None else 0 + oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division + + # The progression is based on the ratio: value[i] / value[i-1] + keyword_increase = 0 + value_list_reversed = value_list[:] + value_list_reversed.reverse() + for i in range(1, len(value_list_reversed)): + divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1 + keyword_increase += value_list_reversed[i] / divisor + + return (keyword_increase, keyword_total_sum) + + +''' + recompute the set top_progression zset + - Compute the current field progression + - re-compute the current progression for each first 2*max_set_cardinality fields in the top_progression_zset +''' def compute_progression(server, field_name, num_day, url_parsed): - redis_progression_name = 'top_progression_'+field_name - redis_progression_name_set = 'top_progression_'+field_name+'_set' + redis_progression_name_set = "z_top_progression_"+field_name keyword = url_parsed[field_name] if keyword is not None: - date_range = get_date_range(num_day) - # check if this keyword is eligible for progression - keyword_total_sum = 0 - value_list = [] - for date in date_range: # get value up to date_range - curr_value = server.hget(keyword, date) - value_list.append(int(curr_value if curr_value is not None else 0)) - keyword_total_sum += int(curr_value) if curr_value is not None else 0 - oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division + #compute the progression of the current word + keyword_increase, keyword_total_sum = compute_progression_word(keyword) - # The progression is based on the ratio: value[i] / value[i-1] - keyword_increase = 0 - value_list_reversed = value_list[:] - value_list_reversed.reverse() - for i in range(1, len(value_list_reversed)): - divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1 - keyword_increase += value_list_reversed[i] / divisor + #re-compute the progression of 2*max_set_cardinality + current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality) + for word, value in array_top_day: + word_inc, word_tot_sum = compute_progression_word(word) + server.zrem(redis_progression_name_set, word) + if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase): + server.zadd(redis_progression_name_set, float(word_inc), word) - # filter + # filter before adding if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): - - server.zadd("z_top_progression_"+field_name, float(keyword_increase), keyword) + server.zadd(redis_progression_name_set, float(keyword_increase), keyword) + if __name__ == '__main__':