diff --git a/bin/WebStats.py b/bin/WebStats.py index 71bbd944..2bb94fc3 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -17,13 +17,12 @@ from Helper import Process from pyfaup.faup import Faup # Config Var -threshold_need_to_look = 50 -range_to_look = 10 -threshold_to_plot = 1 #500% -to_plot = set() -clean_frequency = 10 #minutes +threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression +threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look +max_set_cardinality = 10 # The cardinality of the progression set +num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past -def analyse(server, field_name): +def analyse(server, field_name, date, url_parsed): field = url_parsed[field_name] if field is not None: prev_score = server.hget(field, date) @@ -31,51 +30,57 @@ def analyse(server, field_name): server.hset(field, date, int(prev_score) + 1) else: server.hset(field, date, 1) + if field_name == "domain": #save domain in a set for the monthly plot + domain_set_name = "domain_set_" + date[0:6] + server.sadd(domain_set_name, field) + print "added in " + domain_set_name +": "+ field -def analyse_and_progression(server, field_name): - field = url_parsed[field_name] - if field is not None: - prev_score = server.hget(field, date) - if prev_score is not None: - print field + ' prev_score:' + prev_score - server.hset(field, date, int(prev_score) + 1) - if int(prev_score) + 1 > threshold_need_to_look: #threshold for false possitive - if(check_for_progression(server, field, date)): - to_plot.add(field) - else: - server.hset(field, date, 1) - -def check_for_progression(server, field, date): - previous_data = set() - tot_sum = 0 - for i in range(0, range_to_look): - curr_value = server.hget(field, Date(date).substract_day(i)) - if curr_value is None: #no further data - break - else: - curr_value = int(curr_value) - previous_data.add(curr_value) - tot_sum += curr_value - if i == 0: - today_val = curr_value - - - print 'totsum='+str(tot_sum) - print 'div='+str(tot_sum/today_val) - if tot_sum/today_val >= threshold_to_plot: - return True - else: - return False - -def clean_to_plot(): - temp_to_plot = set() +def get_date_range(num_day): curr_date = datetime.date.today() - date = Date(str(curr_date.year)+str(curr_date.month)+str(curr_date.day)) + date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) + date_list = [] + + for i in range(0, num_day+1): + date_list.append(date.substract_day(i)) + return date_list + +def compute_progression(server, field_name, num_day, url_parsed): + redis_progression_name = 'top_progression_'+field_name + redis_progression_name_set = 'top_progression_'+field_name+'_set' + + keyword = url_parsed[field_name] + if keyword is not None: + date_range = get_date_range(num_day) + # check if this keyword is eligible for progression + keyword_total_sum = 0 + value_list = [] + for date in date_range: + curr_value = server.hget(keyword, date) + value_list.append(int(curr_value if curr_value is not None else 0)) + keyword_total_sum += int(curr_value) if curr_value is not None else 0 + oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division + keyword_increase = value_list[0] / oldest_value + + # filter + if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): + + if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set + server.hset(redis_progression_name, keyword, keyword_increase) #update its value + + elif (server.scard(redis_progression_name_set) < max_set_cardinality): + server.sadd(redis_progression_name_set, keyword) + + else: #not in the set + #Check value for all members + member_set = [] + for keyw in server.smembers(redis_progression_name_set): + member_set += (keyw, int(server.hget(redis_progression_name, keyw))) + member_set.sort(key=lambda tup: tup[1]) + if member_set[0] < keyword_increase: + #remove min from set and add the new one + server.srem(redis_progression_name_set, member_set[0]) + server.sadd(redis_progression_name_set, keyword) - for elem in to_plot: - if(check_for_progression(field, date)): - temp_to_plot.add(elem) - to_plot = temp_to_plot if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) @@ -99,10 +104,10 @@ if __name__ == '__main__': port=p.config.get("Redis_Level_DB", "port"), db=p.config.get("Redis_Level_DB", "db")) - r_serv2 = redis.StrictRedis( - host=p.config.get("Redis_Level_DB_Domain", "host"), - port=p.config.get("Redis_Level_DB_Domain", "port"), - db=p.config.get("Redis_Level_DB_Domain", "db")) + r_serv_trend = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_Trending", "host"), + port=p.config.get("Redis_Level_DB_Trending", "port"), + db=p.config.get("Redis_Level_DB_Trending", "db")) # FILE CURVE SECTION # csv_path_proto = os.path.join(os.environ['AIL_HOME'], @@ -129,27 +134,29 @@ if __name__ == '__main__': if message is None: if generate_new_graph: generate_new_graph = False - print 'Building graph' today = datetime.date.today() year = today.year month = today.month - lib_words.create_curve_with_word_file(r_serv1, csv_path_proto, + print 'Building protocol graph' + lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto, protocolsfile_path, year, month) - lib_words.create_curve_with_word_file(r_serv1, csv_path_tld, + print 'Building tld graph' + lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld, tldsfile_path, year, month) - lib_words.create_curve_with_list(r_serv2, csv_path_domain, - to_plot, year, + print 'Building domain graph' + lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain, + "domain", year, month) print 'end building' publisher.debug("{} queue is empty, waiting".format(config_section)) print 'sleeping' - time.sleep(5) + time.sleep(5*60) continue else: @@ -159,6 +166,9 @@ if __name__ == '__main__': faup.decode(url) url_parsed = faup.get() - analyse(r_serv1, 'scheme') #Scheme analysis - analyse(r_serv1, 'tld') #Tld analysis - analyse_and_progression(r_serv2, 'domain') #Domain analysis + analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis + analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis + analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis + compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed) + compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed) + compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 853edd2a..31d9b147 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -49,10 +49,10 @@ host = localhost port = 2013 db = 0 -[Redis_Level_DB_Domain] +[Redis_Level_DB_Trending] host = localhost port = 2016 -db = 3 +db = 0 [Redis_Level_DB_Hashs] host = localhost diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index b2cf418b..e98609d7 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -88,7 +88,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month with open(feederfilename, 'rb') as f: # words of the files - words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ]) + words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' and word.strip()!='' ]) headers = ['Date'] + words with open(csvfilename+'.csv', 'wb') as f: @@ -112,7 +112,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month row.append(value) writer.writerow(row) -def create_curve_with_list(server, csvfilename, to_plot, year, month): +def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month): """Create a csv file used with dygraph. :param r_serv: -- connexion to redis database @@ -122,15 +122,17 @@ def create_curve_with_list(server, csvfilename, to_plot, year, month): :param month: -- (integer) The month to process This function create a .csv file using datas in redis. - It's checking if the words contained in to_plot and + It's checking if the words contained in set_to_plot and their respectives values by days exists. """ first_day = date(year, month, 01) last_day = date(year, month, calendar.monthrange(year, month)[1]) - words = sorted(to_plot) - + + redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2) + words = list(server.smembers(redis_set_name)) + headers = ['Date'] + words with open(csvfilename+'.csv', 'wb') as f: writer = csv.writer(f) diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 018608f1..d6913999 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -4,12 +4,14 @@ import redis import ConfigParser import json +import datetime from flask import Flask, render_template, jsonify, request import flask import os import sys sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) import Paste +from Date import Date # CONFIG # configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -35,6 +37,11 @@ r_serv_log = redis.StrictRedis( port=cfg.getint("Redis_Log", "port"), db=cfg.getint("Redis_Log", "db")) +r_serv_charts = redis.StrictRedis( + host=cfg.get("Redis_Level_DB_Trending", "host"), + port=cfg.getint("Redis_Level_DB_Trending", "port"), + db=cfg.getint("Redis_Level_DB_Trending", "db")) + app = Flask(__name__, static_url_path='/static/') @@ -76,6 +83,20 @@ def showpaste(content_range): return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content)) +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) + date_list = [] + + for i in range(0, num_day+1): + date_list.append(date.substract_day(i)) + return date_list + + + + + +# ============ ROUTES ============ @app.route("/_logs") def logs(): @@ -86,6 +107,38 @@ def logs(): def stuff(): return jsonify(row1=get_queues(r_serv)) +@app.route("/_progressionCharts", methods=['GET']) +def progressionCharts(): + #To be used later + attribute_name = request.args.get('attributeName') + trending_name = request.args.get('trendingName') + bar_requested = True if request.args.get('bar') == "true" else False + + if (bar_requested): + num_day = int(request.args.get('days')) + bar_values = [] + + date_range = get_date_range(num_day) + # Retreive all data from the last num_day + for date in date_range: + curr_value = r_serv_charts.hget(attribute_name, date) + bar_values.append([date[0:4]+'/'+date[4:6]+'/'+date[6:8], int(curr_value if curr_value is not None else 0)]) + return jsonify(bar_values) + + else: + redis_progression_name = 'top_progression_'+trending_name + redis_progression_name_set = 'top_progression_'+trending_name+'_set' + + member_set = [] + for keyw in r_serv_charts.smembers(redis_progression_name_set): + keyw_value = r_serv_charts.hget(redis_progression_name, keyw) + keyw_value = keyw_value if keyw_value is not None else 0 + member_set.append((keyw, int(keyw_value))) + member_set.sort(key=lambda tup: tup[1], reverse=True) + if len(member_set) == 0: + member_set.append(("No relevant data", int(100))) + return jsonify(member_set) + @app.route("/search", methods=['POST']) def search(): diff --git a/var/www/templates/Trending.html b/var/www/templates/Trending.html index 8928f948..9031076c 100644 --- a/var/www/templates/Trending.html +++ b/var/www/templates/Trending.html @@ -16,6 +16,9 @@ + + + + + +