From 7ff9b9a583261ab0a1c470120545e4c0643efc69 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Tue, 5 Jul 2016 16:53:03 +0200 Subject: [PATCH] Added DomainTrending seems working. Started search features with related html pages, not finish yet. --- bin/WebStats.py | 90 ++++++++++++++++++++++--- bin/packages/Date.py | 9 +++ bin/packages/Paste.py | 4 +- bin/packages/lib_words.py | 44 +++++++++++- var/www/Flask_server.py | 19 +++++- var/www/templates/Tldstrending.html | 2 +- var/www/templates/search.html | 71 ++++++++++++++++++- var/www/templates/show_saved_paste.html | 48 +++++++++++++ 8 files changed, 269 insertions(+), 18 deletions(-) create mode 100644 var/www/templates/show_saved_paste.html diff --git a/bin/WebStats.py b/bin/WebStats.py index 5da443a8..cac352af 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -10,19 +10,72 @@ import re import redis import os from packages import lib_words +from packages.Date import Date from pubsublogger import publisher from packages import Paste from Helper import Process from pyfaup.faup import Faup -def analyse(field_name): +# Config Var +threshold_need_to_look = 50 +range_to_look = 10 +threshold_to_plot = 1 #500% +to_plot = set() +clean_frequency = 10 #minutes + +def analyse(server, field_name): field = url_parsed[field_name] if field is not None: - prev_score = r_serv1.hget(field, date) + prev_score = server.hget(field, date) if prev_score is not None: - r_serv1.hset(field, date, int(prev_score) + 1) + server.hset(field, date, int(prev_score) + 1) else: - r_serv1.hset(field, date, 1) + server.hset(field, date, 1) + +def analyse_and_progression(server, field_name): + field = url_parsed[field_name] + if field is not None: + prev_score = server.hget(field, date) + if prev_score is not None: + print field + ' prev_score:' + prev_score + server.hset(field, date, int(prev_score) + 1) + if int(prev_score) + 1 > threshold_need_to_look: #threshold for false possitive + if(check_for_progression(server, field, date)): + to_plot.add(field) + else: + server.hset(field, date, 1) + +def check_for_progression(server, field, date): + previous_data = set() + tot_sum = 0 + for i in range(0, range_to_look): + curr_value = server.hget(field, Date(date).substract_day(i)) + if curr_value is None: #no further data + break + else: + curr_value = int(curr_value) + previous_data.add(curr_value) + tot_sum += curr_value + if i == 0: + today_val = curr_value + + + print 'totsum='+str(tot_sum) + print 'div='+str(tot_sum/today_val) + if tot_sum/today_val >= threshold_to_plot: + return True + else: + return False + +def clean_to_plot(): + temp_to_plot = set() + curr_date = datetime.date.today() + date = Date(str(curr_date.year)+str(curr_date.month)+str(curr_date.day)) + + for elem in to_plot: + if(check_for_progression(field, date)): + temp_to_plot.add(elem) + to_plot = temp_to_plot if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) @@ -45,6 +98,11 @@ if __name__ == '__main__': host=p.config.get("Redis_Level_DB", "host"), port=p.config.get("Redis_Level_DB", "port"), db=p.config.get("Redis_Level_DB", "db")) + + r_serv2 = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_Domain", "host"), + port=p.config.get("Redis_Level_DB_Domain", "port"), + db=p.config.get("Redis_Level_DB_Domain", "db")) # FILE CURVE SECTION # csv_path_proto = os.path.join(os.environ['AIL_HOME'], @@ -57,6 +115,10 @@ if __name__ == '__main__': tldsfile_path = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "tldsfile")) + csv_path_domain = os.path.join(os.environ['AIL_HOME'], + p.config.get("Directories", "domainstrending_csv")) + + faup = Faup() generate_new_graph = False # Endless loop getting messages from the input queue @@ -71,17 +133,22 @@ if __name__ == '__main__': today = datetime.date.today() year = today.year month = today.month - + print 'b1' lib_words.create_curve_with_word_file(r_serv1, csv_path_proto, protocolsfile_path, year, month) - + print 'b2' lib_words.create_curve_with_word_file(r_serv1, csv_path_tld, tldsfile_path, year, month) - + print 'b3' + lib_words.create_curve_with_list(r_serv2, csv_path_domain, + to_plot, year, + month) + print 'end building' publisher.debug("{} queue is empty, waiting".format(config_section)) - time.sleep(1) + print 'sleeping' + time.sleep(5) continue else: @@ -91,5 +158,8 @@ if __name__ == '__main__': faup.decode(url) url_parsed = faup.get() - analyse('scheme') #Scheme analysis - analyse('tld') #Tld analysis + analyse(r_serv1, 'scheme') #Scheme analysis + analyse(r_serv1, 'tld') #Tld analysis + analyse_and_progression(r_serv2, 'domain') #Domain analysis + print "to_plot:" + print to_plot diff --git a/bin/packages/Date.py b/bin/packages/Date.py index 4abb0910..ce02636a 100644 --- a/bin/packages/Date.py +++ b/bin/packages/Date.py @@ -30,3 +30,12 @@ class Date(object): def _set_day(self, day): self.day = day + + def substract_day(self, numDay): + import datetime + computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay) + comp_year = str(computed_date.year) + comp_month = str(computed_date.month).zfill(2) + comp_day = str(computed_date.day).zfill(2) + return comp_year + comp_month + comp_day + diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 3b81c7fe..cc5c41ad 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -186,7 +186,9 @@ class Paste(object): if the paste doesn't contain any human dictionnary words ..seealso: git@github.com:saffsd/langid.py.git - """ + FIXME: This procedure is using more than 20% of CPU + + """ identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) return identifier.classify(self.get_p_content()) diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index 3c065ed0..b2cf418b 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -81,13 +81,14 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month to keep the timeline of the curve correct. """ + threshold = 50 first_day = date(year, month, 01) last_day = date(year, month, calendar.monthrange(year, month)[1]) words = [] with open(feederfilename, 'rb') as f: # words of the files - words = sorted([word.strip() for word in f]) + words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ]) headers = ['Date'] + words with open(csvfilename+'.csv', 'wb') as f: @@ -102,6 +103,47 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month # from the 1srt day to the last of the list for word in words: value = r_serv.hget(word, curdate) + if value is None: + row.append(0) + else: + # if the word have a value for the day + # FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold + if value >= threshold: + row.append(value) + writer.writerow(row) + +def create_curve_with_list(server, csvfilename, to_plot, year, month): + """Create a csv file used with dygraph. + + :param r_serv: -- connexion to redis database + :param csvfilename: -- the path to the .csv file created + :param to_plot: -- the list which contain a words to plot. + :param year: -- (integer) The year to process + :param month: -- (integer) The month to process + + This function create a .csv file using datas in redis. + It's checking if the words contained in to_plot and + their respectives values by days exists. + + """ + + first_day = date(year, month, 01) + last_day = date(year, month, calendar.monthrange(year, month)[1]) + words = sorted(to_plot) + + headers = ['Date'] + words + with open(csvfilename+'.csv', 'wb') as f: + writer = csv.writer(f) + writer.writerow(headers) + + # for each days + for dt in rrule(DAILY, dtstart=first_day, until=last_day): + row = [] + curdate = dt.strftime("%Y%m%d") + row.append(curdate) + # from the 1srt day to the last of the list + for word in words: + value = server.hget(word, curdate) if value is None: row.append(0) else: diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 9c152cab..a11b1920 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -7,7 +7,9 @@ import json from flask import Flask, render_template, jsonify, request import flask import os - +import sys +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +import Paste # CONFIG # configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -18,6 +20,7 @@ if not os.path.exists(configfile): cfg = ConfigParser.ConfigParser() cfg.read(configfile) +max_preview_char = 500 # REDIS # r_serv = redis.StrictRedis( @@ -49,6 +52,10 @@ def get_queues(r): r.hgetall("queues").iteritems()] +def list_len(s): + return len(s) +app.jinja_env.filters['list_len'] = list_len + @app.route("/_logs") def logs(): return flask.Response(event_stream(), mimetype="text/event-stream") @@ -65,6 +72,7 @@ def search(): q = [] q.append(query) r = [] + c = [] # Search from whoosh import index from whoosh.fields import Schema, TEXT, ID @@ -78,7 +86,10 @@ def search(): results = searcher.search(query, limit=None) for x in results: r.append(x.items()[0][1]) - return render_template("search.html", r=r) + content = Paste.Paste(x.items()[0][1]).get_p_content() + content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 + c.append(content[0:content_range]) + return render_template("search.html", r=r, c=c) @app.route("/") def index(): @@ -104,6 +115,10 @@ def protocolstrending(): def tldstrending(): return render_template("Tldstrending.html") +@app.route("/showsavedpaste/") +def showsavedpaste(): + return render_template("show_saved_paste.html") + if __name__ == "__main__": app.run(host='0.0.0.0', port=7000, threaded=True) diff --git a/var/www/templates/Tldstrending.html b/var/www/templates/Tldstrending.html index 6bc96cba..fe86001c 100644 --- a/var/www/templates/Tldstrending.html +++ b/var/www/templates/Tldstrending.html @@ -130,7 +130,7 @@ diff --git a/var/www/templates/search.html b/var/www/templates/search.html index feb323dd..b9b7caeb 100644 --- a/var/www/templates/search.html +++ b/var/www/templates/search.html @@ -16,6 +16,16 @@ + @@ -39,6 +49,26 @@ + + + + + +
@@ -53,10 +83,26 @@
- - {% for result in r %} - +
{{ result }}
+ + + + + + + + + {% set i = 0 %} + {% for path in r %} + {% set prev_content = c[i] %} + + + + + + {% set i = i + 1 %} {% endfor %} +
#PathAction
{{ i + 1 }} {{ path }}

@@ -69,4 +115,23 @@ + + + + + diff --git a/var/www/templates/show_saved_paste.html b/var/www/templates/show_saved_paste.html new file mode 100644 index 00000000..0468ca2a --- /dev/null +++ b/var/www/templates/show_saved_paste.html @@ -0,0 +1,48 @@ + + + + Paste information + + + + + +

Paste:

+

{{ request.args.get('paste') }}

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DateSourceEncodingLanguageSizeMimeNumber of line
JohnDoejohn@example.com
MaryMoemary@example.com
JulyDooleyjuly@example.com
+ + + +