From 1a5158b0812fa163ef7066f8551404b94215f740 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Sat, 13 Aug 2016 15:24:57 +0200 Subject: [PATCH] Added sentiment analyser module (draft) --- bin/LAUNCH.sh | 2 + bin/SentimentAnalyser.py | 89 +++++- bin/WebStats.py | 1 + bin/packages/modules.cfg | 3 + var/www/Flask_server.py | 37 +++ var/www/static/js/sentiment_trending.js | 162 +++++++--- .../sentiment_analysis_trending.html | 280 ++++++++++++++++-- 7 files changed, 486 insertions(+), 88 deletions(-) mode change 100644 => 100755 bin/SentimentAnalyser.py diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 7f1220c9..b2f4029a 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -151,6 +151,8 @@ function launching_scripts { screen -S "Script" -X screen -t "SQLInjectionDetection" bash -c './SQLInjectionDetection.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Browse_warning_paste" bash -c './Browse_warning_paste.py; read x' + sleep 0.1 + screen -S "Script" -X screen -t "SentimentAnalyser" bash -c './SentimentAnalyser.py; read x' } #If no params, display the help diff --git a/bin/SentimentAnalyser.py b/bin/SentimentAnalyser.py old mode 100644 new mode 100755 index efb2ede6..eb89ddc3 --- a/bin/SentimentAnalyser.py +++ b/bin/SentimentAnalyser.py @@ -1,7 +1,10 @@ #!/usr/bin/env python2 # -*-coding:UTF-8 -* """ - Template for new modules + Sentiment analyser module. + It takes its inputs from 'shortLine' and 'longLine'. + Source code is taken into account (in case of comments). If it is only source code, + it will be treated with a neutral value anyway. nltk.sentiment.vader module: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. @@ -9,27 +12,87 @@ nltk.sentiment.vader module: """ import time +import datetime +import calendar +import redis from pubsublogger import publisher from Helper import Process +from packages import Paste from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import tokenize -def Analyse(message): +def Analyse(message, server): + #print 'analyzing' path = message - paste = Paste.paste(path) - content = paste.p_get_content() + paste = Paste.Paste(path) + + content = paste.get_p_content() + provider = paste.p_source + p_date = str(paste._get_p_date()) + #print provider, date + + the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8])) + #print 'pastedate: ', the_date + the_time = datetime.datetime.now() + the_time = datetime.time(getattr(the_time, 'hour'), 0, 0) + #print 'now: ', the_time + combined_datetime = datetime.datetime.combine(the_date, the_time) + #print 'combined: ', combined_datetime + timestamp = calendar.timegm(combined_datetime.timetuple()) + #print 'timestamp: ', timestamp sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore')) - + #print len(sentences) + + avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} + neg_line = 0 + pos_line = 0 sid = SentimentIntensityAnalyzer() for sentence in sentences: - print(sentence) ss = sid.polarity_scores(sentence) for k in sorted(ss): - print('{0}: {1}, '.format(k, ss[k])) - print '' + if k == 'compound': + if ss['neg'] > ss['pos']: + avg_score['compoundNeg'] += ss[k] + neg_line += 1 + else: + avg_score['compoundPos'] += ss[k] + pos_line += 1 + else: + avg_score[k] += ss[k] + + #print('{0}: {1}, '.format(k, ss[k])) + + for k in avg_score: + if k == 'compoundPos': + avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1) + elif k == 'compoundNeg': + avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1) + else: + avg_score[k] = avg_score[k] / len(sentences) + + + # In redis-levelDB: {} = set, () = K-V + # {Provider_set -> provider_i} + # {Provider_TimestampInHour_i -> UniqID_i}_j + # (UniqID_i -> PasteValue_i) + + server.sadd('Provider_set', provider) + #print 'Provider_set', provider + + provider_timestamp = provider + '_' + str(timestamp) + #print provider_timestamp + server.incr('UniqID') + UniqID = server.get('UniqID') + print provider_timestamp, '->', UniqID + server.sadd(provider_timestamp, UniqID) + server.set(UniqID, avg_score) + #print UniqID, '->', avg_score + + #print '(', provider, timestamp, str(avg_score) , ')' + #server.hset(provider, timestamp, str(avg_score)) if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) @@ -39,7 +102,7 @@ if __name__ == '__main__': publisher.channel = 'Script' # Section name in bin/packages/modules.cfg - config_section = '
' + config_section = 'SentimentAnalyser' # Setup the I/O queues p = Process(config_section) @@ -47,6 +110,12 @@ if __name__ == '__main__': # Sent to the logging a description of the module publisher.info("") + # REDIS_LEVEL_DB # + server = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_Sentiment", "host"), + port=p.config.get("Redis_Level_DB_Sentiment", "port"), + db=p.config.get("Redis_Level_DB_Sentiment", "db")) + # Endless loop getting messages from the input queue while True: # Get one message from the input queue @@ -57,4 +126,4 @@ if __name__ == '__main__': continue # Do something with the message from the queue - Analyse(message) + Analyse(message, server) diff --git a/bin/WebStats.py b/bin/WebStats.py index 837fa2fe..333e7c35 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -82,6 +82,7 @@ def compute_progression(server, field_name, num_day, url_parsed): member_set = [] for keyw in server.smembers(redis_progression_name_set): member_set += (keyw, int(server.hget(redis_progression_name, keyw))) + print member_set member_set.sort(key=lambda tup: tup[1]) if member_set[0] < keyword_increase: #remove min from set and add the new one diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 84567dfd..57a85e73 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -66,6 +66,9 @@ subscribe = Redis_BrowseWarningPaste #subscribe = Redis_Cve #publish = Redis_BrowseWarningPaste +[SentimentAnalyser] +subscribe = Redis_LinesLong + [Release] subscribe = Redis_Global diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 165bb7b6..94abfbcd 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -5,6 +5,7 @@ import redis import ConfigParser import json import datetime +import calendar from flask import Flask, render_template, jsonify, request import flask import os @@ -49,6 +50,12 @@ r_serv_db = redis.StrictRedis( port=cfg.getint("Redis_Level_DB", "port"), db=cfg.getint("Redis_Level_DB", "db")) +r_serv_sentiment = redis.StrictRedis( + host=cfg.get("Redis_Level_DB_Sentiment", "host"), + port=cfg.getint("Redis_Level_DB_Sentiment", "port"), + db=cfg.getint("Redis_Level_DB_Sentiment", "db")) + + app = Flask(__name__, static_url_path='/static/') @@ -433,6 +440,36 @@ def moduletrending(): def sentiment_analysis_trending(): return render_template("sentiment_analysis_trending.html") + +@app.route("/sentiment_analysis_getplotdata/") +def sentiment_analysis_getplotdata(): + # Get the top providers based on number of pastes + oneHour = 60*60 + sevenDays = oneHour*24*7 + dateStart = datetime.datetime.now() + dateStart = dateStart.replace(minute=0, second=0, microsecond=0) + dateStart_timestamp = calendar.timegm(dateStart.timetuple()) + + to_return = {} + for cur_provider in r_serv_charts.smembers('providers_set'): + cur_provider_name = cur_provider + '_' + list_date = {} + for cur_timestamp in range(int(dateStart_timestamp), int(dateStart_timestamp)-sevenDays-oneHour, -oneHour): + cur_set_name = cur_provider_name + str(cur_timestamp) + + list_value = [] + for cur_id in r_serv_sentiment.smembers(cur_set_name): + cur_value = r_serv_sentiment.get(cur_id) + list_value.append(cur_value) + list_date[cur_timestamp] = list_value + to_return[cur_provider] = list_date + + return jsonify(to_return) + + + + + @app.route("/sentiment_analysis_plot_tool/") def sentiment_analysis_plot_tool(): return render_template("sentiment_analysis_plot_tool.html") diff --git a/var/www/static/js/sentiment_trending.js b/var/www/static/js/sentiment_trending.js index 53f7da5a..df4581e4 100644 --- a/var/www/static/js/sentiment_trending.js +++ b/var/www/static/js/sentiment_trending.js @@ -1,4 +1,117 @@ +/* ---------- Sparkline Charts ---------- */ +//generate random number for charts +randNum = function(){ + var num = Math.random(); + if(num > 0.5) + num = -1+num; + //console.log(Math.floor(num*101)); + return Math.floor(num*101); + //return (Math.floor( Math.random()* (1+40-20) ) ) + 20; +} + + + + var sparklineOptions = { + height: 80,//Height of the chart - Defaults to 'auto' (line height of the containing tag) + + chartRangeMin: -1, + chartRangeMax: 1, + + type: 'bar', + barSpacing: 0, + barWidth: 2, + barColor: '#00bf5f', + negBarColor: '#f22929', + zeroColor: '#ffff00' +}; + + +$.getJSON("/sentiment_analysis_getplotdata/", + function(data) { + //console.log(data); + var all_plot_data = []; + var plot_data = []; + var array_provider = Object.keys(data); + var dates_providers = Object.keys(data[array_provider[0]]); + var dateStart = parseInt(dates_providers[0]); + var oneHour = 60*60; + var oneWeek = oneHour*24*7; + + for (graphNum=0; graphNum<8; graphNum++) { + var graph_data = []; + var spark_data = []; + var curr_provider = array_provider[graphNum]; + + for(curr_date=dateStart; curr_date 0.5) - num = -1+num; - console.log(Math.floor(num*101)); - return Math.floor(num*101); - //return (Math.floor( Math.random()* (1+40-20) ) ) + 20; -} - - - - var sparklineOptions = { - width: 250,//Width of the chart - Defaults to 'auto' - May be any valid css width - 1.5em, 20px, etc (using a number without a unit specifier won't do what you want) - This option does nothing for bar and tristate chars (see barWidth) - height: 80,//Height of the chart - Defaults to 'auto' (line height of the containing tag) - type: 'bar', - barSpacing: 0, - barWidth: 10, - barColor: '#00bf5f', - negBarColor: '#f22929', - zeroColor: '#ffff00' - }; - - -//sparklines (making loop with random data for all 10 sparkline) -i=1; -for (i=1; i<10; i++) { - var data = [3+randNum(), 5+randNum(), 8+randNum(), 11+randNum(),14+randNum(),17+randNum(),20+randNum(),15+randNum(),18+randNum(),22+randNum()]; - placeholder = '.sparkLineStatsToday' + i; - - $(placeholder).sparkline(data, sparklineOptions); - -} - -//sparklines (making loop with random data for all 10 sparkline) -i=1; -for (i=1; i<10; i++) { - var data = [3+randNum(), 5+randNum(), 8+randNum(), 11+randNum(),14+randNum(),17+randNum(),20+randNum(),15+randNum(),18+randNum(),22+randNum()]; - placeholder = '.sparkLineStatsWeek' + i; - - $(placeholder).sparkline(data, sparklineOptions); - -} - - /* ----------- CanvasJS ------------ */ var options_canvasJS = { diff --git a/var/www/templates/sentiment_analysis_trending.html b/var/www/templates/sentiment_analysis_trending.html index 7cc730ef..98a89e07 100644 --- a/var/www/templates/sentiment_analysis_trending.html +++ b/var/www/templates/sentiment_analysis_trending.html @@ -21,12 +21,16 @@ strong { font-size: 16px; } + + .table { + margin-bottom: 0px; + } .sparkLineStats ul { padding-left:0; list-style:none } - + .sparkLineStats { position: relative; margin-bottom: -4px; @@ -46,9 +50,8 @@ margin-right: 5px; } - .wellInside { - background-color: #dedede; - padding: 12px; + .panelInside { + padding: 5px; } .fg-dial-label { @@ -107,26 +110,134 @@
-
+
-
    -
  • Graph 1
  • -
  • Graph 2
  • -
  • Graph 3
  • -
  • Graph 4
  • -
+
+
+ Graph 1 +
+
+ + + + + + + +
+
+
+
+
+ Graph 2 +
+
+ + + + + + + +
+
+
+
+
+ Graph 3 +
+
+ + + + + + + +
+
+
+
+
+ Graph 4 +
+
+ + + + + + + +
+
+
-
    -
  • Graph 5
  • -
  • Graph 6
  • -
  • Graph 7
  • -
  • Graph 8
  • -
+
+
+ Graph 5 +
+
+ + + + + + + +
+
+
+
+
+ Graph 6 +
+
+ + + + + + + +
+
+
+
+
+ Graph 7 +
+
+ + + + + + + +
+
+
+
+
+ Graph 8 +
+
+ + + + + + + +
+
+
@@ -167,26 +278,134 @@
-
+
-
    -
  • Graph 1
  • -
  • Graph 2
  • -
  • Graph 3
  • -
  • Graph 4
  • -
+
+
+ Graph 1 +
+
+ + + + + + + +
+
+
+
+
+ Graph 2 +
+
+ + + + + + + +
+
+
+
+
+ Graph 3 +
+
+ + + + + + + +
+
+
+
+
+ Graph 4 +
+
+ + + + + + + +
+
+
-
    -
  • Graph 5
  • -
  • Graph 6
  • -
  • Graph 7
  • -
  • Graph 8
  • -
+
+
+ Graph 5 +
+
+ + + + + + + +
+
+
+
+
+ Graph 6 +
+
+ + + + + + + +
+
+
+
+
+ Graph 7 +
+
+ + + + + + + +
+
+
+
+
+ Graph 8 +
+
+ + + + + + + +
+
+
@@ -252,6 +471,7 @@