From 894b9efda9fa522cfd8c0d8c5da6f6c0ffea6fb1 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Wed, 17 Aug 2016 09:46:25 +0200 Subject: [PATCH] Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. --- bin/SentimentAnalyser.py | 101 +++++++++++++----------- bin/packages/Paste.py | 20 ++++- var/www/static/js/sentiment_trending.js | 19 +++-- 3 files changed, 86 insertions(+), 54 deletions(-) diff --git a/bin/SentimentAnalyser.py b/bin/SentimentAnalyser.py index 8b48610f..63f6c095 100755 --- a/bin/SentimentAnalyser.py +++ b/bin/SentimentAnalyser.py @@ -2,12 +2,16 @@ # -*-coding:UTF-8 -* """ Sentiment analyser module. - It takes its inputs from 'shortLine' and 'longLine'. - Source code is taken into account (in case of comments). If it is only source code, - it will be treated with a neutral value anyway. + It takes its inputs from 'global'. -nltk.sentiment.vader module: - Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. + The content analysed comes from the pastes with length of the line + above a defined threshold removed (get_p_content_with_removed_lines). + This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash + for long lines (function _slices_from_text line#1276). + + + nltk.sentiment.vader module credit: + Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. """ @@ -25,23 +29,27 @@ from nltk import tokenize # Config Variables accepted_Mime_type = ['text/plain'] +size_threshold = 250 +line_max_length_threshold = 1000 def Analyse(message, server): #print 'analyzing' path = message paste = Paste.Paste(path) - content = paste.get_p_content() + # get content with removed line + number of them + num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold) provider = paste.p_source p_date = str(paste._get_p_date()) p_MimeType = paste._get_p_encoding() # Perform further analysis if p_MimeType == "text/plain": - if isJSON(content): + if isJSON(p_content): p_MimeType = "JSON" if p_MimeType in accepted_Mime_type: + print 'Processing', path the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8])) #print 'pastedate: ', the_date @@ -53,54 +61,54 @@ def Analyse(message, server): timestamp = calendar.timegm(combined_datetime.timetuple()) #print 'timestamp: ', timestamp - sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore')) + sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore')) #print len(sentences) - avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} - neg_line = 0 - pos_line = 0 - sid = SentimentIntensityAnalyzer() - for sentence in sentences: - ss = sid.polarity_scores(sentence) - for k in sorted(ss): - if k == 'compound': - if ss['neg'] > ss['pos']: - avg_score['compoundNeg'] += ss[k] - neg_line += 1 + if len(sentences) > 0: + avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} + neg_line = 0 + pos_line = 0 + sid = SentimentIntensityAnalyzer() + for sentence in sentences: + ss = sid.polarity_scores(sentence) + for k in sorted(ss): + if k == 'compound': + if ss['neg'] > ss['pos']: + avg_score['compoundNeg'] += ss[k] + neg_line += 1 + else: + avg_score['compoundPos'] += ss[k] + pos_line += 1 else: - avg_score['compoundPos'] += ss[k] - pos_line += 1 - else: - avg_score[k] += ss[k] + avg_score[k] += ss[k] - #print('{0}: {1}, '.format(k, ss[k])) + #print('{0}: {1}, '.format(k, ss[k])) - for k in avg_score: - if k == 'compoundPos': - avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1) - elif k == 'compoundNeg': - avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1) - else: - avg_score[k] = avg_score[k] / len(sentences) + for k in avg_score: + if k == 'compoundPos': + avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1) + elif k == 'compoundNeg': + avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1) + else: + avg_score[k] = avg_score[k] / len(sentences) - # In redis-levelDB: {} = set, () = K-V - # {Provider_set -> provider_i} - # {Provider_TimestampInHour_i -> UniqID_i}_j - # (UniqID_i -> PasteValue_i) + # In redis-levelDB: {} = set, () = K-V + # {Provider_set -> provider_i} + # {Provider_TimestampInHour_i -> UniqID_i}_j + # (UniqID_i -> PasteValue_i) - server.sadd('Provider_set', provider) - #print 'Provider_set', provider + server.sadd('Provider_set', provider) + #print 'Provider_set', provider - provider_timestamp = provider + '_' + str(timestamp) - #print provider_timestamp - server.incr('UniqID') - UniqID = server.get('UniqID') - print provider_timestamp, '->', UniqID - server.sadd(provider_timestamp, UniqID) - server.set(UniqID, avg_score) - print avg_score - #print UniqID, '->', avg_score + provider_timestamp = provider + '_' + str(timestamp) + #print provider_timestamp + server.incr('UniqID') + UniqID = server.get('UniqID') + print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines' + server.sadd(provider_timestamp, UniqID) + server.set(UniqID, avg_score) + #print UniqID, '->', avg_score else: print 'Dropped:', p_MimeType @@ -146,3 +154,4 @@ if __name__ == '__main__': # Do something with the message from the queue Analyse(message, server) + diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index f03114f1..eb5dd320 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -91,6 +91,7 @@ class Paste(object): self.p_langage = None self.p_nb_lines = None self.p_max_length_line = None + self.array_line_above_threshold = None self.p_duplicate = None def get_p_content(self): @@ -118,6 +119,21 @@ class Paste(object): def get_p_content_as_file(self): return cStringIO.StringIO(self.get_p_content()) + def get_p_content_with_removed_lines(self, threshold): + num_line_removed = 0 + line_length_threshold = threshold + string_content = "" + f = self.get_p_content_as_file() + line_id = 0 + for line_id, line in enumerate(f): + length = len(line) + if length < line_length_threshold: + string_content += line + else: + num_line_removed+=1 + + return (num_line_removed, string_content) + def get_lines_info(self): """ Returning and setting the number of lines and the maximum lenght of the @@ -136,10 +152,12 @@ class Paste(object): length = len(line) if length >= max_length_line: max_length_line = length + f.close() self.p_nb_lines = line_id self.p_max_length_line = max_length_line - return (self.p_nb_lines, self.p_max_length_line) + + return (self.p_nb_lines, self.p_max_length_line, array_line_above_threshold) def _get_p_encoding(self): """ diff --git a/var/www/static/js/sentiment_trending.js b/var/www/static/js/sentiment_trending.js index ca0eafdd..be85fbdb 100644 --- a/var/www/static/js/sentiment_trending.js +++ b/var/www/static/js/sentiment_trending.js @@ -7,13 +7,14 @@ }; function generate_offset_to_date(day){ + day = day-1; var now = new Date(); var to_ret = {}; - for(i=0; i=0; i--){ for(j=0; j<24; j++){ var t1 =now.getDate()-i + ":"; var t2 =now.getHours()-(23-j)+"h"; - to_ret[j+24*i] = t1+t2; + to_ret[j+24*(day-i)] = t1+t2; } } return to_ret; @@ -53,6 +54,7 @@ $.getJSON("/sentiment_analysis_getplotdata/", var all_graph_day_sum = 0.0; var all_graph_hour_sum = 0.0; + var all_day_avg = 0.0; for (graphNum=0; graphNum<8; graphNum++) { var max_value = 0.0; @@ -65,7 +67,7 @@ $.getJSON("/sentiment_analysis_getplotdata/", var day_sum_elem = 0.0; var hour_sum = 0.0; - for(curr_date=dateStart; curr_date max_value ? Math.abs(pos-neg) : max_value; - if(curr_date >= dateStart+oneWeek-24*oneHour){ + if(curr_date >= dateStart+oneWeek-23*oneHour){ day_sum += (pos-neg); day_sum_elem++; } @@ -150,11 +152,13 @@ $.getJSON("/sentiment_analysis_getplotdata/", sparklineOptions.barWidth = 18; sparklineOptions.tooltipFormat = ' Avg: {{value}} ' //var day_avg = day_sum/24; - var day_avg = day_sum/day_sum_elem; + var day_avg = isNaN(day_sum/day_sum_elem) ? 0 : day_sum/day_sum_elem; + var day_avg_text = isNaN(day_sum/day_sum_elem) ? 'No data' : (day_avg).toFixed(5); + all_day_avg += day_avg; $(placeholder+'b').sparkline([day_avg], sparklineOptions); sparklineOptions.tooltipFormat = ' {{offset:names}}, {{value}} ' sparklineOptions.barWidth = 2; - $(placeholder+'s').text((day_avg).toFixed(5)); + $(placeholder+'s').text(day_avg_text); }//for loop @@ -197,7 +201,8 @@ $.getJSON("/sentiment_analysis_getplotdata/", gaugeOptions2.appendTo = '#gauge_today_last_days'; gaugeOptions2.dialLabel = 'Today'; gaugeOptions2.elementId = 'gauge2'; - piePercent = (all_graph_day_sum / (8*24)) / max_value; + //piePercent = (all_graph_day_sum / (8*24)) / max_value; + piePercent = (all_day_avg / 8) / max_value; gaugeOptions2.inc = piePercent; var gauge_today_last_days = new FlexGauge(gaugeOptions2);