Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range.

2024-11-26 15:57:16 +00:00 · 2016-08-17 09:46:25 +02:00 · 2016-08-17 09:46:25 +02:00 · 894b9efda9
commit 894b9efda9
parent 1084e45f1b
3 changed files with 86 additions and 54 deletions
--- a/bin/SentimentAnalyser.py
+++ b/bin/SentimentAnalyser.py
@ -2,12 +2,16 @@
 # -*-coding:UTF-8 -*
 """
    Sentiment analyser module.
-    It takes its inputs from 'shortLine' and 'longLine'.
-    Source code is taken into account (in case of comments). If it is only source code,
-    it will be treated with a neutral value anyway.
+    It takes its inputs from 'global'.

-nltk.sentiment.vader module:
-    Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+    The content analysed comes from the pastes with length of the line 
+    above a defined threshold removed (get_p_content_with_removed_lines).
+    This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
+    for long lines (function _slices_from_text line#1276).
+
+
+    nltk.sentiment.vader module credit:
+        Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

 """

@ -25,23 +29,27 @@ from nltk import tokenize

 # Config Variables
 accepted_Mime_type = ['text/plain']
+size_threshold = 250
+line_max_length_threshold = 1000

 def Analyse(message, server):
    #print 'analyzing'
    path = message
    paste = Paste.Paste(path)

-    content = paste.get_p_content()
+    # get content with removed line + number of them
+    num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)
    provider = paste.p_source
    p_date = str(paste._get_p_date())
    p_MimeType = paste._get_p_encoding()

    # Perform further analysis
    if p_MimeType == "text/plain":
-        if isJSON(content):
+        if isJSON(p_content):
            p_MimeType = "JSON"

    if p_MimeType in accepted_Mime_type:
+
        print 'Processing', path
        the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
        #print 'pastedate: ', the_date
@ -53,54 +61,54 @@ def Analyse(message, server):
        timestamp = calendar.timegm(combined_datetime.timetuple())
        #print 'timestamp: ', timestamp 
    
-        sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
+        sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))
        #print len(sentences)
    
-        avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
-        neg_line = 0
-        pos_line = 0
-        sid = SentimentIntensityAnalyzer()
-        for sentence in sentences:
-             ss = sid.polarity_scores(sentence)
-             for k in sorted(ss):
-                 if k == 'compound':
-                     if ss['neg'] > ss['pos']:
-                         avg_score['compoundNeg'] += ss[k]
-                         neg_line += 1
+        if len(sentences) > 0:
+            avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
+            neg_line = 0
+            pos_line = 0
+            sid = SentimentIntensityAnalyzer()
+            for sentence in sentences:
+                 ss = sid.polarity_scores(sentence)
+                 for k in sorted(ss):
+                     if k == 'compound':
+                         if ss['neg'] > ss['pos']:
+                             avg_score['compoundNeg'] += ss[k]
+                             neg_line += 1
+                         else:
+                             avg_score['compoundPos'] += ss[k]
+                             pos_line += 1
                     else:
-                         avg_score['compoundPos'] += ss[k]
-                         pos_line += 1
-                 else:
-                     avg_score[k] += ss[k]
+                         avg_score[k] += ss[k]
    
-                 #print('{0}: {1}, '.format(k, ss[k]))
+                     #print('{0}: {1}, '.format(k, ss[k]))
    
-        for k in avg_score:
-            if k == 'compoundPos':
-                avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
-            elif k == 'compoundNeg':
-                avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
-            else:
-                avg_score[k] = avg_score[k] / len(sentences)
+            for k in avg_score:
+                if k == 'compoundPos':
+                    avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
+                elif k == 'compoundNeg':
+                    avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
+                else:
+                    avg_score[k] = avg_score[k] / len(sentences)
    
    
-        # In redis-levelDB: {} = set, () = K-V 
-        # {Provider_set -> provider_i}
-        # {Provider_TimestampInHour_i -> UniqID_i}_j
-        # (UniqID_i -> PasteValue_i)
+            # In redis-levelDB: {} = set, () = K-V 
+            # {Provider_set -> provider_i}
+            # {Provider_TimestampInHour_i -> UniqID_i}_j
+            # (UniqID_i -> PasteValue_i)
    
-        server.sadd('Provider_set', provider)
-        #print 'Provider_set', provider
+            server.sadd('Provider_set', provider)
+            #print 'Provider_set', provider
    
-        provider_timestamp = provider + '_' + str(timestamp)
-        #print provider_timestamp
-        server.incr('UniqID')
-        UniqID = server.get('UniqID')
-        print provider_timestamp, '->', UniqID
-        server.sadd(provider_timestamp, UniqID)
-        server.set(UniqID, avg_score)
-        print avg_score
-        #print UniqID, '->', avg_score
+            provider_timestamp = provider + '_' + str(timestamp)
+            #print provider_timestamp
+            server.incr('UniqID')
+            UniqID = server.get('UniqID')
+            print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'
+            server.sadd(provider_timestamp, UniqID)
+            server.set(UniqID, avg_score)
+            #print UniqID, '->', avg_score
    else:
        print 'Dropped:', p_MimeType
    
@ -146,3 +154,4 @@ if __name__ == '__main__':

        # Do something with the message from the queue
        Analyse(message, server)
+
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -91,6 +91,7 @@ class Paste(object):
        self.p_langage = None
        self.p_nb_lines = None
        self.p_max_length_line = None
+        self.array_line_above_threshold = None
        self.p_duplicate = None

    def get_p_content(self):
@ -118,6 +119,21 @@ class Paste(object):
    def get_p_content_as_file(self):
        return cStringIO.StringIO(self.get_p_content())

+    def get_p_content_with_removed_lines(self, threshold):
+        num_line_removed = 0
+        line_length_threshold = threshold
+        string_content = ""
+        f = self.get_p_content_as_file()
+        line_id = 0
+        for line_id, line in enumerate(f):
+            length = len(line)
+            if length < line_length_threshold:
+                string_content += line
+            else:
+                num_line_removed+=1
+
+        return (num_line_removed, string_content)
+
    def get_lines_info(self):
        """
        Returning and setting the number of lines and the maximum lenght of the
@ -136,10 +152,12 @@ class Paste(object):
                length = len(line)
                if length >= max_length_line:
                    max_length_line = length
+
            f.close()
            self.p_nb_lines = line_id
            self.p_max_length_line = max_length_line
-        return (self.p_nb_lines, self.p_max_length_line)
+
+        return (self.p_nb_lines, self.p_max_length_line, array_line_above_threshold)

    def _get_p_encoding(self):
        """
--- a/var/www/static/js/sentiment_trending.js
+++ b/var/www/static/js/sentiment_trending.js
@ -7,13 +7,14 @@
 };

 function generate_offset_to_date(day){
+     day = day-1;
     var now = new Date();
     var to_ret = {};
-     for(i=0; i<day; i++){
+     for(i=day; i>=0; i--){
         for(j=0; j<24; j++){
             var t1 =now.getDate()-i + ":"; 
             var t2 =now.getHours()-(23-j)+"h";
-             to_ret[j+24*i] = t1+t2;
+             to_ret[j+24*(day-i)] = t1+t2;
         }
     }
     return to_ret;
@ -53,6 +54,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",

        var all_graph_day_sum = 0.0;
        var all_graph_hour_sum = 0.0;
+        var all_day_avg = 0.0;

        for (graphNum=0; graphNum<8; graphNum++) {
            var max_value = 0.0;
@ -65,7 +67,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
            var day_sum_elem = 0.0;
            var hour_sum = 0.0;

-            for(curr_date=dateStart; curr_date<dateStart+oneWeek; curr_date+=oneHour){
+            for(curr_date=dateStart+oneHour; curr_date<=dateStart+oneWeek; curr_date+=oneHour){
                var data_array = data[curr_provider][curr_date];

                if (data_array.length == 0){
@ -99,7 +101,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
                    curr_sum_elem++;
                    max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value;

-                    if(curr_date >= dateStart+oneWeek-24*oneHour){
+                    if(curr_date >= dateStart+oneWeek-23*oneHour){
                        day_sum += (pos-neg);
                        day_sum_elem++;
                    }
@ -150,11 +152,13 @@ $.getJSON("/sentiment_analysis_getplotdata/",
            sparklineOptions.barWidth = 18;
            sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> Avg: {{value}} </span>'
            //var day_avg = day_sum/24;
-            var day_avg = day_sum/day_sum_elem;
+            var day_avg = isNaN(day_sum/day_sum_elem) ? 0 : day_sum/day_sum_elem;
+            var day_avg_text = isNaN(day_sum/day_sum_elem) ? 'No data' : (day_avg).toFixed(5);
+            all_day_avg += day_avg;
            $(placeholder+'b').sparkline([day_avg], sparklineOptions);
            sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> {{offset:names}}, {{value}} </span>'
            sparklineOptions.barWidth = 2;
-            $(placeholder+'s').text((day_avg).toFixed(5));
+            $(placeholder+'s').text(day_avg_text);

        }//for loop

@ -197,7 +201,8 @@ $.getJSON("/sentiment_analysis_getplotdata/",
        gaugeOptions2.appendTo = '#gauge_today_last_days';
        gaugeOptions2.dialLabel = 'Today';
        gaugeOptions2.elementId = 'gauge2';
-        piePercent = (all_graph_day_sum / (8*24)) / max_value;
+        //piePercent = (all_graph_day_sum / (8*24)) / max_value;
+        piePercent = (all_day_avg / 8) / max_value;
        gaugeOptions2.inc = piePercent;
        var gauge_today_last_days = new FlexGauge(gaugeOptions2);