Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range.

This commit is contained in:
Mokaddem 2016-08-17 09:46:25 +02:00
parent 1084e45f1b
commit 894b9efda9
3 changed files with 86 additions and 54 deletions

View file

@ -2,12 +2,16 @@
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
""" """
Sentiment analyser module. Sentiment analyser module.
It takes its inputs from 'shortLine' and 'longLine'. It takes its inputs from 'global'.
Source code is taken into account (in case of comments). If it is only source code,
it will be treated with a neutral value anyway.
nltk.sentiment.vader module: The content analysed comes from the pastes with length of the line
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. above a defined threshold removed (get_p_content_with_removed_lines).
This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
for long lines (function _slices_from_text line#1276).
nltk.sentiment.vader module credit:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
""" """
@ -25,23 +29,27 @@ from nltk import tokenize
# Config Variables # Config Variables
accepted_Mime_type = ['text/plain'] accepted_Mime_type = ['text/plain']
size_threshold = 250
line_max_length_threshold = 1000
def Analyse(message, server): def Analyse(message, server):
#print 'analyzing' #print 'analyzing'
path = message path = message
paste = Paste.Paste(path) paste = Paste.Paste(path)
content = paste.get_p_content() # get content with removed line + number of them
num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)
provider = paste.p_source provider = paste.p_source
p_date = str(paste._get_p_date()) p_date = str(paste._get_p_date())
p_MimeType = paste._get_p_encoding() p_MimeType = paste._get_p_encoding()
# Perform further analysis # Perform further analysis
if p_MimeType == "text/plain": if p_MimeType == "text/plain":
if isJSON(content): if isJSON(p_content):
p_MimeType = "JSON" p_MimeType = "JSON"
if p_MimeType in accepted_Mime_type: if p_MimeType in accepted_Mime_type:
print 'Processing', path print 'Processing', path
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8])) the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
#print 'pastedate: ', the_date #print 'pastedate: ', the_date
@ -53,54 +61,54 @@ def Analyse(message, server):
timestamp = calendar.timegm(combined_datetime.timetuple()) timestamp = calendar.timegm(combined_datetime.timetuple())
#print 'timestamp: ', timestamp #print 'timestamp: ', timestamp
sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore')) sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))
#print len(sentences) #print len(sentences)
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} if len(sentences) > 0:
neg_line = 0 avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
pos_line = 0 neg_line = 0
sid = SentimentIntensityAnalyzer() pos_line = 0
for sentence in sentences: sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence) for sentence in sentences:
for k in sorted(ss): ss = sid.polarity_scores(sentence)
if k == 'compound': for k in sorted(ss):
if ss['neg'] > ss['pos']: if k == 'compound':
avg_score['compoundNeg'] += ss[k] if ss['neg'] > ss['pos']:
neg_line += 1 avg_score['compoundNeg'] += ss[k]
neg_line += 1
else:
avg_score['compoundPos'] += ss[k]
pos_line += 1
else: else:
avg_score['compoundPos'] += ss[k] avg_score[k] += ss[k]
pos_line += 1
else:
avg_score[k] += ss[k]
#print('{0}: {1}, '.format(k, ss[k])) #print('{0}: {1}, '.format(k, ss[k]))
for k in avg_score: for k in avg_score:
if k == 'compoundPos': if k == 'compoundPos':
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1) avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
elif k == 'compoundNeg': elif k == 'compoundNeg':
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1) avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
else: else:
avg_score[k] = avg_score[k] / len(sentences) avg_score[k] = avg_score[k] / len(sentences)
# In redis-levelDB: {} = set, () = K-V # In redis-levelDB: {} = set, () = K-V
# {Provider_set -> provider_i} # {Provider_set -> provider_i}
# {Provider_TimestampInHour_i -> UniqID_i}_j # {Provider_TimestampInHour_i -> UniqID_i}_j
# (UniqID_i -> PasteValue_i) # (UniqID_i -> PasteValue_i)
server.sadd('Provider_set', provider) server.sadd('Provider_set', provider)
#print 'Provider_set', provider #print 'Provider_set', provider
provider_timestamp = provider + '_' + str(timestamp) provider_timestamp = provider + '_' + str(timestamp)
#print provider_timestamp #print provider_timestamp
server.incr('UniqID') server.incr('UniqID')
UniqID = server.get('UniqID') UniqID = server.get('UniqID')
print provider_timestamp, '->', UniqID print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'
server.sadd(provider_timestamp, UniqID) server.sadd(provider_timestamp, UniqID)
server.set(UniqID, avg_score) server.set(UniqID, avg_score)
print avg_score #print UniqID, '->', avg_score
#print UniqID, '->', avg_score
else: else:
print 'Dropped:', p_MimeType print 'Dropped:', p_MimeType
@ -146,3 +154,4 @@ if __name__ == '__main__':
# Do something with the message from the queue # Do something with the message from the queue
Analyse(message, server) Analyse(message, server)

View file

@ -91,6 +91,7 @@ class Paste(object):
self.p_langage = None self.p_langage = None
self.p_nb_lines = None self.p_nb_lines = None
self.p_max_length_line = None self.p_max_length_line = None
self.array_line_above_threshold = None
self.p_duplicate = None self.p_duplicate = None
def get_p_content(self): def get_p_content(self):
@ -118,6 +119,21 @@ class Paste(object):
def get_p_content_as_file(self): def get_p_content_as_file(self):
return cStringIO.StringIO(self.get_p_content()) return cStringIO.StringIO(self.get_p_content())
def get_p_content_with_removed_lines(self, threshold):
num_line_removed = 0
line_length_threshold = threshold
string_content = ""
f = self.get_p_content_as_file()
line_id = 0
for line_id, line in enumerate(f):
length = len(line)
if length < line_length_threshold:
string_content += line
else:
num_line_removed+=1
return (num_line_removed, string_content)
def get_lines_info(self): def get_lines_info(self):
""" """
Returning and setting the number of lines and the maximum lenght of the Returning and setting the number of lines and the maximum lenght of the
@ -136,10 +152,12 @@ class Paste(object):
length = len(line) length = len(line)
if length >= max_length_line: if length >= max_length_line:
max_length_line = length max_length_line = length
f.close() f.close()
self.p_nb_lines = line_id self.p_nb_lines = line_id
self.p_max_length_line = max_length_line self.p_max_length_line = max_length_line
return (self.p_nb_lines, self.p_max_length_line)
return (self.p_nb_lines, self.p_max_length_line, array_line_above_threshold)
def _get_p_encoding(self): def _get_p_encoding(self):
""" """

View file

@ -7,13 +7,14 @@
}; };
function generate_offset_to_date(day){ function generate_offset_to_date(day){
day = day-1;
var now = new Date(); var now = new Date();
var to_ret = {}; var to_ret = {};
for(i=0; i<day; i++){ for(i=day; i>=0; i--){
for(j=0; j<24; j++){ for(j=0; j<24; j++){
var t1 =now.getDate()-i + ":"; var t1 =now.getDate()-i + ":";
var t2 =now.getHours()-(23-j)+"h"; var t2 =now.getHours()-(23-j)+"h";
to_ret[j+24*i] = t1+t2; to_ret[j+24*(day-i)] = t1+t2;
} }
} }
return to_ret; return to_ret;
@ -53,6 +54,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
var all_graph_day_sum = 0.0; var all_graph_day_sum = 0.0;
var all_graph_hour_sum = 0.0; var all_graph_hour_sum = 0.0;
var all_day_avg = 0.0;
for (graphNum=0; graphNum<8; graphNum++) { for (graphNum=0; graphNum<8; graphNum++) {
var max_value = 0.0; var max_value = 0.0;
@ -65,7 +67,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
var day_sum_elem = 0.0; var day_sum_elem = 0.0;
var hour_sum = 0.0; var hour_sum = 0.0;
for(curr_date=dateStart; curr_date<dateStart+oneWeek; curr_date+=oneHour){ for(curr_date=dateStart+oneHour; curr_date<=dateStart+oneWeek; curr_date+=oneHour){
var data_array = data[curr_provider][curr_date]; var data_array = data[curr_provider][curr_date];
if (data_array.length == 0){ if (data_array.length == 0){
@ -99,7 +101,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
curr_sum_elem++; curr_sum_elem++;
max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value; max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value;
if(curr_date >= dateStart+oneWeek-24*oneHour){ if(curr_date >= dateStart+oneWeek-23*oneHour){
day_sum += (pos-neg); day_sum += (pos-neg);
day_sum_elem++; day_sum_elem++;
} }
@ -150,11 +152,13 @@ $.getJSON("/sentiment_analysis_getplotdata/",
sparklineOptions.barWidth = 18; sparklineOptions.barWidth = 18;
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> Avg: {{value}} </span>' sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> Avg: {{value}} </span>'
//var day_avg = day_sum/24; //var day_avg = day_sum/24;
var day_avg = day_sum/day_sum_elem; var day_avg = isNaN(day_sum/day_sum_elem) ? 0 : day_sum/day_sum_elem;
var day_avg_text = isNaN(day_sum/day_sum_elem) ? 'No data' : (day_avg).toFixed(5);
all_day_avg += day_avg;
$(placeholder+'b').sparkline([day_avg], sparklineOptions); $(placeholder+'b').sparkline([day_avg], sparklineOptions);
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> {{offset:names}}, {{value}} </span>' sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> {{offset:names}}, {{value}} </span>'
sparklineOptions.barWidth = 2; sparklineOptions.barWidth = 2;
$(placeholder+'s').text((day_avg).toFixed(5)); $(placeholder+'s').text(day_avg_text);
}//for loop }//for loop
@ -197,7 +201,8 @@ $.getJSON("/sentiment_analysis_getplotdata/",
gaugeOptions2.appendTo = '#gauge_today_last_days'; gaugeOptions2.appendTo = '#gauge_today_last_days';
gaugeOptions2.dialLabel = 'Today'; gaugeOptions2.dialLabel = 'Today';
gaugeOptions2.elementId = 'gauge2'; gaugeOptions2.elementId = 'gauge2';
piePercent = (all_graph_day_sum / (8*24)) / max_value; //piePercent = (all_graph_day_sum / (8*24)) / max_value;
piePercent = (all_day_avg / 8) / max_value;
gaugeOptions2.inc = piePercent; gaugeOptions2.inc = piePercent;
var gauge_today_last_days = new FlexGauge(gaugeOptions2); var gauge_today_last_days = new FlexGauge(gaugeOptions2);