mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range.
This commit is contained in:
parent
1084e45f1b
commit
894b9efda9
3 changed files with 86 additions and 54 deletions
|
@ -2,12 +2,16 @@
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
"""
|
"""
|
||||||
Sentiment analyser module.
|
Sentiment analyser module.
|
||||||
It takes its inputs from 'shortLine' and 'longLine'.
|
It takes its inputs from 'global'.
|
||||||
Source code is taken into account (in case of comments). If it is only source code,
|
|
||||||
it will be treated with a neutral value anyway.
|
|
||||||
|
|
||||||
nltk.sentiment.vader module:
|
The content analysed comes from the pastes with length of the line
|
||||||
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
above a defined threshold removed (get_p_content_with_removed_lines).
|
||||||
|
This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
|
||||||
|
for long lines (function _slices_from_text line#1276).
|
||||||
|
|
||||||
|
|
||||||
|
nltk.sentiment.vader module credit:
|
||||||
|
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -25,23 +29,27 @@ from nltk import tokenize
|
||||||
|
|
||||||
# Config Variables
|
# Config Variables
|
||||||
accepted_Mime_type = ['text/plain']
|
accepted_Mime_type = ['text/plain']
|
||||||
|
size_threshold = 250
|
||||||
|
line_max_length_threshold = 1000
|
||||||
|
|
||||||
def Analyse(message, server):
|
def Analyse(message, server):
|
||||||
#print 'analyzing'
|
#print 'analyzing'
|
||||||
path = message
|
path = message
|
||||||
paste = Paste.Paste(path)
|
paste = Paste.Paste(path)
|
||||||
|
|
||||||
content = paste.get_p_content()
|
# get content with removed line + number of them
|
||||||
|
num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)
|
||||||
provider = paste.p_source
|
provider = paste.p_source
|
||||||
p_date = str(paste._get_p_date())
|
p_date = str(paste._get_p_date())
|
||||||
p_MimeType = paste._get_p_encoding()
|
p_MimeType = paste._get_p_encoding()
|
||||||
|
|
||||||
# Perform further analysis
|
# Perform further analysis
|
||||||
if p_MimeType == "text/plain":
|
if p_MimeType == "text/plain":
|
||||||
if isJSON(content):
|
if isJSON(p_content):
|
||||||
p_MimeType = "JSON"
|
p_MimeType = "JSON"
|
||||||
|
|
||||||
if p_MimeType in accepted_Mime_type:
|
if p_MimeType in accepted_Mime_type:
|
||||||
|
|
||||||
print 'Processing', path
|
print 'Processing', path
|
||||||
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
|
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
|
||||||
#print 'pastedate: ', the_date
|
#print 'pastedate: ', the_date
|
||||||
|
@ -53,54 +61,54 @@ def Analyse(message, server):
|
||||||
timestamp = calendar.timegm(combined_datetime.timetuple())
|
timestamp = calendar.timegm(combined_datetime.timetuple())
|
||||||
#print 'timestamp: ', timestamp
|
#print 'timestamp: ', timestamp
|
||||||
|
|
||||||
sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
|
sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))
|
||||||
#print len(sentences)
|
#print len(sentences)
|
||||||
|
|
||||||
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
|
if len(sentences) > 0:
|
||||||
neg_line = 0
|
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
|
||||||
pos_line = 0
|
neg_line = 0
|
||||||
sid = SentimentIntensityAnalyzer()
|
pos_line = 0
|
||||||
for sentence in sentences:
|
sid = SentimentIntensityAnalyzer()
|
||||||
ss = sid.polarity_scores(sentence)
|
for sentence in sentences:
|
||||||
for k in sorted(ss):
|
ss = sid.polarity_scores(sentence)
|
||||||
if k == 'compound':
|
for k in sorted(ss):
|
||||||
if ss['neg'] > ss['pos']:
|
if k == 'compound':
|
||||||
avg_score['compoundNeg'] += ss[k]
|
if ss['neg'] > ss['pos']:
|
||||||
neg_line += 1
|
avg_score['compoundNeg'] += ss[k]
|
||||||
|
neg_line += 1
|
||||||
|
else:
|
||||||
|
avg_score['compoundPos'] += ss[k]
|
||||||
|
pos_line += 1
|
||||||
else:
|
else:
|
||||||
avg_score['compoundPos'] += ss[k]
|
avg_score[k] += ss[k]
|
||||||
pos_line += 1
|
|
||||||
else:
|
|
||||||
avg_score[k] += ss[k]
|
|
||||||
|
|
||||||
#print('{0}: {1}, '.format(k, ss[k]))
|
#print('{0}: {1}, '.format(k, ss[k]))
|
||||||
|
|
||||||
for k in avg_score:
|
for k in avg_score:
|
||||||
if k == 'compoundPos':
|
if k == 'compoundPos':
|
||||||
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
|
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
|
||||||
elif k == 'compoundNeg':
|
elif k == 'compoundNeg':
|
||||||
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
|
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
|
||||||
else:
|
else:
|
||||||
avg_score[k] = avg_score[k] / len(sentences)
|
avg_score[k] = avg_score[k] / len(sentences)
|
||||||
|
|
||||||
|
|
||||||
# In redis-levelDB: {} = set, () = K-V
|
# In redis-levelDB: {} = set, () = K-V
|
||||||
# {Provider_set -> provider_i}
|
# {Provider_set -> provider_i}
|
||||||
# {Provider_TimestampInHour_i -> UniqID_i}_j
|
# {Provider_TimestampInHour_i -> UniqID_i}_j
|
||||||
# (UniqID_i -> PasteValue_i)
|
# (UniqID_i -> PasteValue_i)
|
||||||
|
|
||||||
server.sadd('Provider_set', provider)
|
server.sadd('Provider_set', provider)
|
||||||
#print 'Provider_set', provider
|
#print 'Provider_set', provider
|
||||||
|
|
||||||
provider_timestamp = provider + '_' + str(timestamp)
|
provider_timestamp = provider + '_' + str(timestamp)
|
||||||
#print provider_timestamp
|
#print provider_timestamp
|
||||||
server.incr('UniqID')
|
server.incr('UniqID')
|
||||||
UniqID = server.get('UniqID')
|
UniqID = server.get('UniqID')
|
||||||
print provider_timestamp, '->', UniqID
|
print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'
|
||||||
server.sadd(provider_timestamp, UniqID)
|
server.sadd(provider_timestamp, UniqID)
|
||||||
server.set(UniqID, avg_score)
|
server.set(UniqID, avg_score)
|
||||||
print avg_score
|
#print UniqID, '->', avg_score
|
||||||
#print UniqID, '->', avg_score
|
|
||||||
else:
|
else:
|
||||||
print 'Dropped:', p_MimeType
|
print 'Dropped:', p_MimeType
|
||||||
|
|
||||||
|
@ -146,3 +154,4 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Do something with the message from the queue
|
# Do something with the message from the queue
|
||||||
Analyse(message, server)
|
Analyse(message, server)
|
||||||
|
|
||||||
|
|
|
@ -91,6 +91,7 @@ class Paste(object):
|
||||||
self.p_langage = None
|
self.p_langage = None
|
||||||
self.p_nb_lines = None
|
self.p_nb_lines = None
|
||||||
self.p_max_length_line = None
|
self.p_max_length_line = None
|
||||||
|
self.array_line_above_threshold = None
|
||||||
self.p_duplicate = None
|
self.p_duplicate = None
|
||||||
|
|
||||||
def get_p_content(self):
|
def get_p_content(self):
|
||||||
|
@ -118,6 +119,21 @@ class Paste(object):
|
||||||
def get_p_content_as_file(self):
|
def get_p_content_as_file(self):
|
||||||
return cStringIO.StringIO(self.get_p_content())
|
return cStringIO.StringIO(self.get_p_content())
|
||||||
|
|
||||||
|
def get_p_content_with_removed_lines(self, threshold):
|
||||||
|
num_line_removed = 0
|
||||||
|
line_length_threshold = threshold
|
||||||
|
string_content = ""
|
||||||
|
f = self.get_p_content_as_file()
|
||||||
|
line_id = 0
|
||||||
|
for line_id, line in enumerate(f):
|
||||||
|
length = len(line)
|
||||||
|
if length < line_length_threshold:
|
||||||
|
string_content += line
|
||||||
|
else:
|
||||||
|
num_line_removed+=1
|
||||||
|
|
||||||
|
return (num_line_removed, string_content)
|
||||||
|
|
||||||
def get_lines_info(self):
|
def get_lines_info(self):
|
||||||
"""
|
"""
|
||||||
Returning and setting the number of lines and the maximum lenght of the
|
Returning and setting the number of lines and the maximum lenght of the
|
||||||
|
@ -136,10 +152,12 @@ class Paste(object):
|
||||||
length = len(line)
|
length = len(line)
|
||||||
if length >= max_length_line:
|
if length >= max_length_line:
|
||||||
max_length_line = length
|
max_length_line = length
|
||||||
|
|
||||||
f.close()
|
f.close()
|
||||||
self.p_nb_lines = line_id
|
self.p_nb_lines = line_id
|
||||||
self.p_max_length_line = max_length_line
|
self.p_max_length_line = max_length_line
|
||||||
return (self.p_nb_lines, self.p_max_length_line)
|
|
||||||
|
return (self.p_nb_lines, self.p_max_length_line, array_line_above_threshold)
|
||||||
|
|
||||||
def _get_p_encoding(self):
|
def _get_p_encoding(self):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -7,13 +7,14 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
function generate_offset_to_date(day){
|
function generate_offset_to_date(day){
|
||||||
|
day = day-1;
|
||||||
var now = new Date();
|
var now = new Date();
|
||||||
var to_ret = {};
|
var to_ret = {};
|
||||||
for(i=0; i<day; i++){
|
for(i=day; i>=0; i--){
|
||||||
for(j=0; j<24; j++){
|
for(j=0; j<24; j++){
|
||||||
var t1 =now.getDate()-i + ":";
|
var t1 =now.getDate()-i + ":";
|
||||||
var t2 =now.getHours()-(23-j)+"h";
|
var t2 =now.getHours()-(23-j)+"h";
|
||||||
to_ret[j+24*i] = t1+t2;
|
to_ret[j+24*(day-i)] = t1+t2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return to_ret;
|
return to_ret;
|
||||||
|
@ -53,6 +54,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
|
||||||
|
|
||||||
var all_graph_day_sum = 0.0;
|
var all_graph_day_sum = 0.0;
|
||||||
var all_graph_hour_sum = 0.0;
|
var all_graph_hour_sum = 0.0;
|
||||||
|
var all_day_avg = 0.0;
|
||||||
|
|
||||||
for (graphNum=0; graphNum<8; graphNum++) {
|
for (graphNum=0; graphNum<8; graphNum++) {
|
||||||
var max_value = 0.0;
|
var max_value = 0.0;
|
||||||
|
@ -65,7 +67,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
|
||||||
var day_sum_elem = 0.0;
|
var day_sum_elem = 0.0;
|
||||||
var hour_sum = 0.0;
|
var hour_sum = 0.0;
|
||||||
|
|
||||||
for(curr_date=dateStart; curr_date<dateStart+oneWeek; curr_date+=oneHour){
|
for(curr_date=dateStart+oneHour; curr_date<=dateStart+oneWeek; curr_date+=oneHour){
|
||||||
var data_array = data[curr_provider][curr_date];
|
var data_array = data[curr_provider][curr_date];
|
||||||
|
|
||||||
if (data_array.length == 0){
|
if (data_array.length == 0){
|
||||||
|
@ -99,7 +101,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
|
||||||
curr_sum_elem++;
|
curr_sum_elem++;
|
||||||
max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value;
|
max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value;
|
||||||
|
|
||||||
if(curr_date >= dateStart+oneWeek-24*oneHour){
|
if(curr_date >= dateStart+oneWeek-23*oneHour){
|
||||||
day_sum += (pos-neg);
|
day_sum += (pos-neg);
|
||||||
day_sum_elem++;
|
day_sum_elem++;
|
||||||
}
|
}
|
||||||
|
@ -150,11 +152,13 @@ $.getJSON("/sentiment_analysis_getplotdata/",
|
||||||
sparklineOptions.barWidth = 18;
|
sparklineOptions.barWidth = 18;
|
||||||
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">●</span> Avg: {{value}} </span>'
|
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">●</span> Avg: {{value}} </span>'
|
||||||
//var day_avg = day_sum/24;
|
//var day_avg = day_sum/24;
|
||||||
var day_avg = day_sum/day_sum_elem;
|
var day_avg = isNaN(day_sum/day_sum_elem) ? 0 : day_sum/day_sum_elem;
|
||||||
|
var day_avg_text = isNaN(day_sum/day_sum_elem) ? 'No data' : (day_avg).toFixed(5);
|
||||||
|
all_day_avg += day_avg;
|
||||||
$(placeholder+'b').sparkline([day_avg], sparklineOptions);
|
$(placeholder+'b').sparkline([day_avg], sparklineOptions);
|
||||||
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">●</span> {{offset:names}}, {{value}} </span>'
|
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">●</span> {{offset:names}}, {{value}} </span>'
|
||||||
sparklineOptions.barWidth = 2;
|
sparklineOptions.barWidth = 2;
|
||||||
$(placeholder+'s').text((day_avg).toFixed(5));
|
$(placeholder+'s').text(day_avg_text);
|
||||||
|
|
||||||
}//for loop
|
}//for loop
|
||||||
|
|
||||||
|
@ -197,7 +201,8 @@ $.getJSON("/sentiment_analysis_getplotdata/",
|
||||||
gaugeOptions2.appendTo = '#gauge_today_last_days';
|
gaugeOptions2.appendTo = '#gauge_today_last_days';
|
||||||
gaugeOptions2.dialLabel = 'Today';
|
gaugeOptions2.dialLabel = 'Today';
|
||||||
gaugeOptions2.elementId = 'gauge2';
|
gaugeOptions2.elementId = 'gauge2';
|
||||||
piePercent = (all_graph_day_sum / (8*24)) / max_value;
|
//piePercent = (all_graph_day_sum / (8*24)) / max_value;
|
||||||
|
piePercent = (all_day_avg / 8) / max_value;
|
||||||
gaugeOptions2.inc = piePercent;
|
gaugeOptions2.inc = piePercent;
|
||||||
var gauge_today_last_days = new FlexGauge(gaugeOptions2);
|
var gauge_today_last_days = new FlexGauge(gaugeOptions2);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue