2016-08-11 07:40:42 +00:00
|
|
|
#!/usr/bin/env python2
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
2016-08-13 13:24:57 +00:00
|
|
|
Sentiment analyser module.
|
|
|
|
It takes its inputs from 'shortLine' and 'longLine'.
|
|
|
|
Source code is taken into account (in case of comments). If it is only source code,
|
|
|
|
it will be treated with a neutral value anyway.
|
2016-08-11 07:40:42 +00:00
|
|
|
|
|
|
|
nltk.sentiment.vader module:
|
|
|
|
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import time
|
2016-08-13 13:24:57 +00:00
|
|
|
import datetime
|
|
|
|
import calendar
|
|
|
|
import redis
|
2016-08-11 07:40:42 +00:00
|
|
|
from pubsublogger import publisher
|
|
|
|
from Helper import Process
|
2016-08-13 13:24:57 +00:00
|
|
|
from packages import Paste
|
2016-08-11 07:40:42 +00:00
|
|
|
|
|
|
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
|
|
|
from nltk import tokenize
|
|
|
|
|
|
|
|
|
2016-08-13 13:24:57 +00:00
|
|
|
def Analyse(message, server):
|
|
|
|
#print 'analyzing'
|
2016-08-11 07:40:42 +00:00
|
|
|
path = message
|
2016-08-13 13:24:57 +00:00
|
|
|
paste = Paste.Paste(path)
|
|
|
|
|
|
|
|
content = paste.get_p_content()
|
|
|
|
provider = paste.p_source
|
|
|
|
p_date = str(paste._get_p_date())
|
|
|
|
#print provider, date
|
|
|
|
|
|
|
|
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
|
|
|
|
#print 'pastedate: ', the_date
|
|
|
|
the_time = datetime.datetime.now()
|
|
|
|
the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
|
|
|
|
#print 'now: ', the_time
|
|
|
|
combined_datetime = datetime.datetime.combine(the_date, the_time)
|
|
|
|
#print 'combined: ', combined_datetime
|
|
|
|
timestamp = calendar.timegm(combined_datetime.timetuple())
|
|
|
|
#print 'timestamp: ', timestamp
|
2016-08-11 07:40:42 +00:00
|
|
|
|
|
|
|
sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
|
2016-08-13 13:24:57 +00:00
|
|
|
#print len(sentences)
|
|
|
|
|
|
|
|
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
|
|
|
|
neg_line = 0
|
|
|
|
pos_line = 0
|
2016-08-11 07:40:42 +00:00
|
|
|
sid = SentimentIntensityAnalyzer()
|
|
|
|
for sentence in sentences:
|
|
|
|
ss = sid.polarity_scores(sentence)
|
|
|
|
for k in sorted(ss):
|
2016-08-13 13:24:57 +00:00
|
|
|
if k == 'compound':
|
|
|
|
if ss['neg'] > ss['pos']:
|
|
|
|
avg_score['compoundNeg'] += ss[k]
|
|
|
|
neg_line += 1
|
|
|
|
else:
|
|
|
|
avg_score['compoundPos'] += ss[k]
|
|
|
|
pos_line += 1
|
|
|
|
else:
|
|
|
|
avg_score[k] += ss[k]
|
|
|
|
|
|
|
|
#print('{0}: {1}, '.format(k, ss[k]))
|
|
|
|
|
|
|
|
for k in avg_score:
|
|
|
|
if k == 'compoundPos':
|
|
|
|
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
|
|
|
|
elif k == 'compoundNeg':
|
|
|
|
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
|
|
|
|
else:
|
|
|
|
avg_score[k] = avg_score[k] / len(sentences)
|
|
|
|
|
|
|
|
|
|
|
|
# In redis-levelDB: {} = set, () = K-V
|
|
|
|
# {Provider_set -> provider_i}
|
|
|
|
# {Provider_TimestampInHour_i -> UniqID_i}_j
|
|
|
|
# (UniqID_i -> PasteValue_i)
|
|
|
|
|
|
|
|
server.sadd('Provider_set', provider)
|
|
|
|
#print 'Provider_set', provider
|
|
|
|
|
|
|
|
provider_timestamp = provider + '_' + str(timestamp)
|
|
|
|
#print provider_timestamp
|
|
|
|
server.incr('UniqID')
|
|
|
|
UniqID = server.get('UniqID')
|
|
|
|
print provider_timestamp, '->', UniqID
|
|
|
|
server.sadd(provider_timestamp, UniqID)
|
|
|
|
server.set(UniqID, avg_score)
|
|
|
|
#print UniqID, '->', avg_score
|
|
|
|
|
|
|
|
#print '(', provider, timestamp, str(avg_score) , ')'
|
|
|
|
#server.hset(provider, timestamp, str(avg_score))
|
2016-08-11 07:40:42 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
|
|
|
|
# Port of the redis instance used by pubsublogger
|
|
|
|
publisher.port = 6380
|
|
|
|
# Script is the default channel used for the modules.
|
|
|
|
publisher.channel = 'Script'
|
|
|
|
|
|
|
|
# Section name in bin/packages/modules.cfg
|
2016-08-13 13:24:57 +00:00
|
|
|
config_section = 'SentimentAnalyser'
|
2016-08-11 07:40:42 +00:00
|
|
|
|
|
|
|
# Setup the I/O queues
|
|
|
|
p = Process(config_section)
|
|
|
|
|
|
|
|
# Sent to the logging a description of the module
|
|
|
|
publisher.info("<description of the module>")
|
|
|
|
|
2016-08-13 13:24:57 +00:00
|
|
|
# REDIS_LEVEL_DB #
|
|
|
|
server = redis.StrictRedis(
|
|
|
|
host=p.config.get("Redis_Level_DB_Sentiment", "host"),
|
|
|
|
port=p.config.get("Redis_Level_DB_Sentiment", "port"),
|
|
|
|
db=p.config.get("Redis_Level_DB_Sentiment", "db"))
|
|
|
|
|
2016-08-11 07:40:42 +00:00
|
|
|
# Endless loop getting messages from the input queue
|
|
|
|
while True:
|
|
|
|
# Get one message from the input queue
|
|
|
|
message = p.get_from_set()
|
|
|
|
if message is None:
|
|
|
|
publisher.debug("{} queue is empty, waiting".format(config_section))
|
|
|
|
time.sleep(1)
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Do something with the message from the queue
|
2016-08-13 13:24:57 +00:00
|
|
|
Analyse(message, server)
|