ail-framework/bin/SentimentAnalyser.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
    Sentiment analyser module.
    It takes its inputs from 'shortLine' and 'longLine'.
    Source code is taken into account (in case of comments). If it is only source code,
    it will be treated with a neutral value anyway.

nltk.sentiment.vader module:
    Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

"""

import time
import datetime
import calendar
import redis
from pubsublogger import publisher
from Helper import Process
from packages import Paste

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize


def Analyse(message, server):
    #print 'analyzing'
    path = message
    paste = Paste.Paste(path)

    content = paste.get_p_content()
    provider = paste.p_source
    p_date = str(paste._get_p_date())
    #print provider, date

    the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
    #print 'pastedate: ', the_date
    the_time = datetime.datetime.now()
    the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
    #print 'now: ', the_time
    combined_datetime = datetime.datetime.combine(the_date, the_time)
    #print 'combined: ', combined_datetime
    timestamp = calendar.timegm(combined_datetime.timetuple())
    #print 'timestamp: ', timestamp 

    sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
    #print len(sentences)

    avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
    neg_line = 0
    pos_line = 0
    sid = SentimentIntensityAnalyzer()
    for sentence in sentences:
         ss = sid.polarity_scores(sentence)
         for k in sorted(ss):
             if k == 'compound':
                 if ss['neg'] > ss['pos']:
                     avg_score['compoundNeg'] += ss[k]
                     neg_line += 1
                 else:
                     avg_score['compoundPos'] += ss[k]
                     pos_line += 1
             else:
                 avg_score[k] += ss[k]

             #print('{0}: {1}, '.format(k, ss[k]))

    for k in avg_score:
        if k == 'compoundPos':
            avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
        elif k == 'compoundNeg':
            avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
        else:
            avg_score[k] = avg_score[k] / len(sentences)


    # In redis-levelDB: {} = set, () = K-V 
    # {Provider_set -> provider_i}
    # {Provider_TimestampInHour_i -> UniqID_i}_j
    # (UniqID_i -> PasteValue_i)

    server.sadd('Provider_set', provider)
    #print 'Provider_set', provider

    provider_timestamp = provider + '_' + str(timestamp)
    #print provider_timestamp
    server.incr('UniqID')
    UniqID = server.get('UniqID')
    print provider_timestamp, '->', UniqID
    server.sadd(provider_timestamp, UniqID)
    server.set(UniqID, avg_score)
    #print UniqID, '->', avg_score

    #print '(', provider, timestamp, str(avg_score) , ')'
    #server.hset(provider, timestamp, str(avg_score))

if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'SentimentAnalyser'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # REDIS_LEVEL_DB #
    server = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_Sentiment", "host"),
        port=p.config.get("Redis_Level_DB_Sentiment", "port"),
        db=p.config.get("Redis_Level_DB_Sentiment", "db"))

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        # Do something with the message from the queue
        Analyse(message, server)
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
			`"""`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`Sentiment analyser module.`
			`It takes its inputs from 'shortLine' and 'longLine'.`
			`Source code is taken into account (in case of comments). If it is only source code,`
			`it will be treated with a neutral value anyway.`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00
			`nltk.sentiment.vader module:`
			`Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.`

			`"""`

			`import time`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`import datetime`
			`import calendar`
			`import redis`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00			`from pubsublogger import publisher`
			`from Helper import Process`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`from packages import Paste`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00
			`from nltk.sentiment.vader import SentimentIntensityAnalyzer`
			`from nltk import tokenize`


Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`def Analyse(message, server):`
			`#print 'analyzing'`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00			`path = message`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`paste = Paste.Paste(path)`

			`content = paste.get_p_content()`
			`provider = paste.p_source`
			`p_date = str(paste._get_p_date())`
			`#print provider, date`

			`the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))`
			`#print 'pastedate: ', the_date`
			`the_time = datetime.datetime.now()`
			`the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)`
			`#print 'now: ', the_time`
			`combined_datetime = datetime.datetime.combine(the_date, the_time)`
			`#print 'combined: ', combined_datetime`
			`timestamp = calendar.timegm(combined_datetime.timetuple())`
			`#print 'timestamp: ', timestamp`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00
			`sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`#print len(sentences)`

			`avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}`
			`neg_line = 0`
			`pos_line = 0`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00			`sid = SentimentIntensityAnalyzer()`
			`for sentence in sentences:`
			`ss = sid.polarity_scores(sentence)`
			`for k in sorted(ss):`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`if k == 'compound':`
			`if ss['neg'] > ss['pos']:`
			`avg_score['compoundNeg'] += ss[k]`
			`neg_line += 1`
			`else:`
			`avg_score['compoundPos'] += ss[k]`
			`pos_line += 1`
			`else:`
			`avg_score[k] += ss[k]`

			`#print('{0}: {1}, '.format(k, ss[k]))`

			`for k in avg_score:`
			`if k == 'compoundPos':`
			`avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)`
			`elif k == 'compoundNeg':`
			`avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)`
			`else:`
			`avg_score[k] = avg_score[k] / len(sentences)`


			`# In redis-levelDB: {} = set, () = K-V`
			`# {Provider_set -> provider_i}`
			`# {Provider_TimestampInHour_i -> UniqID_i}_j`
			`# (UniqID_i -> PasteValue_i)`

			`server.sadd('Provider_set', provider)`
			`#print 'Provider_set', provider`

			`provider_timestamp = provider + '_' + str(timestamp)`
			`#print provider_timestamp`
			`server.incr('UniqID')`
			`UniqID = server.get('UniqID')`
			`print provider_timestamp, '->', UniqID`
			`server.sadd(provider_timestamp, UniqID)`
			`server.set(UniqID, avg_score)`
			`#print UniqID, '->', avg_score`

			`#print '(', provider, timestamp, str(avg_score) , ')'`
			`#server.hset(provider, timestamp, str(avg_score))`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00
			`if __name__ == '__main__':`
			`# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)`
			`# Port of the redis instance used by pubsublogger`
			`publisher.port = 6380`
			`# Script is the default channel used for the modules.`
			`publisher.channel = 'Script'`

			`# Section name in bin/packages/modules.cfg`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`config_section = 'SentimentAnalyser'`
Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00
			`# Setup the I/O queues`
			`p = Process(config_section)`

			`# Sent to the logging a description of the module`
			`publisher.info("<description of the module>")`

Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`# REDIS_LEVEL_DB #`
			`server = redis.StrictRedis(`
			`host=p.config.get("Redis_Level_DB_Sentiment", "host"),`
			`port=p.config.get("Redis_Level_DB_Sentiment", "port"),`
			`db=p.config.get("Redis_Level_DB_Sentiment", "db"))`

Added module sentimentAnalyser 2016-08-11 07:40:42 +00:00			`# Endless loop getting messages from the input queue`
			`while True:`
			`# Get one message from the input queue`
			`message = p.get_from_set()`
			`if message is None:`
			`publisher.debug("{} queue is empty, waiting".format(config_section))`
			`time.sleep(1)`
			`continue`

			`# Do something with the message from the queue`
Added sentiment analyser module (draft) 2016-08-13 13:24:57 +00:00			`Analyse(message, server)`