ail-framework/bin/modules/SentimentAnalysis.py

190 lines
6.3 KiB
Python
Raw Normal View History

2018-05-04 11:53:29 +00:00
#!/usr/bin/env python3
2016-08-11 07:40:42 +00:00
# -*-coding:UTF-8 -*
"""
Sentiment analyser module.
It takes its inputs from 'global'.
2016-08-11 07:40:42 +00:00
The content is analysed if the length of the line is
above a defined threshold (get_p_content_with_removed_lines).
This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
for long lines (function _slices_from_text line#1276).
nltk.sentiment.vader module credit:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
2016-08-11 07:40:42 +00:00
"""
2021-04-28 13:24:33 +00:00
##################################
# Import External packages
##################################
import os
import sys
2016-08-11 07:40:42 +00:00
import time
import datetime
import calendar
import redis
import json
2021-04-28 13:24:33 +00:00
import signal
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize, download
sys.path.append(os.environ['AIL_BIN'])
2021-04-28 13:24:33 +00:00
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.objects.Items import Item
from lib import ConfigLoader
2016-08-11 07:40:42 +00:00
2021-04-28 13:24:33 +00:00
class TimeoutException(Exception):
pass
2016-08-11 07:40:42 +00:00
2021-04-28 13:24:33 +00:00
def timeout_handler(signum, frame):
raise TimeoutException
2021-04-28 13:24:33 +00:00
signal.signal(signal.SIGALRM, timeout_handler)
## TODO: REFACTOR MODULE + CLEAN HISTORY
2021-04-28 13:24:33 +00:00
class SentimentAnalysis(AbstractModule):
"""
SentimentAnalysis module for AIL framework
"""
2021-04-28 13:24:33 +00:00
# Config Variables
accepted_Mime_type = ['text/plain']
line_max_length_threshold = 1000
2021-04-28 13:24:33 +00:00
def __init__(self):
super(SentimentAnalysis, self).__init__()
2016-08-11 07:40:42 +00:00
2021-04-28 13:24:33 +00:00
self.sentiment_lexicon_file = ConfigLoader.ConfigLoader().get_config_str("Directories", "sentiment_lexicon_file")
2021-04-28 13:24:33 +00:00
# REDIS_LEVEL_DB #
self.db = ConfigLoader.ConfigLoader().get_redis_conn("_Sentiment")
2021-04-28 13:24:33 +00:00
self.time1 = time.time()
2021-04-28 13:24:33 +00:00
# Waiting time in secondes between to message proccessed
self.pending_seconds = 1
2021-04-28 13:24:33 +00:00
# Send module state to logs
2023-05-12 13:29:53 +00:00
self.logger.info(f"Module {self.module_name} initialized")
2021-04-28 13:24:33 +00:00
def compute(self, message):
# Max time to compute one entry
signal.alarm(60)
try:
2021-04-28 13:24:33 +00:00
self.analyse(message)
except TimeoutException:
2023-05-12 13:29:53 +00:00
self.logger.debug(f"{message} processing timeout")
else:
signal.alarm(0)
2021-04-28 13:24:33 +00:00
def get_p_content_with_removed_lines(self, threshold, item_content):
num_line_removed = 0
line_length_threshold = threshold
string_content = ""
f = item_content
for line_id, line in enumerate(f):
length = len(line)
if length < line_length_threshold:
string_content += line
else:
num_line_removed += 1
return num_line_removed, string_content
2021-04-28 13:24:33 +00:00
def analyse(self, message):
item = Item(message)
2021-04-28 13:24:33 +00:00
# get content with removed line + number of them
num_line_removed, p_content = self.get_p_content_with_removed_lines(SentimentAnalysis.line_max_length_threshold,
item.get_content())
provider = item.get_source()
p_date = item.get_date()
p_MimeType = item.get_mimetype()
2021-04-28 13:24:33 +00:00
# Perform further analysis
if p_MimeType == "text/plain":
if self.isJSON(p_content):
p_MimeType = "JSON"
if p_MimeType in SentimentAnalysis.accepted_Mime_type:
2023-05-12 13:29:53 +00:00
self.logger.debug(f'Accepted :{p_MimeType}')
2021-04-28 13:24:33 +00:00
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
the_time = datetime.datetime.now()
the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
combined_datetime = datetime.datetime.combine(the_date, the_time)
timestamp = calendar.timegm(combined_datetime.timetuple())
try:
sentences = tokenize.sent_tokenize(p_content)
except:
# use the NLTK Downloader to obtain the resource
download('punkt')
sentences = tokenize.sent_tokenize(p_content)
if len(sentences) > 0:
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
neg_line = 0
pos_line = 0
sid = SentimentIntensityAnalyzer(self.sentiment_lexicon_file)
2021-04-28 13:24:33 +00:00
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
if k == 'compound':
if ss['neg'] > ss['pos']:
avg_score['compoundNeg'] += ss[k]
neg_line += 1
else:
avg_score['compoundPos'] += ss[k]
pos_line += 1
else:
avg_score[k] += ss[k]
for k in avg_score:
if k == 'compoundPos':
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
elif k == 'compoundNeg':
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
else:
avg_score[k] = avg_score[k] / len(sentences)
# In redis-levelDB: {} = set, () = K-V
# {Provider_set -> provider_i}
# {Provider_TimestampInHour_i -> UniqID_i}_j
# (UniqID_i -> PasteValue_i)
self.db.sadd('Provider_set', provider)
provider_timestamp = provider + '_' + str(timestamp)
self.db.incr('UniqID')
UniqID = self.db.get('UniqID')
self.redis_logger.debug(f'{provider_timestamp}->{UniqID}dropped{num_line_removed}lines')
self.db.sadd(provider_timestamp, UniqID)
self.db.set(UniqID, avg_score)
else:
self.redis_logger.debug(f'Dropped:{p_MimeType}')
def isJSON(self, content):
try:
json.loads(content)
return True
except Exception:
return False
if __name__ == '__main__':
module = SentimentAnalysis()
module.run()