chg: [Term tracker] add term tracker module (word + set) + API: add new term to track (word + set + regex)

This commit is contained in:
Terrtia 2019-08-07 12:08:24 +02:00
parent 28320a32a6
commit bb6d3a6a26
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
6 changed files with 170 additions and 62 deletions

View file

@ -20,13 +20,6 @@ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
publisher.port = 6380
publisher.channel = "Script"
# notifications enabled/disabled
TrackedTermsNotificationEnabled_Name = "TrackedNotifications"
# associated notification email addresses for a specific term`
# Keys will be e.g. TrackedNotificationEmails<TERMNAME>
TrackedTermsNotificationEmailsPrefix_Name = "TrackedNotificationEmails_"
def sendEmailNotification(recipient, alert_name, content):
if not os.path.exists(configfile):

View file

@ -9,50 +9,84 @@ import os
import sys
import time
from Helper import Process
from pubsublogger import publisher
import NotificationHelper
from packages import Paste
from packages import Term
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
import Flask_config
r_serv_term = Flask_config.r_serv_term
full_item_url = "/showsavedpaste/?paste="
mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
# loads tracked words
list_tracked_words = Term.get_tracked_words_list()
set_tracked_words_list = Term.get_set_tracked_words_list()
def new_term_found(term, term_type):
uuid_list = get_term_uuid_list()
email_notification = []
tags = []
def new_term_found(term, term_type, item_id):
uuid_list = Term.get_term_uuid_list(term)
for term_uuid in uuid_list:
pass
Term.add_tracked_item(term_uuid, item_id)
tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
p.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid)
if mail_to_notify:
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
for mail in mail_to_notify:
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
if __name__ == "__main__":
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
paste = Paste.Paste(item_id)
res = Term.parse_tracked_term_to_add('test zorro meroio apple weert', 'word')
publisher.port = 6380
publisher.channel = "Script"
publisher.info("Script TermTrackerMod started")
'''
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
#config_section = 'TermTrackerMod'
config_section = 'Curve'
p = Process(config_section)
# check solo words
for word in list_tracked_words:
if word in dict_words_freq:
pass
# tag + get uuids ...
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
# check words set
for list_words, nb_words_threshold in set_tracked_words_list:
nb_uniq_word = 0
for word in list_words:
if word in dict_words_freq:
nb_uniq_word += 1
if nb_uniq_word > nb_words_threshold:
# tag + get uuid
pass
'''
while True:
item_id = p.get_from_set()
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
if message is not None:
paste = Paste.Paste(item_id)
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
# check solo words
for word in list_tracked_words:
if word in dict_words_freq:
new_term_found(word, 'word', item_id)
# check words set
for elem in set_tracked_words_list:
list_words = elem[0]
nb_words_threshold = elem[1]
word_set = elem[2]
nb_uniq_word = 0
for word in list_words:
if word in dict_words_freq:
nb_uniq_word += 1
if nb_uniq_word >= nb_words_threshold:
new_term_found(word_set, 'set', item_id)
else:
time.sleep(5)

View file

@ -2,6 +2,7 @@
# -*-coding:UTF-8 -*
import os
import re
import sys
import uuid
import redis
@ -16,6 +17,7 @@ sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
import Flask_config
r_serv_term = Flask_config.r_serv_term
email_regex = Flask_config.email_regex
special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\')
special_characters.add('\\s')
@ -24,6 +26,26 @@ special_characters.add('\\s')
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
gaps=True, discard_empty=True)
def is_valid_mail(email):
result = email_regex.match(email)
if result:
return True
else:
return False
def verify_mail_list(mail_list):
for mail in mail_list:
if not is_valid_mail(mail):
return ({'status': 'error', 'reason': 'Invalid email', 'value': mail}, 400)
return None
def is_valid_regex(term_regex):
try:
re.compile(term_regex)
return True
except:
return False
def get_text_word_frequency(item_content, filtering=True):
item_content = item_content.lower()
words_dict = defaultdict(int)
@ -34,7 +56,6 @@ def get_text_word_frequency(item_content, filtering=True):
blob = TextBlob(item_content)
for word in blob.tokens:
words_dict[word] += 1
print(words_dict)
return words_dict
# # TODO: create all tracked words
@ -45,28 +66,40 @@ def get_set_tracked_words_list():
set_list = r_serv_term.smembers('all:tracked_term:set')
all_set_list = []
for elem in set_list:
elem = elem.split(';')
num_words = int(elem[1])
ter_set = elem[0].split(',')
all_set_list.append((ter_set, num_words))
res = elem.split(';')
num_words = int(res[1])
ter_set = res[0].split(',')
all_set_list.append((ter_set, num_words, elem))
return all_set_list
def parse_json_term_to_add(dict_input):
def is_term_tracked_in_global_level(term):
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
if res:
for elem_uuid in res:
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
return True
return False
def parse_json_term_to_add(dict_input, user_id):
term = dict_input.get('term', None)
if not term:
return ({"status": "error", "reason": "Term not provided"}, 400)
term_type = dict_input.get('term', None)
term_type = dict_input.get('type', None)
if not term_type:
return ({"status": "error", "reason": "Term type not provided"}, 400)
nb_words = dict_input.get('nb_words', 1)
res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words)
if res['status']=='error':
if res[1]!=200:
return res
term = res[0]['term']
term_type = res[0]['type']
# get user_id
tags = dict_input.get('tags', [])
mails = dict_input.get('mails', [])
## TODO: verify mail integrity
res = verify_mail_list(mails)
if res:
return res
## TODO: add dashboard key
level = dict_input.get('level', 1)
@ -77,17 +110,20 @@ def parse_json_term_to_add(dict_input):
except:
level = 1
# check if term already tracked in global
if level==1:
if is_term_tracked_in_global_level(term):
return ({"status": "error", "reason": "Term already tracked"}, 409)
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
return ({'term': term, 'uuid': term_uuid}, 200)
return ({'term': term, 'type': term_type, 'uuid': term_uuid}, 200)
def parse_tracked_term_to_add(term , term_type, nb_words=1):
# todo verify regex format
if term_type=='regex':
# TODO: verify regex integrity
pass
if not is_valid_regex(term):
return ({"status": "error", "reason": "Invalid regex"}, 400)
elif term_type=='word' or term_type=='set':
# force lowercase
term = term.lower()
@ -97,7 +133,7 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400)
words = term.split()
# not a word
if term_type=='word' and words:
if term_type=='word' and len(words)>1:
term_type = 'set'
# ouput format: term1,term2,term3;2
@ -106,19 +142,21 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
nb_words = int(nb_words)
except:
nb_words = 1
if nb_words==0:
nb_words = 1
words_set = set(words)
words_set = sorted(words_set)
term = ",".join(words_set)
term = "{};{}".format(term, nb_words)
print(term)
print(term_type)
return ({"status": "success", "term": term, "type": term_type}, 200)
if nb_words > len(words_set):
nb_words = len(words_set)
else:
return ({"status": "error", "reason": "Incorrect type"}, 400)
return ({"status": "success", "term": term, "type": term_type}, 200)
def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0):
@ -154,9 +192,44 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
return term_uuid
def delete_term(term_uuid):
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
if level == 0: # user only
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
r_serv_term.srem('user:tracked_term:{}'.format(user_id), term_uuid)
elif level == 1: # global
r_serv_term.srem('gobal:tracked_term', term_uuid)
# delete metatadata
r_serv_term.delete('tracked_term:{}'.format(term_uuid))
# remove tags
r_serv_term.delete('tracked_term:tags:{}'.format(term_uuid))
# remove mails
r_serv_term.delete('tracked_term:mail:{}'.format(term_uuid))
# remove item set
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
def get_term_uuid_list(term):
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
def get_term_tags(term_uuid):
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
def get_term_mails(term_uuid):
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
def add_tracked_item(term_uuid, item_id):
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)

View file

@ -23,7 +23,7 @@ sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon
##### Notifications ######
[Notifications]
ail_domain = http://localhost:7000
ail_domain = https://localhost:7000
sender = sender@example.com
sender_host = smtp.example.com
sender_port = 1337

View file

@ -600,7 +600,7 @@ Add term tracker
- `term`
- term to add
- *str - word(s)*
- default: `text`
- mandatory
- `nb_words`
- number of words in set
- *int*

View file

@ -17,6 +17,7 @@ import Import_helper
import Item
import Paste
import Tag
import Term
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response
from flask_login import login_required
@ -55,8 +56,11 @@ def verify_token(token):
else:
return False
def get_user_from_token(token):
return r_serv_db.hget('user:tokens', token)
def verify_user_role(role, token):
user_id = r_serv_db.hget('user:tokens', token)
user_id = get_user_from_token(token)
if user_id:
if is_in_role(user_id, role):
return True
@ -308,13 +312,17 @@ def get_all_tags():
return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # TAGS # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # TRACKER # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@restApi.route("api/v1/add/tracker/term", methods=['POST'])
#@token_required('analyst')
@restApi.route("api/v1/add/tracker/term", methods=['GET'])
@token_required('analyst')
def add_tracker_term():
data = request.get_json()
#data = request.get_json()
data = {"term": "pi", 'type' : "word"}
user_token = get_auth_from_header()
user_id = get_user_from_token(user_token)
res = Term.parse_json_term_to_add(data, user_id)
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # #