chg: [Term tracker] add term tracker module (word + set) + API: add new term to track (word + set + regex)

This commit is contained in:
Terrtia 2019-08-07 12:08:24 +02:00
parent 28320a32a6
commit bb6d3a6a26
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
6 changed files with 170 additions and 62 deletions

View file

@ -20,13 +20,6 @@ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
publisher.port = 6380 publisher.port = 6380
publisher.channel = "Script" publisher.channel = "Script"
# notifications enabled/disabled
TrackedTermsNotificationEnabled_Name = "TrackedNotifications"
# associated notification email addresses for a specific term`
# Keys will be e.g. TrackedNotificationEmails<TERMNAME>
TrackedTermsNotificationEmailsPrefix_Name = "TrackedNotificationEmails_"
def sendEmailNotification(recipient, alert_name, content): def sendEmailNotification(recipient, alert_name, content):
if not os.path.exists(configfile): if not os.path.exists(configfile):

View file

@ -9,50 +9,84 @@ import os
import sys import sys
import time import time
from Helper import Process
from pubsublogger import publisher
import NotificationHelper
from packages import Paste from packages import Paste
from packages import Term from packages import Term
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
import Flask_config import Flask_config
r_serv_term = Flask_config.r_serv_term full_item_url = "/showsavedpaste/?paste="
mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
# loads tracked words # loads tracked words
list_tracked_words = Term.get_tracked_words_list() list_tracked_words = Term.get_tracked_words_list()
set_tracked_words_list = Term.get_set_tracked_words_list() set_tracked_words_list = Term.get_set_tracked_words_list()
def new_term_found(term, term_type): def new_term_found(term, term_type, item_id):
uuid_list = get_term_uuid_list() uuid_list = Term.get_term_uuid_list(term)
email_notification = []
tags = []
for term_uuid in uuid_list: for term_uuid in uuid_list:
pass Term.add_tracked_item(term_uuid, item_id)
tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
p.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid)
if mail_to_notify:
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
for mail in mail_to_notify:
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
if __name__ == "__main__": if __name__ == "__main__":
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz' publisher.port = 6380
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz' publisher.channel = "Script"
paste = Paste.Paste(item_id) publisher.info("Script TermTrackerMod started")
res = Term.parse_tracked_term_to_add('test zorro meroio apple weert', 'word')
''' #config_section = 'TermTrackerMod'
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content()) config_section = 'Curve'
p = Process(config_section)
# check solo words full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
for word in list_tracked_words:
if word in dict_words_freq:
pass
# tag + get uuids ...
# check words set while True:
for list_words, nb_words_threshold in set_tracked_words_list:
nb_uniq_word = 0 item_id = p.get_from_set()
for word in list_words: item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
if word in dict_words_freq: #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
nb_uniq_word += 1
if nb_uniq_word > nb_words_threshold: if message is not None:
# tag + get uuid
pass paste = Paste.Paste(item_id)
'''
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
# check solo words
for word in list_tracked_words:
if word in dict_words_freq:
new_term_found(word, 'word', item_id)
# check words set
for elem in set_tracked_words_list:
list_words = elem[0]
nb_words_threshold = elem[1]
word_set = elem[2]
nb_uniq_word = 0
for word in list_words:
if word in dict_words_freq:
nb_uniq_word += 1
if nb_uniq_word >= nb_words_threshold:
new_term_found(word_set, 'set', item_id)
else:
time.sleep(5)

View file

@ -2,6 +2,7 @@
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
import os import os
import re
import sys import sys
import uuid import uuid
import redis import redis
@ -16,6 +17,7 @@ sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
import Flask_config import Flask_config
r_serv_term = Flask_config.r_serv_term r_serv_term = Flask_config.r_serv_term
email_regex = Flask_config.email_regex
special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\') special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\')
special_characters.add('\\s') special_characters.add('\\s')
@ -24,6 +26,26 @@ special_characters.add('\\s')
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
gaps=True, discard_empty=True) gaps=True, discard_empty=True)
def is_valid_mail(email):
result = email_regex.match(email)
if result:
return True
else:
return False
def verify_mail_list(mail_list):
for mail in mail_list:
if not is_valid_mail(mail):
return ({'status': 'error', 'reason': 'Invalid email', 'value': mail}, 400)
return None
def is_valid_regex(term_regex):
try:
re.compile(term_regex)
return True
except:
return False
def get_text_word_frequency(item_content, filtering=True): def get_text_word_frequency(item_content, filtering=True):
item_content = item_content.lower() item_content = item_content.lower()
words_dict = defaultdict(int) words_dict = defaultdict(int)
@ -34,7 +56,6 @@ def get_text_word_frequency(item_content, filtering=True):
blob = TextBlob(item_content) blob = TextBlob(item_content)
for word in blob.tokens: for word in blob.tokens:
words_dict[word] += 1 words_dict[word] += 1
print(words_dict)
return words_dict return words_dict
# # TODO: create all tracked words # # TODO: create all tracked words
@ -45,28 +66,40 @@ def get_set_tracked_words_list():
set_list = r_serv_term.smembers('all:tracked_term:set') set_list = r_serv_term.smembers('all:tracked_term:set')
all_set_list = [] all_set_list = []
for elem in set_list: for elem in set_list:
elem = elem.split(';') res = elem.split(';')
num_words = int(elem[1]) num_words = int(res[1])
ter_set = elem[0].split(',') ter_set = res[0].split(',')
all_set_list.append((ter_set, num_words)) all_set_list.append((ter_set, num_words, elem))
return all_set_list
def parse_json_term_to_add(dict_input): def is_term_tracked_in_global_level(term):
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
if res:
for elem_uuid in res:
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
return True
return False
def parse_json_term_to_add(dict_input, user_id):
term = dict_input.get('term', None) term = dict_input.get('term', None)
if not term: if not term:
return ({"status": "error", "reason": "Term not provided"}, 400) return ({"status": "error", "reason": "Term not provided"}, 400)
term_type = dict_input.get('term', None) term_type = dict_input.get('type', None)
if not term_type: if not term_type:
return ({"status": "error", "reason": "Term type not provided"}, 400) return ({"status": "error", "reason": "Term type not provided"}, 400)
nb_words = dict_input.get('nb_words', 1) nb_words = dict_input.get('nb_words', 1)
res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words) res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words)
if res['status']=='error': if res[1]!=200:
return res return res
term = res[0]['term']
term_type = res[0]['type']
# get user_id
tags = dict_input.get('tags', []) tags = dict_input.get('tags', [])
mails = dict_input.get('mails', []) mails = dict_input.get('mails', [])
## TODO: verify mail integrity res = verify_mail_list(mails)
if res:
return res
## TODO: add dashboard key ## TODO: add dashboard key
level = dict_input.get('level', 1) level = dict_input.get('level', 1)
@ -77,17 +110,20 @@ def parse_json_term_to_add(dict_input):
except: except:
level = 1 level = 1
# check if term already tracked in global
if level==1:
if is_term_tracked_in_global_level(term):
return ({"status": "error", "reason": "Term already tracked"}, 409)
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails) term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
return ({'term': term, 'uuid': term_uuid}, 200) return ({'term': term, 'type': term_type, 'uuid': term_uuid}, 200)
def parse_tracked_term_to_add(term , term_type, nb_words=1): def parse_tracked_term_to_add(term , term_type, nb_words=1):
# todo verify regex format
if term_type=='regex': if term_type=='regex':
# TODO: verify regex integrity if not is_valid_regex(term):
pass return ({"status": "error", "reason": "Invalid regex"}, 400)
elif term_type=='word' or term_type=='set': elif term_type=='word' or term_type=='set':
# force lowercase # force lowercase
term = term.lower() term = term.lower()
@ -97,7 +133,7 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400) return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400)
words = term.split() words = term.split()
# not a word # not a word
if term_type=='word' and words: if term_type=='word' and len(words)>1:
term_type = 'set' term_type = 'set'
# ouput format: term1,term2,term3;2 # ouput format: term1,term2,term3;2
@ -106,19 +142,21 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
nb_words = int(nb_words) nb_words = int(nb_words)
except: except:
nb_words = 1 nb_words = 1
if nb_words==0:
nb_words = 1
words_set = set(words) words_set = set(words)
words_set = sorted(words_set) words_set = sorted(words_set)
term = ",".join(words_set) term = ",".join(words_set)
term = "{};{}".format(term, nb_words) term = "{};{}".format(term, nb_words)
print(term) if nb_words > len(words_set):
print(term_type) nb_words = len(words_set)
return ({"status": "success", "term": term, "type": term_type}, 200)
else: else:
return ({"status": "error", "reason": "Incorrect type"}, 400) return ({"status": "error", "reason": "Incorrect type"}, 400)
return ({"status": "success", "term": term, "type": term_type}, 200)
def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0): def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0):
@ -154,9 +192,44 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
return term_uuid return term_uuid
def delete_term(term_uuid):
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
if level == 0: # user only
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
r_serv_term.srem('user:tracked_term:{}'.format(user_id), term_uuid)
elif level == 1: # global
r_serv_term.srem('gobal:tracked_term', term_uuid)
# delete metatadata
r_serv_term.delete('tracked_term:{}'.format(term_uuid))
# remove tags
r_serv_term.delete('tracked_term:tags:{}'.format(term_uuid))
# remove mails
r_serv_term.delete('tracked_term:mail:{}'.format(term_uuid))
# remove item set
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
def get_term_uuid_list(term): def get_term_uuid_list(term):
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))) return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
def get_term_tags(term_uuid):
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
def get_term_mails(term_uuid):
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
def add_tracked_item(term_uuid, item_id):
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)

View file

@ -23,7 +23,7 @@ sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon
##### Notifications ###### ##### Notifications ######
[Notifications] [Notifications]
ail_domain = http://localhost:7000 ail_domain = https://localhost:7000
sender = sender@example.com sender = sender@example.com
sender_host = smtp.example.com sender_host = smtp.example.com
sender_port = 1337 sender_port = 1337

View file

@ -600,7 +600,7 @@ Add term tracker
- `term` - `term`
- term to add - term to add
- *str - word(s)* - *str - word(s)*
- default: `text` - mandatory
- `nb_words` - `nb_words`
- number of words in set - number of words in set
- *int* - *int*

View file

@ -17,6 +17,7 @@ import Import_helper
import Item import Item
import Paste import Paste
import Tag import Tag
import Term
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response
from flask_login import login_required from flask_login import login_required
@ -55,8 +56,11 @@ def verify_token(token):
else: else:
return False return False
def get_user_from_token(token):
return r_serv_db.hget('user:tokens', token)
def verify_user_role(role, token): def verify_user_role(role, token):
user_id = r_serv_db.hget('user:tokens', token) user_id = get_user_from_token(token)
if user_id: if user_id:
if is_in_role(user_id, role): if is_in_role(user_id, role):
return True return True
@ -308,13 +312,17 @@ def get_all_tags():
return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200 return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # TAGS # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # TRACKER # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@restApi.route("api/v1/add/tracker/term", methods=['POST']) @restApi.route("api/v1/add/tracker/term", methods=['GET'])
#@token_required('analyst') @token_required('analyst')
def add_tracker_term(): def add_tracker_term():
data = request.get_json() #data = request.get_json()
data = {"term": "pi", 'type' : "word"}
user_token = get_auth_from_header()
user_id = get_user_from_token(user_token)
res = Term.parse_json_term_to_add(data, user_id)
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # #