Merge pull request #266 from kovacsbalu/fix-regexp-timeout

Stop regexp processing after timeout (60sec)
This commit is contained in:
Sami Mokaddem 2018-10-09 14:48:08 +02:00 committed by GitHub
commit d9bc2d6f44
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 17 deletions

View file

@ -9,35 +9,43 @@ supplied in the term webpage.
import redis import redis
import time import time
from pubsublogger import publisher from pubsublogger import publisher
from packages import lib_words
from packages import Paste from packages import Paste
import os
from os import environ
import datetime
import calendar import calendar
import re import re
import signal
import time
from Helper import Process from Helper import Process
# Email notifications # Email notifications
from NotificationHelper import * from NotificationHelper import *
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# Config Variables # Config Variables
DICO_REFRESH_TIME = 60 #s DICO_REFRESH_TIME = 60 # s
BlackListTermsSet_Name = "BlackListSetTermSet" BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet" TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet" TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24 oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7] top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31] top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
# create direct link in mail # create direct link in mail
full_paste_url = "/showsavedpaste/?paste=" full_paste_url = "/showsavedpaste/?paste="
def refresh_dicos(): def refresh_dicos():
dico_regex = {} dico_regex = {}
dico_regexname_to_redis = {} dico_regexname_to_redis = {}
@ -53,6 +61,7 @@ if __name__ == "__main__":
config_section = 'RegexForTermsFrequency' config_section = 'RegexForTermsFrequency'
p = Process(config_section) p = Process(config_section)
max_execution_time = p.config.getint(config_section, "max_execution_time")
# REDIS # # REDIS #
server_term = redis.StrictRedis( server_term = redis.StrictRedis(
@ -67,7 +76,7 @@ if __name__ == "__main__":
# create direct link in mail # create direct link in mail
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
#compile the regex # compile the regex
dico_refresh_cooldown = time.time() dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos() dico_regex, dico_regexname_to_redis = refresh_dicos()
@ -87,13 +96,22 @@ if __name__ == "__main__":
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp) curr_set = top_termFreq_setName_day[0] + str(timestamp)
content = Paste.Paste(filename).get_p_content() paste = Paste.Paste(filename)
content = paste.get_p_content()
#iterate the word with the regex # iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items(): for regex_str, compiled_regex in dico_regex.items():
matched = compiled_regex.search(content)
if matched is not None: #there is a match signal.alarm(max_execution_time)
try:
matched = compiled_regex.search(content)
except TimeoutException:
print ("{0} processing timeout".format(paste.p_path))
continue
else:
signal.alarm(0)
if matched is not None: # there is a match
print('regex matched {}'.format(regex_str)) print('regex matched {}'.format(regex_str))
matched = matched.group(0) matched = matched.group(0)
regex_str_complete = "/" + regex_str + "/" regex_str_complete = "/" + regex_str + "/"
@ -104,8 +122,8 @@ if __name__ == "__main__":
# create mail body # create mail body
mail_body = ("AIL Framework,\n" mail_body = ("AIL Framework,\n"
"New occurrence for regex: " + regex_str + "\n" "New occurrence for regex: " + regex_str + "\n"
''+full_paste_url + filename) ''+full_paste_url + filename)
# Send to every associated email adress # Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete): for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
@ -115,9 +133,9 @@ if __name__ == "__main__":
new_to_the_set = server_term.sadd(set_name, filename) new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this term # consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1))) regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
#1 term per paste # 1 term per paste
if new_to_the_set: if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1))) regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1)) server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))

View file

@ -99,6 +99,9 @@ operation_mode = 3
#Define the time that a paste will be considerate duplicate. in seconds (1day = 86400) #Define the time that a paste will be considerate duplicate. in seconds (1day = 86400)
ttl_duplicate = 86400 ttl_duplicate = 86400
[RegexForTermsFrequency]
max_execution_time = 60
##### Redis ##### ##### Redis #####
[Redis_Cache] [Redis_Cache]
host = localhost host = localhost