From 3b101ea8f5fcbfa4d219826645f3e05edc5519b6 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Thu, 12 Jan 2017 07:32:55 +0000 Subject: [PATCH] (partially) Fix #91 using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 5 seconds) to ensure that the execution time of the tokenizer takes less than 5 seconds. This is a simple and standard POSIX signal handler. This approach fixes the specific issues we have currently with some inputs where the tokenization takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different tokenizer approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). --- bin/Tokenize.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/bin/Tokenize.py b/bin/Tokenize.py index b0adf895..5e5c9b17 100755 --- a/bin/Tokenize.py +++ b/bin/Tokenize.py @@ -28,6 +28,15 @@ from packages import Paste from pubsublogger import publisher from Helper import Process +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum, frame): + raise TimeoutException + +signal.signal(signal.SIGALRM, timeout_handler) if __name__ == "__main__": publisher.port = 6380 @@ -44,10 +53,17 @@ if __name__ == "__main__": print message if message is not None: paste = Paste.Paste(message) - for word, score in paste._get_top_words().items(): - if len(word) >= 4: - msg = '{} {} {}'.format(paste.p_path, word, score) - p.populate_set_out(msg) + signal.alarm(5) + try: + for word, score in paste._get_top_words().items(): + if len(word) >= 4: + msg = '{} {} {}'.format(paste.p_path, word, score) + p.populate_set_out(msg) + except TimeoutException: + print ("{0} processing timeout".format(paste.p_path)) + continue + else: + signal.alarm(0) else: publisher.debug("Tokeniser is idling 10s") time.sleep(10)