From 4eafca29366194be5110064a525d4164dff4caa8 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:13:08 +0200 Subject: [PATCH 01/14] Added pystemon instruction section --- README.md | 26 ++++++++++++++++++++++++++ bin/feeder/pystemon-feeder.py | 0 2 files changed, 26 insertions(+) mode change 100644 => 100755 bin/feeder/pystemon-feeder.py diff --git a/README.md b/README.md index f5cdb7b4..7483a57f 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,28 @@ Eventually you can browse the status of the AIL framework website at the followi ``http://localhost:7000/`` +How to +====== + +How to feed the AIL framework +----------------------------- + +For now, there are two different way to feed AIL with data: +1. Be a collaborator of CIRCL and ask to access our feed. Then, it will be sent to the static IP your are using for AIL. +2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL + +#Feeding AIL with pystemon +AIL is a analysis tool, not a collector! +However, if you want to collect some paste and feed them to AIL, here is the procedure but moderate your queries quantity!! + +Here are the steps to setup your pystemon and feed data to AIL: +1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon) +2. Install its python dependencies inside your virtual environment +3. Launch pystemon ``` ./pystemon ``` +4. Edit the file bin/feeder/pystemon-feeder.py and modify the pystemonpath path accordingly +5. Launch pystemon-feeder ``` ./pystemon-feeder.py ``` + + How to create a new module -------------------------- @@ -117,6 +139,10 @@ Feel free to fork the code, play with it, make some patches or add additional an To contribute your module, feel free to pull your contribution. +Overview and License +==================== + + Redis and LevelDB overview -------------------------- diff --git a/bin/feeder/pystemon-feeder.py b/bin/feeder/pystemon-feeder.py old mode 100644 new mode 100755 From f615333aae4866e72ba0d0c9a847823dd0ba9871 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:17:09 +0200 Subject: [PATCH 02/14] fixed typos --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7483a57f..6cb4462b 100644 --- a/README.md +++ b/README.md @@ -109,17 +109,23 @@ How to feed the AIL framework For now, there are two different way to feed AIL with data: 1. Be a collaborator of CIRCL and ask to access our feed. Then, it will be sent to the static IP your are using for AIL. + 2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL #Feeding AIL with pystemon -AIL is a analysis tool, not a collector! -However, if you want to collect some paste and feed them to AIL, here is the procedure but moderate your queries quantity!! +AIL is an analysis tool, not a collector! +However, if you want to collect some paste and feed them to AIL, the procedure is described below. +Nevertheless, moderate your queries! Here are the steps to setup your pystemon and feed data to AIL: 1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon) + 2. Install its python dependencies inside your virtual environment + 3. Launch pystemon ``` ./pystemon ``` + 4. Edit the file bin/feeder/pystemon-feeder.py and modify the pystemonpath path accordingly + 5. Launch pystemon-feeder ``` ./pystemon-feeder.py ``` From c88b77c37240e22acc1b735931fd6c4a512885af Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:20:25 +0200 Subject: [PATCH 03/14] fixed typos 2 --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6cb4462b..faa65fe9 100644 --- a/README.md +++ b/README.md @@ -115,16 +115,18 @@ For now, there are two different way to feed AIL with data: #Feeding AIL with pystemon AIL is an analysis tool, not a collector! However, if you want to collect some paste and feed them to AIL, the procedure is described below. + Nevertheless, moderate your queries! Here are the steps to setup your pystemon and feed data to AIL: + 1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon) 2. Install its python dependencies inside your virtual environment 3. Launch pystemon ``` ./pystemon ``` -4. Edit the file bin/feeder/pystemon-feeder.py and modify the pystemonpath path accordingly +4. Edit the file ```bin/feeder/pystemon-feeder.py``` and modify the pystemonpath path accordingly 5. Launch pystemon-feeder ``` ./pystemon-feeder.py ``` From 343f79886f4b8223f52160ce8b422c8055dff680 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:21:31 +0200 Subject: [PATCH 04/14] fixed typo 3 --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index faa65fe9..77bf8a90 100644 --- a/README.md +++ b/README.md @@ -108,11 +108,12 @@ How to feed the AIL framework ----------------------------- For now, there are two different way to feed AIL with data: + 1. Be a collaborator of CIRCL and ask to access our feed. Then, it will be sent to the static IP your are using for AIL. 2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL -#Feeding AIL with pystemon +###Feeding AIL with pystemon AIL is an analysis tool, not a collector! However, if you want to collect some paste and feed them to AIL, the procedure is described below. From c805fee61d5209dbfec9aac95b086594f24dd5ba Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:26:20 +0200 Subject: [PATCH 05/14] fixed typo 4 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 77bf8a90..cf638e05 100644 --- a/README.md +++ b/README.md @@ -107,15 +107,15 @@ How to How to feed the AIL framework ----------------------------- -For now, there are two different way to feed AIL with data: +For the moment, there are two different ways to feed AIL with data: -1. Be a collaborator of CIRCL and ask to access our feed. Then, it will be sent to the static IP your are using for AIL. +1. Be a collaborator of CIRCL and ask to access our feed. It will be sent to the static IP your are using for AIL. -2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL +2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL (see below). ###Feeding AIL with pystemon AIL is an analysis tool, not a collector! -However, if you want to collect some paste and feed them to AIL, the procedure is described below. +However, if you want to collect some pastes and feed them to AIL, the procedure is described below. Nevertheless, moderate your queries! From 39ec7a2839abef2bce09524521d10ea4302aaeeb Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 13 Oct 2016 17:26:55 +0200 Subject: [PATCH 06/14] fixed typo 5 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cf638e05..df1b8218 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ However, if you want to collect some pastes and feed them to AIL, the procedure Nevertheless, moderate your queries! -Here are the steps to setup your pystemon and feed data to AIL: +Here are the steps to setup pystemon and feed data to AIL: 1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon) From 68ca20db8f44751c367c2e30513ec9d6342ec83a Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Tue, 10 Jan 2017 16:45:32 +0100 Subject: [PATCH 07/14] fix to use the new pycountry API --- bin/Web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Web.py b/bin/Web.py index 49790185..0fae546d 100755 --- a/bin/Web.py +++ b/bin/Web.py @@ -113,7 +113,7 @@ if __name__ == "__main__": # IP allocation) if cc is not None and cc != "EU": print hostl, asn, cc, \ - pycountry.countries.get(alpha2=cc).name + pycountry.countries.get(alpha_2=cc).name if cc == cc_critical: to_print = 'Url;{};{};{};Detected {} {}'.format( PST.p_source, PST.p_date, PST.p_name, From 3dc014dad949b79633e3d6513ee6fea1eb63350d Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Tue, 10 Jan 2017 16:46:46 +0100 Subject: [PATCH 08/14] fix invalid variable propagation --- bin/WebStats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/WebStats.py b/bin/WebStats.py index 1c41b64d..4cc05b48 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -39,7 +39,7 @@ def get_date_range(num_day): return date_list # Compute the progression for one keyword -def compute_progression_word(keyword): +def compute_progression_word(server, num_day, keyword): date_range = get_date_range(num_day) # check if this keyword is eligible for progression keyword_total_sum = 0 @@ -73,12 +73,12 @@ def compute_progression(server, field_name, num_day, url_parsed): if keyword is not None: #compute the progression of the current word - keyword_increase, keyword_total_sum = compute_progression_word(keyword) + keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword) #re-compute the progression of 2*max_set_cardinality current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality) - for word, value in array_top_day: - word_inc, word_tot_sum = compute_progression_word(word) + for word, value in current_top: + word_inc, word_tot_sum = compute_progression_word(server, num_day, word) server.zrem(redis_progression_name_set, word) if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase): server.zadd(redis_progression_name_set, float(word_inc), word) From 8102ff009dfbc1c6a3133107f0b381ece7dfc064 Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Tue, 10 Jan 2017 16:48:05 +0100 Subject: [PATCH 09/14] add all_modules.txt file so that the WebGUI can do proper cleanup --- doc/all_modules.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 doc/all_modules.txt diff --git a/doc/all_modules.txt b/doc/all_modules.txt new file mode 100644 index 00000000..fabdf4e9 --- /dev/null +++ b/doc/all_modules.txt @@ -0,0 +1,31 @@ +Attributes +BrowseWarningPaste +Categ +Credential +CreditCards +Curve +CurveManageTopSets +Cve +DomClassifier +Duplicates +Global +Indexer +Keys +Lines +Mail +Mixer +ModuleInformation +Keys +Lines +Mail +Mixer +ModuleInformation +ModuleStats +Onion +Phone +Release +SentimentAnalysis +SQLInjectionDetection +Tokenize +Web +WebStats From 83db40104c05e4f3466cfd48a3da3ede627f47ec Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Tue, 10 Jan 2017 18:18:55 +0100 Subject: [PATCH 10/14] import pystemon-feeder configuration from the central configuration file --- bin/feeder/pystemon-feeder.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bin/feeder/pystemon-feeder.py b/bin/feeder/pystemon-feeder.py index 1a9088b3..d36ed66f 100755 --- a/bin/feeder/pystemon-feeder.py +++ b/bin/feeder/pystemon-feeder.py @@ -24,13 +24,24 @@ import sys import time import redis import base64 +import os +import ConfigParser -port = "5556" -pystemonpath = "/home/pystemon/pystemon/" +configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') +if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + +cfg = ConfigParser.ConfigParser() +cfg.read(configfile) + +zmq_url = cfg.get("ZMQ_Global", "address") +pystemonpath = cfg.get("Directories", "pystemonpath") context = zmq.Context() socket = context.socket(zmq.PUB) -socket.bind("tcp://*:%s" % port) +socket.bind(zmq_url) # check https://github.com/cvandeplas/pystemon/blob/master/pystemon.yaml#L16 r = redis.StrictRedis(host='localhost', db=10) From c0fac820319a9d5e337251794feefff487468f1f Mon Sep 17 00:00:00 2001 From: ptitdoc Date: Tue, 10 Jan 2017 18:33:46 +0100 Subject: [PATCH 11/14] Add pystemon path in configuration sample --- bin/packages/config.cfg.sample | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 4f2899a0..5ce73e3f 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -14,6 +14,8 @@ tldsfile = faup/src/data/mozilla.tlds domainstrending_csv = var/www/static/csv/domainstrendingdata +pystemonpath = /home/pystemon/pystemon/ + ##### Flask ##### [Flask] #Maximum number of character to display in the toolip From 9f9c265cb01f0d3ff202a1f9554169de4c53e1ae Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Wed, 11 Jan 2017 11:00:36 +0100 Subject: [PATCH 12/14] SentimentAnalysis: use lexicon file path from the ail configuration file --- bin/SentimentAnalysis.py | 16 +++++++++++++++- bin/packages/config.cfg.sample | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index 09f59e40..e16890e5 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -32,6 +32,20 @@ accepted_Mime_type = ['text/plain'] size_threshold = 250 line_max_length_threshold = 1000 +import os +import ConfigParser + +configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') +if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + +cfg = ConfigParser.ConfigParser() +cfg.read(configfile) + +sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file") + def Analyse(message, server): path = message paste = Paste.Paste(path) @@ -61,7 +75,7 @@ def Analyse(message, server): avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} neg_line = 0 pos_line = 0 - sid = SentimentIntensityAnalyzer() + sid = SentimentIntensityAnalyzer(sentiment_lexicon_file) for sentence in sentences: ss = sid.polarity_scores(sentence) for k in sorted(ss): diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 5ce73e3f..79bd402b 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -16,6 +16,8 @@ domainstrending_csv = var/www/static/csv/domainstrendingdata pystemonpath = /home/pystemon/pystemon/ +sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt + ##### Flask ##### [Flask] #Maximum number of character to display in the toolip From 3b101ea8f5fcbfa4d219826645f3e05edc5519b6 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Thu, 12 Jan 2017 07:32:55 +0000 Subject: [PATCH 13/14] (partially) Fix #91 using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 5 seconds) to ensure that the execution time of the tokenizer takes less than 5 seconds. This is a simple and standard POSIX signal handler. This approach fixes the specific issues we have currently with some inputs where the tokenization takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different tokenizer approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). --- bin/Tokenize.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/bin/Tokenize.py b/bin/Tokenize.py index b0adf895..5e5c9b17 100755 --- a/bin/Tokenize.py +++ b/bin/Tokenize.py @@ -28,6 +28,15 @@ from packages import Paste from pubsublogger import publisher from Helper import Process +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum, frame): + raise TimeoutException + +signal.signal(signal.SIGALRM, timeout_handler) if __name__ == "__main__": publisher.port = 6380 @@ -44,10 +53,17 @@ if __name__ == "__main__": print message if message is not None: paste = Paste.Paste(message) - for word, score in paste._get_top_words().items(): - if len(word) >= 4: - msg = '{} {} {}'.format(paste.p_path, word, score) - p.populate_set_out(msg) + signal.alarm(5) + try: + for word, score in paste._get_top_words().items(): + if len(word) >= 4: + msg = '{} {} {}'.format(paste.p_path, word, score) + p.populate_set_out(msg) + except TimeoutException: + print ("{0} processing timeout".format(paste.p_path)) + continue + else: + signal.alarm(0) else: publisher.debug("Tokeniser is idling 10s") time.sleep(10) From 16044d4d369ae8056e206c4306c6b2c05ac9baab Mon Sep 17 00:00:00 2001 From: Olivier MEDOC Date: Fri, 13 Jan 2017 14:54:43 +0100 Subject: [PATCH 14/14] pystemon-feeder: add the option bind in configuration file instead of address This is essentially because multiple feeds can be used in the address configuration variable. --- bin/feeder/pystemon-feeder.py | 6 +++++- bin/packages/config.cfg.sample | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/feeder/pystemon-feeder.py b/bin/feeder/pystemon-feeder.py index d36ed66f..770655bc 100755 --- a/bin/feeder/pystemon-feeder.py +++ b/bin/feeder/pystemon-feeder.py @@ -36,7 +36,11 @@ if not os.path.exists(configfile): cfg = ConfigParser.ConfigParser() cfg.read(configfile) -zmq_url = cfg.get("ZMQ_Global", "address") +if cfg.has_option("ZMQ_Global", "bind"): + zmq_url = cfg.get("ZMQ_Global", "bind") +else: + zmq_url = "tcp://127.0.0.1:5556" + pystemonpath = cfg.get("Directories", "pystemonpath") context = zmq.Context() diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 79bd402b..f2e8285d 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -132,6 +132,7 @@ path = indexdir #address = tcp://crf.circl.lu:5556 address = tcp://127.0.0.1:5556 channel = 102 +bind = tcp://127.0.0.1:5556 [ZMQ_Url] address = tcp://127.0.0.1:5004