Merge remote-tracking branch 'upstream/master' into production

This commit is contained in:
Mokaddem 2017-01-17 16:38:48 +01:00
commit c422db6e6d
8 changed files with 129 additions and 13 deletions

View file

@ -101,6 +101,37 @@ Eventually you can browse the status of the AIL framework website at the followi
``http://localhost:7000/``
How to
======
How to feed the AIL framework
-----------------------------
For the moment, there are two different ways to feed AIL with data:
1. Be a collaborator of CIRCL and ask to access our feed. It will be sent to the static IP your are using for AIL.
2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL (see below).
###Feeding AIL with pystemon
AIL is an analysis tool, not a collector!
However, if you want to collect some pastes and feed them to AIL, the procedure is described below.
Nevertheless, moderate your queries!
Here are the steps to setup pystemon and feed data to AIL:
1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon)
2. Install its python dependencies inside your virtual environment
3. Launch pystemon ``` ./pystemon ```
4. Edit the file ```bin/feeder/pystemon-feeder.py``` and modify the pystemonpath path accordingly
5. Launch pystemon-feeder ``` ./pystemon-feeder.py ```
How to create a new module
--------------------------
@ -117,6 +148,10 @@ Feel free to fork the code, play with it, make some patches or add additional an
To contribute your module, feel free to pull your contribution.
Overview and License
====================
Redis and LevelDB overview
--------------------------

View file

@ -32,6 +32,20 @@ accepted_Mime_type = ['text/plain']
size_threshold = 250
line_max_length_threshold = 1000
import os
import ConfigParser
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = ConfigParser.ConfigParser()
cfg.read(configfile)
sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file")
def Analyse(message, server):
path = message
paste = Paste.Paste(path)
@ -61,7 +75,7 @@ def Analyse(message, server):
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
neg_line = 0
pos_line = 0
sid = SentimentIntensityAnalyzer()
sid = SentimentIntensityAnalyzer(sentiment_lexicon_file)
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):

View file

@ -28,6 +28,15 @@ from packages import Paste
from pubsublogger import publisher
from Helper import Process
import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
if __name__ == "__main__":
publisher.port = 6380
@ -44,10 +53,17 @@ if __name__ == "__main__":
print message
if message is not None:
paste = Paste.Paste(message)
for word, score in paste._get_top_words().items():
if len(word) >= 4:
msg = '{} {} {}'.format(paste.p_path, word, score)
p.populate_set_out(msg)
signal.alarm(5)
try:
for word, score in paste._get_top_words().items():
if len(word) >= 4:
msg = '{} {} {}'.format(paste.p_path, word, score)
p.populate_set_out(msg)
except TimeoutException:
print ("{0} processing timeout".format(paste.p_path))
continue
else:
signal.alarm(0)
else:
publisher.debug("Tokeniser is idling 10s")
time.sleep(10)

View file

@ -113,7 +113,7 @@ if __name__ == "__main__":
# IP allocation)
if cc is not None and cc != "EU":
print hostl, asn, cc, \
pycountry.countries.get(alpha2=cc).name
pycountry.countries.get(alpha_2=cc).name
if cc == cc_critical:
to_print = 'Url;{};{};{};Detected {} {}'.format(
PST.p_source, PST.p_date, PST.p_name,

View file

@ -39,7 +39,7 @@ def get_date_range(num_day):
return date_list
# Compute the progression for one keyword
def compute_progression_word(keyword):
def compute_progression_word(server, num_day, keyword):
date_range = get_date_range(num_day)
# check if this keyword is eligible for progression
keyword_total_sum = 0
@ -73,12 +73,12 @@ def compute_progression(server, field_name, num_day, url_parsed):
if keyword is not None:
#compute the progression of the current word
keyword_increase, keyword_total_sum = compute_progression_word(keyword)
keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword)
#re-compute the progression of 2*max_set_cardinality
current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality)
for word, value in array_top_day:
word_inc, word_tot_sum = compute_progression_word(word)
for word, value in current_top:
word_inc, word_tot_sum = compute_progression_word(server, num_day, word)
server.zrem(redis_progression_name_set, word)
if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):
server.zadd(redis_progression_name_set, float(word_inc), word)

21
bin/feeder/pystemon-feeder.py Normal file → Executable file
View file

@ -24,13 +24,28 @@ import sys
import time
import redis
import base64
import os
import ConfigParser
port = "5556"
pystemonpath = "/home/pystemon/pystemon/"
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = ConfigParser.ConfigParser()
cfg.read(configfile)
if cfg.has_option("ZMQ_Global", "bind"):
zmq_url = cfg.get("ZMQ_Global", "bind")
else:
zmq_url = "tcp://127.0.0.1:5556"
pystemonpath = cfg.get("Directories", "pystemonpath")
context = zmq.Context()
socket = context.socket(zmq.PUB)
socket.bind("tcp://*:%s" % port)
socket.bind(zmq_url)
# check https://github.com/cvandeplas/pystemon/blob/master/pystemon.yaml#L16
r = redis.StrictRedis(host='localhost', db=10)

View file

@ -14,6 +14,10 @@ tldsfile = faup/src/data/mozilla.tlds
domainstrending_csv = var/www/static/csv/domainstrendingdata
pystemonpath = /home/pystemon/pystemon/
sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt
##### Flask #####
[Flask]
#Maximum number of character to display in the toolip
@ -128,6 +132,7 @@ path = indexdir
#address = tcp://crf.circl.lu:5556
address = tcp://127.0.0.1:5556
channel = 102
bind = tcp://127.0.0.1:5556
[ZMQ_Url]
address = tcp://127.0.0.1:5004

31
doc/all_modules.txt Normal file
View file

@ -0,0 +1,31 @@
Attributes
BrowseWarningPaste
Categ
Credential
CreditCards
Curve
CurveManageTopSets
Cve
DomClassifier
Duplicates
Global
Indexer
Keys
Lines
Mail
Mixer
ModuleInformation
Keys
Lines
Mail
Mixer
ModuleInformation
ModuleStats
Onion
Phone
Release
SentimentAnalysis
SQLInjectionDetection
Tokenize
Web
WebStats