mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-19 08:46:14 +00:00
Merge remote-tracking branch 'upstream/master' into production
This commit is contained in:
commit
c422db6e6d
8 changed files with 129 additions and 13 deletions
35
README.md
35
README.md
|
@ -101,6 +101,37 @@ Eventually you can browse the status of the AIL framework website at the followi
|
||||||
|
|
||||||
``http://localhost:7000/``
|
``http://localhost:7000/``
|
||||||
|
|
||||||
|
How to
|
||||||
|
======
|
||||||
|
|
||||||
|
How to feed the AIL framework
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
For the moment, there are two different ways to feed AIL with data:
|
||||||
|
|
||||||
|
1. Be a collaborator of CIRCL and ask to access our feed. It will be sent to the static IP your are using for AIL.
|
||||||
|
|
||||||
|
2. You can setup [pystemon](https://github.com/CIRCL/pystemon) and use the custom feeder provided by AIL (see below).
|
||||||
|
|
||||||
|
###Feeding AIL with pystemon
|
||||||
|
AIL is an analysis tool, not a collector!
|
||||||
|
However, if you want to collect some pastes and feed them to AIL, the procedure is described below.
|
||||||
|
|
||||||
|
Nevertheless, moderate your queries!
|
||||||
|
|
||||||
|
Here are the steps to setup pystemon and feed data to AIL:
|
||||||
|
|
||||||
|
1. Clone the [pystemon's git repository](https://github.com/CIRCL/pystemon)
|
||||||
|
|
||||||
|
2. Install its python dependencies inside your virtual environment
|
||||||
|
|
||||||
|
3. Launch pystemon ``` ./pystemon ```
|
||||||
|
|
||||||
|
4. Edit the file ```bin/feeder/pystemon-feeder.py``` and modify the pystemonpath path accordingly
|
||||||
|
|
||||||
|
5. Launch pystemon-feeder ``` ./pystemon-feeder.py ```
|
||||||
|
|
||||||
|
|
||||||
How to create a new module
|
How to create a new module
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
@ -117,6 +148,10 @@ Feel free to fork the code, play with it, make some patches or add additional an
|
||||||
|
|
||||||
To contribute your module, feel free to pull your contribution.
|
To contribute your module, feel free to pull your contribution.
|
||||||
|
|
||||||
|
Overview and License
|
||||||
|
====================
|
||||||
|
|
||||||
|
|
||||||
Redis and LevelDB overview
|
Redis and LevelDB overview
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,20 @@ accepted_Mime_type = ['text/plain']
|
||||||
size_threshold = 250
|
size_threshold = 250
|
||||||
line_max_length_threshold = 1000
|
line_max_length_threshold = 1000
|
||||||
|
|
||||||
|
import os
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
|
if not os.path.exists(configfile):
|
||||||
|
raise Exception('Unable to find the configuration file. \
|
||||||
|
Did you set environment variables? \
|
||||||
|
Or activate the virtualenv.')
|
||||||
|
|
||||||
|
cfg = ConfigParser.ConfigParser()
|
||||||
|
cfg.read(configfile)
|
||||||
|
|
||||||
|
sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file")
|
||||||
|
|
||||||
def Analyse(message, server):
|
def Analyse(message, server):
|
||||||
path = message
|
path = message
|
||||||
paste = Paste.Paste(path)
|
paste = Paste.Paste(path)
|
||||||
|
@ -61,7 +75,7 @@ def Analyse(message, server):
|
||||||
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
|
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
|
||||||
neg_line = 0
|
neg_line = 0
|
||||||
pos_line = 0
|
pos_line = 0
|
||||||
sid = SentimentIntensityAnalyzer()
|
sid = SentimentIntensityAnalyzer(sentiment_lexicon_file)
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
ss = sid.polarity_scores(sentence)
|
ss = sid.polarity_scores(sentence)
|
||||||
for k in sorted(ss):
|
for k in sorted(ss):
|
||||||
|
|
|
@ -28,6 +28,15 @@ from packages import Paste
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
import signal
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutException
|
||||||
|
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
|
@ -44,10 +53,17 @@ if __name__ == "__main__":
|
||||||
print message
|
print message
|
||||||
if message is not None:
|
if message is not None:
|
||||||
paste = Paste.Paste(message)
|
paste = Paste.Paste(message)
|
||||||
|
signal.alarm(5)
|
||||||
|
try:
|
||||||
for word, score in paste._get_top_words().items():
|
for word, score in paste._get_top_words().items():
|
||||||
if len(word) >= 4:
|
if len(word) >= 4:
|
||||||
msg = '{} {} {}'.format(paste.p_path, word, score)
|
msg = '{} {} {}'.format(paste.p_path, word, score)
|
||||||
p.populate_set_out(msg)
|
p.populate_set_out(msg)
|
||||||
|
except TimeoutException:
|
||||||
|
print ("{0} processing timeout".format(paste.p_path))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
signal.alarm(0)
|
||||||
else:
|
else:
|
||||||
publisher.debug("Tokeniser is idling 10s")
|
publisher.debug("Tokeniser is idling 10s")
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
|
@ -113,7 +113,7 @@ if __name__ == "__main__":
|
||||||
# IP allocation)
|
# IP allocation)
|
||||||
if cc is not None and cc != "EU":
|
if cc is not None and cc != "EU":
|
||||||
print hostl, asn, cc, \
|
print hostl, asn, cc, \
|
||||||
pycountry.countries.get(alpha2=cc).name
|
pycountry.countries.get(alpha_2=cc).name
|
||||||
if cc == cc_critical:
|
if cc == cc_critical:
|
||||||
to_print = 'Url;{};{};{};Detected {} {}'.format(
|
to_print = 'Url;{};{};{};Detected {} {}'.format(
|
||||||
PST.p_source, PST.p_date, PST.p_name,
|
PST.p_source, PST.p_date, PST.p_name,
|
||||||
|
|
|
@ -39,7 +39,7 @@ def get_date_range(num_day):
|
||||||
return date_list
|
return date_list
|
||||||
|
|
||||||
# Compute the progression for one keyword
|
# Compute the progression for one keyword
|
||||||
def compute_progression_word(keyword):
|
def compute_progression_word(server, num_day, keyword):
|
||||||
date_range = get_date_range(num_day)
|
date_range = get_date_range(num_day)
|
||||||
# check if this keyword is eligible for progression
|
# check if this keyword is eligible for progression
|
||||||
keyword_total_sum = 0
|
keyword_total_sum = 0
|
||||||
|
@ -73,12 +73,12 @@ def compute_progression(server, field_name, num_day, url_parsed):
|
||||||
if keyword is not None:
|
if keyword is not None:
|
||||||
|
|
||||||
#compute the progression of the current word
|
#compute the progression of the current word
|
||||||
keyword_increase, keyword_total_sum = compute_progression_word(keyword)
|
keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword)
|
||||||
|
|
||||||
#re-compute the progression of 2*max_set_cardinality
|
#re-compute the progression of 2*max_set_cardinality
|
||||||
current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality)
|
current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality)
|
||||||
for word, value in array_top_day:
|
for word, value in current_top:
|
||||||
word_inc, word_tot_sum = compute_progression_word(word)
|
word_inc, word_tot_sum = compute_progression_word(server, num_day, word)
|
||||||
server.zrem(redis_progression_name_set, word)
|
server.zrem(redis_progression_name_set, word)
|
||||||
if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):
|
if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):
|
||||||
server.zadd(redis_progression_name_set, float(word_inc), word)
|
server.zadd(redis_progression_name_set, float(word_inc), word)
|
||||||
|
|
21
bin/feeder/pystemon-feeder.py
Normal file → Executable file
21
bin/feeder/pystemon-feeder.py
Normal file → Executable file
|
@ -24,13 +24,28 @@ import sys
|
||||||
import time
|
import time
|
||||||
import redis
|
import redis
|
||||||
import base64
|
import base64
|
||||||
|
import os
|
||||||
|
import ConfigParser
|
||||||
|
|
||||||
port = "5556"
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
pystemonpath = "/home/pystemon/pystemon/"
|
if not os.path.exists(configfile):
|
||||||
|
raise Exception('Unable to find the configuration file. \
|
||||||
|
Did you set environment variables? \
|
||||||
|
Or activate the virtualenv.')
|
||||||
|
|
||||||
|
cfg = ConfigParser.ConfigParser()
|
||||||
|
cfg.read(configfile)
|
||||||
|
|
||||||
|
if cfg.has_option("ZMQ_Global", "bind"):
|
||||||
|
zmq_url = cfg.get("ZMQ_Global", "bind")
|
||||||
|
else:
|
||||||
|
zmq_url = "tcp://127.0.0.1:5556"
|
||||||
|
|
||||||
|
pystemonpath = cfg.get("Directories", "pystemonpath")
|
||||||
|
|
||||||
context = zmq.Context()
|
context = zmq.Context()
|
||||||
socket = context.socket(zmq.PUB)
|
socket = context.socket(zmq.PUB)
|
||||||
socket.bind("tcp://*:%s" % port)
|
socket.bind(zmq_url)
|
||||||
|
|
||||||
# check https://github.com/cvandeplas/pystemon/blob/master/pystemon.yaml#L16
|
# check https://github.com/cvandeplas/pystemon/blob/master/pystemon.yaml#L16
|
||||||
r = redis.StrictRedis(host='localhost', db=10)
|
r = redis.StrictRedis(host='localhost', db=10)
|
||||||
|
|
|
@ -14,6 +14,10 @@ tldsfile = faup/src/data/mozilla.tlds
|
||||||
|
|
||||||
domainstrending_csv = var/www/static/csv/domainstrendingdata
|
domainstrending_csv = var/www/static/csv/domainstrendingdata
|
||||||
|
|
||||||
|
pystemonpath = /home/pystemon/pystemon/
|
||||||
|
|
||||||
|
sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt
|
||||||
|
|
||||||
##### Flask #####
|
##### Flask #####
|
||||||
[Flask]
|
[Flask]
|
||||||
#Maximum number of character to display in the toolip
|
#Maximum number of character to display in the toolip
|
||||||
|
@ -128,6 +132,7 @@ path = indexdir
|
||||||
#address = tcp://crf.circl.lu:5556
|
#address = tcp://crf.circl.lu:5556
|
||||||
address = tcp://127.0.0.1:5556
|
address = tcp://127.0.0.1:5556
|
||||||
channel = 102
|
channel = 102
|
||||||
|
bind = tcp://127.0.0.1:5556
|
||||||
|
|
||||||
[ZMQ_Url]
|
[ZMQ_Url]
|
||||||
address = tcp://127.0.0.1:5004
|
address = tcp://127.0.0.1:5004
|
||||||
|
|
31
doc/all_modules.txt
Normal file
31
doc/all_modules.txt
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
Attributes
|
||||||
|
BrowseWarningPaste
|
||||||
|
Categ
|
||||||
|
Credential
|
||||||
|
CreditCards
|
||||||
|
Curve
|
||||||
|
CurveManageTopSets
|
||||||
|
Cve
|
||||||
|
DomClassifier
|
||||||
|
Duplicates
|
||||||
|
Global
|
||||||
|
Indexer
|
||||||
|
Keys
|
||||||
|
Lines
|
||||||
|
Mail
|
||||||
|
Mixer
|
||||||
|
ModuleInformation
|
||||||
|
Keys
|
||||||
|
Lines
|
||||||
|
Mail
|
||||||
|
Mixer
|
||||||
|
ModuleInformation
|
||||||
|
ModuleStats
|
||||||
|
Onion
|
||||||
|
Phone
|
||||||
|
Release
|
||||||
|
SentimentAnalysis
|
||||||
|
SQLInjectionDetection
|
||||||
|
Tokenize
|
||||||
|
Web
|
||||||
|
WebStats
|
Loading…
Add table
Reference in a new issue