mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 09:47:17 +00:00
1379ef705a
AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information
614 lines
20 KiB
Python
614 lines
20 KiB
Python
import redis, gzip
|
|
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from pylab import *
|
|
|
|
from textblob import TextBlob
|
|
from nltk.corpus import stopwords
|
|
from nltk.tokenize import RegexpTokenizer
|
|
|
|
from lib_redis_insert import clean, listdirectory
|
|
from lib_jobs import *
|
|
|
|
from pubsublogger import publisher
|
|
|
|
import calendar as cal
|
|
from datetime import date, timedelta
|
|
from dateutil.rrule import rrule, DAILY
|
|
|
|
from packages import *
|
|
|
|
def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):
|
|
"""Looping function
|
|
|
|
:param pipe: -- Redis pipe.
|
|
:param nb: -- (int) Number of pastes proceeded by function
|
|
:param minlength: -- (int) passed to the next function
|
|
:param maxlength: -- (int) passed to the next function
|
|
|
|
"""
|
|
try:
|
|
for n in xrange(0,nb):
|
|
|
|
path = r_serv.lpop("filelist")
|
|
|
|
if path != None:
|
|
set_listof_pid(r_serv, path, sys.argv[0])
|
|
|
|
redis_zincr_words(pipe, path, minlength, maxlength)
|
|
|
|
update_listof_pid(r_serv)
|
|
|
|
r_serv.lpush("processed",path)
|
|
|
|
publisher.debug(path)
|
|
else:
|
|
publisher.debug("Empty list")
|
|
break
|
|
except (KeyboardInterrupt, SystemExit) as e:
|
|
flush_list_of_pid(r_serv)
|
|
publisher.debug("Pid list flushed")
|
|
|
|
|
|
|
|
|
|
|
|
def redis_zincr_words(pipe, filename, minlength, maxlength):
|
|
"""Create news sorted set in redis.
|
|
|
|
:param minlength: -- (int) Minimum words length inserted
|
|
:param maxlength: -- (int) Maximum words length inserted
|
|
:param filename: -- The absolute path to the file.gz to process.
|
|
|
|
Representation of the set in redis:
|
|
|
|
+------------+------------+-----------+
|
|
| Keys | Members | Scores |
|
|
+============+============+===========+
|
|
| 20131001 | word1 | 142 |
|
|
+------------+------------+-----------+
|
|
| ... | word2 | 120 |
|
|
+------------+------------+-----------+
|
|
| 20131002 | ... | ... |
|
|
+------------+------------+-----------+
|
|
|
|
This function store all words between minlength and maxlength in redis.
|
|
Redis will count as well how much time each word will appear by day:
|
|
The cardinality.
|
|
|
|
"""
|
|
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
|
|
|
|
with gzip.open(filename, 'rb') as F:
|
|
|
|
blob = TextBlob(clean(F.read()), tokenizer = tokenizer)
|
|
|
|
for word in blob.tokens:
|
|
|
|
if (len(word) >= minlength) and (len(word) <= maxlength):
|
|
pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)
|
|
|
|
if (len(word) >= maxlength):
|
|
publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
|
|
publisher.info(word)
|
|
|
|
pipe.execute()
|
|
|
|
|
|
|
|
|
|
def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
|
|
"""Tokenizing on word category
|
|
|
|
:param r_serv: -- Redis database connexion
|
|
:param listname: -- (str) path to the file containing the list of path of category files
|
|
:param choicedatastruct: -- (bool) Changing the index of datastructure
|
|
:param nb: -- (int) Number of pastes proceeded by function
|
|
|
|
Redis data structures cas be choose as follow:
|
|
|
|
+---------------+------------+-----------+
|
|
| Keys | Members | Scores |
|
|
+===============+============+===========+
|
|
| mails_categ | filename | 25000 |
|
|
+---------------+------------+-----------+
|
|
| ... | filename2 | 2400 |
|
|
+---------------+------------+-----------+
|
|
| web_categ | ... | ... |
|
|
+---------------+------------+-----------+
|
|
|
|
Or
|
|
|
|
+--------------+-------------+-----------+
|
|
| Keys | Members | Scores |
|
|
+==============+=============+===========+
|
|
| filename | mails_categ | 100000 |
|
|
+--------------+-------------+-----------+
|
|
| ... | web_categ | 24050 |
|
|
+--------------+-------------+-----------+
|
|
| filename2 | ... | ... |
|
|
+--------------+-------------+-----------+
|
|
|
|
This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
|
|
And insert data in redis if the token match the keywords in a list previously
|
|
created.
|
|
These lists of keywords can be list of everything you want but it's better
|
|
to create "category" of keywords.
|
|
|
|
"""
|
|
|
|
try:
|
|
for n in xrange(0,nb):
|
|
filename = r_serv.lpop(r_set)
|
|
|
|
if filename != None:
|
|
|
|
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
|
|
set_listof_pid(r_serv, filename, sys.argv[0])
|
|
|
|
with open(listname, 'rb') as L:
|
|
# for each "categ" listed in the file
|
|
for num, fname in enumerate(L):
|
|
# contain keywords by categ
|
|
tmp_list = []
|
|
#for each keywords
|
|
with open(fname[:-1], 'rb') as LS:
|
|
|
|
for num, kword in enumerate(LS):
|
|
tmp_list.append(kword[:-1])
|
|
|
|
# for each paste
|
|
with gzip.open(filename, 'rb') as F:
|
|
|
|
blob = TextBlob(clean(F.read()),
|
|
tokenizer = tokenizer)
|
|
|
|
# for each paste token
|
|
for word in blob.tokens.lower():
|
|
|
|
if word in tmp_list:
|
|
# choosing between two data structures.
|
|
if choicedatastruct:
|
|
r_serv.zincrby(filename,
|
|
fname.split('/')[-1][:-1],
|
|
1)
|
|
else:
|
|
r_serv.zincrby(fname.split('/')[-1][:-1],
|
|
filename,
|
|
1)
|
|
|
|
update_listof_pid(r_serv)
|
|
|
|
else:
|
|
publisher.debug("Empty list")
|
|
#r_serv.save()
|
|
break
|
|
|
|
except (KeyboardInterrupt, SystemExit) as e:
|
|
flush_list_of_pid(r_serv)
|
|
publisher.debug("Pid list flushed")
|
|
|
|
|
|
|
|
|
|
def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):
|
|
"""Store longlines's linenumbers in redis
|
|
|
|
:param r_serv: -- The redis connexion database
|
|
:param r_key: -- (str) The key name in redis
|
|
:param store: -- (bool) Store the line numbers or not.
|
|
:param maxlength: -- The limit between "short lines" and "long lines"
|
|
|
|
This function connect to a redis list of filename (pastes filename);
|
|
Open the paste and check inside if there is some line with their
|
|
length >= to maxlength.
|
|
If yes, the paste is "tagged" as containing a longlines in another
|
|
redis structures, and the linenumber (of the long lines) can be stored
|
|
in addition if the argument store is at True.
|
|
|
|
"""
|
|
try:
|
|
while True:
|
|
#r_key_list (categ)
|
|
filename = r_serv.lpop(r_key)
|
|
|
|
if filename != None:
|
|
|
|
set_listof_pid(r_serv, filename, sys.argv[0])
|
|
|
|
# for each pastes
|
|
with gzip.open(filename, 'rb') as F:
|
|
var = True
|
|
for num, line in enumerate(F):
|
|
|
|
if len(line) >= maxlength:
|
|
#publisher.debug("Longline:{0}".format(line))
|
|
if var:
|
|
r_serv.rpush("longlines", filename)
|
|
var = False
|
|
|
|
if store:
|
|
r_serv.sadd(filename, num)
|
|
else:
|
|
publisher.debug("Line numbers of longlines not stored")
|
|
|
|
update_listof_pid(r_serv)
|
|
else:
|
|
publisher.debug("Empty list")
|
|
return False
|
|
break
|
|
|
|
except (KeyboardInterrupt, SystemExit) as e:
|
|
flush_list_of_pid(r_serv)
|
|
publisher.debug("Pid list flushed")
|
|
|
|
|
|
|
|
|
|
# NOT USED RIGHT NOW #
|
|
def recovering_longlines(r_serv):
|
|
"""Get longlines with linenumbers
|
|
|
|
"""
|
|
try:
|
|
for n in xrange(0,nb):
|
|
filename = r_serv.lpop("longlines")
|
|
|
|
if filename != None:
|
|
# For each values in redis (longline's line number)
|
|
for numline in r_serv.smembers(filename):
|
|
|
|
with gzip.open(filename,'rb') as F:
|
|
|
|
for num, line in enumerate(F):
|
|
#When corresponding.
|
|
if int(num) == int(numline):
|
|
pass
|
|
# TREATMENT
|
|
else:
|
|
publisher.debug("Empty list")
|
|
r_serv.save()
|
|
break
|
|
|
|
except (KeyboardInterrupt, SystemExit) as e:
|
|
flush_list_of_pid(r_serv)
|
|
publisher.debug("Pid list flushed")
|
|
|
|
|
|
|
|
|
|
def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):
|
|
"""Remove from a set, file with long lines.
|
|
|
|
:param r_serv: -- The redis connexion database
|
|
:param r_key: -- (str) The key name in redis
|
|
:param store: -- (bool) Store the line numbers or not.
|
|
:param delete: -- (bool) If true, delete the used key from redis.
|
|
:param maxlength: -- The limit between "short lines" and "long lines"
|
|
|
|
"""
|
|
publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))
|
|
|
|
#Create a list of file to proceed (1)
|
|
for filename in r_serv.zrange(r_key, 0, -1):
|
|
r_serv.rpush(r_key+"_list", filename)
|
|
|
|
#detecting longlines in pastes
|
|
dectect_longlines(r_serv, r_key+"_list", store, maxlength)
|
|
|
|
#remove false positive members
|
|
while True:
|
|
fp_filename = r_serv.lpop("longlines")
|
|
|
|
if fp_filename == None:
|
|
break
|
|
|
|
else:
|
|
# if wanted, delete in addition the set with linenumbers (created with store)
|
|
if delete:
|
|
r_serv.zrem(r_key, fp_filename)
|
|
r_serv.delete(fp_filename)
|
|
|
|
else:
|
|
#remove the file with longline from the r_key zset.
|
|
r_serv.zrem(r_key, fp_filename)
|
|
|
|
publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))
|
|
|
|
|
|
|
|
|
|
def detect_longline_from_list(r_serv, nb):
|
|
try:
|
|
for n in xrange(0,nb):
|
|
|
|
if not dectect_longlines(r_serv, "filelist", True):
|
|
break
|
|
|
|
except (KeyboardInterrupt, SystemExit) as e:
|
|
flush_list_of_pid(r_serv)
|
|
publisher.debug("Pid list flushed")
|
|
|
|
|
|
|
|
|
|
def create_dirfile(r_serv, directory, overwrite):
|
|
"""Create a file of path.
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
:param directory: -- The folder where to launch the listing of the .gz files
|
|
|
|
This function create a list in redis with inside the absolute path
|
|
of all the pastes needed to be proceeded by function using parallel
|
|
(like redis_words_ranking)
|
|
|
|
"""
|
|
if overwrite:
|
|
r_serv.delete("filelist")
|
|
|
|
for x in listdirectory(directory):
|
|
r_serv.rpush("filelist",x)
|
|
|
|
publisher.info("The list was overwritten")
|
|
|
|
else:
|
|
if r_serv.llen("filelist") == 0:
|
|
|
|
for x in listdirectory(directory):
|
|
r_serv.rpush("filelist",x)
|
|
|
|
publisher.info("New list created")
|
|
else:
|
|
|
|
for x in listdirectory(directory):
|
|
r_serv.rpush("filelist",x)
|
|
|
|
publisher.info("The list was updated with new elements")
|
|
|
|
|
|
|
|
|
|
def redis_interbargraph_set(r_serv, year, month, overwrite):
|
|
"""Create a Redis sorted set.
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
:param year: -- (integer) The year to process
|
|
:param month: -- (integer) The month to process
|
|
:param overwrite: -- (bool) trigger the overwrite mode
|
|
|
|
This function create inside redis the intersection of all days in
|
|
a month two by two.
|
|
Example:
|
|
For a month of 31days it will create 30 sorted set between day and
|
|
day+1 until the last day.
|
|
The overwrite mode delete the intersets and re-create them.
|
|
|
|
"""
|
|
a = date(year, month, 01)
|
|
b = date(year, month, cal.monthrange(year, month)[1])
|
|
|
|
if overwrite:
|
|
r_serv.delete("InterSet")
|
|
|
|
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
|
|
dayafter = dt+timedelta(1)
|
|
|
|
r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
|
|
r_serv.zinterstore(
|
|
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
|
|
{str(dt.strftime("%Y%m%d")):1,
|
|
str(dayafter.strftime("%Y%m%d")):-1})
|
|
|
|
r_serv.zadd(
|
|
"InterSet",
|
|
1,
|
|
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
else:
|
|
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
|
|
dayafter = dt+timedelta(1)
|
|
|
|
if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:
|
|
|
|
r_serv.zinterstore(
|
|
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
|
|
{str(dt.strftime("%Y%m%d")):1,
|
|
str(dayafter.strftime("%Y%m%d")):-1})
|
|
|
|
r_serv.zadd(
|
|
"InterSet",
|
|
1,
|
|
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
|
|
|
|
publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")
|
|
|
|
else:
|
|
publisher.warning("Data already exist, operation aborted.")
|
|
|
|
|
|
|
|
|
|
|
|
def word_bar_graph(r_serv, year, month, filename):
|
|
"""Create an histogram.
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
:param year: -- (integer) The year to process
|
|
:param month: -- (integer) The month to process
|
|
:param filename: -- The absolute path where to save the figure.png
|
|
|
|
This function use matplotlib to create an histogram.
|
|
The redis database need obviously to be populated first
|
|
with functions: redis_words_ranking and redis_interbargraph_set.
|
|
|
|
"""
|
|
lw = []
|
|
adate = []
|
|
inter = [0]
|
|
rcParams['figure.figsize'] = 15, 10
|
|
|
|
a = date(year, month, 01)
|
|
b = date(year, month, cal.monthrange(year,month)[1])
|
|
|
|
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))
|
|
adate.append(dt.strftime("%d"))
|
|
|
|
for x in r_serv.zrange("InterSet", 0, 31):
|
|
inter.append(r_serv.zcard(x))
|
|
|
|
n_groups = len(lw)
|
|
card_words = tuple(lw)
|
|
card_interword = tuple(inter)
|
|
|
|
index = np.arange(n_groups)
|
|
bar_width = 0.5
|
|
opacity = 0.6
|
|
|
|
words = plt.bar(index, card_words, bar_width,
|
|
alpha=opacity,
|
|
color='g',
|
|
label='Words/day')
|
|
|
|
lwords = plt.bar(index - 0.5, card_interword, bar_width,
|
|
alpha=opacity,
|
|
color='r',
|
|
label='Intersection')
|
|
|
|
|
|
plt.plot(tuple(inter), 'b--')
|
|
plt.xlabel(str(year)+'/'+str(month)+' Days')
|
|
plt.ylabel('Words')
|
|
plt.title('Words Cardinality & Intersection Histogram')
|
|
plt.xticks(index + bar_width/2 , tuple(adate))
|
|
|
|
plt.legend()
|
|
plt.grid()
|
|
|
|
plt.tight_layout()
|
|
|
|
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
|
|
orientation='portrait', papertype=None, format="png",
|
|
transparent=False, bbox_inches=None, pad_inches=0.1,
|
|
frameon=True)
|
|
|
|
publisher.info(filename+".png"+" saved!")
|
|
|
|
|
|
|
|
|
|
def create_data_words_curve(r_serv, r_serv2, year, month, filename):
|
|
"""Create a Redis hashes.
|
|
|
|
:param r_serv: -- connexion to redis database (read)
|
|
:param r_serv2: -- connexion to redis database (write)
|
|
:param year: -- (integer) The year to process
|
|
:param month: -- (integer) The month to process
|
|
:param filename: -- the path to the file which contain a list of words.
|
|
|
|
|
|
The hashes of redis is created as follow:
|
|
|
|
+------------+------------+-----------+
|
|
| Keys | Field | Values |
|
|
+============+============+===========+
|
|
| word1 | 20131001 | 150 |
|
|
+------------+------------+-----------+
|
|
| ... | 20131002 | 145 |
|
|
+------------+------------+-----------+
|
|
| word2 | ... | ... |
|
|
+------------+------------+-----------+
|
|
|
|
The filename need to be a list of words separated by a carriage return
|
|
with an empty line at the end.
|
|
This function create datas which is used by the function
|
|
create_curve_with_word_file which create a csv file.
|
|
|
|
"""
|
|
stop = stopwords.words('english')
|
|
a = date(year, month, 01)
|
|
b = date(year, month, cal.monthrange(year,month)[1])
|
|
|
|
with open(filename, 'rb') as F:
|
|
|
|
for line in F:
|
|
|
|
for dt in rrule(DAILY, dtstart = a, until = b):
|
|
|
|
if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:
|
|
#tester si ca existe deja "en option" et ajouter un WARNING log
|
|
r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})
|
|
else:
|
|
pass
|
|
|
|
|
|
|
|
|
|
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
|
"""Create a csv file used with dygraph.
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
:param csvfilename: -- the path to the .csv file created
|
|
:param feederfilename: -- the path to the file which contain a list of words.
|
|
:param year: -- (integer) The year to process
|
|
:param month: -- (integer) The month to process
|
|
|
|
This function create a .csv file using datas in redis.
|
|
It's checking if the words contained in feederfilename and
|
|
their respectives values by days exists. If these values are missing
|
|
(Word not present during a day) it's will automatically put a 0
|
|
to keep the timeline of the curve correct.
|
|
|
|
"""
|
|
a = date(year, month, 01)
|
|
b = date(year, month, cal.monthrange(year,month)[1])
|
|
days = {}
|
|
words = []
|
|
|
|
with open(feederfilename, 'rb') as F:
|
|
for word in F: # words of the files
|
|
words.append(word[:-1]) # list of words (sorted as in the file)
|
|
|
|
for dt in rrule(DAILY, dtstart = a, until = b): # for each days
|
|
|
|
mot = []
|
|
mot1 = []
|
|
mot2 = []
|
|
|
|
days[dt.strftime("%Y%m%d")] = ''
|
|
for word in sorted(words): # from the 1srt day to the last of the list
|
|
if r_serv.hexists(word, dt.strftime("%Y%m%d")): # if the word have a value for the day
|
|
mot1.append(str(word))
|
|
mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))
|
|
|
|
mot = zip(mot1, mot2)
|
|
|
|
days[dt.strftime("%Y%m%d")] = mot
|
|
else:
|
|
|
|
mot1.append(str(word))
|
|
mot2.append(0)
|
|
|
|
mot = zip(mot1, mot2)
|
|
|
|
days[dt.strftime("%Y%m%d")] = mot
|
|
|
|
with open(csvfilename+".csv", 'wb') as F:
|
|
F.write("Date," + ",".join(sorted(words)) + '\n')
|
|
|
|
for x, s in days.items():
|
|
val = []
|
|
for y in s:
|
|
val.append(y[1])
|
|
|
|
F.write(x + ',' + str(val) + '\n')
|
|
|
|
with open(csvfilename+".csv", 'rb') as F:
|
|
h = F.read()
|
|
h = h.replace("[","")
|
|
h = h.replace("]","")
|
|
h = h.replace('\'',"")
|
|
|
|
with open(csvfilename+".csv", 'wb') as F:
|
|
F.write(h)
|