Merge pull request #97 from osagit/abstract-module

feat: module factorization
This commit is contained in:
Thirion Aurélien 2021-04-02 16:32:06 +02:00 committed by GitHub
commit 77a708845c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 1076 additions and 841 deletions

View file

@ -36,23 +36,36 @@ Requirements
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. *Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
""" """
##################################
# Import External packages
##################################
import os import os
import argparse import argparse
import time import time
import re import re
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from pubsublogger import publisher from pubsublogger import publisher
from packages import Paste from packages import Paste
from Helper import Process from Helper import Process
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Categ' class Categ(AbstractModule):
"""
Categ module for AIL framework
"""
p = Process(config_section) def __init__(self):
matchingThreshold = p.config.getint("Categ", "matchingThreshold") """
Init Categ
"""
super(Categ, self).__init__()
self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
# SCRIPT PARSER # # SCRIPT PARSER #
parser = argparse.ArgumentParser(description='Start Categ module on files.') parser = argparse.ArgumentParser(description='Start Categ module on files.')
@ -64,8 +77,7 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
# FUNCTIONS # self.redis_logger.info("Script Categ started")
publisher.info("Script Categ started")
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey'] categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
tmp_dict = {} tmp_dict = {}
@ -76,28 +88,44 @@ if __name__ == "__main__":
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f] patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE) tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
self.categ_items = tmp_dict.items()
prec_filename = None prec_filename = None
while True:
filename = p.get_from_set()
if filename is None:
publisher.debug("Script Categ is Idling 10s")
print('Sleeping')
time.sleep(10)
continue
paste = Paste.Paste(filename) def compute(self, message):
# Cast message as paste
paste = Paste.Paste(message)
# Get paste content
content = paste.get_p_content() content = paste.get_p_content()
for categ, pattern in tmp_dict.items(): # init categories found
is_categ_found = False
# Search for pattern categories in paste content
for categ, pattern in self.categ_items:
found = set(re.findall(pattern, content)) found = set(re.findall(pattern, content))
if len(found) >= matchingThreshold: lenfound = len(found)
msg = '{} {}'.format(paste.p_rel_path, len(found)) if lenfound >= self.matchingThreshold:
is_categ_found = True
msg = '{} {}'.format(paste.p_rel_path, lenfound)
print(msg, categ) self.redis_logger.debug('%s;%s %s'%(self.module_name, msg, categ))
p.populate_set_out(msg, categ)
publisher.info( # Export message to categ queue
self.process.populate_set_out(msg, categ)
self.redis_logger.info(
'Categ;{};{};{};Detected {} as {};{}'.format( 'Categ;{};{};{};Detected {} as {};{}'.format(
paste.p_source, paste.p_date, paste.p_name, paste.p_source, paste.p_date, paste.p_name,
len(found), categ, paste.p_rel_path)) lenfound, categ, paste.p_rel_path))
if not is_categ_found:
self.redis_logger.debug('No %s found in this paste: %s'%(self.module_name, paste.p_name))
if __name__ == '__main__':
module = Categ()
module.run()

View file

@ -9,131 +9,148 @@ The ZMQ_Sub_Indexer modules is fetching the list of files to be processed
and index each file with a full-text indexer (Whoosh until now). and index each file with a full-text indexer (Whoosh until now).
""" """
##################################
# Import External packages
##################################
import time import time
from packages import Paste
from pubsublogger import publisher
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
import shutil import shutil
import os import os
from os.path import join, getsize from os.path import join, getsize
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste
from Helper import Process from Helper import Process
# Config variable
TIME_WAIT = 60*15 #sec
# return in bytes class Indexer(AbstractModule):
def check_index_size(baseindexpath, indexname): """
the_index_name = join(baseindexpath, indexname) Indexer module for AIL framework
cur_sum = 0 """
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
def move_index_into_old_index_folder(baseindexpath): # Time to wait in seconds between two index's size variable compute
for cur_file in os.listdir(baseindexpath): TIME_WAIT = 60*15 # sec
if not cur_file == "old_index":
shutil.move(join(baseindexpath, cur_file), join(join(baseindexpath, "old_index"), cur_file))
def __init__(self):
if __name__ == "__main__": """
publisher.port = 6380 Init Instance
publisher.channel = "Script" """
super(Indexer, self).__init__()
config_section = 'Indexer'
p = Process(config_section)
# Indexer configuration - index dir and schema setup # Indexer configuration - index dir and schema setup
baseindexpath = join(os.environ['AIL_HOME'], self.baseindexpath = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "path")) self.process.config.get("Indexer", "path"))
indexRegister_path = join(os.environ['AIL_HOME'], self.indexRegister_path = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "register")) self.process.config.get("Indexer", "register"))
indexertype = p.config.get("Indexer", "type") self.indexertype = self.process.config.get("Indexer", "type")
INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size")) self.INDEX_SIZE_THRESHOLD = self.process.config.getint(
if indexertype == "whoosh": "Indexer", "index_max_size")
schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
self.indexname = None
self.schema = None
self.ix = None
if self.indexertype == "whoosh":
self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True), unique=True),
content=TEXT) content=TEXT)
if not os.path.exists(baseindexpath): if not os.path.exists(self.baseindexpath):
os.mkdir(baseindexpath) os.mkdir(self.baseindexpath)
# create the index register if not present # create the index register if not present
time_now = int(time.time()) time_now = int(time.time())
if not os.path.isfile(indexRegister_path): #index are not organised if not os.path.isfile(self.indexRegister_path): # index are not organised
print("Indexes are not organized") self.redis_logger.debug("Indexes are not organized")
print("moving all files in folder 'old_index' ") self.redis_logger.debug(
#move all files to old_index folder "moving all files in folder 'old_index' ")
move_index_into_old_index_folder(baseindexpath) # move all files to old_index folder
print("Creating new index") self.move_index_into_old_index_folder()
#create all_index.txt self.redis_logger.debug("Creating new index")
with open(indexRegister_path, 'w') as f: # create all_index.txt
with open(self.indexRegister_path, 'w') as f:
f.write(str(time_now)) f.write(str(time_now))
#create dir # create dir
os.mkdir(join(baseindexpath, str(time_now))) os.mkdir(join(self.baseindexpath, str(time_now)))
with open(indexRegister_path, "r") as f: with open(self.indexRegister_path, "r") as f:
allIndex = f.read() allIndex = f.read()
allIndex = allIndex.split() # format [time1\ntime2] allIndex = allIndex.split() # format [time1\ntime2]
allIndex.sort() allIndex.sort()
try: try:
indexname = allIndex[-1].strip('\n\r') self.indexname = allIndex[-1].strip('\n\r')
except IndexError as e: except IndexError as e:
indexname = time_now self.indexname = time_now
indexpath = join(baseindexpath, str(indexname)) self.indexpath = join(self.baseindexpath, str(self.indexname))
if not exists_in(indexpath): if not exists_in(self.indexpath):
ix = create_in(indexpath, schema) self.ix = create_in(self.indexpath, self.schema)
else: else:
ix = open_dir(indexpath) self.ix = open_dir(self.indexpath)
last_refresh = time_now self.last_refresh = time_now
# LOGGING #
publisher.info("ZMQ Indexer is Running")
while True: def compute(self, message):
try: try:
message = p.get_from_set()
if message is not None:
PST = Paste.Paste(message) PST = Paste.Paste(message)
else:
publisher.debug("Script Indexer is idling 1s")
time.sleep(1)
continue
docpath = message.split(" ", -1)[-1] docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content() paste = PST.get_p_content()
print("Indexing - " + indexname + " :", docpath) self.redis_logger.debug("Indexing - " + self.indexname + " :", docpath)
# Avoid calculating the index's size at each message
#avoid calculating the index's size at each message if(time.time() - self.last_refresh > self.TIME_WAIT):
if( time.time() - last_refresh > TIME_WAIT): self.last_refresh = time.time()
last_refresh = time.time() if self.check_index_size() >= self.INDEX_SIZE_THRESHOLD*(1000*1000):
if check_index_size(baseindexpath, indexname) >= INDEX_SIZE_THRESHOLD*(1000*1000):
timestamp = int(time.time()) timestamp = int(time.time())
print("Creating new index", timestamp) self.redis_logger.debug("Creating new index", timestamp)
indexpath = join(baseindexpath, str(timestamp)) self.indexpath = join(self.baseindexpath, str(timestamp))
indexname = str(timestamp) self.indexname = str(timestamp)
#update all_index # update all_index
with open(indexRegister_path, "a") as f: with open(self.indexRegister_path, "a") as f:
f.write('\n'+str(timestamp)) f.write('\n'+str(timestamp))
#create new dir # create new dir
os.mkdir(indexpath) os.mkdir(self.indexpath)
ix = create_in(indexpath, schema) self.ix = create_in(self.indexpath, self.schema)
if self.indexertype == "whoosh":
if indexertype == "whoosh": indexwriter = self.ix.writer()
indexwriter = ix.writer()
indexwriter.update_document( indexwriter.update_document(
title=docpath, title=docpath,
path=docpath, path=docpath,
content=paste) content=paste)
indexwriter.commit() indexwriter.commit()
except IOError: except IOError:
print("CRC Checksum Failed on :", PST.p_path) self.redis_logger.debug("CRC Checksum Failed on :", PST.p_path)
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( self.redis_logger.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
PST.p_source, PST.p_date, PST.p_name)) PST.p_source, PST.p_date, PST.p_name))
def check_index_size(self):
"""
return in bytes
"""
the_index_name = join(self.baseindexpath, self.indexname)
cur_sum = 0
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
def move_index_into_old_index_folder(self):
for cur_file in os.listdir(self.baseindexpath):
if not cur_file == "old_index":
shutil.move(join(self.baseindexpath, cur_file), join(
join(self.baseindexpath, "old_index"), cur_file))
if __name__ == '__main__':
module = Indexer()
module.run()

View file

@ -12,160 +12,170 @@ RSA private key, certificate messages
""" """
##################################
# Import External packages
##################################
import time import time
from enum import Enum
from pubsublogger import publisher from pubsublogger import publisher
#from bin.packages import Paste
#from bin.Helper import Process
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste from packages import Paste
from Helper import Process from Helper import Process
def search_key(paste): class KeyEnum(Enum):
PGP_MESSAGE = '-----BEGIN PGP MESSAGE-----'
PGP_PUBLIC_KEY_BLOCK = '-----BEGIN PGP PUBLIC KEY BLOCK-----'
PGP_PRIVATE_KEY_BLOCK = '-----BEGIN PGP PRIVATE KEY BLOCK-----'
PGP_SIGNATURE = '-----BEGIN PGP SIGNATURE-----'
CERTIFICATE = '-----BEGIN CERTIFICATE-----'
PUBLIC_KEY = '-----BEGIN PUBLIC KEY-----'
PRIVATE_KEY = '-----BEGIN PRIVATE KEY-----'
ENCRYPTED_PRIVATE_KEY = '-----BEGIN ENCRYPTED PRIVATE KEY-----'
OPENSSH_PRIVATE_KEY = '-----BEGIN OPENSSH PRIVATE KEY-----'
SSH2_ENCRYPTED_PRIVATE_KEY = '---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----'
OPENVPN_STATIC_KEY_V1 = '-----BEGIN OpenVPN Static key V1-----'
RSA_PRIVATE_KEY = '-----BEGIN RSA PRIVATE KEY-----'
DSA_PRIVATE_KEY = '-----BEGIN DSA PRIVATE KEY-----'
EC_PRIVATE_KEY = '-----BEGIN EC PRIVATE KEY-----'
class Keys(AbstractModule):
"""
Keys module for AIL framework
"""
def __init__(self):
super(Keys, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 1
def compute(self, message):
paste = Paste.Paste(message)
content = paste.get_p_content() content = paste.get_p_content()
find = False find = False
get_pgp_content = False get_pgp_content = False
if '-----BEGIN PGP MESSAGE-----' in content:
publisher.warning('{} has a PGP enc message'.format(paste.p_name)) if KeyEnum.PGP_MESSAGE.value in content:
self.redis_logger.warning('{} has a PGP enc message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="pgp-message";{}'.format(message) msg = 'infoleak:automatic-detection="pgp-message";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
get_pgp_content = True get_pgp_content = True
find = True find = True
if '-----BEGIN PGP PUBLIC KEY BLOCK-----' in content: if KeyEnum.PGP_PUBLIC_KEY_BLOCK.value in content:
msg = 'infoleak:automatic-detection="pgp-public-key-block";{}'.format(message) msg = 'infoleak:automatic-detection="pgp-public-key-block";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
get_pgp_content = True get_pgp_content = True
if '-----BEGIN PGP SIGNATURE-----' in content: if KeyEnum.PGP_SIGNATURE.value in content:
msg = 'infoleak:automatic-detection="pgp-signature";{}'.format(message) msg = 'infoleak:automatic-detection="pgp-signature";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
get_pgp_content = True get_pgp_content = True
if KeyEnum.CERTIFICATE.value in content:
if '-----BEGIN CERTIFICATE-----' in content: self.redis_logger.warning('{} has a certificate message'.format(paste.p_name))
publisher.warning('{} has a certificate message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="certificate";{}'.format(message) msg = 'infoleak:automatic-detection="certificate";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN RSA PRIVATE KEY-----' in content: if KeyEnum.RSA_PRIVATE_KEY.value in content:
publisher.warning('{} has a RSA private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a RSA private key message'.format(paste.p_name))
print('rsa private key message found') print('rsa private key message found')
msg = 'infoleak:automatic-detection="rsa-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="rsa-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN PRIVATE KEY-----' in content: if KeyEnum.PRIVATE_KEY.value in content:
publisher.warning('{} has a private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a private key message'.format(paste.p_name))
print('private key message found') print('private key message found')
msg = 'infoleak:automatic-detection="private-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN ENCRYPTED PRIVATE KEY-----' in content: if KeyEnum.ENCRYPTED_PRIVATE_KEY.value in content:
publisher.warning('{} has an encrypted private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an encrypted private key message'.format(paste.p_name))
print('encrypted private key message found') print('encrypted private key message found')
msg = 'infoleak:automatic-detection="encrypted-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="encrypted-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN OPENSSH PRIVATE KEY-----' in content: if KeyEnum.OPENSSH_PRIVATE_KEY.value in content:
publisher.warning('{} has an openssh private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an openssh private key message'.format(paste.p_name))
print('openssh private key message found') print('openssh private key message found')
msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----' in content: if KeyEnum.SSH2_ENCRYPTED_PRIVATE_KEY.value in content:
publisher.warning('{} has an ssh2 private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an ssh2 private key message'.format(paste.p_name))
print('SSH2 private key message found') print('SSH2 private key message found')
msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN OpenVPN Static key V1-----' in content: if KeyEnum.OPENVPN_STATIC_KEY_V1.value in content:
publisher.warning('{} has an openssh private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an openssh private key message'.format(paste.p_name))
print('OpenVPN Static key message found') print('OpenVPN Static key message found')
msg = 'infoleak:automatic-detection="vpn-static-key";{}'.format(message) msg = 'infoleak:automatic-detection="vpn-static-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN DSA PRIVATE KEY-----' in content: if KeyEnum.DSA_PRIVATE_KEY.value in content:
publisher.warning('{} has a dsa private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a dsa private key message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="dsa-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="dsa-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN EC PRIVATE KEY-----' in content: if KeyEnum.EC_PRIVATE_KEY.value in content:
publisher.warning('{} has an ec private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an ec private key message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="ec-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="ec-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN PGP PRIVATE KEY BLOCK-----' in content: if KeyEnum.PGP_PRIVATE_KEY_BLOCK.value in content:
publisher.warning('{} has a pgp private key block message'.format(paste.p_name)) self.redis_logger.warning('{} has a pgp private key block message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="pgp-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="pgp-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN PUBLIC KEY-----' in content: if KeyEnum.PUBLIC_KEY.value in content:
publisher.warning('{} has a public key message'.format(paste.p_name)) self.redis_logger.warning('{} has a public key message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="public-key";{}'.format(message) msg = 'infoleak:automatic-detection="public-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
# pgp content # pgp content
if get_pgp_content: if get_pgp_content:
p.populate_set_out(message, 'PgpDump') self.process.populate_set_out(message, 'PgpDump')
if find : if find :
#Send to duplicate #Send to duplicate
p.populate_set_out(message, 'Duplicate') self.process.populate_set_out(message, 'Duplicate')
print(message) self.redis_logger.debug(message)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Keys()
config_section = 'Keys' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Run Keys module ")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
paste = Paste.Paste(message)
search_key(paste)
# (Optional) Send that thing to the next queue

View file

@ -5,19 +5,60 @@
""" """
##################################
# Import External packages
##################################
import time import time
import datetime import datetime
import redis import redis
import os import os
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages.Date import Date from packages.Date import Date
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
from packages import Paste from packages import Paste
import ConfigLoader
# Config Var
max_set_cardinality = 8
def get_date_range(num_day): class ModuleStats(AbstractModule):
"""
Module Statistics module for AIL framework
"""
# Config Var
MAX_SET_CARDINALITY = 8
def __init__(self):
super(ModuleStats, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 20
# Sent to the logging a description of the module
self.redis_logger.info("Makes statistics about valid URL")
# REDIS #
self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
self.r_serv_pasteName = ConfigLoader.ConfigLoader().get_redis_conn("Redis_Paste_Name")
def compute(self, message):
if len(message.split(';')) > 1:
self.compute_most_posted(message)
else:
self.compute_provider_info(message)
def get_date_range(self, num_day):
curr_date = datetime.date.today() curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = [] date_list = []
@ -27,40 +68,40 @@ def get_date_range(num_day):
return date_list return date_list
def compute_most_posted(server, message): def compute_most_posted(self, message):
module, num, keyword, paste_date = message.split(';') module, num, keyword, paste_date = message.split(';')
redis_progression_name_set = 'top_'+ module +'_set_' + paste_date redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
# Add/Update in Redis # Add/Update in Redis
server.hincrby(paste_date, module+'-'+keyword, int(num)) self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
# Compute Most Posted # Compute Most Posted
date = get_date_range(0)[0] date = self.get_date_range(0)[0]
# check if this keyword is eligible for progression # check if this keyword is eligible for progression
keyword_total_sum = 0 keyword_total_sum = 0
curr_value = server.hget(date, module+'-'+keyword) curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
keyword_total_sum += int(curr_value) if curr_value is not None else 0 keyword_total_sum += int(curr_value) if curr_value is not None else 0
if server.zcard(redis_progression_name_set) < max_set_cardinality: if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
else: # not in set else: # not in set
member_set = server.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1) member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs # Member set is a list of (value, score) pairs
if int(member_set[0][1]) < keyword_total_sum: if int(member_set[0][1]) < keyword_total_sum:
#remove min from set and add the new one #remove min from set and add the new one
print(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server.zrem(redis_progression_name_set, member_set[0][0]) self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
print(redis_progression_name_set) self.redis_logger.debug(redis_progression_name_set)
def compute_provider_info(server_trend, path): def compute_provider_info(self, message):
redis_all_provider = 'all_provider_set' redis_all_provider = 'all_provider_set'
paste = Paste.Paste(path) paste = Paste.Paste(message)
paste_baseName = paste.p_name.split('.')[0] paste_baseName = paste.p_name.split('.')[0]
paste_size = paste._get_p_size() paste_size = paste._get_p_size()
@ -71,12 +112,13 @@ def compute_provider_info(server_trend, path):
redis_providers_name_set = 'providers_set_' + paste_date redis_providers_name_set = 'providers_set_' + paste_date
# Add/Update in Redis # Add/Update in Redis
server_trend.sadd(redis_all_provider, paste_provider) self.r_serv_pasteName.sadd(paste_baseName, message)
self.r_serv_trend.sadd(redis_all_provider, paste_provider)
num_paste = int(server_trend.hincrby(paste_provider+'_num', paste_date, 1)) num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
sum_size = float(server_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size)) sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
new_avg = float(sum_size) / float(num_paste) new_avg = float(sum_size) / float(num_paste)
server_trend.hset(paste_provider +'_avg', paste_date, new_avg) self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
# #
@ -84,72 +126,36 @@ def compute_provider_info(server_trend, path):
# #
# Size # Size
if server_trend.zcard(redis_sum_size_set) < max_set_cardinality or server_trend.zscore(redis_sum_size_set, paste_provider) != "nil": if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
server_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider) self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
server_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
else: #set full capacity else: #set full capacity
member_set = server_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1) member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs # Member set is a list of (value, score) pairs
if float(member_set[0][1]) < new_avg: if float(member_set[0][1]) < new_avg:
#remove min from set and add the new one #remove min from set and add the new one
print('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server_trend.zrem(redis_sum_size_set, member_set[0][0]) self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
server_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider) self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
server_trend.zrem(redis_avg_size_name_set, member_set[0][0]) self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
server_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
# Num # Num
# if set not full or provider already present # if set not full or provider already present
if server_trend.zcard(redis_providers_name_set) < max_set_cardinality or server_trend.zscore(redis_providers_name_set, paste_provider) != "nil": if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
server_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
else: #set at full capacity else: #set at full capacity
member_set = server_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1) member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs # Member set is a list of (value, score) pairs
if int(member_set[0][1]) < num_paste: if int(member_set[0][1]) < num_paste:
#remove min from set and add the new one #remove min from set and add the new one
print('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server_trend.zrem(member_set[0][0]) self.r_serv_trend.zrem(member_set[0][0])
server_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = ModuleStats()
config_section = 'ModuleStats' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Makes statistics about valid URL")
# REDIS #
r_serv_trend = redis.StrictRedis(
host=p.config.get("ARDB_Trending", "host"),
port=p.config.get("ARDB_Trending", "port"),
db=p.config.get("ARDB_Trending", "db"),
decode_responses=True)
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
print('sleeping')
time.sleep(20)
continue
else:
# Do something with the message from the queue
if len(message.split(';')) > 1:
compute_most_posted(r_serv_trend, message)
else:
compute_provider_info(r_serv_trend, message)

View file

@ -11,33 +11,57 @@ It apply phone number regexes on paste content and warn if above a threshold.
""" """
##################################
# Import External packages
##################################
import time import time
import re import re
import phonenumbers import phonenumbers
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste from packages import Paste
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
def search_phone(message): class Phone(AbstractModule):
"""
Phone module for AIL framework
"""
# regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required)
# reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
def __init__(self):
super(Phone, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 1
def compute(self, message):
paste = Paste.Paste(message) paste = Paste.Paste(message)
content = paste.get_p_content() content = paste.get_p_content()
# regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) # List of the regex results in the Paste, may be null
reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') results = self.REG_PHONE.findall(content)
reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
# list of the regex results in the Paste, may be null
results = reg_phone.findall(content)
# if the list is greater than 4, we consider the Paste may contain a list of phone numbers # If the list is greater than 4, we consider the Paste may contain a list of phone numbers
if len(results) > 4: if len(results) > 4:
print(results) self.redis_logger.debug(results)
publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name)) self.redis_logger.warning('{} contains PID (phone numbers)'.format(paste.p_name))
msg = 'infoleak:automatic-detection="phone-number";{}'.format(message) msg = 'infoleak:automatic-detection="phone-number";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
# Send to duplicate
self.process.populate_set_out(message, 'Duplicate')
#Send to duplicate
p.populate_set_out(message, 'Duplicate')
stats = {} stats = {}
for phone_number in results: for phone_number in results:
try: try:
@ -51,32 +75,10 @@ def search_phone(message):
pass pass
for country_code in stats: for country_code in stats:
if stats[country_code] > 4: if stats[country_code] > 4:
publisher.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code)) self.redis_logger.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code))
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Phone()
config_section = 'Phone' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Run Phone module")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
search_phone(message)

View file

@ -5,30 +5,26 @@ The TermTracker Module
=================== ===================
""" """
##################################
# Import External packages
##################################
import os import os
import sys import sys
import time import time
import signal import signal
##################################
# Import Project packages
##################################
from Helper import Process from Helper import Process
from pubsublogger import publisher from pubsublogger import publisher
from module.abstract_module import AbstractModule
import NotificationHelper import NotificationHelper
from packages import Item from packages import Item
from packages import Term from packages import Term
from lib import Tracker from lib import Tracker
full_item_url = "/object/item?id="
mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
# loads tracked words
list_tracked_words = Term.get_tracked_words_list()
last_refresh_word = time.time()
set_tracked_words_list = Term.get_set_tracked_words_list()
last_refresh_set = time.time()
class TimeoutException(Exception): class TimeoutException(Exception):
pass pass
@ -36,67 +32,60 @@ def timeout_handler(signum, frame):
raise TimeoutException raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
def new_term_found(term, term_type, item_id, item_date):
uuid_list = Term.get_term_uuid_list(term, term_type)
print('new tracked term found: {} in {}'.format(term, item_id))
for term_uuid in uuid_list: class TermTrackerMod(AbstractModule):
Term.add_tracked_item(term_uuid, item_id, item_date)
tags_to_add = Term.get_term_tags(term_uuid) mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
p.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid) """
if mail_to_notify: TermTrackerMod module for AIL framework
mail_subject = Tracker.get_email_subject(term_uuid) """
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id) def __init__(self):
for mail in mail_to_notify: super(TermTrackerMod, self).__init__()
NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body)
self.pending_seconds = 5
self.max_execution_time = self.process.config.getint('TermTrackerMod', "max_execution_time")
self.full_item_url = self.process.config.get("Notifications", "ail_domain") + "/object/item?id="
# loads tracked words
self.list_tracked_words = Term.get_tracked_words_list()
self.last_refresh_word = time.time()
self.set_tracked_words_list = Term.get_set_tracked_words_list()
self.last_refresh_set = time.time()
# Send module state to logs
self.redis_logger.info("Module %s initialized"%(self._module_name()))
if __name__ == "__main__": def compute(self, item_id):
# Cast message as Item
publisher.port = 6380
publisher.channel = "Script"
publisher.info("Script TermTrackerMod started")
config_section = 'TermTrackerMod'
p = Process(config_section)
max_execution_time = p.config.getint(config_section, "max_execution_time")
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
while True:
item_id = p.get_from_set()
if item_id is not None:
item_date = Item.get_item_date(item_id) item_date = Item.get_item_date(item_id)
item_content = Item.get_item_content(item_id) item_content = Item.get_item_content(item_id)
signal.alarm(max_execution_time) signal.alarm(self.max_execution_time)
dict_words_freq = None
try: try:
dict_words_freq = Term.get_text_word_frequency(item_content) dict_words_freq = Term.get_text_word_frequency(item_content)
except TimeoutException: except TimeoutException:
print ("{0} processing timeout".format(item_id)) self.redis_logger.warning("{0} processing timeout".format(item_id))
continue
else: else:
signal.alarm(0) signal.alarm(0)
if dict_words_freq:
# create token statistics # create token statistics
#for word in dict_words_freq: #for word in dict_words_freq:
# Term.create_token_statistics(item_date, word, dict_words_freq[word]) # Term.create_token_statistics(item_date, word, dict_words_freq[word])
# check solo words # check solo words
for word in list_tracked_words: for word in self.list_tracked_words:
if word in dict_words_freq: if word in dict_words_freq:
new_term_found(word, 'word', item_id, item_date) self.new_term_found(word, 'word', item_id, item_date)
# check words set # check words set
for elem in set_tracked_words_list: for elem in self.set_tracked_words_list:
list_words = elem[0] list_words = elem[0]
nb_words_threshold = elem[1] nb_words_threshold = elem[1]
word_set = elem[2] word_set = elem[2]
@ -106,19 +95,42 @@ if __name__ == "__main__":
if word in dict_words_freq: if word in dict_words_freq:
nb_uniq_word += 1 nb_uniq_word += 1
if nb_uniq_word >= nb_words_threshold: if nb_uniq_word >= nb_words_threshold:
new_term_found(word_set, 'set', item_id, item_date) self.new_term_found(word_set, 'set', item_id, item_date)
else:
time.sleep(5)
# refresh Tracked term # refresh Tracked term
if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'): if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
list_tracked_words = Term.get_tracked_words_list() self.list_tracked_words = Term.get_tracked_words_list()
last_refresh_word = time.time() self.last_refresh_word = time.time()
print('Tracked word refreshed') self.redis_logger.debug('Tracked word refreshed')
if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'): if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
set_tracked_words_list = Term.get_set_tracked_words_list() self.set_tracked_words_list = Term.get_set_tracked_words_list()
last_refresh_set = time.time() self.last_refresh_set = time.time()
print('Tracked set refreshed') self.redis_logger.debug('Tracked set refreshed')
def new_term_found(self, term, term_type, item_id, item_date):
uuid_list = Term.get_term_uuid_list(term, term_type)
self.redis_logger.info('new tracked term found: {} in {}'.format(term, item_id))
for term_uuid in uuid_list:
Term.add_tracked_item(term_uuid, item_id, item_date)
tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
self.process.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid)
if mail_to_notify:
mail_subject = Tracker.get_email_subject(term_uuid)
mail_body = TermTrackerMod.mail_body_template.format(term, item_id, self.full_item_url, item_id)
for mail in mail_to_notify:
self.redis_logger.debug('Send Mail {}'.format(mail_subject))
NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body)
if __name__ == '__main__':
module = TermTrackerMod()
module.run()

View file

@ -9,61 +9,71 @@ This module tries to parse URLs and warns if some defined contry code are presen
""" """
##################################
# Import External packages
##################################
import redis import redis
import pprint import pprint
import time import time
import os import os
import dns.exception import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from pyfaup.faup import Faup from pyfaup.faup import Faup
import re import re
# Country and ASN lookup # Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn from cymru.ip2asn.dns import DNSClient as ip2asn
import socket import socket
import pycountry import pycountry
import ipaddress import ipaddress
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from Helper import Process from Helper import Process
# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string): class Web(AbstractModule):
"""
Web module for AIL framework
"""
# Used to prevent concat with empty fields due to url parsing
def avoidNone(self, a_string):
if a_string is None: if a_string is None:
return "" return ""
else: else:
return a_string return a_string
if __name__ == "__main__": def __init__(self):
publisher.port = 6380 """
publisher.channel = "Script" Init Web
"""
super(Web, self).__init__()
config_section = 'Web' # REDIS Cache
self.r_serv2 = redis.StrictRedis(
p = Process(config_section) host=self.process.config.get("Redis_Cache", "host"),
port=self.process.config.getint("Redis_Cache", "port"),
# REDIS # db=self.process.config.getint("Redis_Cache", "db"),
r_serv2 = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),
db=p.config.getint("Redis_Cache", "db"),
decode_responses=True) decode_responses=True)
# Country to log as critical
self.cc_critical = self.process.config.get("Url", "cc_critical")
# FUNCTIONS #
self.redis_logger.info("Script URL subscribed to channel web_categ")
# FIXME For retro compatibility
self.channel = 'web_categ'
self.faup = Faup()
# Protocol file path # Protocol file path
protocolsfile_path = os.path.join(os.environ['AIL_HOME'], protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile")) self.process.config.get("Directories", "protocolsfile"))
# Country to log as critical
cc_critical = p.config.get("Url", "cc_critical")
# FUNCTIONS #
publisher.info("Script URL Started")
message = p.get_from_set()
prec_filename = None
faup = Faup()
# Get all uri from protocolsfile (Used for Curve) # Get all uri from protocolsfile (Used for Curve)
uri_scheme = "" uri_scheme = ""
with open(protocolsfile_path, 'r') as scheme_file: with open(protocolsfile_path, 'r') as scheme_file:
@ -71,90 +81,114 @@ if __name__ == "__main__":
uri_scheme += scheme[:-1]+"|" uri_scheme += scheme[:-1]+"|"
uri_scheme = uri_scheme[:-1] uri_scheme = uri_scheme[:-1]
url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*" self.url_regex = "((?i:"+uri_scheme + \
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
while True: self.prec_filename = None
if message is not None:
# Send module state to logs
self.redis_logger.info("Module %s initialized" % (self.module_name))
def compute(self, message):
"""
Search for Web links from given message
"""
# Extract item
filename, score = message.split() filename, score = message.split()
if prec_filename is None or filename != prec_filename: if self.prec_filename is None or filename != self.prec_filename:
domains_list = [] domains_list = set()
PST = Paste.Paste(filename) PST = Paste.Paste(filename)
client = ip2asn() client = ip2asn()
for x in PST.get_regex(url_regex):
matching_url = re.search(url_regex, PST.get_p_content()) detected_urls = PST.get_regex(self.url_regex)
url = matching_url.group(0) if len(detected_urls) > 0:
to_print = 'Web;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
self.redis_logger.info('{}Detected {} URL;{}'.format(
to_print, len(detected_urls), PST.p_rel_path))
for url in detected_urls:
self.redis_logger.debug("match regex: %s" % (url))
# self.redis_logger.debug("match regex search: %s"%(url))
to_send = "{} {} {}".format(url, PST._get_p_date(), filename) to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
p.populate_set_out(to_send, 'Url') self.process.populate_set_out(to_send, 'Url')
self.redis_logger.debug("url_parsed: %s" % (to_send))
faup.decode(url) self.faup.decode(url)
domain = faup.get_domain() domain = self.faup.get_domain()
subdomain = faup.get_subdomain() subdomain = self.faup.get_subdomain()
publisher.debug('{} Published'.format(url)) self.redis_logger.debug('{} Published'.format(url))
if subdomain is not None: if subdomain is not None:
## TODO: # FIXME: remove me # TODO: # FIXME: remove me
try: try:
subdomain = subdomain.decode() subdomain = subdomain.decode()
except: except:
pass pass
if domain is not None: if domain is not None:
## TODO: # FIXME: remove me # TODO: # FIXME: remove me
try: try:
domain = domain.decode() domain = domain.decode()
except: except:
pass pass
domains_list.append(domain) domains_list.add(domain)
hostl = avoidNone(subdomain) + avoidNone(domain) hostl = self.avoidNone(subdomain) + self.avoidNone(domain)
try: try:
socket.setdefaulttimeout(1) socket.setdefaulttimeout(1)
ip = socket.gethostbyname(hostl) ip = socket.gethostbyname(hostl)
except:
# If the resolver is not giving any IPv4 address, # If the resolver is not giving any IPv4 address,
# ASN/CC lookup is skip. # ASN/CC lookup is skip.
continue
try:
l = client.lookup(ip, qType='IP') l = client.lookup(ip, qType='IP')
except ipaddress.AddressValueError: except ipaddress.AddressValueError:
self.redis_logger.error(
'ASN/CC lookup failed for IP {}'.format(ip))
continue continue
except:
self.redis_logger.error(
'Resolver IPv4 address failed for host {}'.format(hostl))
continue
cc = getattr(l, 'cc') cc = getattr(l, 'cc')
asn = '' asn = ''
if getattr(l, 'asn') is not None: if getattr(l, 'asn') is not None:
asn = getattr(l, 'asn')[2:] #remobe b' asn = getattr(l, 'asn')[2:] # remobe b'
# EU is not an official ISO 3166 code (but used by RIPE # EU is not an official ISO 3166 code (but used by RIPE
# IP allocation) # IP allocation)
if cc is not None and cc != "EU": if cc is not None and cc != "EU":
print(hostl, asn, cc, \ self.redis_logger.debug('{};{};{};{}'.format(hostl, asn, cc,
pycountry.countries.get(alpha_2=cc).name) pycountry.countries.get(alpha_2=cc).name))
if cc == cc_critical: if cc == self.cc_critical:
to_print = 'Url;{};{};{};Detected {} {}'.format( to_print = 'Url;{};{};{};Detected {} {}'.format(
PST.p_source, PST.p_date, PST.p_name, PST.p_source, PST.p_date, PST.p_name,
hostl, cc) hostl, cc)
#publisher.warning(to_print) self.redis_logger.info(to_print)
print(to_print)
else: else:
print(hostl, asn, cc) self.redis_logger.debug('{};{};{}'.format(hostl, asn, cc))
A_values = lib_refine.checking_A_record(r_serv2, A_values = lib_refine.checking_A_record(self.r_serv2,
domains_list) domains_list)
if A_values[0] >= 1: if A_values[0] >= 1:
PST.__setattr__(self.channel, A_values)
PST.save_attribute_redis(self.channel, (A_values[0],
list(A_values[1])))
pprint.pprint(A_values) pprint.pprint(A_values)
publisher.info('Url;{};{};{};Checked {} URL;{}'.format( # self.redis_logger.info('Url;{};{};{};Checked {} URL;{}'.format(
PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path)) # PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))
prec_filename = filename
else: self.prec_filename = filename
publisher.debug("Script url is Idling 10s")
print('Sleeping')
time.sleep(10)
message = p.get_from_set()
if __name__ == '__main__':
module = Web()
module.run()

View file

@ -10,36 +10,136 @@ It consider the TLD, Domain and protocol.
""" """
##################################
# Import External packages
##################################
import time import time
import datetime import datetime
import redis import redis
import os import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup from pyfaup.faup import Faup
# Config Var
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look
max_set_cardinality = 10 # The cardinality of the progression set
num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past
def analyse(server, field_name, date, url_parsed): ##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import lib_words
from packages.Date import Date
from Helper import Process
class WebStats(AbstractModule):
"""
WebStats module for AIL framework
"""
# Config Var
THRESHOLD_TOTAL_SUM = 200 # Above this value, a keyword is eligible for a progression
THRESHOLD_INCREASE = 1.0 # The percentage representing the keyword occurence since num_day_to_look
MAX_SET_CARDINALITY = 10 # The cardinality of the progression set
NUM_DAY_TO_LOOK = 5 # the detection of the progression start num_day_to_look in the past
def __init__(self):
super(WebStats, self).__init__()
# Send module state to logs
self.redis_logger.info("Module %s initialized"%(self.module_name))
# Sent to the logging a description of the module
self.redis_logger.info("Makes statistics about valid URL")
self.pending_seconds = 5*60
# REDIS #
self.r_serv_trend = redis.StrictRedis(
host=self.process.config.get("ARDB_Trending", "host"),
port=self.process.config.get("ARDB_Trending", "port"),
db=self.process.config.get("ARDB_Trending", "db"),
decode_responses=True)
# FILE CURVE SECTION #
self.csv_path_proto = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "protocolstrending_csv"))
self.protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "protocolsfile"))
self.csv_path_tld = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "tldstrending_csv"))
self.tldsfile_path = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "tldsfile"))
self.csv_path_domain = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "domainstrending_csv"))
self.faup = Faup()
self.generate_new_graph = False
def computeNone(self):
if self.generate_new_graph:
self.generate_new_graph = False
today = datetime.date.today()
year = today.year
month = today.month
self.redis_logger.debug('Building protocol graph')
lib_words.create_curve_with_word_file(self.r_serv_trend, csv_path_proto,
protocolsfile_path, year,
month)
self.redis_logger.debug('Building tld graph')
lib_words.create_curve_with_word_file(self.r_serv_trend, csv_path_tld,
tldsfile_path, year,
month)
self.redis_logger.debug('Building domain graph')
lib_words.create_curve_from_redis_set(self.r_serv_trend, csv_path_domain,
"domain", year,
month)
self.redis_logger.debug('end building')
def compute(self, message):
self.generate_new_graph = True
# Do something with the message from the queue
url, date, path = message.split()
self.faup.decode(url)
url_parsed = self.faup.get()
# Scheme analysis
self.analyse('scheme', date, url_parsed)
# Tld analysis
self.analyse('tld', date, url_parsed)
# Domain analysis
self.analyse('domain', date, url_parsed)
self.compute_progression('scheme', self.NUM_DAY_TO_LOOK, url_parsed)
self.compute_progression('tld', self.NUM_DAY_TO_LOOK, url_parsed)
self.compute_progression('domain', self.NUM_DAY_TO_LOOK, url_parsed)
def analyse(self, field_name, date, url_parsed):
field = url_parsed[field_name] field = url_parsed[field_name]
if field is not None: if field is not None:
try: # faup version try: # faup version
field = field.decode() field = field.decode()
except: except:
pass pass
server.hincrby(field, date, 1)
self.r_serv_trend.hincrby(field, date, 1)
if field_name == "domain": #save domain in a set for the monthly plot if field_name == "domain": #save domain in a set for the monthly plot
domain_set_name = "domain_set_" + date[0:6] domain_set_name = "domain_set_" + date[0:6]
server.sadd(domain_set_name, field) self.r_serv_trend.sadd(domain_set_name, field)
print("added in " + domain_set_name +": "+ field) self.redis_logger.debug("added in " + domain_set_name +": "+ field)
def get_date_range(num_day):
def get_date_range(self, num_day):
curr_date = datetime.date.today() curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = [] date_list = []
@ -48,14 +148,17 @@ def get_date_range(num_day):
date_list.append(date.substract_day(i)) date_list.append(date.substract_day(i))
return date_list return date_list
# Compute the progression for one keyword
def compute_progression_word(server, num_day, keyword): def compute_progression_word(self, num_day, keyword):
date_range = get_date_range(num_day) """
Compute the progression for one keyword
"""
date_range = self.get_date_range(num_day)
# check if this keyword is eligible for progression # check if this keyword is eligible for progression
keyword_total_sum = 0 keyword_total_sum = 0
value_list = [] value_list = []
for date in date_range: # get value up to date_range for date in date_range: # get value up to date_range
curr_value = server.hget(keyword, date) curr_value = self.r_serv_trend.hget(keyword, date)
value_list.append(int(curr_value if curr_value is not None else 0)) value_list.append(int(curr_value if curr_value is not None else 0))
keyword_total_sum += int(curr_value) if curr_value is not None else 0 keyword_total_sum += int(curr_value) if curr_value is not None else 0
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division
@ -71,121 +174,34 @@ def compute_progression_word(server, num_day, keyword):
return (keyword_increase, keyword_total_sum) return (keyword_increase, keyword_total_sum)
''' def compute_progression(self, field_name, num_day, url_parsed):
"""
recompute the set top_progression zset recompute the set top_progression zset
- Compute the current field progression - Compute the current field progression
- re-compute the current progression for each first 2*max_set_cardinality fields in the top_progression_zset - re-compute the current progression for each first 2*self.MAX_SET_CARDINALITY fields in the top_progression_zset
''' """
def compute_progression(server, field_name, num_day, url_parsed):
redis_progression_name_set = "z_top_progression_"+field_name redis_progression_name_set = "z_top_progression_"+field_name
keyword = url_parsed[field_name] keyword = url_parsed[field_name]
if keyword is not None: if keyword is not None:
#compute the progression of the current word #compute the progression of the current word
keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword) keyword_increase, keyword_total_sum = self.compute_progression_word(num_day, keyword)
#re-compute the progression of 2*max_set_cardinality #re-compute the progression of 2*self.MAX_SET_CARDINALITY
current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality) current_top = self.r_serv_trend.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*self.MAX_SET_CARDINALITY)
for word, value in current_top: for word, value in current_top:
word_inc, word_tot_sum = compute_progression_word(server, num_day, word) word_inc, word_tot_sum = self.compute_progression_word(num_day, word)
server.zrem(redis_progression_name_set, word) self.r_serv_trend.zrem(redis_progression_name_set, word)
if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase): if (word_tot_sum > self.THRESHOLD_TOTAL_SUM) and (word_inc > self.THRESHOLD_INCREASE):
server.zadd(redis_progression_name_set, float(word_inc), word) self.r_serv_trend.zadd(redis_progression_name_set, float(word_inc), word)
# filter before adding # filter before adding
if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): if (keyword_total_sum > self.THRESHOLD_TOTAL_SUM) and (keyword_increase > self.THRESHOLD_INCREASE):
server.zadd(redis_progression_name_set, float(keyword_increase), keyword) self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_increase), keyword)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = WebStats()
config_section = 'WebStats' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Makes statistics about valid URL")
# REDIS #
r_serv_trend = redis.StrictRedis(
host=p.config.get("ARDB_Trending", "host"),
port=p.config.get("ARDB_Trending", "port"),
db=p.config.get("ARDB_Trending", "db"),
decode_responses=True)
# FILE CURVE SECTION #
csv_path_proto = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolstrending_csv"))
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile"))
csv_path_tld = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldstrending_csv"))
tldsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldsfile"))
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "domainstrending_csv"))
faup = Faup()
generate_new_graph = False
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
if generate_new_graph:
generate_new_graph = False
today = datetime.date.today()
year = today.year
month = today.month
print('Building protocol graph')
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
protocolsfile_path, year,
month)
print('Building tld graph')
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
tldsfile_path, year,
month)
print('Building domain graph')
lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
"domain", year,
month)
print('end building')
publisher.debug("{} queue is empty, waiting".format(config_section))
print('sleeping')
time.sleep(5*60)
continue
else:
generate_new_graph = True
# Do something with the message from the queue
url, date, path = message.split()
faup.decode(url)
url_parsed = faup.get()
# Scheme analysis
analyse(r_serv_trend, 'scheme', date, url_parsed)
# Tld analysis
analyse(r_serv_trend, 'tld', date, url_parsed)
# Domain analysis
analyse(r_serv_trend, 'domain', date, url_parsed)
compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)

0
bin/module/__init__.py Normal file
View file

View file

@ -0,0 +1,98 @@
# coding: utf-8
"""
Base Class for AIL Modules
"""
##################################
# Import External packages
##################################
from abc import ABC, abstractmethod
import time
##################################
# Import Project packages
##################################
from pubsublogger import publisher
from Helper import Process
class AbstractModule(ABC):
"""
Abstract Module class
"""
def __init__(self, module_name=None, queue_name=None):
"""
Init Module
module_name: str; set the module name if different from the instance ClassName
"""
# Module name if provided else instance className
self.module_name = module_name if module_name else self._module_name()
# Module name if provided else instance className
self.queue_name = queue_name if queue_name else self._module_name()
# Init Redis Logger
self.redis_logger = publisher
# Port of the redis instance used by pubsublogger
self.redis_logger.port = 6380
# Channel name to publish logs
self.redis_logger.channel = 'Script'
# TODO modify generic channel Script to a namespaced channel like:
# publish module logs to script:<ModuleName> channel
# self.redis_logger.channel = 'script:%s'%(self.module_name)
# Run module endlessly
self.proceed = True
# Waiting time in secondes between two proccessed messages
self.pending_seconds = 10
# Setup the I/O queues
self.process = Process(self.queue_name)
def run(self):
"""
Run Module endless process
"""
# Endless loop processing messages from the input queue
while self.proceed:
# Get one message (paste) from the QueueIn (copy of Redis_Global publish)
message = self.process.get_from_set()
if message is None:
self.computeNone()
# Wait before next process
self.redis_logger.debug('%s, waiting for new message, Idling %ds'%(self.module_name, self.pending_seconds))
time.sleep(self.pending_seconds)
continue
try:
# Module processing with the message from the queue
self.compute(message)
except Exception as err:
self.redis_logger.error("Error in module %s: %s"%(self.module_name, err))
def _module_name(self):
"""
Returns the instance class name (ie. the Module Name)
"""
return self.__class__.__name__
@abstractmethod
def compute(self, message):
"""
Main method of the Module to implement
"""
pass
def computeNone(self):
"""
Method of the Module when there is no message
"""
pass

View file

@ -1,45 +1,57 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
""" """
Template for new modules The Template Module
======================
This module is a template for Template for new modules
""" """
##################################
# Import External packages
##################################
import time import time
from pubsublogger import publisher from pubsublogger import publisher
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from Helper import Process from Helper import Process
def do_something(message): class Template(AbstractModule):
return None """
Template module for AIL framework
"""
def __init__(self):
super(Template, self).__init__()
# Send module state to logs
self.redis_logger.info("Module %s initialized"%(self.module_name))
# Pending time between two computation in seconds
self.pending_seconds = 10
def computeNone(self):
"""
Compute when no message in queue
"""
self.redis_logger.debug("No message in queue")
def compute(self, message):
"""
Compute a message in queue
"""
self.redis_logger.debug("Compute message in queue")
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Template()
config_section = '<section name>' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("<description of the module>")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
something_has_been_done = do_something(message)
# (Optional) Send that thing to the next queue
p.populate_set_out(something_has_been_done)