Merge pull request #97 from osagit/abstract-module

feat: module factorization
This commit is contained in:
Thirion Aurélien 2021-04-02 16:32:06 +02:00 committed by GitHub
commit 77a708845c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 1076 additions and 841 deletions

View file

@ -36,68 +36,96 @@ Requirements
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. *Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
""" """
##################################
# Import External packages
##################################
import os import os
import argparse import argparse
import time import time
import re import re
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from pubsublogger import publisher from pubsublogger import publisher
from packages import Paste from packages import Paste
from Helper import Process from Helper import Process
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Categ' class Categ(AbstractModule):
"""
Categ module for AIL framework
"""
p = Process(config_section) def __init__(self):
matchingThreshold = p.config.getint("Categ", "matchingThreshold") """
Init Categ
"""
super(Categ, self).__init__()
# SCRIPT PARSER # self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
parser = argparse.ArgumentParser(description='Start Categ module on files.')
parser.add_argument( # SCRIPT PARSER #
'-d', type=str, default="../files/", parser = argparse.ArgumentParser(description='Start Categ module on files.')
help='Path to the directory containing the category files.',
action='store')
args = parser.parse_args() parser.add_argument(
'-d', type=str, default="../files/",
help='Path to the directory containing the category files.',
action='store')
# FUNCTIONS # args = parser.parse_args()
publisher.info("Script Categ started")
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey'] self.redis_logger.info("Script Categ started")
tmp_dict = {}
for filename in categories:
bname = os.path.basename(filename)
tmp_dict[bname] = []
with open(os.path.join(args.d, filename), 'r') as f:
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
prec_filename = None categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
tmp_dict = {}
for filename in categories:
bname = os.path.basename(filename)
tmp_dict[bname] = []
with open(os.path.join(args.d, filename), 'r') as f:
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
while True: self.categ_items = tmp_dict.items()
filename = p.get_from_set()
if filename is None:
publisher.debug("Script Categ is Idling 10s")
print('Sleeping')
time.sleep(10)
continue
paste = Paste.Paste(filename) prec_filename = None
def compute(self, message):
# Cast message as paste
paste = Paste.Paste(message)
# Get paste content
content = paste.get_p_content() content = paste.get_p_content()
for categ, pattern in tmp_dict.items(): # init categories found
is_categ_found = False
# Search for pattern categories in paste content
for categ, pattern in self.categ_items:
found = set(re.findall(pattern, content)) found = set(re.findall(pattern, content))
if len(found) >= matchingThreshold: lenfound = len(found)
msg = '{} {}'.format(paste.p_rel_path, len(found)) if lenfound >= self.matchingThreshold:
is_categ_found = True
msg = '{} {}'.format(paste.p_rel_path, lenfound)
print(msg, categ) self.redis_logger.debug('%s;%s %s'%(self.module_name, msg, categ))
p.populate_set_out(msg, categ)
publisher.info( # Export message to categ queue
self.process.populate_set_out(msg, categ)
self.redis_logger.info(
'Categ;{};{};{};Detected {} as {};{}'.format( 'Categ;{};{};{};Detected {} as {};{}'.format(
paste.p_source, paste.p_date, paste.p_name, paste.p_source, paste.p_date, paste.p_name,
len(found), categ, paste.p_rel_path)) lenfound, categ, paste.p_rel_path))
if not is_categ_found:
self.redis_logger.debug('No %s found in this paste: %s'%(self.module_name, paste.p_name))
if __name__ == '__main__':
module = Categ()
module.run()

View file

@ -9,131 +9,148 @@ The ZMQ_Sub_Indexer modules is fetching the list of files to be processed
and index each file with a full-text indexer (Whoosh until now). and index each file with a full-text indexer (Whoosh until now).
""" """
##################################
# Import External packages
##################################
import time import time
from packages import Paste
from pubsublogger import publisher
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
import shutil import shutil
import os import os
from os.path import join, getsize from os.path import join, getsize
from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste
from Helper import Process from Helper import Process
# Config variable
TIME_WAIT = 60*15 #sec
# return in bytes class Indexer(AbstractModule):
def check_index_size(baseindexpath, indexname): """
the_index_name = join(baseindexpath, indexname) Indexer module for AIL framework
cur_sum = 0 """
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
def move_index_into_old_index_folder(baseindexpath): # Time to wait in seconds between two index's size variable compute
for cur_file in os.listdir(baseindexpath): TIME_WAIT = 60*15 # sec
if not cur_file == "old_index":
shutil.move(join(baseindexpath, cur_file), join(join(baseindexpath, "old_index"), cur_file)) def __init__(self):
"""
Init Instance
"""
super(Indexer, self).__init__()
# Indexer configuration - index dir and schema setup
self.baseindexpath = join(os.environ['AIL_HOME'],
self.process.config.get("Indexer", "path"))
self.indexRegister_path = join(os.environ['AIL_HOME'],
self.process.config.get("Indexer", "register"))
self.indexertype = self.process.config.get("Indexer", "type")
self.INDEX_SIZE_THRESHOLD = self.process.config.getint(
"Indexer", "index_max_size")
self.indexname = None
self.schema = None
self.ix = None
if self.indexertype == "whoosh":
self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True),
content=TEXT)
if not os.path.exists(self.baseindexpath):
os.mkdir(self.baseindexpath)
# create the index register if not present
time_now = int(time.time())
if not os.path.isfile(self.indexRegister_path): # index are not organised
self.redis_logger.debug("Indexes are not organized")
self.redis_logger.debug(
"moving all files in folder 'old_index' ")
# move all files to old_index folder
self.move_index_into_old_index_folder()
self.redis_logger.debug("Creating new index")
# create all_index.txt
with open(self.indexRegister_path, 'w') as f:
f.write(str(time_now))
# create dir
os.mkdir(join(self.baseindexpath, str(time_now)))
with open(self.indexRegister_path, "r") as f:
allIndex = f.read()
allIndex = allIndex.split() # format [time1\ntime2]
allIndex.sort()
try:
self.indexname = allIndex[-1].strip('\n\r')
except IndexError as e:
self.indexname = time_now
self.indexpath = join(self.baseindexpath, str(self.indexname))
if not exists_in(self.indexpath):
self.ix = create_in(self.indexpath, self.schema)
else:
self.ix = open_dir(self.indexpath)
self.last_refresh = time_now
if __name__ == "__main__": def compute(self, message):
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Indexer'
p = Process(config_section)
# Indexer configuration - index dir and schema setup
baseindexpath = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "path"))
indexRegister_path = join(os.environ['AIL_HOME'],
p.config.get("Indexer", "register"))
indexertype = p.config.get("Indexer", "type")
INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size"))
if indexertype == "whoosh":
schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
unique=True),
content=TEXT)
if not os.path.exists(baseindexpath):
os.mkdir(baseindexpath)
# create the index register if not present
time_now = int(time.time())
if not os.path.isfile(indexRegister_path): #index are not organised
print("Indexes are not organized")
print("moving all files in folder 'old_index' ")
#move all files to old_index folder
move_index_into_old_index_folder(baseindexpath)
print("Creating new index")
#create all_index.txt
with open(indexRegister_path, 'w') as f:
f.write(str(time_now))
#create dir
os.mkdir(join(baseindexpath, str(time_now)))
with open(indexRegister_path, "r") as f:
allIndex = f.read()
allIndex = allIndex.split() # format [time1\ntime2]
allIndex.sort()
try:
indexname = allIndex[-1].strip('\n\r')
except IndexError as e:
indexname = time_now
indexpath = join(baseindexpath, str(indexname))
if not exists_in(indexpath):
ix = create_in(indexpath, schema)
else:
ix = open_dir(indexpath)
last_refresh = time_now
# LOGGING #
publisher.info("ZMQ Indexer is Running")
while True:
try: try:
message = p.get_from_set() PST = Paste.Paste(message)
if message is not None:
PST = Paste.Paste(message)
else:
publisher.debug("Script Indexer is idling 1s")
time.sleep(1)
continue
docpath = message.split(" ", -1)[-1] docpath = message.split(" ", -1)[-1]
paste = PST.get_p_content() paste = PST.get_p_content()
print("Indexing - " + indexname + " :", docpath) self.redis_logger.debug("Indexing - " + self.indexname + " :", docpath)
# Avoid calculating the index's size at each message
#avoid calculating the index's size at each message if(time.time() - self.last_refresh > self.TIME_WAIT):
if( time.time() - last_refresh > TIME_WAIT): self.last_refresh = time.time()
last_refresh = time.time() if self.check_index_size() >= self.INDEX_SIZE_THRESHOLD*(1000*1000):
if check_index_size(baseindexpath, indexname) >= INDEX_SIZE_THRESHOLD*(1000*1000):
timestamp = int(time.time()) timestamp = int(time.time())
print("Creating new index", timestamp) self.redis_logger.debug("Creating new index", timestamp)
indexpath = join(baseindexpath, str(timestamp)) self.indexpath = join(self.baseindexpath, str(timestamp))
indexname = str(timestamp) self.indexname = str(timestamp)
#update all_index # update all_index
with open(indexRegister_path, "a") as f: with open(self.indexRegister_path, "a") as f:
f.write('\n'+str(timestamp)) f.write('\n'+str(timestamp))
#create new dir # create new dir
os.mkdir(indexpath) os.mkdir(self.indexpath)
ix = create_in(indexpath, schema) self.ix = create_in(self.indexpath, self.schema)
if self.indexertype == "whoosh":
if indexertype == "whoosh": indexwriter = self.ix.writer()
indexwriter = ix.writer()
indexwriter.update_document( indexwriter.update_document(
title=docpath, title=docpath,
path=docpath, path=docpath,
content=paste) content=paste)
indexwriter.commit() indexwriter.commit()
except IOError: except IOError:
print("CRC Checksum Failed on :", PST.p_path) self.redis_logger.debug("CRC Checksum Failed on :", PST.p_path)
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( self.redis_logger.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
PST.p_source, PST.p_date, PST.p_name)) PST.p_source, PST.p_date, PST.p_name))
def check_index_size(self):
"""
return in bytes
"""
the_index_name = join(self.baseindexpath, self.indexname)
cur_sum = 0
for root, dirs, files in os.walk(the_index_name):
cur_sum += sum(getsize(join(root, name)) for name in files)
return cur_sum
def move_index_into_old_index_folder(self):
for cur_file in os.listdir(self.baseindexpath):
if not cur_file == "old_index":
shutil.move(join(self.baseindexpath, cur_file), join(
join(self.baseindexpath, "old_index"), cur_file))
if __name__ == '__main__':
module = Indexer()
module.run()

View file

@ -12,160 +12,170 @@ RSA private key, certificate messages
""" """
##################################
# Import External packages
##################################
import time import time
from enum import Enum
from pubsublogger import publisher from pubsublogger import publisher
#from bin.packages import Paste
#from bin.Helper import Process
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste from packages import Paste
from Helper import Process from Helper import Process
def search_key(paste): class KeyEnum(Enum):
content = paste.get_p_content() PGP_MESSAGE = '-----BEGIN PGP MESSAGE-----'
find = False PGP_PUBLIC_KEY_BLOCK = '-----BEGIN PGP PUBLIC KEY BLOCK-----'
get_pgp_content = False PGP_PRIVATE_KEY_BLOCK = '-----BEGIN PGP PRIVATE KEY BLOCK-----'
if '-----BEGIN PGP MESSAGE-----' in content: PGP_SIGNATURE = '-----BEGIN PGP SIGNATURE-----'
publisher.warning('{} has a PGP enc message'.format(paste.p_name)) CERTIFICATE = '-----BEGIN CERTIFICATE-----'
PUBLIC_KEY = '-----BEGIN PUBLIC KEY-----'
msg = 'infoleak:automatic-detection="pgp-message";{}'.format(message) PRIVATE_KEY = '-----BEGIN PRIVATE KEY-----'
p.populate_set_out(msg, 'Tags') ENCRYPTED_PRIVATE_KEY = '-----BEGIN ENCRYPTED PRIVATE KEY-----'
get_pgp_content = True OPENSSH_PRIVATE_KEY = '-----BEGIN OPENSSH PRIVATE KEY-----'
find = True SSH2_ENCRYPTED_PRIVATE_KEY = '---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----'
OPENVPN_STATIC_KEY_V1 = '-----BEGIN OpenVPN Static key V1-----'
if '-----BEGIN PGP PUBLIC KEY BLOCK-----' in content: RSA_PRIVATE_KEY = '-----BEGIN RSA PRIVATE KEY-----'
msg = 'infoleak:automatic-detection="pgp-public-key-block";{}'.format(message) DSA_PRIVATE_KEY = '-----BEGIN DSA PRIVATE KEY-----'
p.populate_set_out(msg, 'Tags') EC_PRIVATE_KEY = '-----BEGIN EC PRIVATE KEY-----'
get_pgp_content = True
if '-----BEGIN PGP SIGNATURE-----' in content:
msg = 'infoleak:automatic-detection="pgp-signature";{}'.format(message)
p.populate_set_out(msg, 'Tags')
get_pgp_content = True
if '-----BEGIN CERTIFICATE-----' in content: class Keys(AbstractModule):
publisher.warning('{} has a certificate message'.format(paste.p_name)) """
Keys module for AIL framework
"""
msg = 'infoleak:automatic-detection="certificate";{}'.format(message) def __init__(self):
p.populate_set_out(msg, 'Tags') super(Keys, self).__init__()
find = True
if '-----BEGIN RSA PRIVATE KEY-----' in content: # Waiting time in secondes between to message proccessed
publisher.warning('{} has a RSA private key message'.format(paste.p_name)) self.pending_seconds = 1
print('rsa private key message found')
msg = 'infoleak:automatic-detection="rsa-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags')
find = True
if '-----BEGIN PRIVATE KEY-----' in content: def compute(self, message):
publisher.warning('{} has a private key message'.format(paste.p_name)) paste = Paste.Paste(message)
print('private key message found') content = paste.get_p_content()
msg = 'infoleak:automatic-detection="private-key";{}'.format(message) find = False
p.populate_set_out(msg, 'Tags') get_pgp_content = False
find = True
if '-----BEGIN ENCRYPTED PRIVATE KEY-----' in content: if KeyEnum.PGP_MESSAGE.value in content:
publisher.warning('{} has an encrypted private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a PGP enc message'.format(paste.p_name))
print('encrypted private key message found')
msg = 'infoleak:automatic-detection="encrypted-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="pgp-message";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True get_pgp_content = True
find = True
if '-----BEGIN OPENSSH PRIVATE KEY-----' in content: if KeyEnum.PGP_PUBLIC_KEY_BLOCK.value in content:
publisher.warning('{} has an openssh private key message'.format(paste.p_name)) msg = 'infoleak:automatic-detection="pgp-public-key-block";{}'.format(message)
print('openssh private key message found') self.process.populate_set_out(msg, 'Tags')
get_pgp_content = True
msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message) if KeyEnum.PGP_SIGNATURE.value in content:
p.populate_set_out(msg, 'Tags') msg = 'infoleak:automatic-detection="pgp-signature";{}'.format(message)
find = True self.process.populate_set_out(msg, 'Tags')
get_pgp_content = True
if '---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----' in content: if KeyEnum.CERTIFICATE.value in content:
publisher.warning('{} has an ssh2 private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a certificate message'.format(paste.p_name))
print('SSH2 private key message found')
msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message) msg = 'infoleak:automatic-detection="certificate";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN OpenVPN Static key V1-----' in content: if KeyEnum.RSA_PRIVATE_KEY.value in content:
publisher.warning('{} has an openssh private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a RSA private key message'.format(paste.p_name))
print('OpenVPN Static key message found') print('rsa private key message found')
msg = 'infoleak:automatic-detection="vpn-static-key";{}'.format(message) msg = 'infoleak:automatic-detection="rsa-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN DSA PRIVATE KEY-----' in content: if KeyEnum.PRIVATE_KEY.value in content:
publisher.warning('{} has a dsa private key message'.format(paste.p_name)) self.redis_logger.warning('{} has a private key message'.format(paste.p_name))
print('private key message found')
msg = 'infoleak:automatic-detection="dsa-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN EC PRIVATE KEY-----' in content: if KeyEnum.ENCRYPTED_PRIVATE_KEY.value in content:
publisher.warning('{} has an ec private key message'.format(paste.p_name)) self.redis_logger.warning('{} has an encrypted private key message'.format(paste.p_name))
print('encrypted private key message found')
msg = 'infoleak:automatic-detection="ec-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="encrypted-private-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN PGP PRIVATE KEY BLOCK-----' in content: if KeyEnum.OPENSSH_PRIVATE_KEY.value in content:
publisher.warning('{} has a pgp private key block message'.format(paste.p_name)) self.redis_logger.warning('{} has an openssh private key message'.format(paste.p_name))
print('openssh private key message found')
msg = 'infoleak:automatic-detection="pgp-private-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
if '-----BEGIN PUBLIC KEY-----' in content: if KeyEnum.SSH2_ENCRYPTED_PRIVATE_KEY.value in content:
publisher.warning('{} has a public key message'.format(paste.p_name)) self.redis_logger.warning('{} has an ssh2 private key message'.format(paste.p_name))
print('SSH2 private key message found')
msg = 'infoleak:automatic-detection="public-key";{}'.format(message) msg = 'infoleak:automatic-detection="private-ssh-key";{}'.format(message)
p.populate_set_out(msg, 'Tags') self.process.populate_set_out(msg, 'Tags')
find = True find = True
# pgp content if KeyEnum.OPENVPN_STATIC_KEY_V1.value in content:
if get_pgp_content: self.redis_logger.warning('{} has an openssh private key message'.format(paste.p_name))
p.populate_set_out(message, 'PgpDump') print('OpenVPN Static key message found')
if find : msg = 'infoleak:automatic-detection="vpn-static-key";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
find = True
#Send to duplicate if KeyEnum.DSA_PRIVATE_KEY.value in content:
p.populate_set_out(message, 'Duplicate') self.redis_logger.warning('{} has a dsa private key message'.format(paste.p_name))
print(message)
msg = 'infoleak:automatic-detection="dsa-private-key";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
find = True
if KeyEnum.EC_PRIVATE_KEY.value in content:
self.redis_logger.warning('{} has an ec private key message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="ec-private-key";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
find = True
if KeyEnum.PGP_PRIVATE_KEY_BLOCK.value in content:
self.redis_logger.warning('{} has a pgp private key block message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="pgp-private-key";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
find = True
if KeyEnum.PUBLIC_KEY.value in content:
self.redis_logger.warning('{} has a public key message'.format(paste.p_name))
msg = 'infoleak:automatic-detection="public-key";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
find = True
# pgp content
if get_pgp_content:
self.process.populate_set_out(message, 'PgpDump')
if find :
#Send to duplicate
self.process.populate_set_out(message, 'Duplicate')
self.redis_logger.debug(message)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Keys()
config_section = 'Keys' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Run Keys module ")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
paste = Paste.Paste(message)
search_key(paste)
# (Optional) Send that thing to the next queue

View file

@ -5,151 +5,157 @@
""" """
##################################
# Import External packages
##################################
import time import time
import datetime import datetime
import redis import redis
import os import os
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages.Date import Date from packages.Date import Date
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
from packages import Paste from packages import Paste
import ConfigLoader
# Config Var
max_set_cardinality = 8
def get_date_range(num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return date_list
def compute_most_posted(server, message): class ModuleStats(AbstractModule):
module, num, keyword, paste_date = message.split(';') """
Module Statistics module for AIL framework
"""
redis_progression_name_set = 'top_'+ module +'_set_' + paste_date # Config Var
MAX_SET_CARDINALITY = 8
# Add/Update in Redis
server.hincrby(paste_date, module+'-'+keyword, int(num))
# Compute Most Posted
date = get_date_range(0)[0]
# check if this keyword is eligible for progression
keyword_total_sum = 0
curr_value = server.hget(date, module+'-'+keyword)
keyword_total_sum += int(curr_value) if curr_value is not None else 0
if server.zcard(redis_progression_name_set) < max_set_cardinality:
server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
else: # not in set
member_set = server.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < keyword_total_sum:
#remove min from set and add the new one
print(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server.zrem(redis_progression_name_set, member_set[0][0])
server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
print(redis_progression_name_set)
def compute_provider_info(server_trend, path): def __init__(self):
redis_all_provider = 'all_provider_set'
paste = Paste.Paste(path) super(ModuleStats, self).__init__()
paste_baseName = paste.p_name.split('.')[0] # Waiting time in secondes between to message proccessed
paste_size = paste._get_p_size() self.pending_seconds = 20
paste_provider = paste.p_source
paste_date = str(paste._get_p_date())
redis_sum_size_set = 'top_size_set_' + paste_date
redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
redis_providers_name_set = 'providers_set_' + paste_date
# Add/Update in Redis # Sent to the logging a description of the module
server_trend.sadd(redis_all_provider, paste_provider) self.redis_logger.info("Makes statistics about valid URL")
num_paste = int(server_trend.hincrby(paste_provider+'_num', paste_date, 1)) # REDIS #
sum_size = float(server_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size)) self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
new_avg = float(sum_size) / float(num_paste)
server_trend.hset(paste_provider +'_avg', paste_date, new_avg) self.r_serv_pasteName = ConfigLoader.ConfigLoader().get_redis_conn("Redis_Paste_Name")
# def compute(self, message):
# Compute Most Posted
#
# Size if len(message.split(';')) > 1:
if server_trend.zcard(redis_sum_size_set) < max_set_cardinality or server_trend.zscore(redis_sum_size_set, paste_provider) != "nil": self.compute_most_posted(message)
server_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider) else:
server_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) self.compute_provider_info(message)
else: #set full capacity
member_set = server_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if float(member_set[0][1]) < new_avg:
#remove min from set and add the new one
print('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server_trend.zrem(redis_sum_size_set, member_set[0][0])
server_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
server_trend.zrem(redis_avg_size_name_set, member_set[0][0])
server_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
# Num def get_date_range(self, num_day):
# if set not full or provider already present curr_date = datetime.date.today()
if server_trend.zcard(redis_providers_name_set) < max_set_cardinality or server_trend.zscore(redis_providers_name_set, paste_provider) != "nil": date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
server_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) date_list = []
else: #set at full capacity
member_set = server_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1) for i in range(0, num_day+1):
# Member set is a list of (value, score) pairs date_list.append(date.substract_day(i))
if int(member_set[0][1]) < num_paste: return date_list
#remove min from set and add the new one
print('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
server_trend.zrem(member_set[0][0]) def compute_most_posted(self, message):
server_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) module, num, keyword, paste_date = message.split(';')
redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
# Add/Update in Redis
self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
# Compute Most Posted
date = self.get_date_range(0)[0]
# check if this keyword is eligible for progression
keyword_total_sum = 0
curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
keyword_total_sum += int(curr_value) if curr_value is not None else 0
if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
else: # not in set
member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < keyword_total_sum:
#remove min from set and add the new one
self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
self.redis_logger.debug(redis_progression_name_set)
def compute_provider_info(self, message):
redis_all_provider = 'all_provider_set'
paste = Paste.Paste(message)
paste_baseName = paste.p_name.split('.')[0]
paste_size = paste._get_p_size()
paste_provider = paste.p_source
paste_date = str(paste._get_p_date())
redis_sum_size_set = 'top_size_set_' + paste_date
redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
redis_providers_name_set = 'providers_set_' + paste_date
# Add/Update in Redis
self.r_serv_pasteName.sadd(paste_baseName, message)
self.r_serv_trend.sadd(redis_all_provider, paste_provider)
num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
new_avg = float(sum_size) / float(num_paste)
self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
#
# Compute Most Posted
#
# Size
if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
else: #set full capacity
member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if float(member_set[0][1]) < new_avg:
#remove min from set and add the new one
self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
# Num
# if set not full or provider already present
if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
else: #set at full capacity
member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < num_paste:
#remove min from set and add the new one
self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(member_set[0][0])
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = ModuleStats()
config_section = 'ModuleStats' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Makes statistics about valid URL")
# REDIS #
r_serv_trend = redis.StrictRedis(
host=p.config.get("ARDB_Trending", "host"),
port=p.config.get("ARDB_Trending", "port"),
db=p.config.get("ARDB_Trending", "db"),
decode_responses=True)
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
print('sleeping')
time.sleep(20)
continue
else:
# Do something with the message from the queue
if len(message.split(';')) > 1:
compute_most_posted(r_serv_trend, message)
else:
compute_provider_info(r_serv_trend, message)

View file

@ -11,72 +11,74 @@ It apply phone number regexes on paste content and warn if above a threshold.
""" """
##################################
# Import External packages
##################################
import time import time
import re import re
import phonenumbers import phonenumbers
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste from packages import Paste
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
def search_phone(message): class Phone(AbstractModule):
paste = Paste.Paste(message) """
content = paste.get_p_content() Phone module for AIL framework
"""
# regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required)
reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') # reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})')
# list of the regex results in the Paste, may be null
results = reg_phone.findall(content)
# if the list is greater than 4, we consider the Paste may contain a list of phone numbers
if len(results) > 4:
print(results)
publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name))
msg = 'infoleak:automatic-detection="phone-number";{}'.format(message) def __init__(self):
p.populate_set_out(msg, 'Tags') super(Phone, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 1
def compute(self, message):
paste = Paste.Paste(message)
content = paste.get_p_content()
# List of the regex results in the Paste, may be null
results = self.REG_PHONE.findall(content)
# If the list is greater than 4, we consider the Paste may contain a list of phone numbers
if len(results) > 4:
self.redis_logger.debug(results)
self.redis_logger.warning('{} contains PID (phone numbers)'.format(paste.p_name))
msg = 'infoleak:automatic-detection="phone-number";{}'.format(message)
self.process.populate_set_out(msg, 'Tags')
# Send to duplicate
self.process.populate_set_out(message, 'Duplicate')
stats = {}
for phone_number in results:
try:
x = phonenumbers.parse(phone_number, None)
country_code = x.country_code
if stats.get(country_code) is None:
stats[country_code] = 1
else:
stats[country_code] = stats[country_code] + 1
except:
pass
for country_code in stats:
if stats[country_code] > 4:
self.redis_logger.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code))
#Send to duplicate
p.populate_set_out(message, 'Duplicate')
stats = {}
for phone_number in results:
try:
x = phonenumbers.parse(phone_number, None)
country_code = x.country_code
if stats.get(country_code) is None:
stats[country_code] = 1
else:
stats[country_code] = stats[country_code] + 1
except:
pass
for country_code in stats:
if stats[country_code] > 4:
publisher.warning('{} contains Phone numbers with country code {}'.format(paste.p_name, country_code))
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Phone()
config_section = 'Phone' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Run Phone module")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
search_phone(message)

View file

@ -5,30 +5,26 @@ The TermTracker Module
=================== ===================
""" """
##################################
# Import External packages
##################################
import os import os
import sys import sys
import time import time
import signal import signal
##################################
# Import Project packages
##################################
from Helper import Process from Helper import Process
from pubsublogger import publisher from pubsublogger import publisher
from module.abstract_module import AbstractModule
import NotificationHelper import NotificationHelper
from packages import Item from packages import Item
from packages import Term from packages import Term
from lib import Tracker from lib import Tracker
full_item_url = "/object/item?id="
mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
# loads tracked words
list_tracked_words = Term.get_tracked_words_list()
last_refresh_word = time.time()
set_tracked_words_list = Term.get_set_tracked_words_list()
last_refresh_set = time.time()
class TimeoutException(Exception): class TimeoutException(Exception):
pass pass
@ -36,89 +32,105 @@ def timeout_handler(signum, frame):
raise TimeoutException raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
def new_term_found(term, term_type, item_id, item_date):
uuid_list = Term.get_term_uuid_list(term, term_type)
print('new tracked term found: {} in {}'.format(term, item_id))
for term_uuid in uuid_list: class TermTrackerMod(AbstractModule):
Term.add_tracked_item(term_uuid, item_id, item_date)
tags_to_add = Term.get_term_tags(term_uuid) mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
p.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid) """
if mail_to_notify: TermTrackerMod module for AIL framework
mail_subject = Tracker.get_email_subject(term_uuid) """
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id) def __init__(self):
for mail in mail_to_notify: super(TermTrackerMod, self).__init__()
NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body)
self.pending_seconds = 5
self.max_execution_time = self.process.config.getint('TermTrackerMod', "max_execution_time")
self.full_item_url = self.process.config.get("Notifications", "ail_domain") + "/object/item?id="
# loads tracked words
self.list_tracked_words = Term.get_tracked_words_list()
self.last_refresh_word = time.time()
self.set_tracked_words_list = Term.get_set_tracked_words_list()
self.last_refresh_set = time.time()
# Send module state to logs
self.redis_logger.info("Module %s initialized"%(self._module_name()))
if __name__ == "__main__": def compute(self, item_id):
# Cast message as Item
item_date = Item.get_item_date(item_id)
item_content = Item.get_item_content(item_id)
publisher.port = 6380 signal.alarm(self.max_execution_time)
publisher.channel = "Script"
publisher.info("Script TermTrackerMod started")
config_section = 'TermTrackerMod' dict_words_freq = None
p = Process(config_section) try:
max_execution_time = p.config.getint(config_section, "max_execution_time") dict_words_freq = Term.get_text_word_frequency(item_content)
except TimeoutException:
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url self.redis_logger.warning("{0} processing timeout".format(item_id))
else:
while True: signal.alarm(0)
item_id = p.get_from_set()
if item_id is not None:
item_date = Item.get_item_date(item_id)
item_content = Item.get_item_content(item_id)
signal.alarm(max_execution_time)
try:
dict_words_freq = Term.get_text_word_frequency(item_content)
except TimeoutException:
print ("{0} processing timeout".format(item_id))
continue
else:
signal.alarm(0)
if dict_words_freq:
# create token statistics # create token statistics
#for word in dict_words_freq: #for word in dict_words_freq:
# Term.create_token_statistics(item_date, word, dict_words_freq[word]) # Term.create_token_statistics(item_date, word, dict_words_freq[word])
# check solo words # check solo words
for word in list_tracked_words: for word in self.list_tracked_words:
if word in dict_words_freq: if word in dict_words_freq:
new_term_found(word, 'word', item_id, item_date) self.new_term_found(word, 'word', item_id, item_date)
# check words set # check words set
for elem in set_tracked_words_list: for elem in self.set_tracked_words_list:
list_words = elem[0] list_words = elem[0]
nb_words_threshold = elem[1] nb_words_threshold = elem[1]
word_set = elem[2] word_set = elem[2]
nb_uniq_word = 0 nb_uniq_word = 0
for word in list_words: for word in list_words:
if word in dict_words_freq: if word in dict_words_freq:
nb_uniq_word += 1 nb_uniq_word += 1
if nb_uniq_word >= nb_words_threshold: if nb_uniq_word >= nb_words_threshold:
new_term_found(word_set, 'set', item_id, item_date) self.new_term_found(word_set, 'set', item_id, item_date)
else: # refresh Tracked term
time.sleep(5) if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'):
self.list_tracked_words = Term.get_tracked_words_list()
self.last_refresh_word = time.time()
self.redis_logger.debug('Tracked word refreshed')
if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'):
self.set_tracked_words_list = Term.get_set_tracked_words_list()
self.last_refresh_set = time.time()
self.redis_logger.debug('Tracked set refreshed')
# refresh Tracked term def new_term_found(self, term, term_type, item_id, item_date):
if last_refresh_word < Term.get_tracked_term_last_updated_by_type('word'): uuid_list = Term.get_term_uuid_list(term, term_type)
list_tracked_words = Term.get_tracked_words_list() self.redis_logger.info('new tracked term found: {} in {}'.format(term, item_id))
last_refresh_word = time.time()
print('Tracked word refreshed')
if last_refresh_set < Term.get_tracked_term_last_updated_by_type('set'): for term_uuid in uuid_list:
set_tracked_words_list = Term.get_set_tracked_words_list() Term.add_tracked_item(term_uuid, item_id, item_date)
last_refresh_set = time.time()
print('Tracked set refreshed') tags_to_add = Term.get_term_tags(term_uuid)
for tag in tags_to_add:
msg = '{};{}'.format(tag, item_id)
self.process.populate_set_out(msg, 'Tags')
mail_to_notify = Term.get_term_mails(term_uuid)
if mail_to_notify:
mail_subject = Tracker.get_email_subject(term_uuid)
mail_body = TermTrackerMod.mail_body_template.format(term, item_id, self.full_item_url, item_id)
for mail in mail_to_notify:
self.redis_logger.debug('Send Mail {}'.format(mail_subject))
NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body)
if __name__ == '__main__':
module = TermTrackerMod()
module.run()

View file

@ -9,152 +9,186 @@ This module tries to parse URLs and warns if some defined contry code are presen
""" """
##################################
# Import External packages
##################################
import redis import redis
import pprint import pprint
import time import time
import os import os
import dns.exception import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from pyfaup.faup import Faup from pyfaup.faup import Faup
import re import re
# Country and ASN lookup # Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn from cymru.ip2asn.dns import DNSClient as ip2asn
import socket import socket
import pycountry import pycountry
import ipaddress import ipaddress
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from Helper import Process from Helper import Process
# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string):
if a_string is None:
return ""
else:
return a_string
if __name__ == "__main__": class Web(AbstractModule):
publisher.port = 6380 """
publisher.channel = "Script" Web module for AIL framework
"""
config_section = 'Web'
p = Process(config_section)
# REDIS #
r_serv2 = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),
db=p.config.getint("Redis_Cache", "db"),
decode_responses=True)
# Protocol file path
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile"))
# Country to log as critical
cc_critical = p.config.get("Url", "cc_critical")
# FUNCTIONS #
publisher.info("Script URL Started")
message = p.get_from_set()
prec_filename = None
faup = Faup()
# Get all uri from protocolsfile (Used for Curve)
uri_scheme = ""
with open(protocolsfile_path, 'r') as scheme_file:
for scheme in scheme_file:
uri_scheme += scheme[:-1]+"|"
uri_scheme = uri_scheme[:-1]
url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"
while True:
if message is not None:
filename, score = message.split()
if prec_filename is None or filename != prec_filename:
domains_list = []
PST = Paste.Paste(filename)
client = ip2asn()
for x in PST.get_regex(url_regex):
matching_url = re.search(url_regex, PST.get_p_content())
url = matching_url.group(0)
to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
p.populate_set_out(to_send, 'Url')
faup.decode(url)
domain = faup.get_domain()
subdomain = faup.get_subdomain()
publisher.debug('{} Published'.format(url))
if subdomain is not None:
## TODO: # FIXME: remove me
try:
subdomain = subdomain.decode()
except:
pass
if domain is not None:
## TODO: # FIXME: remove me
try:
domain = domain.decode()
except:
pass
domains_list.append(domain)
hostl = avoidNone(subdomain) + avoidNone(domain)
try:
socket.setdefaulttimeout(1)
ip = socket.gethostbyname(hostl)
except:
# If the resolver is not giving any IPv4 address,
# ASN/CC lookup is skip.
continue
try:
l = client.lookup(ip, qType='IP')
except ipaddress.AddressValueError:
continue
cc = getattr(l, 'cc')
asn = ''
if getattr(l, 'asn') is not None:
asn = getattr(l, 'asn')[2:] #remobe b'
# EU is not an official ISO 3166 code (but used by RIPE
# IP allocation)
if cc is not None and cc != "EU":
print(hostl, asn, cc, \
pycountry.countries.get(alpha_2=cc).name)
if cc == cc_critical:
to_print = 'Url;{};{};{};Detected {} {}'.format(
PST.p_source, PST.p_date, PST.p_name,
hostl, cc)
#publisher.warning(to_print)
print(to_print)
else:
print(hostl, asn, cc)
A_values = lib_refine.checking_A_record(r_serv2,
domains_list)
if A_values[0] >= 1:
pprint.pprint(A_values)
publisher.info('Url;{};{};{};Checked {} URL;{}'.format(
PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))
prec_filename = filename
# Used to prevent concat with empty fields due to url parsing
def avoidNone(self, a_string):
if a_string is None:
return ""
else: else:
publisher.debug("Script url is Idling 10s") return a_string
print('Sleeping')
time.sleep(10)
message = p.get_from_set() def __init__(self):
"""
Init Web
"""
super(Web, self).__init__()
# REDIS Cache
self.r_serv2 = redis.StrictRedis(
host=self.process.config.get("Redis_Cache", "host"),
port=self.process.config.getint("Redis_Cache", "port"),
db=self.process.config.getint("Redis_Cache", "db"),
decode_responses=True)
# Country to log as critical
self.cc_critical = self.process.config.get("Url", "cc_critical")
# FUNCTIONS #
self.redis_logger.info("Script URL subscribed to channel web_categ")
# FIXME For retro compatibility
self.channel = 'web_categ'
self.faup = Faup()
# Protocol file path
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "protocolsfile"))
# Get all uri from protocolsfile (Used for Curve)
uri_scheme = ""
with open(protocolsfile_path, 'r') as scheme_file:
for scheme in scheme_file:
uri_scheme += scheme[:-1]+"|"
uri_scheme = uri_scheme[:-1]
self.url_regex = "((?i:"+uri_scheme + \
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
self.prec_filename = None
# Send module state to logs
self.redis_logger.info("Module %s initialized" % (self.module_name))
def compute(self, message):
"""
Search for Web links from given message
"""
# Extract item
filename, score = message.split()
if self.prec_filename is None or filename != self.prec_filename:
domains_list = set()
PST = Paste.Paste(filename)
client = ip2asn()
detected_urls = PST.get_regex(self.url_regex)
if len(detected_urls) > 0:
to_print = 'Web;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
self.redis_logger.info('{}Detected {} URL;{}'.format(
to_print, len(detected_urls), PST.p_rel_path))
for url in detected_urls:
self.redis_logger.debug("match regex: %s" % (url))
# self.redis_logger.debug("match regex search: %s"%(url))
to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
self.process.populate_set_out(to_send, 'Url')
self.redis_logger.debug("url_parsed: %s" % (to_send))
self.faup.decode(url)
domain = self.faup.get_domain()
subdomain = self.faup.get_subdomain()
self.redis_logger.debug('{} Published'.format(url))
if subdomain is not None:
# TODO: # FIXME: remove me
try:
subdomain = subdomain.decode()
except:
pass
if domain is not None:
# TODO: # FIXME: remove me
try:
domain = domain.decode()
except:
pass
domains_list.add(domain)
hostl = self.avoidNone(subdomain) + self.avoidNone(domain)
try:
socket.setdefaulttimeout(1)
ip = socket.gethostbyname(hostl)
# If the resolver is not giving any IPv4 address,
# ASN/CC lookup is skip.
l = client.lookup(ip, qType='IP')
except ipaddress.AddressValueError:
self.redis_logger.error(
'ASN/CC lookup failed for IP {}'.format(ip))
continue
except:
self.redis_logger.error(
'Resolver IPv4 address failed for host {}'.format(hostl))
continue
cc = getattr(l, 'cc')
asn = ''
if getattr(l, 'asn') is not None:
asn = getattr(l, 'asn')[2:] # remobe b'
# EU is not an official ISO 3166 code (but used by RIPE
# IP allocation)
if cc is not None and cc != "EU":
self.redis_logger.debug('{};{};{};{}'.format(hostl, asn, cc,
pycountry.countries.get(alpha_2=cc).name))
if cc == self.cc_critical:
to_print = 'Url;{};{};{};Detected {} {}'.format(
PST.p_source, PST.p_date, PST.p_name,
hostl, cc)
self.redis_logger.info(to_print)
else:
self.redis_logger.debug('{};{};{}'.format(hostl, asn, cc))
A_values = lib_refine.checking_A_record(self.r_serv2,
domains_list)
if A_values[0] >= 1:
PST.__setattr__(self.channel, A_values)
PST.save_attribute_redis(self.channel, (A_values[0],
list(A_values[1])))
pprint.pprint(A_values)
# self.redis_logger.info('Url;{};{};{};Checked {} URL;{}'.format(
# PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))
self.prec_filename = filename
if __name__ == '__main__':
module = Web()
module.run()

View file

@ -10,182 +10,198 @@ It consider the TLD, Domain and protocol.
""" """
##################################
# Import External packages
##################################
import time import time
import datetime import datetime
import redis import redis
import os import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup from pyfaup.faup import Faup
# Config Var
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look
max_set_cardinality = 10 # The cardinality of the progression set
num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past
def analyse(server, field_name, date, url_parsed): ##################################
field = url_parsed[field_name] # Import Project packages
if field is not None: ##################################
try: # faup version from module.abstract_module import AbstractModule
field = field.decode() from packages import lib_words
except: from packages.Date import Date
pass from Helper import Process
server.hincrby(field, date, 1)
if field_name == "domain": #save domain in a set for the monthly plot
domain_set_name = "domain_set_" + date[0:6]
server.sadd(domain_set_name, field)
print("added in " + domain_set_name +": "+ field)
def get_date_range(num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return date_list
# Compute the progression for one keyword
def compute_progression_word(server, num_day, keyword):
date_range = get_date_range(num_day)
# check if this keyword is eligible for progression
keyword_total_sum = 0
value_list = []
for date in date_range: # get value up to date_range
curr_value = server.hget(keyword, date)
value_list.append(int(curr_value if curr_value is not None else 0))
keyword_total_sum += int(curr_value) if curr_value is not None else 0
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division
# The progression is based on the ratio: value[i] / value[i-1]
keyword_increase = 0
value_list_reversed = value_list[:]
value_list_reversed.reverse()
for i in range(1, len(value_list_reversed)):
divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
keyword_increase += value_list_reversed[i] / divisor
return (keyword_increase, keyword_total_sum)
''' class WebStats(AbstractModule):
recompute the set top_progression zset """
- Compute the current field progression WebStats module for AIL framework
- re-compute the current progression for each first 2*max_set_cardinality fields in the top_progression_zset """
'''
def compute_progression(server, field_name, num_day, url_parsed):
redis_progression_name_set = "z_top_progression_"+field_name
keyword = url_parsed[field_name] # Config Var
if keyword is not None: THRESHOLD_TOTAL_SUM = 200 # Above this value, a keyword is eligible for a progression
THRESHOLD_INCREASE = 1.0 # The percentage representing the keyword occurence since num_day_to_look
MAX_SET_CARDINALITY = 10 # The cardinality of the progression set
NUM_DAY_TO_LOOK = 5 # the detection of the progression start num_day_to_look in the past
#compute the progression of the current word
keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword)
#re-compute the progression of 2*max_set_cardinality def __init__(self):
current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality) super(WebStats, self).__init__()
for word, value in current_top:
word_inc, word_tot_sum = compute_progression_word(server, num_day, word)
server.zrem(redis_progression_name_set, word)
if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):
server.zadd(redis_progression_name_set, float(word_inc), word)
# filter before adding # Send module state to logs
if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): self.redis_logger.info("Module %s initialized"%(self.module_name))
server.zadd(redis_progression_name_set, float(keyword_increase), keyword) # Sent to the logging a description of the module
self.redis_logger.info("Makes statistics about valid URL")
self.pending_seconds = 5*60
# REDIS #
self.r_serv_trend = redis.StrictRedis(
host=self.process.config.get("ARDB_Trending", "host"),
port=self.process.config.get("ARDB_Trending", "port"),
db=self.process.config.get("ARDB_Trending", "db"),
decode_responses=True)
# FILE CURVE SECTION #
self.csv_path_proto = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "protocolstrending_csv"))
self.protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "protocolsfile"))
self.csv_path_tld = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "tldstrending_csv"))
self.tldsfile_path = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "tldsfile"))
self.csv_path_domain = os.path.join(os.environ['AIL_HOME'],
self.process.config.get("Directories", "domainstrending_csv"))
self.faup = Faup()
self.generate_new_graph = False
def computeNone(self):
if self.generate_new_graph:
self.generate_new_graph = False
today = datetime.date.today()
year = today.year
month = today.month
self.redis_logger.debug('Building protocol graph')
lib_words.create_curve_with_word_file(self.r_serv_trend, csv_path_proto,
protocolsfile_path, year,
month)
self.redis_logger.debug('Building tld graph')
lib_words.create_curve_with_word_file(self.r_serv_trend, csv_path_tld,
tldsfile_path, year,
month)
self.redis_logger.debug('Building domain graph')
lib_words.create_curve_from_redis_set(self.r_serv_trend, csv_path_domain,
"domain", year,
month)
self.redis_logger.debug('end building')
def compute(self, message):
self.generate_new_graph = True
# Do something with the message from the queue
url, date, path = message.split()
self.faup.decode(url)
url_parsed = self.faup.get()
# Scheme analysis
self.analyse('scheme', date, url_parsed)
# Tld analysis
self.analyse('tld', date, url_parsed)
# Domain analysis
self.analyse('domain', date, url_parsed)
self.compute_progression('scheme', self.NUM_DAY_TO_LOOK, url_parsed)
self.compute_progression('tld', self.NUM_DAY_TO_LOOK, url_parsed)
self.compute_progression('domain', self.NUM_DAY_TO_LOOK, url_parsed)
def analyse(self, field_name, date, url_parsed):
field = url_parsed[field_name]
if field is not None:
try: # faup version
field = field.decode()
except:
pass
self.r_serv_trend.hincrby(field, date, 1)
if field_name == "domain": #save domain in a set for the monthly plot
domain_set_name = "domain_set_" + date[0:6]
self.r_serv_trend.sadd(domain_set_name, field)
self.redis_logger.debug("added in " + domain_set_name +": "+ field)
def get_date_range(self, num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return date_list
def compute_progression_word(self, num_day, keyword):
"""
Compute the progression for one keyword
"""
date_range = self.get_date_range(num_day)
# check if this keyword is eligible for progression
keyword_total_sum = 0
value_list = []
for date in date_range: # get value up to date_range
curr_value = self.r_serv_trend.hget(keyword, date)
value_list.append(int(curr_value if curr_value is not None else 0))
keyword_total_sum += int(curr_value) if curr_value is not None else 0
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division
# The progression is based on the ratio: value[i] / value[i-1]
keyword_increase = 0
value_list_reversed = value_list[:]
value_list_reversed.reverse()
for i in range(1, len(value_list_reversed)):
divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
keyword_increase += value_list_reversed[i] / divisor
return (keyword_increase, keyword_total_sum)
def compute_progression(self, field_name, num_day, url_parsed):
"""
recompute the set top_progression zset
- Compute the current field progression
- re-compute the current progression for each first 2*self.MAX_SET_CARDINALITY fields in the top_progression_zset
"""
redis_progression_name_set = "z_top_progression_"+field_name
keyword = url_parsed[field_name]
if keyword is not None:
#compute the progression of the current word
keyword_increase, keyword_total_sum = self.compute_progression_word(num_day, keyword)
#re-compute the progression of 2*self.MAX_SET_CARDINALITY
current_top = self.r_serv_trend.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*self.MAX_SET_CARDINALITY)
for word, value in current_top:
word_inc, word_tot_sum = self.compute_progression_word(num_day, word)
self.r_serv_trend.zrem(redis_progression_name_set, word)
if (word_tot_sum > self.THRESHOLD_TOTAL_SUM) and (word_inc > self.THRESHOLD_INCREASE):
self.r_serv_trend.zadd(redis_progression_name_set, float(word_inc), word)
# filter before adding
if (keyword_total_sum > self.THRESHOLD_TOTAL_SUM) and (keyword_increase > self.THRESHOLD_INCREASE):
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_increase), keyword)
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = WebStats()
config_section = 'WebStats' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("Makes statistics about valid URL")
# REDIS #
r_serv_trend = redis.StrictRedis(
host=p.config.get("ARDB_Trending", "host"),
port=p.config.get("ARDB_Trending", "port"),
db=p.config.get("ARDB_Trending", "db"),
decode_responses=True)
# FILE CURVE SECTION #
csv_path_proto = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolstrending_csv"))
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile"))
csv_path_tld = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldstrending_csv"))
tldsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldsfile"))
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "domainstrending_csv"))
faup = Faup()
generate_new_graph = False
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
if generate_new_graph:
generate_new_graph = False
today = datetime.date.today()
year = today.year
month = today.month
print('Building protocol graph')
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
protocolsfile_path, year,
month)
print('Building tld graph')
lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
tldsfile_path, year,
month)
print('Building domain graph')
lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
"domain", year,
month)
print('end building')
publisher.debug("{} queue is empty, waiting".format(config_section))
print('sleeping')
time.sleep(5*60)
continue
else:
generate_new_graph = True
# Do something with the message from the queue
url, date, path = message.split()
faup.decode(url)
url_parsed = faup.get()
# Scheme analysis
analyse(r_serv_trend, 'scheme', date, url_parsed)
# Tld analysis
analyse(r_serv_trend, 'tld', date, url_parsed)
# Domain analysis
analyse(r_serv_trend, 'domain', date, url_parsed)
compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)

0
bin/module/__init__.py Normal file
View file

View file

@ -0,0 +1,98 @@
# coding: utf-8
"""
Base Class for AIL Modules
"""
##################################
# Import External packages
##################################
from abc import ABC, abstractmethod
import time
##################################
# Import Project packages
##################################
from pubsublogger import publisher
from Helper import Process
class AbstractModule(ABC):
"""
Abstract Module class
"""
def __init__(self, module_name=None, queue_name=None):
"""
Init Module
module_name: str; set the module name if different from the instance ClassName
"""
# Module name if provided else instance className
self.module_name = module_name if module_name else self._module_name()
# Module name if provided else instance className
self.queue_name = queue_name if queue_name else self._module_name()
# Init Redis Logger
self.redis_logger = publisher
# Port of the redis instance used by pubsublogger
self.redis_logger.port = 6380
# Channel name to publish logs
self.redis_logger.channel = 'Script'
# TODO modify generic channel Script to a namespaced channel like:
# publish module logs to script:<ModuleName> channel
# self.redis_logger.channel = 'script:%s'%(self.module_name)
# Run module endlessly
self.proceed = True
# Waiting time in secondes between two proccessed messages
self.pending_seconds = 10
# Setup the I/O queues
self.process = Process(self.queue_name)
def run(self):
"""
Run Module endless process
"""
# Endless loop processing messages from the input queue
while self.proceed:
# Get one message (paste) from the QueueIn (copy of Redis_Global publish)
message = self.process.get_from_set()
if message is None:
self.computeNone()
# Wait before next process
self.redis_logger.debug('%s, waiting for new message, Idling %ds'%(self.module_name, self.pending_seconds))
time.sleep(self.pending_seconds)
continue
try:
# Module processing with the message from the queue
self.compute(message)
except Exception as err:
self.redis_logger.error("Error in module %s: %s"%(self.module_name, err))
def _module_name(self):
"""
Returns the instance class name (ie. the Module Name)
"""
return self.__class__.__name__
@abstractmethod
def compute(self, message):
"""
Main method of the Module to implement
"""
pass
def computeNone(self):
"""
Method of the Module when there is no message
"""
pass

View file

@ -1,45 +1,57 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
""" """
Template for new modules The Template Module
======================
This module is a template for Template for new modules
""" """
##################################
# Import External packages
##################################
import time import time
from pubsublogger import publisher from pubsublogger import publisher
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from Helper import Process from Helper import Process
def do_something(message): class Template(AbstractModule):
return None """
Template module for AIL framework
"""
def __init__(self):
super(Template, self).__init__()
# Send module state to logs
self.redis_logger.info("Module %s initialized"%(self.module_name))
# Pending time between two computation in seconds
self.pending_seconds = 10
def computeNone(self):
"""
Compute when no message in queue
"""
self.redis_logger.debug("No message in queue")
def compute(self, message):
"""
Compute a message in queue
"""
self.redis_logger.debug("Compute message in queue")
if __name__ == '__main__': if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg module = Template()
config_section = '<section name>' module.run()
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("<description of the module>")
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
something_has_been_done = do_something(message)
# (Optional) Send that thing to the next queue
p.populate_set_out(something_has_been_done)