chg: [statistics] ARDB migration

This commit is contained in:
Terrtia 2022-09-08 10:31:57 +02:00
parent d27d47dc70
commit aa6ba61050
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
13 changed files with 482 additions and 420 deletions

View file

@ -16,6 +16,13 @@ import re
import string
from itertools import chain
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages #
##################################
from lib import Statistics
from packages import Item
from pubsublogger import publisher
@ -48,6 +55,7 @@ def is_valid_iban(iban):
return True
return False
# # TODO: SET
def check_all_iban(l_iban, obj_id):
nb_valid_iban = 0
for iban in l_iban:
@ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id):
if is_valid_iban(iban):
print('------')
nb_valid_iban = nb_valid_iban + 1
server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)
Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1)
if(nb_valid_iban > 0):
to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
@ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id):
msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
p.populate_set_out(msg, 'Tags')
#Send to duplicate
p.populate_set_out(obj_id, 'Duplicate')
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
@ -82,13 +88,6 @@ if __name__ == "__main__":
p = Process(config_section)
max_execution_time = p.config.getint("BankAccount", "max_execution_time")
# ARDB #
server_statistics = redis.StrictRedis(
host=p.config.get("ARDB_Statistics", "host"),
port=p.config.getint("ARDB_Statistics", "port"),
db=p.config.getint("ARDB_Statistics", "db"),
decode_responses=True)
publisher.info("BankAccount started")
#iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')

View file

@ -314,14 +314,6 @@ if __name__ == '__main__':
print('splash url: {}'.format(splash_url))
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
r_serv_metadata = redis.StrictRedis(
host=p.config.get("ARDB_Metadata", "host"),
port=p.config.getint("ARDB_Metadata", "port"),
db=p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_cache = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),

View file

@ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
from lib import Statistics
from lib import Tag
from lib import Users
from lib.objects import Decodeds
@ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker")
r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
r_crawler = config_loader.get_redis_conn("ARDB_Onion")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
r_serv_trend = config_loader.get_redis_conn("ARDB_Trending")
r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
config_loader = None
# # - - CONFIGS - - # #
@ -358,9 +361,16 @@ def items_migration():
def get_last_crawled_domains(domain_type):
return r_crawler.lrange(f'last_{domain_type}', 0 ,-1)
def get_domains_blacklist(domain_type):
return r_crawler.smembers(f'blacklist_{domain_type}')
def crawler_migration():
print('CRAWLER MIGRATION...')
for domain_type in ['onion', 'regular']:
for domain in get_domains_blacklist(domain_type):
crawlers.add_domain_blacklist(domain_type, domain)
# for domain_type in ['onion', 'regular']:
# for row in get_last_crawled_domains(domain_type):
# dom_row, epoch = row.rsplit(';', 1)
@ -368,19 +378,18 @@ def crawler_migration():
# print(domain, port, epoch)
# #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch)
for cookiejar_uuid in old_crawlers.get_all_cookiejar():
meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
#print(meta)
#crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
#crawlers._set_cookiejar_date(meta['date'])
for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
print(cookie_uuid)
#crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
# for cookiejar_uuid in old_crawlers.get_all_cookiejar():
# meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
# #print(meta)
# crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
# crawlers._set_cookiejar_date(meta['date'])
#
# for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
# print(cookie_uuid)
# crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
# TODO: auto crawler -> to Fix / change
# TODO: crawlers queues
###############################
@ -689,12 +698,89 @@ def subtypes_obj_migration():
#
# Credential:
# HSET 'credential_by_tld:'+date, tld, 1
def get_all_provider():
return r_serv_trend.smembers('all_provider_set')
def get_item_source_stats_by_date(date, source):
stats = {}
stats['num'] = r_serv_trend.hget(f'{source}_num', date)
stats['size'] = r_serv_trend.hget(f'{source}_size', date)
stats['avg'] = r_serv_trend.hget(f'{source}_avg', date)
return stats
def get_item_stats_size_avg_by_date(date):
return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
def get_item_stats_nb_by_date(date):
return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True)
def get_top_stats_module(module_name, date):
return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True)
def get_module_tld_stats_by_date(module, date):
return r_statistics.hgetall(f'{module}_by_tld:{date}')
def statistics_migration():
# paste_by_modules_timeout
# Date full history => lot of keys
# top_size_set_{date}
# top_avg_size_set_{date}
# 'providers_set_{date}
sources = get_all_provider()
for date in Date.get_date_range_today('20180101'):
size_avg = get_item_stats_size_avg_by_date(date)
nb_items = get_item_stats_nb_by_date(date)
# top_size_set_{date}
# top_avg_size_set_{date}
# 'providers_set_{date}
# ITEM STATS
for source in sources:
source_stat = get_item_source_stats_by_date(date, source)
Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg'])
# # MODULE STATS
# for module in ['credential', 'mail', 'SQLInjection']:
# stats = get_module_tld_stats_by_date(module, date)
# for tld in stats:
# if tld:
# print(module, date, tld, stats[tld])
# Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld])
# for module in ['credential']:
# # TOP STATS
# top_module = get_top_stats_module(module, date)
# for keyword, total_sum in top_module:
# print(date, module, keyword, total_sum)
# #Statistics._add_module_stats(module, total_sum, keyword, date)
pass
if __name__ == '__main__':
core_migration()
#core_migration()
# user_migration()
# tags_migration()
#items_migration()
@ -706,6 +792,7 @@ if __name__ == '__main__':
# ail_2_ail_migration()
# trackers_migration()
# investigations_migration()
statistics_migration()

View file

@ -217,8 +217,12 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x"
sleep 0.1
# screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
# sleep 0.1
screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
sleep 0.1
@ -267,8 +271,6 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x"
@ -277,8 +279,6 @@ function launching_scripts {
sleep 0.1
screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"

View file

@ -1,220 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Mails Module
======================
This module is consuming the Redis-list created by the Categ module.
It apply mail regexes on item content and warn if above a threshold.
"""
import os
import re
import sys
import uuid
import redis
import time
import datetime
import dns.resolver
import dns.exception
from multiprocessing import Process as Proc
from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
import Item
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
## LOAD CONFIG ##
config_loader = ConfigLoader.ConfigLoader()
server_statistics = config_loader.get_redis_conn("ARDB_Statistics")
r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
dns_server = config_loader.get_config_str('Mail', 'dns')
config_loader = None
## -- ##
def is_mxdomain_in_cache(mxdomain):
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
def save_mxdomain_in_cache(mxdomain):
r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
def check_mx_record(set_mxdomains, dns_server):
"""Check if emails MX domains are responding.
:param adress_set: -- (set) This is a set of emails domains
:return: (int) Number of adress with a responding and valid MX domains
"""
resolver = dns.resolver.Resolver()
resolver.nameservers = [dns_server]
resolver.timeout = 5.0
resolver.lifetime = 2.0
valid_mxdomain = []
for mxdomain in set_mxdomains:
# check if is in cache
# # TODO:
if is_mxdomain_in_cache(mxdomain):
valid_mxdomain.append(mxdomain)
else:
# DNS resolution
try:
answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
if answers:
save_mxdomain_in_cache(mxdomain)
valid_mxdomain.append(mxdomain)
# DEBUG
# print('---')
# print(answers.response)
# print(answers.qname)
# print(answers.rdtype)
# print(answers.rdclass)
# print(answers.nameserver)
# print()
except dns.resolver.NoNameservers:
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
print('NoNameserver, No non-broken nameservers are available to answer the query.')
except dns.resolver.NoAnswer:
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
print('NoAnswer, The response did not contain an answer to the question.')
except dns.name.EmptyLabel:
publisher.debug('SyntaxError: EmptyLabel')
print('SyntaxError: EmptyLabel')
except dns.resolver.NXDOMAIN:
#save_mxdomain_in_cache(mxdomain)
publisher.debug('The query name does not exist.')
print('The query name does not exist.')
except dns.name.LabelTooLong:
publisher.debug('The Label is too long')
print('The Label is too long')
except dns.exception.Timeout:
print('dns timeout')
#save_mxdomain_in_cache(mxdomain)
except Exception as e:
print(e)
return valid_mxdomain
def extract_all_emails(redis_key, item_content):
all_emails = re.findall(email_regex, item_content)
if len(all_emails) > 1:
r_serv_cache.sadd(redis_key, *all_emails)
r_serv_cache.expire(redis_key, 360)
elif all_emails:
r_serv_cache.sadd(redis_key, all_emails[0])
r_serv_cache.expire(redis_key, 360)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Mail'
faup = Faup()
p = Process(config_section)
publisher.info("Mails module started")
# Numbers of Mails needed to Tags
mail_threshold = 10
max_execution_time = 30
email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
while True:
message = p.get_from_set()
if message is not None:
item_id, score = message.split()
print(item_id)
item_content = Item.get_item_content(item_id)
proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
try:
proc.start()
proc.join(max_execution_time)
if proc.is_alive():
proc.terminate()
p.incr_module_timeout_statistic()
err_mess = "Mails: processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
continue
else:
all_emails = r_serv_cache.smembers(redis_key)
r_serv_cache.delete(redis_key)
proc.terminate()
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating workers")
proc.terminate()
sys.exit(0)
# get MXdomains
set_mxdomains = set()
dict_mxdomains_email = {}
for email in all_emails:
mxdomain = email.split('@')[1].lower()
if not mxdomain in dict_mxdomains_email:
dict_mxdomains_email[mxdomain] = []
set_mxdomains.add(mxdomain)
dict_mxdomains_email[mxdomain].append(email)
## TODO: add MAIL trackers
valid_mx = check_mx_record(set_mxdomains, dns_server)
item_date = Item.get_item_date(item_id)
num_valid_email = 0
for domain_mx in valid_mx:
num_valid_email += len(dict_mxdomains_email[domain_mx])
for email in dict_mxdomains_email[domain_mx]:
msg = 'mail;{};{};{}'.format(1, email, item_date)
p.populate_set_out(msg, 'ModuleStats')
# Create country stats
faup.decode(email)
tld = faup.get()['tld']
try:
tld = tld.decode()
except:
pass
server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1)
msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id)
if num_valid_email > mail_threshold:
print('{} Checked {} e-mail(s)'.format(item_id, num_valid_email))
publisher.warning(msg)
#Send to duplicate
p.populate_set_out(item_id, 'Duplicate')
#tags
msg = 'infoleak:automatic-detection="mail";{}'.format(item_id)
p.populate_set_out(msg, 'Tags')
else:
publisher.info(msg)
else:
time.sleep(10)

View file

@ -1,156 +0,0 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This module makes statistics for some modules and providers
"""
##################################
# Import External packages #
##################################
import time
import datetime
import redis
import os
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages #
##################################
from modules.abstract_module import AbstractModule
from packages.Date import Date
from packages import Paste
import ConfigLoader
class ModuleStats(AbstractModule):
"""
Module Statistics module for AIL framework
"""
# Config Var
MAX_SET_CARDINALITY = 8
def __init__(self):
super(ModuleStats, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 20
# Sent to the logging a description of the module
self.redis_logger.info("Makes statistics about valid URL")
# REDIS #
self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
def compute(self, message):
if len(message.split(';')) > 1:
self.compute_most_posted(message)
else:
self.compute_provider_info(message)
def get_date_range(self, num_day):
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
date_list = []
for i in range(0, num_day+1):
date_list.append(date.substract_day(i))
return date_list
def compute_most_posted(self, message):
module, num, keyword, paste_date = message.split(';')
redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
# Add/Update in Redis
self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
# Compute Most Posted
date = self.get_date_range(0)[0]
# check if this keyword is eligible for progression
keyword_total_sum = 0
curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
keyword_total_sum += int(curr_value) if curr_value is not None else 0
if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
else: # not in set
member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < keyword_total_sum:
#remove min from set and add the new one
self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
self.redis_logger.debug(redis_progression_name_set)
def compute_provider_info(self, message):
redis_all_provider = 'all_provider_set'
paste = Paste.Paste(message)
paste_baseName = paste.p_name.split('.')[0]
paste_size = paste._get_p_size()
paste_provider = paste.p_source
paste_date = str(paste._get_p_date())
redis_sum_size_set = 'top_size_set_' + paste_date
redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
redis_providers_name_set = 'providers_set_' + paste_date
# Add/Update in Redis
self.r_serv_trend.sadd(redis_all_provider, paste_provider)
num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
new_avg = float(sum_size) / float(num_paste)
self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
#
# Compute Most Posted
#
# Size
if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
else: #set full capacity
member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if float(member_set[0][1]) < new_avg:
#remove min from set and add the new one
self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
# Num
# if set not full or provider already present
if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
else: #set at full capacity
member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < num_paste:
#remove min from set and add the new one
self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
self.r_serv_trend.zrem(member_set[0][0])
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
if __name__ == '__main__':
module = ModuleStats()
module.run()

View file

@ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
config_loader = ConfigLoader.ConfigLoader()
r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics")
r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
#r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending")
config_loader = None
PIE_CHART_MAX_CARDINALITY = 8
def incr_module_timeout_statistic(module_name):
curr_date = datetime.date.today()
r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
def create_item_statistics(item_id, source, size):
pass
def get_item_sources():
return r_statistics.smembers('all_provider_set')
def get_nb_items_processed_by_day_and_source(date, source):
nb_items = r_statistics.hget(f'{source}_num', date)
if not nb_items:
nb_items = 0
return int(nb_items)
def get_items_total_size_by_day_and_source(date, source):
total_size = r_statistics.hget(f'{source}_size', date)
if not total_size:
total_size = 0
return float(total_size)
def get_items_av_size_by_day_and_source(date, source):
av_size = r_statistics.hget(f'{source}_avg', date)
if not av_size:
av_size = 0
return float(av_size)
def _create_item_stats_size_nb(date, source, num, size, avg):
r_statistics.hset(f'{source}_num', date, num)
r_statistics.hset(f'{source}_size', date, size)
r_statistics.hset(f'{source}_avg', date, avg)
def get_item_stats_size_avg_by_date():
return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
def get_item_stats_nb_by_date():
return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True)
def _set_item_stats_nb_by_date(date, source):
return r_statistics.zrange(f'providers_set_{date}', )
# # TODO: load ZSET IN CACHE => FAST UPDATE
def update_item_stats_size_nb(item_id, source, size, date):
# Add/Update in Redis
r_statistics.sadd('all_provider_set', source)
nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1))
sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size))
new_avg = sum_size / nb_items
r_statistics.hset(f'{source}_avg', date, new_avg)
# TOP Items Size
if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY:
r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
else:
member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if float(member_set[0][1]) < new_avg:
# remove min from set and add the new one
r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0])
r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
# TOP Nb Items
if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None:
r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
else: # zset at full capacity
member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < nb_items:
# remove min from set and add the new one
r_statistics.zrem(member_set[0][0])
r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
# keyword num
def _add_module_stats(module_name, total_sum, keyword, date):
r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword)
# # TODO: ONE HSET BY MODULE / CUSTOM STATS
def update_module_stats(module_name, num, keyword, date):
# Add/Update in Redis
r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!!
# Compute Most Posted
# check if this keyword is eligible for progression
keyword_total_sum = 0
curr_value = r_statistics.hget(date, module+'-'+keyword)
keyword_total_sum += int(curr_value) if curr_value is not None else 0
if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY:
r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
else: # zset at full capacity
member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
# Member set is a list of (value, score) pairs
if int(member_set[0][1]) < keyword_total_sum:
#remove min from set and add the new one
r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0])
r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
def get_module_tld_stats_by_tld_date(date, tld):
nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld)
if not nb_tld:
nb_tld = 0
return int(nb_tld)
def get_module_tld_stats_by_date(module, date):
return r_statistics.hgetall(f'{module}_by_tld:{date}')
def add_module_tld_stats_by_date(module, date, tld, nb):
r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb))
def get_iban_country_stats_by_date(date):
return r_statistics.hgetall(f'iban_by_country:{date}')
def add_iban_country_stats_by_date(date, tld, nb):
r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb))
# r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
# r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1)

View file

@ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
else:
return ({'error':'invalid input'}, 400)
def get_domains_blacklist(domain_type):
return r_serv_onion.smembers(f'blacklist_{domain_type}')
def add_domain_blacklist(domain_type, domain):
r_serv_onion.sadd(f'blacklist_{domain_type}', domain)
##-- CRAWLER GLOBAL --##
#### AUTOMATIC CRAWLER ####

View file

@ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule
from packages.Item import Item
from lib import ConfigLoader
from lib import regex_helper
from lib import Statistics
class Credential(AbstractModule):
@ -96,6 +97,7 @@ class Credential(AbstractModule):
item_content = item.get_content()
# TODO: USE SETS
# Extract all credentials
all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time)
@ -117,9 +119,6 @@ class Credential(AbstractModule):
print(f"========> Found more than 10 credentials in this file : {item.get_id()}")
self.redis_logger.warning(to_print)
# Send to duplicate
self.send_message_to_queue(item.get_id(), 'Duplicate')
msg = f'infoleak:automatic-detection="credential";{item.get_id()}'
self.send_message_to_queue(msg, 'Tags')
@ -158,6 +157,7 @@ class Credential(AbstractModule):
print(f"=======> Probably on : {discovered_sites}")
date = datetime.now().strftime("%Y%m")
nb_tlds = {}
for cred in all_credentials:
maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
self.faup.decode(maildomains)
@ -167,7 +167,9 @@ class Credential(AbstractModule):
tld = tld.decode()
except:
pass
self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
nb_tlds[tld] = nb_tlds.get(tld, 0) + 1
for tld in nb_tlds:
Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld])
else:
self.redis_logger.info(to_print)
print(f'found {nb_cred} credentials')

View file

@ -75,9 +75,6 @@ class CreditCards(AbstractModule):
if (len(creditcard_set) > 0):
self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}')
#Send to duplicate
self.send_message_to_queue(item.get_id(), 'Duplicate')
msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}'
self.send_message_to_queue(msg, 'Tags')

177
bin/modules/Mail.py Executable file
View file

@ -0,0 +1,177 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Mails Module
======================
This module is consuming the Redis-list created by the Categ module.
It apply mail regexes on item content and warn if above a threshold.
"""
import os
import re
import redis
import sys
import time
import datetime
import dns.resolver
import dns.exception
from pyfaup.faup import Faup
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages #
##################################
from modules.abstract_module import AbstractModule
from lib.objects.Items import Item
from lib.ConfigLoader import ConfigLoader
from lib import Statistics
class Mail(AbstractModule):
"""
Module Mail module for AIL framework
"""
def __init__(self):
super(Mail, self).__init__()
config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
self.dns_server = config_loader.get_config_str('Mail', 'dns')
self.faup = Faup()
# Numbers of Mails needed to Tags
self.mail_threshold = 10
self.regex_timeout = 30
self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
re.compile(self.email_regex)
def is_mxdomain_in_cache(self, mxdomain):
return self.r_cache.exists(f'mxdomain:{mxdomain}')
def save_mxdomain_in_cache(self, mxdomain):
self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1))
def check_mx_record(self, set_mxdomains):
"""Check if emails MX domains are responding.
:param adress_set: -- (set) This is a set of emails domains
:return: (int) Number of adress with a responding and valid MX domains
"""
resolver = dns.resolver.Resolver()
resolver.nameservers = [self.dns_server]
resolver.timeout = 5.0
resolver.lifetime = 2.0
valid_mxdomain = []
for mxdomain in set_mxdomains:
# check if is in cache
# # TODO:
if self.is_mxdomain_in_cache(mxdomain):
valid_mxdomain.append(mxdomain)
else:
# DNS resolution
try:
answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
if answers:
self.save_mxdomain_in_cache(mxdomain)
valid_mxdomain.append(mxdomain)
# DEBUG
# print('---')
# print(answers.response)
# print(answers.qname)
# print(answers.rdtype)
# print(answers.rdclass)
# print(answers.nameserver)
# print()
except dns.resolver.NoNameservers:
self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
print('NoNameserver, No non-broken nameservers are available to answer the query.')
except dns.resolver.NoAnswer:
self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.')
print('NoAnswer, The response did not contain an answer to the question.')
except dns.name.EmptyLabel:
self.redis_logger.debug('SyntaxError: EmptyLabel')
print('SyntaxError: EmptyLabel')
except dns.resolver.NXDOMAIN:
#save_mxdomain_in_cache(mxdomain)
self.redis_logger.debug('The query name does not exist.')
print('The query name does not exist.')
except dns.name.LabelTooLong:
self.redis_logger.debug('The Label is too long')
print('The Label is too long')
except dns.exception.Timeout:
print('dns timeout')
#save_mxdomain_in_cache(mxdomain)
except Exception as e:
print(e)
return valid_mxdomain
# # TODO: sanityze mails
def compute(self, message):
item_id, score = message.split()
item = Item(item_id)
item_date = item.get_date()
mails = self.regex_findall(self.email_regex, item_id, item.get_content())
mxdomains_email = {}
for mail in mails:
mxdomain = mail.rsplit('@', 1)[1].lower()
if not mxdomain in mxdomains_email:
mxdomains_email[mxdomain] = set()
mxdomains_email[mxdomain].add(mail)
## TODO: add MAIL trackers
valid_mx = self.check_mx_record(mxdomains_email.keys())
print(f'valid_mx: {valid_mx}')
mx_tlds = {}
num_valid_email = 0
for domain_mx in valid_mx:
nb_mails = len(mxdomains_email[domain_mx])
num_valid_email += nb_mails
# Create doamin_mail stats
msg = f'mail;{nb_mails};{domain_mx};{item_date}'
self.send_message_to_queue(msg, 'ModuleStats')
# Create country stats
self.faup.decode(domain_mx)
tld = self.faup.get()['tld']
try:
tld = tld.decode()
except:
pass
mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails
for tld in mx_tlds:
Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld])
if num_valid_email > self.mail_threshold:
msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}'
print(f'{item_id} Checked {num_valid_email} e-mail(s)')
self.redis_logger.warning(msg)
# Tags
msg = f'infoleak:automatic-detection="mail";{item_id}'
self.send_message_to_queue(msg, 'Tags')
else:
self.redis_logger.info(msg)
if __name__ == '__main__':
module = Mail()
#module.compute('tests/2021/01/01/mails.gz 50')
module.run()

54
bin/modules/ModuleStats.py Executable file
View file

@ -0,0 +1,54 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This module makes statistics for some modules and providers
"""
##################################
# Import External packages #
##################################
import os
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages #
##################################
from modules.abstract_module import AbstractModule
from lib.objects.Items import Item
from lib import Statistics
class ModuleStats(AbstractModule):
"""
Module Statistics module for AIL framework
"""
def __init__(self):
super(ModuleStats, self).__init__()
# Waiting time in secondes between to message proccessed
self.pending_seconds = 20
def compute(self, message):
# MODULE STATS
if len(message.split(';')) > 1:
module_name, num, keyword, date = message.split(';')
Statisticsupdate_module_stats(module_name, num, keyword, date)
# ITEM STATS
else:
item = Item(item_id)
source = item.get_source()
date = item.get_date()
size = item.get_size()
Statistics.update_item_stats_size_nb(item_id, source, size, date)
if __name__ == '__main__':
module = ModuleStats()
module.run()

View file

@ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R
[CreditCards]
subscribe = Redis_CreditCards
publish = Redis_ModuleStats,Redis_Tags
publish = Redis_Tags
[BankAccount]
subscribe = Redis_Global