mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-23 06:37:15 +00:00
chg: [statistics] ARDB migration
This commit is contained in:
parent
d27d47dc70
commit
aa6ba61050
13 changed files with 482 additions and 420 deletions
|
@ -16,6 +16,13 @@ import re
|
||||||
import string
|
import string
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
##################################
|
||||||
|
# Import Project packages #
|
||||||
|
##################################
|
||||||
|
from lib import Statistics
|
||||||
|
|
||||||
|
|
||||||
from packages import Item
|
from packages import Item
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
@ -48,6 +55,7 @@ def is_valid_iban(iban):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# # TODO: SET
|
||||||
def check_all_iban(l_iban, obj_id):
|
def check_all_iban(l_iban, obj_id):
|
||||||
nb_valid_iban = 0
|
nb_valid_iban = 0
|
||||||
for iban in l_iban:
|
for iban in l_iban:
|
||||||
|
@ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id):
|
||||||
if is_valid_iban(iban):
|
if is_valid_iban(iban):
|
||||||
print('------')
|
print('------')
|
||||||
nb_valid_iban = nb_valid_iban + 1
|
nb_valid_iban = nb_valid_iban + 1
|
||||||
server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)
|
Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1)
|
||||||
|
|
||||||
|
|
||||||
if(nb_valid_iban > 0):
|
if(nb_valid_iban > 0):
|
||||||
to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
|
to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
|
||||||
|
@ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id):
|
||||||
msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
|
msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
|
||||||
p.populate_set_out(msg, 'Tags')
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
#Send to duplicate
|
|
||||||
p.populate_set_out(obj_id, 'Duplicate')
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
@ -82,13 +88,6 @@ if __name__ == "__main__":
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
max_execution_time = p.config.getint("BankAccount", "max_execution_time")
|
max_execution_time = p.config.getint("BankAccount", "max_execution_time")
|
||||||
|
|
||||||
# ARDB #
|
|
||||||
server_statistics = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_Statistics", "host"),
|
|
||||||
port=p.config.getint("ARDB_Statistics", "port"),
|
|
||||||
db=p.config.getint("ARDB_Statistics", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
publisher.info("BankAccount started")
|
publisher.info("BankAccount started")
|
||||||
|
|
||||||
#iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
|
#iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
|
||||||
|
|
|
@ -314,14 +314,6 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
print('splash url: {}'.format(splash_url))
|
print('splash url: {}'.format(splash_url))
|
||||||
|
|
||||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
|
||||||
|
|
||||||
r_serv_metadata = redis.StrictRedis(
|
|
||||||
host=p.config.get("ARDB_Metadata", "host"),
|
|
||||||
port=p.config.getint("ARDB_Metadata", "port"),
|
|
||||||
db=p.config.getint("ARDB_Metadata", "db"),
|
|
||||||
decode_responses=True)
|
|
||||||
|
|
||||||
r_cache = redis.StrictRedis(
|
r_cache = redis.StrictRedis(
|
||||||
host=p.config.get("Redis_Cache", "host"),
|
host=p.config.get("Redis_Cache", "host"),
|
||||||
port=p.config.getint("Redis_Cache", "port"),
|
port=p.config.getint("Redis_Cache", "port"),
|
||||||
|
|
|
@ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
# Import Project packages
|
# Import Project packages
|
||||||
##################################
|
##################################
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
from lib import Statistics
|
||||||
from lib import Tag
|
from lib import Tag
|
||||||
from lib import Users
|
from lib import Users
|
||||||
from lib.objects import Decodeds
|
from lib.objects import Decodeds
|
||||||
|
@ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker")
|
||||||
r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
|
r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
|
||||||
r_crawler = config_loader.get_redis_conn("ARDB_Onion")
|
r_crawler = config_loader.get_redis_conn("ARDB_Onion")
|
||||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||||
|
r_serv_trend = config_loader.get_redis_conn("ARDB_Trending")
|
||||||
|
r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
|
||||||
config_loader = None
|
config_loader = None
|
||||||
# # - - CONFIGS - - # #
|
# # - - CONFIGS - - # #
|
||||||
|
|
||||||
|
@ -358,9 +361,16 @@ def items_migration():
|
||||||
def get_last_crawled_domains(domain_type):
|
def get_last_crawled_domains(domain_type):
|
||||||
return r_crawler.lrange(f'last_{domain_type}', 0 ,-1)
|
return r_crawler.lrange(f'last_{domain_type}', 0 ,-1)
|
||||||
|
|
||||||
|
def get_domains_blacklist(domain_type):
|
||||||
|
return r_crawler.smembers(f'blacklist_{domain_type}')
|
||||||
|
|
||||||
def crawler_migration():
|
def crawler_migration():
|
||||||
print('CRAWLER MIGRATION...')
|
print('CRAWLER MIGRATION...')
|
||||||
|
|
||||||
|
for domain_type in ['onion', 'regular']:
|
||||||
|
for domain in get_domains_blacklist(domain_type):
|
||||||
|
crawlers.add_domain_blacklist(domain_type, domain)
|
||||||
|
|
||||||
# for domain_type in ['onion', 'regular']:
|
# for domain_type in ['onion', 'regular']:
|
||||||
# for row in get_last_crawled_domains(domain_type):
|
# for row in get_last_crawled_domains(domain_type):
|
||||||
# dom_row, epoch = row.rsplit(';', 1)
|
# dom_row, epoch = row.rsplit(';', 1)
|
||||||
|
@ -368,19 +378,18 @@ def crawler_migration():
|
||||||
# print(domain, port, epoch)
|
# print(domain, port, epoch)
|
||||||
# #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch)
|
# #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch)
|
||||||
|
|
||||||
for cookiejar_uuid in old_crawlers.get_all_cookiejar():
|
# for cookiejar_uuid in old_crawlers.get_all_cookiejar():
|
||||||
meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
|
# meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True)
|
||||||
#print(meta)
|
# #print(meta)
|
||||||
# crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
|
# crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid)
|
||||||
# crawlers._set_cookiejar_date(meta['date'])
|
# crawlers._set_cookiejar_date(meta['date'])
|
||||||
|
#
|
||||||
for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
|
# for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True):
|
||||||
print(cookie_uuid)
|
# print(cookie_uuid)
|
||||||
# crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
|
# crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid)
|
||||||
|
|
||||||
# TODO: auto crawler -> to Fix / change
|
# TODO: auto crawler -> to Fix / change
|
||||||
|
|
||||||
|
|
||||||
# TODO: crawlers queues
|
# TODO: crawlers queues
|
||||||
|
|
||||||
###############################
|
###############################
|
||||||
|
@ -689,12 +698,89 @@ def subtypes_obj_migration():
|
||||||
#
|
#
|
||||||
# Credential:
|
# Credential:
|
||||||
# HSET 'credential_by_tld:'+date, tld, 1
|
# HSET 'credential_by_tld:'+date, tld, 1
|
||||||
|
|
||||||
|
def get_all_provider():
|
||||||
|
return r_serv_trend.smembers('all_provider_set')
|
||||||
|
|
||||||
|
def get_item_source_stats_by_date(date, source):
|
||||||
|
stats = {}
|
||||||
|
stats['num'] = r_serv_trend.hget(f'{source}_num', date)
|
||||||
|
stats['size'] = r_serv_trend.hget(f'{source}_size', date)
|
||||||
|
stats['avg'] = r_serv_trend.hget(f'{source}_avg', date)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def get_item_stats_size_avg_by_date(date):
|
||||||
|
return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def get_item_stats_nb_by_date(date):
|
||||||
|
return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def get_top_stats_module(module_name, date):
|
||||||
|
return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def get_module_tld_stats_by_date(module, date):
|
||||||
|
return r_statistics.hgetall(f'{module}_by_tld:{date}')
|
||||||
|
|
||||||
def statistics_migration():
|
def statistics_migration():
|
||||||
|
# paste_by_modules_timeout
|
||||||
|
|
||||||
|
# Date full history => lot of keys
|
||||||
|
|
||||||
|
|
||||||
|
# top_size_set_{date}
|
||||||
|
# top_avg_size_set_{date}
|
||||||
|
|
||||||
|
# 'providers_set_{date}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
sources = get_all_provider()
|
||||||
|
for date in Date.get_date_range_today('20180101'):
|
||||||
|
|
||||||
|
size_avg = get_item_stats_size_avg_by_date(date)
|
||||||
|
|
||||||
|
nb_items = get_item_stats_nb_by_date(date)
|
||||||
|
|
||||||
|
# top_size_set_{date}
|
||||||
|
# top_avg_size_set_{date}
|
||||||
|
|
||||||
|
# 'providers_set_{date}
|
||||||
|
|
||||||
|
# ITEM STATS
|
||||||
|
for source in sources:
|
||||||
|
source_stat = get_item_source_stats_by_date(date, source)
|
||||||
|
Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # MODULE STATS
|
||||||
|
# for module in ['credential', 'mail', 'SQLInjection']:
|
||||||
|
# stats = get_module_tld_stats_by_date(module, date)
|
||||||
|
# for tld in stats:
|
||||||
|
# if tld:
|
||||||
|
# print(module, date, tld, stats[tld])
|
||||||
|
# Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld])
|
||||||
|
# for module in ['credential']:
|
||||||
|
# # TOP STATS
|
||||||
|
# top_module = get_top_stats_module(module, date)
|
||||||
|
# for keyword, total_sum in top_module:
|
||||||
|
# print(date, module, keyword, total_sum)
|
||||||
|
# #Statistics._add_module_stats(module, total_sum, keyword, date)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
core_migration()
|
#core_migration()
|
||||||
# user_migration()
|
# user_migration()
|
||||||
# tags_migration()
|
# tags_migration()
|
||||||
#items_migration()
|
#items_migration()
|
||||||
|
@ -706,6 +792,7 @@ if __name__ == '__main__':
|
||||||
# ail_2_ail_migration()
|
# ail_2_ail_migration()
|
||||||
# trackers_migration()
|
# trackers_migration()
|
||||||
# investigations_migration()
|
# investigations_migration()
|
||||||
|
statistics_migration()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -217,8 +217,12 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x"
|
screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x"
|
||||||
|
sleep 0.1
|
||||||
# screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
|
# screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x"
|
||||||
# sleep 0.1
|
# sleep 0.1
|
||||||
|
screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x"
|
||||||
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
|
screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
|
||||||
|
@ -267,8 +271,6 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
|
screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x"
|
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x"
|
screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x"
|
screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x"
|
||||||
|
@ -277,8 +279,6 @@ function launching_scripts {
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x"
|
screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x"
|
|
||||||
sleep 0.1
|
|
||||||
screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x"
|
screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x"
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"
|
screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"
|
||||||
|
|
220
bin/Mail.py
220
bin/Mail.py
|
@ -1,220 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
|
|
||||||
"""
|
|
||||||
The Mails Module
|
|
||||||
======================
|
|
||||||
|
|
||||||
This module is consuming the Redis-list created by the Categ module.
|
|
||||||
|
|
||||||
It apply mail regexes on item content and warn if above a threshold.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import uuid
|
|
||||||
import redis
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
import dns.resolver
|
|
||||||
import dns.exception
|
|
||||||
|
|
||||||
from multiprocessing import Process as Proc
|
|
||||||
|
|
||||||
from pubsublogger import publisher
|
|
||||||
from Helper import Process
|
|
||||||
|
|
||||||
from pyfaup.faup import Faup
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
|
|
||||||
import Item
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
|
||||||
import ConfigLoader
|
|
||||||
|
|
||||||
## LOAD CONFIG ##
|
|
||||||
config_loader = ConfigLoader.ConfigLoader()
|
|
||||||
server_statistics = config_loader.get_redis_conn("ARDB_Statistics")
|
|
||||||
r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
||||||
|
|
||||||
dns_server = config_loader.get_config_str('Mail', 'dns')
|
|
||||||
|
|
||||||
config_loader = None
|
|
||||||
## -- ##
|
|
||||||
|
|
||||||
def is_mxdomain_in_cache(mxdomain):
|
|
||||||
return r_serv_cache.exists('mxdomain:{}'.format(mxdomain))
|
|
||||||
|
|
||||||
def save_mxdomain_in_cache(mxdomain):
|
|
||||||
r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1))
|
|
||||||
|
|
||||||
def check_mx_record(set_mxdomains, dns_server):
|
|
||||||
"""Check if emails MX domains are responding.
|
|
||||||
|
|
||||||
:param adress_set: -- (set) This is a set of emails domains
|
|
||||||
:return: (int) Number of adress with a responding and valid MX domains
|
|
||||||
|
|
||||||
"""
|
|
||||||
resolver = dns.resolver.Resolver()
|
|
||||||
resolver.nameservers = [dns_server]
|
|
||||||
resolver.timeout = 5.0
|
|
||||||
resolver.lifetime = 2.0
|
|
||||||
|
|
||||||
valid_mxdomain = []
|
|
||||||
for mxdomain in set_mxdomains:
|
|
||||||
|
|
||||||
# check if is in cache
|
|
||||||
# # TODO:
|
|
||||||
if is_mxdomain_in_cache(mxdomain):
|
|
||||||
valid_mxdomain.append(mxdomain)
|
|
||||||
else:
|
|
||||||
|
|
||||||
# DNS resolution
|
|
||||||
try:
|
|
||||||
answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
|
|
||||||
if answers:
|
|
||||||
save_mxdomain_in_cache(mxdomain)
|
|
||||||
valid_mxdomain.append(mxdomain)
|
|
||||||
# DEBUG
|
|
||||||
# print('---')
|
|
||||||
# print(answers.response)
|
|
||||||
# print(answers.qname)
|
|
||||||
# print(answers.rdtype)
|
|
||||||
# print(answers.rdclass)
|
|
||||||
# print(answers.nameserver)
|
|
||||||
# print()
|
|
||||||
|
|
||||||
except dns.resolver.NoNameservers:
|
|
||||||
publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
|
|
||||||
print('NoNameserver, No non-broken nameservers are available to answer the query.')
|
|
||||||
except dns.resolver.NoAnswer:
|
|
||||||
publisher.debug('NoAnswer, The response did not contain an answer to the question.')
|
|
||||||
print('NoAnswer, The response did not contain an answer to the question.')
|
|
||||||
except dns.name.EmptyLabel:
|
|
||||||
publisher.debug('SyntaxError: EmptyLabel')
|
|
||||||
print('SyntaxError: EmptyLabel')
|
|
||||||
except dns.resolver.NXDOMAIN:
|
|
||||||
#save_mxdomain_in_cache(mxdomain)
|
|
||||||
publisher.debug('The query name does not exist.')
|
|
||||||
print('The query name does not exist.')
|
|
||||||
except dns.name.LabelTooLong:
|
|
||||||
publisher.debug('The Label is too long')
|
|
||||||
print('The Label is too long')
|
|
||||||
except dns.exception.Timeout:
|
|
||||||
print('dns timeout')
|
|
||||||
#save_mxdomain_in_cache(mxdomain)
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return valid_mxdomain
|
|
||||||
|
|
||||||
def extract_all_emails(redis_key, item_content):
|
|
||||||
all_emails = re.findall(email_regex, item_content)
|
|
||||||
if len(all_emails) > 1:
|
|
||||||
r_serv_cache.sadd(redis_key, *all_emails)
|
|
||||||
r_serv_cache.expire(redis_key, 360)
|
|
||||||
elif all_emails:
|
|
||||||
r_serv_cache.sadd(redis_key, all_emails[0])
|
|
||||||
r_serv_cache.expire(redis_key, 360)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
publisher.port = 6380
|
|
||||||
publisher.channel = "Script"
|
|
||||||
|
|
||||||
config_section = 'Mail'
|
|
||||||
|
|
||||||
faup = Faup()
|
|
||||||
|
|
||||||
p = Process(config_section)
|
|
||||||
|
|
||||||
publisher.info("Mails module started")
|
|
||||||
|
|
||||||
# Numbers of Mails needed to Tags
|
|
||||||
mail_threshold = 10
|
|
||||||
|
|
||||||
max_execution_time = 30
|
|
||||||
|
|
||||||
email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
|
|
||||||
|
|
||||||
redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4()))
|
|
||||||
|
|
||||||
while True:
|
|
||||||
message = p.get_from_set()
|
|
||||||
|
|
||||||
if message is not None:
|
|
||||||
item_id, score = message.split()
|
|
||||||
|
|
||||||
print(item_id)
|
|
||||||
|
|
||||||
item_content = Item.get_item_content(item_id)
|
|
||||||
|
|
||||||
proc = Proc(target=extract_all_emails, args=(redis_key, item_content, ))
|
|
||||||
try:
|
|
||||||
proc.start()
|
|
||||||
proc.join(max_execution_time)
|
|
||||||
if proc.is_alive():
|
|
||||||
proc.terminate()
|
|
||||||
p.incr_module_timeout_statistic()
|
|
||||||
err_mess = "Mails: processing timeout: {}".format(item_id)
|
|
||||||
print(err_mess)
|
|
||||||
publisher.info(err_mess)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
all_emails = r_serv_cache.smembers(redis_key)
|
|
||||||
r_serv_cache.delete(redis_key)
|
|
||||||
proc.terminate()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("Caught KeyboardInterrupt, terminating workers")
|
|
||||||
proc.terminate()
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# get MXdomains
|
|
||||||
set_mxdomains = set()
|
|
||||||
dict_mxdomains_email = {}
|
|
||||||
for email in all_emails:
|
|
||||||
mxdomain = email.split('@')[1].lower()
|
|
||||||
if not mxdomain in dict_mxdomains_email:
|
|
||||||
dict_mxdomains_email[mxdomain] = []
|
|
||||||
set_mxdomains.add(mxdomain)
|
|
||||||
dict_mxdomains_email[mxdomain].append(email)
|
|
||||||
|
|
||||||
## TODO: add MAIL trackers
|
|
||||||
|
|
||||||
valid_mx = check_mx_record(set_mxdomains, dns_server)
|
|
||||||
|
|
||||||
item_date = Item.get_item_date(item_id)
|
|
||||||
|
|
||||||
num_valid_email = 0
|
|
||||||
for domain_mx in valid_mx:
|
|
||||||
num_valid_email += len(dict_mxdomains_email[domain_mx])
|
|
||||||
|
|
||||||
for email in dict_mxdomains_email[domain_mx]:
|
|
||||||
msg = 'mail;{};{};{}'.format(1, email, item_date)
|
|
||||||
p.populate_set_out(msg, 'ModuleStats')
|
|
||||||
|
|
||||||
# Create country stats
|
|
||||||
faup.decode(email)
|
|
||||||
tld = faup.get()['tld']
|
|
||||||
try:
|
|
||||||
tld = tld.decode()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1)
|
|
||||||
|
|
||||||
msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id)
|
|
||||||
|
|
||||||
if num_valid_email > mail_threshold:
|
|
||||||
print('{} Checked {} e-mail(s)'.format(item_id, num_valid_email))
|
|
||||||
publisher.warning(msg)
|
|
||||||
#Send to duplicate
|
|
||||||
p.populate_set_out(item_id, 'Duplicate')
|
|
||||||
#tags
|
|
||||||
msg = 'infoleak:automatic-detection="mail";{}'.format(item_id)
|
|
||||||
p.populate_set_out(msg, 'Tags')
|
|
||||||
else:
|
|
||||||
publisher.info(msg)
|
|
||||||
|
|
||||||
else:
|
|
||||||
time.sleep(10)
|
|
|
@ -1,156 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*-coding:UTF-8 -*
|
|
||||||
"""
|
|
||||||
This module makes statistics for some modules and providers
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
##################################
|
|
||||||
# Import External packages #
|
|
||||||
##################################
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
import redis
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
|
||||||
##################################
|
|
||||||
# Import Project packages #
|
|
||||||
##################################
|
|
||||||
from modules.abstract_module import AbstractModule
|
|
||||||
from packages.Date import Date
|
|
||||||
from packages import Paste
|
|
||||||
import ConfigLoader
|
|
||||||
|
|
||||||
|
|
||||||
class ModuleStats(AbstractModule):
|
|
||||||
"""
|
|
||||||
Module Statistics module for AIL framework
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Config Var
|
|
||||||
MAX_SET_CARDINALITY = 8
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
|
|
||||||
super(ModuleStats, self).__init__()
|
|
||||||
|
|
||||||
# Waiting time in secondes between to message proccessed
|
|
||||||
self.pending_seconds = 20
|
|
||||||
|
|
||||||
# Sent to the logging a description of the module
|
|
||||||
self.redis_logger.info("Makes statistics about valid URL")
|
|
||||||
|
|
||||||
# REDIS #
|
|
||||||
self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending")
|
|
||||||
|
|
||||||
def compute(self, message):
|
|
||||||
|
|
||||||
if len(message.split(';')) > 1:
|
|
||||||
self.compute_most_posted(message)
|
|
||||||
else:
|
|
||||||
self.compute_provider_info(message)
|
|
||||||
|
|
||||||
|
|
||||||
def get_date_range(self, num_day):
|
|
||||||
curr_date = datetime.date.today()
|
|
||||||
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
|
|
||||||
date_list = []
|
|
||||||
|
|
||||||
for i in range(0, num_day+1):
|
|
||||||
date_list.append(date.substract_day(i))
|
|
||||||
return date_list
|
|
||||||
|
|
||||||
|
|
||||||
def compute_most_posted(self, message):
|
|
||||||
module, num, keyword, paste_date = message.split(';')
|
|
||||||
|
|
||||||
redis_progression_name_set = 'top_'+ module +'_set_' + paste_date
|
|
||||||
|
|
||||||
# Add/Update in Redis
|
|
||||||
self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num))
|
|
||||||
|
|
||||||
# Compute Most Posted
|
|
||||||
date = self.get_date_range(0)[0]
|
|
||||||
# check if this keyword is eligible for progression
|
|
||||||
keyword_total_sum = 0
|
|
||||||
|
|
||||||
curr_value = self.r_serv_trend.hget(date, module+'-'+keyword)
|
|
||||||
keyword_total_sum += int(curr_value) if curr_value is not None else 0
|
|
||||||
|
|
||||||
if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY:
|
|
||||||
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
|
|
||||||
|
|
||||||
else: # not in set
|
|
||||||
member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
|
|
||||||
# Member set is a list of (value, score) pairs
|
|
||||||
if int(member_set[0][1]) < keyword_total_sum:
|
|
||||||
#remove min from set and add the new one
|
|
||||||
self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
|
|
||||||
self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0])
|
|
||||||
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword)
|
|
||||||
self.redis_logger.debug(redis_progression_name_set)
|
|
||||||
|
|
||||||
|
|
||||||
def compute_provider_info(self, message):
|
|
||||||
redis_all_provider = 'all_provider_set'
|
|
||||||
|
|
||||||
paste = Paste.Paste(message)
|
|
||||||
|
|
||||||
paste_baseName = paste.p_name.split('.')[0]
|
|
||||||
paste_size = paste._get_p_size()
|
|
||||||
paste_provider = paste.p_source
|
|
||||||
paste_date = str(paste._get_p_date())
|
|
||||||
redis_sum_size_set = 'top_size_set_' + paste_date
|
|
||||||
redis_avg_size_name_set = 'top_avg_size_set_' + paste_date
|
|
||||||
redis_providers_name_set = 'providers_set_' + paste_date
|
|
||||||
|
|
||||||
# Add/Update in Redis
|
|
||||||
self.r_serv_trend.sadd(redis_all_provider, paste_provider)
|
|
||||||
|
|
||||||
num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1))
|
|
||||||
sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size))
|
|
||||||
new_avg = float(sum_size) / float(num_paste)
|
|
||||||
self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg)
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Compute Most Posted
|
|
||||||
#
|
|
||||||
|
|
||||||
# Size
|
|
||||||
if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil":
|
|
||||||
self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider)
|
|
||||||
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
|
|
||||||
else: #set full capacity
|
|
||||||
member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1)
|
|
||||||
# Member set is a list of (value, score) pairs
|
|
||||||
if float(member_set[0][1]) < new_avg:
|
|
||||||
#remove min from set and add the new one
|
|
||||||
self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
|
|
||||||
self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0])
|
|
||||||
self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider)
|
|
||||||
self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0])
|
|
||||||
self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider)
|
|
||||||
|
|
||||||
|
|
||||||
# Num
|
|
||||||
# if set not full or provider already present
|
|
||||||
if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil":
|
|
||||||
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
|
|
||||||
else: #set at full capacity
|
|
||||||
member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1)
|
|
||||||
# Member set is a list of (value, score) pairs
|
|
||||||
if int(member_set[0][1]) < num_paste:
|
|
||||||
#remove min from set and add the new one
|
|
||||||
self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')')
|
|
||||||
self.r_serv_trend.zrem(member_set[0][0])
|
|
||||||
self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
module = ModuleStats()
|
|
||||||
module.run()
|
|
|
@ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||||
import ConfigLoader
|
import ConfigLoader
|
||||||
|
|
||||||
config_loader = ConfigLoader.ConfigLoader()
|
config_loader = ConfigLoader.ConfigLoader()
|
||||||
r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics")
|
r_statistics = config_loader.get_redis_conn("ARDB_Statistics")
|
||||||
|
#r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending")
|
||||||
config_loader = None
|
config_loader = None
|
||||||
|
|
||||||
|
PIE_CHART_MAX_CARDINALITY = 8
|
||||||
|
|
||||||
def incr_module_timeout_statistic(module_name):
|
def incr_module_timeout_statistic(module_name):
|
||||||
curr_date = datetime.date.today()
|
curr_date = datetime.date.today()
|
||||||
r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
|
r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)
|
||||||
|
|
||||||
|
def create_item_statistics(item_id, source, size):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_item_sources():
|
||||||
|
return r_statistics.smembers('all_provider_set')
|
||||||
|
|
||||||
|
def get_nb_items_processed_by_day_and_source(date, source):
|
||||||
|
nb_items = r_statistics.hget(f'{source}_num', date)
|
||||||
|
if not nb_items:
|
||||||
|
nb_items = 0
|
||||||
|
return int(nb_items)
|
||||||
|
|
||||||
|
def get_items_total_size_by_day_and_source(date, source):
|
||||||
|
total_size = r_statistics.hget(f'{source}_size', date)
|
||||||
|
if not total_size:
|
||||||
|
total_size = 0
|
||||||
|
return float(total_size)
|
||||||
|
|
||||||
|
def get_items_av_size_by_day_and_source(date, source):
|
||||||
|
av_size = r_statistics.hget(f'{source}_avg', date)
|
||||||
|
if not av_size:
|
||||||
|
av_size = 0
|
||||||
|
return float(av_size)
|
||||||
|
|
||||||
|
def _create_item_stats_size_nb(date, source, num, size, avg):
|
||||||
|
r_statistics.hset(f'{source}_num', date, num)
|
||||||
|
r_statistics.hset(f'{source}_size', date, size)
|
||||||
|
r_statistics.hset(f'{source}_avg', date, avg)
|
||||||
|
|
||||||
|
def get_item_stats_size_avg_by_date():
|
||||||
|
return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def get_item_stats_nb_by_date():
|
||||||
|
return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
def _set_item_stats_nb_by_date(date, source):
|
||||||
|
return r_statistics.zrange(f'providers_set_{date}', )
|
||||||
|
|
||||||
|
|
||||||
|
# # TODO: load ZSET IN CACHE => FAST UPDATE
|
||||||
|
def update_item_stats_size_nb(item_id, source, size, date):
|
||||||
|
# Add/Update in Redis
|
||||||
|
r_statistics.sadd('all_provider_set', source)
|
||||||
|
|
||||||
|
nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1))
|
||||||
|
sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size))
|
||||||
|
new_avg = sum_size / nb_items
|
||||||
|
r_statistics.hset(f'{source}_avg', date, new_avg)
|
||||||
|
|
||||||
|
# TOP Items Size
|
||||||
|
if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY:
|
||||||
|
r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
|
||||||
|
else:
|
||||||
|
member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
|
||||||
|
# Member set is a list of (value, score) pairs
|
||||||
|
if float(member_set[0][1]) < new_avg:
|
||||||
|
# remove min from set and add the new one
|
||||||
|
r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0])
|
||||||
|
r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source)
|
||||||
|
|
||||||
|
# TOP Nb Items
|
||||||
|
if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None:
|
||||||
|
r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
|
||||||
|
else: # zset at full capacity
|
||||||
|
member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
|
||||||
|
# Member set is a list of (value, score) pairs
|
||||||
|
if int(member_set[0][1]) < nb_items:
|
||||||
|
# remove min from set and add the new one
|
||||||
|
r_statistics.zrem(member_set[0][0])
|
||||||
|
r_statistics.zadd(f'providers_set_{date}', float(nb_items), source)
|
||||||
|
|
||||||
|
# keyword num
|
||||||
|
|
||||||
|
def _add_module_stats(module_name, total_sum, keyword, date):
|
||||||
|
r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword)
|
||||||
|
|
||||||
|
# # TODO: ONE HSET BY MODULE / CUSTOM STATS
|
||||||
|
def update_module_stats(module_name, num, keyword, date):
|
||||||
|
|
||||||
|
# Add/Update in Redis
|
||||||
|
r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
||||||
|
# Compute Most Posted
|
||||||
|
# check if this keyword is eligible for progression
|
||||||
|
keyword_total_sum = 0
|
||||||
|
|
||||||
|
curr_value = r_statistics.hget(date, module+'-'+keyword)
|
||||||
|
keyword_total_sum += int(curr_value) if curr_value is not None else 0
|
||||||
|
|
||||||
|
if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY:
|
||||||
|
r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
|
||||||
|
else: # zset at full capacity
|
||||||
|
member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1)
|
||||||
|
# Member set is a list of (value, score) pairs
|
||||||
|
if int(member_set[0][1]) < keyword_total_sum:
|
||||||
|
#remove min from set and add the new one
|
||||||
|
r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0])
|
||||||
|
r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword)
|
||||||
|
|
||||||
|
def get_module_tld_stats_by_tld_date(date, tld):
|
||||||
|
nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld)
|
||||||
|
if not nb_tld:
|
||||||
|
nb_tld = 0
|
||||||
|
return int(nb_tld)
|
||||||
|
|
||||||
|
def get_module_tld_stats_by_date(module, date):
|
||||||
|
return r_statistics.hgetall(f'{module}_by_tld:{date}')
|
||||||
|
|
||||||
|
def add_module_tld_stats_by_date(module, date, tld, nb):
|
||||||
|
r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb))
|
||||||
|
|
||||||
|
|
||||||
|
def get_iban_country_stats_by_date(date):
|
||||||
|
return r_statistics.hgetall(f'iban_by_country:{date}')
|
||||||
|
|
||||||
|
def add_iban_country_stats_by_date(date, tld, nb):
|
||||||
|
r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb))
|
||||||
|
|
||||||
|
# r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
|
||||||
|
# r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1)
|
||||||
|
|
|
@ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
|
||||||
else:
|
else:
|
||||||
return ({'error':'invalid input'}, 400)
|
return ({'error':'invalid input'}, 400)
|
||||||
|
|
||||||
|
def get_domains_blacklist(domain_type):
|
||||||
|
return r_serv_onion.smembers(f'blacklist_{domain_type}')
|
||||||
|
|
||||||
|
def add_domain_blacklist(domain_type, domain):
|
||||||
|
r_serv_onion.sadd(f'blacklist_{domain_type}', domain)
|
||||||
|
|
||||||
##-- CRAWLER GLOBAL --##
|
##-- CRAWLER GLOBAL --##
|
||||||
|
|
||||||
#### AUTOMATIC CRAWLER ####
|
#### AUTOMATIC CRAWLER ####
|
||||||
|
|
|
@ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule
|
||||||
from packages.Item import Item
|
from packages.Item import Item
|
||||||
from lib import ConfigLoader
|
from lib import ConfigLoader
|
||||||
from lib import regex_helper
|
from lib import regex_helper
|
||||||
|
from lib import Statistics
|
||||||
|
|
||||||
|
|
||||||
class Credential(AbstractModule):
|
class Credential(AbstractModule):
|
||||||
|
@ -96,6 +97,7 @@ class Credential(AbstractModule):
|
||||||
|
|
||||||
item_content = item.get_content()
|
item_content = item.get_content()
|
||||||
|
|
||||||
|
# TODO: USE SETS
|
||||||
# Extract all credentials
|
# Extract all credentials
|
||||||
all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time)
|
all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time)
|
||||||
|
|
||||||
|
@ -117,9 +119,6 @@ class Credential(AbstractModule):
|
||||||
print(f"========> Found more than 10 credentials in this file : {item.get_id()}")
|
print(f"========> Found more than 10 credentials in this file : {item.get_id()}")
|
||||||
self.redis_logger.warning(to_print)
|
self.redis_logger.warning(to_print)
|
||||||
|
|
||||||
# Send to duplicate
|
|
||||||
self.send_message_to_queue(item.get_id(), 'Duplicate')
|
|
||||||
|
|
||||||
msg = f'infoleak:automatic-detection="credential";{item.get_id()}'
|
msg = f'infoleak:automatic-detection="credential";{item.get_id()}'
|
||||||
self.send_message_to_queue(msg, 'Tags')
|
self.send_message_to_queue(msg, 'Tags')
|
||||||
|
|
||||||
|
@ -158,6 +157,7 @@ class Credential(AbstractModule):
|
||||||
print(f"=======> Probably on : {discovered_sites}")
|
print(f"=======> Probably on : {discovered_sites}")
|
||||||
|
|
||||||
date = datetime.now().strftime("%Y%m")
|
date = datetime.now().strftime("%Y%m")
|
||||||
|
nb_tlds = {}
|
||||||
for cred in all_credentials:
|
for cred in all_credentials:
|
||||||
maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
|
maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
|
||||||
self.faup.decode(maildomains)
|
self.faup.decode(maildomains)
|
||||||
|
@ -167,7 +167,9 @@ class Credential(AbstractModule):
|
||||||
tld = tld.decode()
|
tld = tld.decode()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
|
nb_tlds[tld] = nb_tlds.get(tld, 0) + 1
|
||||||
|
for tld in nb_tlds:
|
||||||
|
Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld])
|
||||||
else:
|
else:
|
||||||
self.redis_logger.info(to_print)
|
self.redis_logger.info(to_print)
|
||||||
print(f'found {nb_cred} credentials')
|
print(f'found {nb_cred} credentials')
|
||||||
|
|
|
@ -75,9 +75,6 @@ class CreditCards(AbstractModule):
|
||||||
if (len(creditcard_set) > 0):
|
if (len(creditcard_set) > 0):
|
||||||
self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}')
|
self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}')
|
||||||
|
|
||||||
#Send to duplicate
|
|
||||||
self.send_message_to_queue(item.get_id(), 'Duplicate')
|
|
||||||
|
|
||||||
msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}'
|
msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}'
|
||||||
self.send_message_to_queue(msg, 'Tags')
|
self.send_message_to_queue(msg, 'Tags')
|
||||||
|
|
||||||
|
|
177
bin/modules/Mail.py
Executable file
177
bin/modules/Mail.py
Executable file
|
@ -0,0 +1,177 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
"""
|
||||||
|
The Mails Module
|
||||||
|
======================
|
||||||
|
|
||||||
|
This module is consuming the Redis-list created by the Categ module.
|
||||||
|
|
||||||
|
It apply mail regexes on item content and warn if above a threshold.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import redis
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
import dns.resolver
|
||||||
|
import dns.exception
|
||||||
|
|
||||||
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
##################################
|
||||||
|
# Import Project packages #
|
||||||
|
##################################
|
||||||
|
from modules.abstract_module import AbstractModule
|
||||||
|
from lib.objects.Items import Item
|
||||||
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
from lib import Statistics
|
||||||
|
|
||||||
|
|
||||||
|
class Mail(AbstractModule):
|
||||||
|
"""
|
||||||
|
Module Mail module for AIL framework
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(Mail, self).__init__()
|
||||||
|
|
||||||
|
config_loader = ConfigLoader()
|
||||||
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||||
|
|
||||||
|
self.dns_server = config_loader.get_config_str('Mail', 'dns')
|
||||||
|
|
||||||
|
self.faup = Faup()
|
||||||
|
|
||||||
|
# Numbers of Mails needed to Tags
|
||||||
|
self.mail_threshold = 10
|
||||||
|
|
||||||
|
self.regex_timeout = 30
|
||||||
|
self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"
|
||||||
|
re.compile(self.email_regex)
|
||||||
|
|
||||||
|
def is_mxdomain_in_cache(self, mxdomain):
|
||||||
|
return self.r_cache.exists(f'mxdomain:{mxdomain}')
|
||||||
|
|
||||||
|
def save_mxdomain_in_cache(self, mxdomain):
|
||||||
|
self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1))
|
||||||
|
|
||||||
|
def check_mx_record(self, set_mxdomains):
|
||||||
|
"""Check if emails MX domains are responding.
|
||||||
|
|
||||||
|
:param adress_set: -- (set) This is a set of emails domains
|
||||||
|
:return: (int) Number of adress with a responding and valid MX domains
|
||||||
|
|
||||||
|
"""
|
||||||
|
resolver = dns.resolver.Resolver()
|
||||||
|
resolver.nameservers = [self.dns_server]
|
||||||
|
resolver.timeout = 5.0
|
||||||
|
resolver.lifetime = 2.0
|
||||||
|
|
||||||
|
valid_mxdomain = []
|
||||||
|
for mxdomain in set_mxdomains:
|
||||||
|
|
||||||
|
# check if is in cache
|
||||||
|
# # TODO:
|
||||||
|
if self.is_mxdomain_in_cache(mxdomain):
|
||||||
|
valid_mxdomain.append(mxdomain)
|
||||||
|
else:
|
||||||
|
|
||||||
|
# DNS resolution
|
||||||
|
try:
|
||||||
|
answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX)
|
||||||
|
if answers:
|
||||||
|
self.save_mxdomain_in_cache(mxdomain)
|
||||||
|
valid_mxdomain.append(mxdomain)
|
||||||
|
# DEBUG
|
||||||
|
# print('---')
|
||||||
|
# print(answers.response)
|
||||||
|
# print(answers.qname)
|
||||||
|
# print(answers.rdtype)
|
||||||
|
# print(answers.rdclass)
|
||||||
|
# print(answers.nameserver)
|
||||||
|
# print()
|
||||||
|
|
||||||
|
except dns.resolver.NoNameservers:
|
||||||
|
self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.')
|
||||||
|
print('NoNameserver, No non-broken nameservers are available to answer the query.')
|
||||||
|
except dns.resolver.NoAnswer:
|
||||||
|
self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.')
|
||||||
|
print('NoAnswer, The response did not contain an answer to the question.')
|
||||||
|
except dns.name.EmptyLabel:
|
||||||
|
self.redis_logger.debug('SyntaxError: EmptyLabel')
|
||||||
|
print('SyntaxError: EmptyLabel')
|
||||||
|
except dns.resolver.NXDOMAIN:
|
||||||
|
#save_mxdomain_in_cache(mxdomain)
|
||||||
|
self.redis_logger.debug('The query name does not exist.')
|
||||||
|
print('The query name does not exist.')
|
||||||
|
except dns.name.LabelTooLong:
|
||||||
|
self.redis_logger.debug('The Label is too long')
|
||||||
|
print('The Label is too long')
|
||||||
|
except dns.exception.Timeout:
|
||||||
|
print('dns timeout')
|
||||||
|
#save_mxdomain_in_cache(mxdomain)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return valid_mxdomain
|
||||||
|
|
||||||
|
# # TODO: sanityze mails
|
||||||
|
def compute(self, message):
|
||||||
|
item_id, score = message.split()
|
||||||
|
item = Item(item_id)
|
||||||
|
item_date = item.get_date()
|
||||||
|
|
||||||
|
mails = self.regex_findall(self.email_regex, item_id, item.get_content())
|
||||||
|
mxdomains_email = {}
|
||||||
|
for mail in mails:
|
||||||
|
mxdomain = mail.rsplit('@', 1)[1].lower()
|
||||||
|
if not mxdomain in mxdomains_email:
|
||||||
|
mxdomains_email[mxdomain] = set()
|
||||||
|
mxdomains_email[mxdomain].add(mail)
|
||||||
|
|
||||||
|
## TODO: add MAIL trackers
|
||||||
|
|
||||||
|
valid_mx = self.check_mx_record(mxdomains_email.keys())
|
||||||
|
print(f'valid_mx: {valid_mx}')
|
||||||
|
mx_tlds = {}
|
||||||
|
num_valid_email = 0
|
||||||
|
for domain_mx in valid_mx:
|
||||||
|
nb_mails = len(mxdomains_email[domain_mx])
|
||||||
|
num_valid_email += nb_mails
|
||||||
|
|
||||||
|
# Create doamin_mail stats
|
||||||
|
msg = f'mail;{nb_mails};{domain_mx};{item_date}'
|
||||||
|
self.send_message_to_queue(msg, 'ModuleStats')
|
||||||
|
|
||||||
|
# Create country stats
|
||||||
|
self.faup.decode(domain_mx)
|
||||||
|
tld = self.faup.get()['tld']
|
||||||
|
try:
|
||||||
|
tld = tld.decode()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails
|
||||||
|
for tld in mx_tlds:
|
||||||
|
Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld])
|
||||||
|
|
||||||
|
if num_valid_email > self.mail_threshold:
|
||||||
|
msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}'
|
||||||
|
print(f'{item_id} Checked {num_valid_email} e-mail(s)')
|
||||||
|
self.redis_logger.warning(msg)
|
||||||
|
# Tags
|
||||||
|
msg = f'infoleak:automatic-detection="mail";{item_id}'
|
||||||
|
self.send_message_to_queue(msg, 'Tags')
|
||||||
|
else:
|
||||||
|
self.redis_logger.info(msg)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
module = Mail()
|
||||||
|
#module.compute('tests/2021/01/01/mails.gz 50')
|
||||||
|
module.run()
|
54
bin/modules/ModuleStats.py
Executable file
54
bin/modules/ModuleStats.py
Executable file
|
@ -0,0 +1,54 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
"""
|
||||||
|
This module makes statistics for some modules and providers
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
##################################
|
||||||
|
# Import External packages #
|
||||||
|
##################################
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
##################################
|
||||||
|
# Import Project packages #
|
||||||
|
##################################
|
||||||
|
from modules.abstract_module import AbstractModule
|
||||||
|
from lib.objects.Items import Item
|
||||||
|
from lib import Statistics
|
||||||
|
|
||||||
|
|
||||||
|
class ModuleStats(AbstractModule):
|
||||||
|
"""
|
||||||
|
Module Statistics module for AIL framework
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
super(ModuleStats, self).__init__()
|
||||||
|
|
||||||
|
# Waiting time in secondes between to message proccessed
|
||||||
|
self.pending_seconds = 20
|
||||||
|
|
||||||
|
def compute(self, message):
|
||||||
|
|
||||||
|
# MODULE STATS
|
||||||
|
if len(message.split(';')) > 1:
|
||||||
|
module_name, num, keyword, date = message.split(';')
|
||||||
|
Statisticsupdate_module_stats(module_name, num, keyword, date)
|
||||||
|
# ITEM STATS
|
||||||
|
else:
|
||||||
|
item = Item(item_id)
|
||||||
|
source = item.get_source()
|
||||||
|
date = item.get_date()
|
||||||
|
size = item.get_size()
|
||||||
|
Statistics.update_item_stats_size_nb(item_id, source, size, date)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
module = ModuleStats()
|
||||||
|
module.run()
|
|
@ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R
|
||||||
|
|
||||||
[CreditCards]
|
[CreditCards]
|
||||||
subscribe = Redis_CreditCards
|
subscribe = Redis_CreditCards
|
||||||
publish = Redis_ModuleStats,Redis_Tags
|
publish = Redis_Tags
|
||||||
|
|
||||||
[BankAccount]
|
[BankAccount]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
|
|
Loading…
Reference in a new issue