fix: [Mail module] regex timeout

This commit is contained in:
Terrtia 2020-05-20 17:03:58 +02:00
parent 85a0d944cc
commit 4601003509
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 147 additions and 90 deletions

View file

@ -7,7 +7,7 @@ The Credential Module
This module is consuming the Redis-list created by the Categ module. This module is consuming the Redis-list created by the Categ module.
It apply credential regexes on paste content and warn if above a threshold. It apply credential regexes on item content and warn if above a threshold.
It also split the username and store it into redis for searching purposes. It also split the username and store it into redis for searching purposes.
@ -24,24 +24,37 @@ Redis organization:
""" """
import time import time
import os
import sys import sys
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import datetime import datetime
import re import re
import redis import redis
from pyfaup.faup import Faup from pyfaup.faup import Faup
from pubsublogger import publisher
from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
import Item
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
import regex_helper
## LOAD CONFIG ##
config_loader = ConfigLoader.ConfigLoader()
server_cred = config_loader.get_redis_conn("ARDB_TermCred")
server_statistics = config_loader.get_redis_conn("ARDB_Statistics")
minimumLengthThreshold = config_loader.get_config_int("Credential", "minimumLengthThreshold")
criticalNumberToAlert = config_loader.get_config_int("Credential", "criticalNumberToAlert")
minTopPassList = config_loader.get_config_int("Credential", "minTopPassList")
config_loader = None
## -- ##
import signal import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
max_execution_time = 30 max_execution_time = 30
#split username with spec. char or with upper case, distinguish start with upper #split username with spec. char or with upper case, distinguish start with upper
@ -58,107 +71,58 @@ if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
publisher.channel = "Script" publisher.channel = "Script"
config_section = "Credential" config_section = "Credential"
module_name = "Credential"
p = Process(config_section) p = Process(config_section)
publisher.info("Find credentials") publisher.info("Find credentials")
minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold")
faup = Faup() faup = Faup()
server_cred = redis.StrictRedis(
host=p.config.get("ARDB_TermCred", "host"),
port=p.config.get("ARDB_TermCred", "port"),
db=p.config.get("ARDB_TermCred", "db"),
decode_responses=True)
server_statistics = redis.StrictRedis( regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
host=p.config.get("ARDB_Statistics", "host"),
port=p.config.getint("ARDB_Statistics", "port"),
db=p.config.getint("ARDB_Statistics", "db"),
decode_responses=True)
criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert")
minTopPassList = p.config.getint("Credential", "minTopPassList")
regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
#regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+" #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"
regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"
redis_cache_key = regex_helper.generate_redis_cache_key(module_name)
while True: while True:
message = p.get_from_set() message = p.get_from_set()
if message is None: if message is None:
publisher.debug("Script Credential is Idling 10s") publisher.debug("Script Credential is Idling 10s")
#print('sleeping 10s')
time.sleep(10) time.sleep(10)
continue continue
filepath, count = message.split(' ') item_id, count = message.split()
paste = Paste.Paste(filepath) item_content = Item.get_item_content(item_id)
content = paste.get_p_content()
item_id = filepath # Extract all credentials
all_credentials = regex_helper.regex_findall(module_name, redis_cache_key, regex_cred, item_content, max_time=max_execution_time)
# max execution time on regex if not all_credentials:
signal.alarm(max_execution_time)
try:
creds = set(re.findall(regex_cred, content))
except TimeoutException:
p.incr_module_timeout_statistic() # add encoder type
err_mess = "Credential: processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
continue
else:
signal.alarm(0)
if len(creds) == 0:
continue continue
signal.alarm(max_execution_time) all_sites = regex_helper.regex_findall(module_name, redis_cache_key, regex_web, item_content, max_time=max_execution_time)
try:
sites = re.findall(regex_web, content) #Use to count occurences
except TimeoutException:
p.incr_module_timeout_statistic()
err_mess = "Credential: site, processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
sites = []
else:
signal.alarm(0)
sites_set = set(sites) message = 'Checked {} credentials found.'.format(len(all_credentials))
if all_sites:
message += ' Related websites: {}'.format( (', '.join(all_sites)) )
print(message)
message = 'Checked {} credentials found.'.format(len(creds)) to_print = 'Credential;{};{};{};{};{}'.format(Item.get_source(item_id), Item.get_item_date(item_id), Item.get_item_basename(item_id), message, item_id)
if sites_set:
message += ' Related websites: {}'.format( (', '.join(sites_set)) )
to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path)
print('\n '.join(creds))
#num of creds above tresh, publish an alert #num of creds above tresh, publish an alert
if len(creds) > criticalNumberToAlert: if len(all_credentials) > criticalNumberToAlert:
print("========> Found more than 10 credentials in this file : {}".format( filepath )) print("========> Found more than 10 credentials in this file : {}".format( item_id ))
publisher.warning(to_print) publisher.warning(to_print)
#Send to duplicate #Send to duplicate
p.populate_set_out(filepath, 'Duplicate') p.populate_set_out(item_id, 'Duplicate')
msg = 'infoleak:automatic-detection="credential";{}'.format(filepath) msg = 'infoleak:automatic-detection="credential";{}'.format(item_id)
p.populate_set_out(msg, 'Tags') p.populate_set_out(msg, 'Tags')
#Put in form, count occurences, then send to moduleStats site_occurence = regex_helper.regex_findall(module_name, redis_cache_key, regex_site_for_stats, item_content, max_time=max_execution_time, r_set=False)
signal.alarm(max_execution_time)
try:
site_occurence = re.findall(regex_site_for_stats, content)
except TimeoutException:
p.incr_module_timeout_statistic()
err_mess = "Credential: site occurence, processing timeout: {}".format(item_id)
print(err_mess)
publisher.info(err_mess)
site_occurence = []
else:
signal.alarm(0)
creds_sites = {} creds_sites = {}
@ -169,7 +133,7 @@ if __name__ == "__main__":
else: else:
creds_sites[site_domain] = 1 creds_sites[site_domain] = 1
for url in sites: for url in all_sites:
faup.decode(url) faup.decode(url)
domain = faup.get()['domain'] domain = faup.get()['domain']
## TODO: # FIXME: remove me ## TODO: # FIXME: remove me
@ -184,15 +148,15 @@ if __name__ == "__main__":
for site, num in creds_sites.items(): # Send for each different site to moduleStats for site, num in creds_sites.items(): # Send for each different site to moduleStats
mssg = 'credential;{};{};{}'.format(num, site, paste.p_date) mssg = 'credential;{};{};{}'.format(num, site, Item.get_item_date(item_id))
print(mssg) print(mssg)
p.populate_set_out(mssg, 'ModuleStats') p.populate_set_out(mssg, 'ModuleStats')
if sites_set: if all_sites:
print("=======> Probably on : {}".format(', '.join(sites_set))) print("=======> Probably on : {}".format(', '.join(all_sites)))
date = datetime.datetime.now().strftime("%Y%m") date = datetime.datetime.now().strftime("%Y%m")
for cred in creds: for cred in all_credentials:
maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
faup.decode(maildomains) faup.decode(maildomains)
tld = faup.get()['tld'] tld = faup.get()['tld']
@ -204,17 +168,17 @@ if __name__ == "__main__":
server_statistics.hincrby('credential_by_tld:'+date, tld, 1) server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
else: else:
publisher.info(to_print) publisher.info(to_print)
print('found {} credentials'.format(len(creds))) print('found {} credentials'.format(len(all_credentials)))
#for searching credential in termFreq #for searching credential in termFreq
for cred in creds: for cred in all_credentials:
cred = cred.split('@')[0] #Split to ignore mail address cred = cred.split('@')[0] #Split to ignore mail address
#unique number attached to unique path #unique number attached to unique path
uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH) uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH)
server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {filepath: uniq_num_path}) server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {item_id: uniq_num_path})
server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: filepath}) server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: item_id})
#unique number attached to unique username #unique number attached to unique username
uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred) uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred)

75
bin/lib/regex_helper.py Executable file
View file

@ -0,0 +1,75 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
Regex Helper
"""
import os
import re
import sys
import uuid
from multiprocessing import Process as Proc
sys.path.append(os.environ['AIL_BIN'])
from pubsublogger import publisher
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader
import statistics
## LOAD CONFIG ##
config_loader = ConfigLoader.ConfigLoader()
r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None
## -- ##
publisher.port = 6380
publisher.channel = "Script"
def generate_redis_cache_key(module_name):
return '{}_extracted:{}'.format(module_name, str(uuid.uuid4()))
def _regex_findall(redis_key, regex, item_content, r_set):
all_items = re.findall(regex, item_content)
if r_set:
if len(all_items) > 1:
r_serv_cache.sadd(redis_key, *all_items)
r_serv_cache.expire(redis_key, 360)
elif all_items:
r_serv_cache.sadd(redis_key, all_items[0])
r_serv_cache.expire(redis_key, 360)
else:
if len(all_items) > 1:
r_serv_cache.lpush(redis_key, *all_items)
r_serv_cache.expire(redis_key, 360)
elif all_items:
r_serv_cache.lpush(redis_key, all_items[0])
r_serv_cache.expire(redis_key, 360)
def regex_findall(module_name, redis_key, regex, item_content, max_time=30, r_set=True):
proc = Proc(target=_regex_findall, args=(redis_key, regex, item_content, r_set, ))
try:
proc.start()
proc.join(max_time)
if proc.is_alive():
proc.terminate()
statistics.incr_module_timeout_statistic(module_name)
err_mess = "{}: processing timeout: {}".format(module_name, item_id)
print(err_mess)
publisher.info(err_mess)
return []
else:
if r_set:
all_items = r_serv_cache.smembers(redis_key)
else:
all_items = r_serv_cache.lrange(redis_key, 0 ,-1)
r_serv_cache.delete(redis_key)
proc.terminate()
return all_items
except KeyboardInterrupt:
print("Caught KeyboardInterrupt, terminating workers")
proc.terminate()
sys.exit(0)

18
bin/lib/statistics.py Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import datetime
import os
import redis
import sys
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
config_loader = ConfigLoader.ConfigLoader()
r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics")
config_loader = None
def incr_module_timeout_statistic(module_name):
curr_date = datetime.date.today()
r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)