#!/usr/bin/env python3
# -*-coding:UTF-8 -*

"""
The Credential Module
=====================

This module is consuming the Redis-list created by the Categ module.

It apply credential regexes on paste content and warn if above a threshold.

It also split the username and store it into redis for searching purposes.

Redis organization:
    uniqNumForUsername: unique number attached to unique username
    uniqNumForPath: unique number attached to unique path
        -> uniqNum are used to avoid string duplication
    AllCredentials: hashed set where keys are username and value are their uniq number
    AllCredentialsRev: the opposite of AllCredentials, uniqNum -> username
    AllPath: hashed set where keys are path and value are their uniq number
    AllPathRev: the opposite of AllPath, uniqNum -> path
    CredToPathMapping_uniqNumForUsername -> (set) -> uniqNumForPath

"""

import time
import sys
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import datetime
import re
import redis
from pyfaup.faup import Faup

import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)
max_execution_time = 30

#split username with spec. char or with upper case, distinguish start with upper
REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+"
REDIS_KEY_NUM_USERNAME = 'uniqNumForUsername'
REDIS_KEY_NUM_PATH = 'uniqNumForUsername'
REDIS_KEY_ALL_CRED_SET = 'AllCredentials'
REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'
REDIS_KEY_ALL_PATH_SET = 'AllPath'
REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'
REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold")

    faup = Faup()
    server_cred = redis.StrictRedis(
        host=p.config.get("ARDB_TermCred", "host"),
        port=p.config.get("ARDB_TermCred", "port"),
        db=p.config.get("ARDB_TermCred", "db"),
        decode_responses=True)

    server_statistics = redis.StrictRedis(
        host=p.config.get("ARDB_Statistics", "host"),
        port=p.config.getint("ARDB_Statistics", "port"),
        db=p.config.getint("ARDB_Statistics", "db"),
        decode_responses=True)

    criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert")
    minTopPassList = p.config.getint("Credential", "minTopPassList")

    regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
    #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
    regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"
    regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"

    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("Script Credential is Idling 10s")
            #print('sleeping 10s')
            time.sleep(10)
            continue

        filepath, count = message.split(' ')

        paste = Paste.Paste(filepath)
        content = paste.get_p_content()

        item_id = filepath

        # max execution time on regex
        signal.alarm(max_execution_time)
        try:
            creds = set(re.findall(regex_cred, content))
        except TimeoutException:
            p.incr_module_timeout_statistic() # add encoder type
            err_mess = "Credential: processing timeout: {}".format(item_id)
            print(err_mess)
            publisher.info(err_mess)
            continue
        else:
            signal.alarm(0)

        if len(creds) == 0:
            continue

        signal.alarm(max_execution_time)
        try:
            sites = re.findall(regex_web, content) #Use to count occurences
        except TimeoutException:
            p.incr_module_timeout_statistic()
            err_mess = "Credential: site, processing timeout: {}".format(item_id)
            print(err_mess)
            publisher.info(err_mess)
            sites = []
        else:
            signal.alarm(0)

        sites_set = set(sites)

        message = 'Checked {} credentials found.'.format(len(creds))
        if sites_set:
            message += ' Related websites: {}'.format( (', '.join(sites_set)) )

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".format( filepath ))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')

            msg = 'infoleak:automatic-detection="credential";{}'.format(filepath)
            p.populate_set_out(msg, 'Tags')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
                faup.decode(url)
                domain = faup.get()['domain']
                ## TODO: # FIXME: remove me
                try:
                    domain = domain.decode()
                except:
                    pass
                if domain in creds_sites.keys():
                    creds_sites[domain] += 1
                else:
                    creds_sites[domain] = 1

            for site, num in creds_sites.items(): # Send for each different site to moduleStats

                mssg = 'credential;{};{};{}'.format(num, site, paste.p_date)
                print(mssg)
                p.populate_set_out(mssg, 'ModuleStats')

            if sites_set:
                print("=======> Probably on : {}".format(', '.join(sites_set)))

            date = datetime.datetime.now().strftime("%Y%m")
            for cred in creds:
                maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0]
                faup.decode(maildomains)
                tld = faup.get()['tld']
                ## TODO: # FIXME: remove me
                try:
                    tld = tld.decode()
                except:
                    pass
                server_statistics.hincrby('credential_by_tld:'+date, tld, 1)
        else:
            publisher.info(to_print)
            print('found {} credentials'.format(len(creds)))


        #for searching credential in termFreq
        for cred in creds:
            cred = cred.split('@')[0] #Split to ignore mail address

            #unique number attached to unique path
            uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH)
            server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {filepath: uniq_num_path})
            server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: filepath})

            #unique number attached to unique username
            uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred)
            if uniq_num_cred is None: #cred do not exist, create new entries
                uniq_num_cred = server_cred.incr(REDIS_KEY_NUM_USERNAME)
                server_cred.hmset(REDIS_KEY_ALL_CRED_SET, {cred: uniq_num_cred})
                server_cred.hmset(REDIS_KEY_ALL_CRED_SET_REV, {uniq_num_cred: cred})

            #Add the mapping between the credential and the path
            server_cred.sadd(REDIS_KEY_MAP_CRED_TO_PATH+'_'+str(uniq_num_cred), uniq_num_path)

            #Split credentials on capital letters, numbers, dots and so on
            #Add the split to redis, each split point towards its initial credential unique number
            splitedCred = re.findall(REGEX_CRED, cred)
            for partCred in splitedCred:
                if len(partCred) > minimumLengthThreshold:
                    server_cred.sadd(partCred, uniq_num_cred)