#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Pasties Module
======================
This module spots domain-pasties services for further processing
"""

##################################
# Import External packages
##################################
import os
import sys
import time

from pyfaup.faup import Faup

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import crawlers

# TODO add url validator

pasties_blocklist_urls = set()
pasties_domains = {}

class Pasties(AbstractModule):
    """
    Pasties module for AIL framework
    """

    def __init__(self):
        super(Pasties, self).__init__()
        self.faup = Faup()

        config_loader = ConfigLoader()
        self.r_cache = config_loader.get_redis_conn("Redis_Cache")

        self.pasties = {}
        self.urls_blocklist = set()
        self.load_pasties_domains()

        # Send module state to logs
        self.logger.info(f'Module {self.module_name} initialized')

    def load_pasties_domains(self):
        self.pasties = {}
        self.urls_blocklist = set()

        domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
        if os.path.exists(domains_pasties):
            with open(domains_pasties) as f:
                for line in f:
                    url = line.strip()
                    if url:  # TODO validate line
                        self.faup.decode(url)
                        url_decoded = self.faup.get()
                        host = url_decoded['host']
                        # if url_decoded.get('port', ''):
                        #     host = f'{host}:{url_decoded["port"]}'
                        path = url_decoded.get('resource_path', '')
                        # print(url_decoded)
                        if path and path != '/':
                            if path[-1] != '/':
                                path = f'{path}/'
                        else:
                            path = None

                        if host in self.pasties:
                            if path:
                                self.pasties[host].add(path)
                        else:
                            if path:
                                self.pasties[host] = {path}
                            else:
                                self.pasties[host] = set()

        url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
        if os.path.exists(url_blocklist):
            with open(url_blocklist) as f:
                for line in f:
                    url = line.strip()
                    self.faup.decode(url)
                    url_decoded = self.faup.get()
                    host = url_decoded['host']
                    # if url_decoded.get('port', ''):
                    #     host = f'{host}:{url_decoded["port"]}'
                    path = url_decoded.get('resource_path', '')
                    url = f'{host}{path}'
                    if url_decoded['query_string']:
                        url = url + url_decoded['query_string']
                    self.urls_blocklist.add(url)

    def send_to_crawler(self, url, obj_id):
        if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
            self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
            self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
            crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)

    def compute(self, message):
        url = message.split()

        self.faup.decode(url)
        url_decoded = self.faup.get()
        # print(url_decoded)
        url_host = url_decoded['host']
        # if url_decoded.get('port', ''):
        #     url_host = f'{url_host}:{url_decoded["port"]}'
        path = url_decoded.get('resource_path', '')
        if url_host in self.pasties:
            if url.startswith('http://'):
                if url[7:] in self.urls_blocklist:
                    return None
            elif url.startswith('https://'):
                if url[8:] in self.urls_blocklist:
                    return None
            else:
                if url in self.urls_blocklist:
                    return None

            if not self.pasties[url_host]:
                if path and path != '/':
                    print('send to crawler', url_host, url)
                    self.send_to_crawler(url, self.obj.id)
            else:
                if path.endswith('/'):
                    path_end = path[:-1]
                else:
                    path_end = f'{path}/'
                for url_path in self.pasties[url_host]:
                    if path.startswith(url_path):
                        if url_path != path and url_path != path_end:
                            print('send to crawler', url_path, url)
                            self.send_to_crawler(url, self.obj.id)
                            break


if __name__ == '__main__':
    module = Pasties()
    module.run()