#!/usr/bin/env python3.5
# -*-coding:UTF-8 -*

"""
The Web Module
============================

This module tries to parse URLs and warns if some defined contry code are present.

"""

import redis
import pprint
import time
import os
import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from pyfaup.faup import Faup
import re

# Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn
import socket
import pycountry
import ipaddress

from Helper import Process

# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string):
    if a_string is None:
        return ""
    else:
        return a_string

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Web'

    p = Process(config_section)

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"))

    # Protocol file path
    protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
                         p.config.get("Directories", "protocolsfile"))

    # Country to log as critical
    cc_critical = p.config.get("Url", "cc_critical")

    # FUNCTIONS #
    publisher.info("Script URL subscribed to channel web_categ")

    # FIXME For retro compatibility
    channel = 'web_categ'

    message = p.get_from_set()
    prec_filename = None
    faup = Faup()

    # Get all uri from protocolsfile (Used for Curve)
    uri_scheme = ""
    with open(protocolsfile_path, 'r') as scheme_file:
        for scheme in scheme_file:
            uri_scheme += scheme[:-1]+"|"
    uri_scheme = uri_scheme[:-1]

    url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        if message is not None:
            filename, score = message.split()

            if prec_filename is None or filename != prec_filename:
                domains_list = []
                PST = Paste.Paste(filename)
                client = ip2asn()
                for x in PST.get_regex(url_regex):
                    matching_url = re.search(url_regex, PST.get_p_content())
                    url = matching_url.group(0)

                    to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
                    p.populate_set_out(to_send, 'Url')

                    faup.decode(url)
                    domain = faup.get_domain()
                    subdomain = faup.get_subdomain()
                    f1 = None

                    domains_list.append(domain)

                    publisher.debug('{} Published'.format(url))

                    if f1 == "onion":
                        print(domain)

                    hostl = unicode(avoidNone(subdomain)+avoidNone(domain))
                    try:
                        socket.setdefaulttimeout(1)
                        ip = socket.gethostbyname(unicode(hostl))
                    except:
                        # If the resolver is not giving any IPv4 address,
                        # ASN/CC lookup is skip.
                        continue

                    try:
                        l = client.lookup(ip, qType='IP')
                    except ipaddress.AddressValueError:
                        continue
                    cc = getattr(l, 'cc')
                    asn = getattr(l, 'asn')

                    # EU is not an official ISO 3166 code (but used by RIPE
                    # IP allocation)
                    if cc is not None and cc != "EU":
                        print(hostl, asn, cc, \
                            pycountry.countries.get(alpha_2=cc).name)
                        if cc == cc_critical:
                            to_print = 'Url;{};{};{};Detected {} {}'.format(
                                    PST.p_source, PST.p_date, PST.p_name,
                                    hostl, cc)
                            #publisher.warning(to_print)
                            print(to_print)
                    else:
                        print(hostl, asn, cc)

                A_values = lib_refine.checking_A_record(r_serv2,
                                                        domains_list)
                if A_values[0] >= 1:
                    PST.__setattr__(channel, A_values)
                    PST.save_attribute_redis(channel, (A_values[0],
                                             list(A_values[1])))

                    pprint.pprint(A_values)
                    publisher.info('Url;{};{};{};Checked {} URL;{}'.format(
                        PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_path))
            prec_filename = filename

        else:
            publisher.debug("Script url is Idling 10s")
            print('Sleeping')
            time.sleep(10)

        message = p.get_from_set()