ail-framework/bin/Onion.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The Onion Module
============================

This module extract url from item and returning only ones which are tor
related (.onion). All These urls are send to the crawler discovery queue.

Requirements
------------

*Need running Redis instances. (Redis)

"""
import time
import datetime
import os
import sys
import re

# project packages
from module.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import crawlers
from lib import regex_helper
from packages.Item import Item

## Manually fetch first page if crawler is disabled
# import base64
# import subprocess
#
# torclient_host = '127.0.0.1'
# torclient_port = 9050
#
# def fetch(p, r_cache, urls, domains):
#     now = datetime.datetime.now()
#     path = os.path.join('onions', str(now.year).zfill(4),
#                         str(now.month).zfill(2),
#                         str(now.day).zfill(2),
#                         str(int(time.mktime(now.utctimetuple()))))
#     failed = []
#     downloaded = []
#     print('{} Urls to fetch'.format(len(urls)))
#     for url, domain in zip(urls, domains):
#         if r_cache.exists(url) or url in failed:
#             continue
#         to_fetch = base64.standard_b64encode(url.encode('utf8'))
#         print('fetching url: {}'.format(to_fetch))
#         process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
#                                    stdout=subprocess.PIPE)
#         while process.poll() is None:
#             time.sleep(1)
#
#         if process.returncode == 0:
#             r_cache.setbit(url, 0, 1)
#             r_cache.expire(url, 360000)
#             downloaded.append(url)
#             print('downloaded : {}'.format(downloaded))
#             '''tempfile = process.stdout.read().strip()
#             tempfile = tempfile.decode('utf8')
#             #with open(tempfile, 'r') as f:
#                 filename = path + domain + '.gz'
#                 fetched = f.read()
#                 content = base64.standard_b64decode(fetched)
#                 save_path = os.path.join(os.environ['AIL_HOME'],
#                                          p.config.get("Directories", "pastes"),
#                                          filename)
#                 dirname = os.path.dirname(save_path)
#                 if not os.path.exists(dirname):
#                     os.makedirs(dirname)
#                 with open(save_path, 'w') as ff:
#                     ff.write(content)
#                 p.populate_set_out(save_path, 'Global')
#                 p.populate_set_out(url, 'ValidOnion')
#                 p.populate_set_out(fetched, 'FetchedOnion')'''
#             yield url
#             #os.unlink(tempfile)
#         else:
#             r_cache.setbit(url, 0, 0)
#             r_cache.expire(url, 3600)
#             failed.append(url)
#             print('Failed at downloading', url)
#             print(process.stdout.read())
#     print('Failed:', len(failed), 'Downloaded:', len(downloaded))


class Onion(AbstractModule):
    """docstring for Onion module."""

    def __init__(self):
        super(Onion, self).__init__()

        config_loader = ConfigLoader()
        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
        self.r_onion = config_loader.get_redis_conn("ARDB_Onion")

        self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
        # regex timeout
        self.regex_timeout = 30

        self.faup = crawlers.get_faup()
        self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)

        # activate_crawler = p.config.get("Crawler", "activate_crawler")


        self.url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
        self.i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
        re.compile(self.url_regex)
        re.compile(self.i2p_regex)

        self.redis_logger.info(f"Module: {self.module_name} Launched")

        # TEMP var: SAVE I2P Domain (future I2P crawler)
        self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")

    def compute(self, message):
        # list of tuples: (url, subdomains, domain)
        urls_to_crawl = []

        id, score = message.split()
        item = Item(id)
        item_content = item.get_content()

        # max execution time on regex
        res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
        for x in res:
            # String to tuple
            x = x[2:-2].replace(" '", "").split("',")
            url = x[0]
            subdomain = x[4].lower()
            self.faup.decode(url)
            url_unpack = self.faup.get()
            try:    ## TODO: # FIXME: check faup version
                domain = url_unpack['domain'].decode().lower()
            except Exception as e:
                domain = url_unpack['domain'].lower()

            if crawlers.is_valid_onion_domain(domain):
                urls_to_crawl.append((url, subdomain, domain))

        to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
        if not urls_to_crawl:
            self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}')
            return

        # TAG Item
        msg = f'infoleak:automatic-detection="onion";{item.get_id()}'
        self.send_message_to_queue('Tags', msg)

        if crawlers.is_crawler_activated():
            for to_crawl in urls_to_crawl:
                print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
                crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
        else:
            print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
            self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
            # keep manual fetcher ????
            ## Manually fetch first page if crawler is disabled
            # for url in fetch(p, r_cache, urls, domains_list):
            #     publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))

if __name__ == "__main__":

    module = Onion()
    module.run()
decode with redis connection 2018-05-04 11:53:29 +00:00			`#!/usr/bin/env python3`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 09:43:40 +00:00			`# --coding:UTF-8 -`
			`"""`
chg: [ApiKey] refactor module + tests 2021-05-19 12:54:34 +00:00			`The Onion Module`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 09:43:40 +00:00			`============================`

chg: [ApiKey] refactor module + tests 2021-05-19 12:54:34 +00:00			`This module extract url from item and returning only ones which are tor`
			`related (.onion). All These urls are send to the crawler discovery queue.`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 09:43:40 +00:00
			`Requirements`
			`------------`

			`*Need running Redis instances. (Redis)`

			`"""`
Big cleanup, pep8 2014-08-14 15:55:18 +00:00			`import time`
The onion module now fetches the URLs it finds. 2014-08-31 20:42:12 +00:00			`import datetime`
			`import os`
chg: [AIL items + Onion] create AIL item objects + Onion module refactor 2021-05-14 12:42:16 +00:00			`import sys`
chg: [Crawler] change BDD, save i2p links 2018-08-21 13:54:53 +00:00			`import re`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 09:43:40 +00:00
chg: [ApiKey] refactor module + tests 2021-05-19 12:54:34 +00:00			`# project packages`
chg: [AIL items + Onion] create AIL item objects + Onion module refactor 2021-05-14 12:42:16 +00:00			`from module.abstract_module import AbstractModule`
			`from lib.ConfigLoader import ConfigLoader`
			`from lib import crawlers`
			`from lib import regex_helper`
			`from packages.Item import Item`

			`## Manually fetch first page if crawler is disabled`
			`# import base64`
			`# import subprocess`
			`#`
			`# torclient_host = '127.0.0.1'`
			`# torclient_port = 9050`
			`#`
			`# def fetch(p, r_cache, urls, domains):`
			`# now = datetime.datetime.now()`
			`# path = os.path.join('onions', str(now.year).zfill(4),`
			`# str(now.month).zfill(2),`
			`# str(now.day).zfill(2),`
			`# str(int(time.mktime(now.utctimetuple()))))`
			`# failed = []`
			`# downloaded = []`
			`# print('{} Urls to fetch'.format(len(urls)))`
			`# for url, domain in zip(urls, domains):`
			`# if r_cache.exists(url) or url in failed:`
			`# continue`
			`# to_fetch = base64.standard_b64encode(url.encode('utf8'))`
			`# print('fetching url: {}'.format(to_fetch))`
			`# process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],`
			`# stdout=subprocess.PIPE)`
			`# while process.poll() is None:`
			`# time.sleep(1)`
			`#`
			`# if process.returncode == 0:`
			`# r_cache.setbit(url, 0, 1)`
			`# r_cache.expire(url, 360000)`
			`# downloaded.append(url)`
			`# print('downloaded : {}'.format(downloaded))`
			`# '''tempfile = process.stdout.read().strip()`
			`# tempfile = tempfile.decode('utf8')`
			`# #with open(tempfile, 'r') as f:`
			`# filename = path + domain + '.gz'`
			`# fetched = f.read()`
			`# content = base64.standard_b64decode(fetched)`
			`# save_path = os.path.join(os.environ['AIL_HOME'],`
			`# p.config.get("Directories", "pastes"),`
			`# filename)`
			`# dirname = os.path.dirname(save_path)`
			`# if not os.path.exists(dirname):`
			`# os.makedirs(dirname)`
			`# with open(save_path, 'w') as ff:`
			`# ff.write(content)`
			`# p.populate_set_out(save_path, 'Global')`
			`# p.populate_set_out(url, 'ValidOnion')`
			`# p.populate_set_out(fetched, 'FetchedOnion')'''`
			`# yield url`
			`# #os.unlink(tempfile)`
			`# else:`
			`# r_cache.setbit(url, 0, 0)`
			`# r_cache.expire(url, 3600)`
			`# failed.append(url)`
			`# print('Failed at downloading', url)`
			`# print(process.stdout.read())`
			`# print('Failed:', len(failed), 'Downloaded:', len(downloaded))`


			`class Onion(AbstractModule):`
			`"""docstring for Onion module."""`

			`def __init__(self):`
			`super(Onion, self).__init__()`

			`config_loader = ConfigLoader()`
			`self.r_cache = config_loader.get_redis_conn("Redis_Cache")`
			`self.r_onion = config_loader.get_redis_conn("ARDB_Onion")`

			`self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")`
			`# regex timeout`
			`self.regex_timeout = 30`

			`self.faup = crawlers.get_faup()`
			`self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)`

			`# activate_crawler = p.config.get("Crawler", "activate_crawler")`


			`self.url_regex = "((http\|https\|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)@)((25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9])\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[0-9])\|localhost\|([a-zA-Z0-9\-]+\.)[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)(/($\|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"`
			`self.i2p_regex = "((http\|https\|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)@)((25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9])\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[1-9]\|0)\.(25[0-5]\|2[0-4][0-9]\|[0-1]{1}[0-9]{2}\|[1-9]{1}[0-9]{1}\|[0-9])\|localhost\|([a-zA-Z0-9\-]+\.)[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)(/($\|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"`
			`re.compile(self.url_regex)`
			`re.compile(self.i2p_regex)`

			`self.redis_logger.info(f"Module: {self.module_name} Launched")`

			`# TEMP var: SAVE I2P Domain (future I2P crawler)`
			`self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")`

			`def compute(self, message):`
			`# list of tuples: (url, subdomains, domain)`
			`urls_to_crawl = []`

			`id, score = message.split()`
			`item = Item(id)`
			`item_content = item.get_content()`

			`# max execution time on regex`
			`res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)`
			`for x in res:`
			`# String to tuple`
			`x = x[2:-2].replace(" '", "").split("',")`
			`url = x[0]`
			`subdomain = x[4].lower()`
			`self.faup.decode(url)`
			`url_unpack = self.faup.get()`
			`try: ## TODO: # FIXME: check faup version`
			`domain = url_unpack['domain'].decode().lower()`
			`except Exception as e:`
			`domain = url_unpack['domain'].lower()`

			`if crawlers.is_valid_onion_domain(domain):`
			`urls_to_crawl.append((url, subdomain, domain))`

			`to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'`
			`if not urls_to_crawl:`
			`self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}')`
			`return`

			`# TAG Item`
			`msg = f'infoleak:automatic-detection="onion";{item.get_id()}'`
			`self.send_message_to_queue('Tags', msg)`

			`if crawlers.is_crawler_activated():`
			`for to_crawl in urls_to_crawl:`
chg: [launcher + modules] add module tests (Onion module) 2021-05-17 16:03:30 +00:00			`print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')`
chg: [AIL items + Onion] create AIL item objects + Onion module refactor 2021-05-14 12:42:16 +00:00			`crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())`
The onion module now fetches the URLs it finds. 2014-08-31 20:42:12 +00:00			`else:`
chg: [launcher + modules] add module tests (Onion module) 2021-05-17 16:03:30 +00:00			`print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')`
chg: [AIL items + Onion] create AIL item objects + Onion module refactor 2021-05-14 12:42:16 +00:00			`self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')`
			`# keep manual fetcher ????`
			`## Manually fetch first page if crawler is disabled`
			`# for url in fetch(p, r_cache, urls, domains_list):`
			`# publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))`
The onion module now fetches the URLs it finds. 2014-08-31 20:42:12 +00:00
completely remove ZMQ_PubSub.py 2014-08-20 13:14:57 +00:00			`if __name__ == "__main__":`
chg: [AIL items + Onion] create AIL item objects + Onion module refactor 2021-05-14 12:42:16 +00:00
			`module = Onion()`
			`module.run()`