ail-framework/bin/crawlers/Crawler.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*

import os
import sys
import time

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain
from lib.objects import Screenshots

class Crawler(AbstractModule):

    def __init__(self):
        super(Crawler, self, ).__init__(logger_channel='Crawler')

        # Waiting time in seconds between to message processed
        self.pending_seconds = 1

        config_loader = ConfigLoader()
        self.r_log_submit = config_loader.get_redis_conn('Redis_Log_submit')

        self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
        self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
        self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')

        # TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES

        # update hardcoded blacklist
        crawlers.load_blacklist()
        # update captures cache
        crawlers.reload_crawler_captures()

        # LACUS
        self.lacus = crawlers.get_lacus()

        # Capture
        self.har = None
        self.screenshot = None
        self.root_item = None
        self.har_dir = None
        self.items_dir = None
        self.domain = None

        # Send module state to logs
        self.redis_logger.info('Crawler initialized')

    def print_crawler_start_info(self, url, domain, domain_url):
        print()
        print()
        print('\033[92m------------------START CRAWLER------------------\033[0m')
        print(f'crawler type:     {domain}')
        print('\033[92m-------------------------------------------------\033[0m')
        print(f'url:         {url}')
        print(f'domain:      {domain}')
        print(f'domain_url:  {domain_url}')
        print()

    def get_message(self):
        # Check if a new Capture can be Launched
        if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
            task_row = crawlers.get_crawler_task_from_queue()
            if task_row:
                print(task_row)
                task_uuid, priority = task_row
                self.enqueue_capture(task_uuid, priority)

        # Check if a Capture is Done
        capture = crawlers.get_crawler_capture()
        if capture:
            print(capture)
            capture_uuid = capture[0][0]
            capture_status = self.lacus.get_capture_status(capture_uuid)
            if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
                crawlers.update_crawler_capture(capture_uuid)
                print(capture_uuid, capture_status, int(time.time()))
            else:
                self.compute(capture_uuid)
                crawlers.remove_crawler_capture(capture_uuid)
                print('capture', capture_uuid, 'completed')


        time.sleep(self.pending_seconds)

    def enqueue_capture(self, task_uuid, priority):
        task = crawlers.get_crawler_task(task_uuid)
        print(task)
        # task = {
        #         'uuid': task_uuid,
        #         'url': 'https://foo.be',
        #         'domain': 'foo.be',
        #         'depth': 1,
        #         'har': True,
        #         'screenshot': True,
        #         'user_agent': crawlers.get_default_user_agent(),
        #         'cookiejar': [],
        #         'header': '',
        #         'proxy': 'force_tor',
        #         'parent': 'manual',
        # }
        url = task['url']
        force = priority != 0

        # TODO unpack cookiejar

        # TODO HEADER

        capture_uuid = self.lacus.enqueue(url=url,
                                          depth=task['depth'],
                                          user_agent=task['user_agent'],
                                          proxy=task['proxy'],
                                          cookies=[],
                                          force=force,
                                          general_timeout_in_sec=90)

        crawlers.add_crawler_capture(task_uuid, capture_uuid)
        print(task_uuid, capture_uuid, 'launched')
        return capture_uuid

    # CRAWL DOMAIN
    # TODO: CATCH ERRORS
    def compute(self, capture_uuid):

        print('saving capture', capture_uuid)

        task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid)
        task = crawlers.get_crawler_task(task_uuid)

        print(task['domain'])

        self.domain = Domain(task['domain'])

        # TODO CHANGE EPOCH
        epoch = int(time.time())
        parent_id = task['parent']
        print(task)

        entries = self.lacus.get_capture(capture_uuid)
        print(entries['status'])
        self.har = task['har']
        self.screenshot = task['screenshot']
        str_date = crawlers.get_current_date(separator=True)
        self.har_dir = crawlers.get_date_har_dir(str_date)
        self.items_dir = crawlers.get_date_crawled_items_source(str_date)
        self.root_item = None

        # Save Capture
        self.save_capture_response(parent_id, entries)

        self.domain.update_daterange(str_date.replace('/', ''))
        # Origin + History
        if self.root_item:
            # domain.add_ports(port)
            self.domain.set_last_origin(parent_id)
            self.domain.add_history(epoch, root_item=self.root_item)
        elif self.domain.was_up():
            self.domain.add_history(epoch, root_item=epoch)

        crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
        crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type())

    def save_capture_response(self, parent_id, entries):
        print(entries.keys())
        if 'error' in entries:
            # TODO IMPROVE ERROR MESSAGE
            self.redis_logger.warning(str(entries['error']))
            print(entries['error'])
            if entries.get('html'):
                print('retrieved content')
                # print(entries.get('html'))

        # TODO LOGS IF != domain
        if 'last_redirected_url' in entries and entries['last_redirected_url']:
            last_url = entries['last_redirected_url']
            unpacked_last_url = crawlers.unpack_url(last_url)
            current_domain = unpacked_last_url['domain']
            # REDIRECTION TODO CHECK IF WEB
            if current_domain != self.domain.id and not self.root_item:
                self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
                print(f'External redirection {self.domain.id} -> {current_domain}')
                if not self.root_item:
                    self.domain = Domain(current_domain)
        # TODO LAST URL
        # FIXME
        else:
            last_url = f'http://{self.domain.id}'

        if 'html' in entries and entries['html']:
            item_id = crawlers.create_item_id(self.items_dir, self.domain.id)
            print(item_id)
            gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html'])
            # send item to Global
            relay_message = f'{item_id} {gzip64encoded}'
            self.send_message_to_queue(relay_message, 'Mixer')
            # increase nb of paste by feeder name
            self.r_log_submit.hincrby('mixer_cache:list_feeder', 'crawler', 1)

            # Tag
            msg = f'infoleak:submission="crawler";{item_id}'
            self.send_message_to_queue(msg, 'Tags')

            crawlers.create_item_metadata(item_id, self.domain.id, last_url, parent_id)
            if self.root_item is None:
                self.root_item = item_id
            parent_id = item_id

            # SCREENSHOT
            if self.screenshot:
                if 'png' in entries and entries['png']:
                    screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
                    if screenshot:
                        # Create Correlations
                        screenshot.add_correlation('item', '', item_id)
                        screenshot.add_correlation('domain', '', self.domain.id)
            # HAR
            if self.har:
                if 'har' in entries and entries['har']:
                    crawlers.save_har(self.har_dir, item_id, entries['har'])
        # Next Children
        entries_children = entries.get('children')
        if entries_children:
            for children in entries_children:
                self.save_capture_response(parent_id, children)


if __name__ == '__main__':
    module = Crawler()
    module.debug = True
    # module.compute(('ooooo', 0))
    module.run()


##################################
##################################
##################################
##################################
##################################


# from Helper import Process
# from pubsublogger import publisher


# ======== FUNCTIONS ========


# def update_auto_crawler():
#     current_epoch = int(time.time())
#     list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
#     for elem_to_crawl in list_to_crawl:
#         mess, type = elem_to_crawl.rsplit(';', 1)
#         redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
#         redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)

# Extract info form url (url, domain, domain url, ...)
# def unpack_url(url):
#     to_crawl = {}
#     faup.decode(url)
#     url_unpack = faup.get()
#     to_crawl['domain'] = to_crawl['domain'].lower()
#     new_url_host = url_host.lower()
#     url_lower_case = url.replace(url_host, new_url_host, 1)
#
#     if url_unpack['scheme'] is None:
#         to_crawl['scheme'] = 'http'
#         url= 'http://{}'.format(url_lower_case)
#     else:
#         try:
#             scheme = url_unpack['scheme'].decode()
#         except Exception as e:
#             scheme = url_unpack['scheme']
#         if scheme in default_proto_map:
#             to_crawl['scheme'] = scheme
#             url = url_lower_case
#         else:
#             redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
#             to_crawl['scheme'] = 'http'
#             url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
#
#     if url_unpack['port'] is None:
#         to_crawl['port'] = default_proto_map[to_crawl['scheme']]
#     else:
#         try:
#             port = url_unpack['port'].decode()
#         except:
#             port = url_unpack['port']
#         # Verify port number                        #################### make function to verify/correct port number
#         try:
#             int(port)
#         # Invalid port Number
#         except Exception as e:
#             port = default_proto_map[to_crawl['scheme']]
#         to_crawl['port'] = port
#
#     #if url_unpack['query_string'] is None:
#     #    if to_crawl['port'] == 80:
#     #        to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
#     #    else:
#     #        to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
#     #else:
#     #    to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())
#
#     to_crawl['url'] = url
#     if to_crawl['port'] == 80:
#         to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
#     else:
#         to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
#
#     try:
#         to_crawl['tld'] = url_unpack['tld'].decode()
#     except:
#         to_crawl['tld'] = url_unpack['tld']
#
#     return to_crawl

# ##################################################### add ftp ???
        # update_auto_crawler()

                # # add next auto Crawling in queue:
                # if to_crawl['paste'] == 'auto':
                #     redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
                #     # update list, last auto crawled domains
                #     redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
                #     redis_crawler.ltrim('last_auto_crawled', 0, 9)
                #
chg: [crawler + core + cve] migrate crawler to lacus + add new CVE object and correlation + migrate core 2022-10-25 14:25:19 +00:00			`#!/usr/bin/env python3`
			`# --coding:UTF-8 -`

			`import os`
			`import sys`
			`import time`

			`sys.path.append(os.environ['AIL_BIN'])`
			`##################################`
			`# Import Project packages`
			`##################################`
			`from modules.abstract_module import AbstractModule`
			`from lib import crawlers`
			`from lib.ConfigLoader import ConfigLoader`
			`from lib.objects.Domains import Domain`
			`from lib.objects import Screenshots`

			`class Crawler(AbstractModule):`

			`def __init__(self):`
			`super(Crawler, self, ).__init__(logger_channel='Crawler')`

			`# Waiting time in seconds between to message processed`
			`self.pending_seconds = 1`

			`config_loader = ConfigLoader()`
			`self.r_log_submit = config_loader.get_redis_conn('Redis_Log_submit')`

			`self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')`
			`self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')`
			`self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')`

			`# TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES`

			`# update hardcoded blacklist`
			`crawlers.load_blacklist()`
			`# update captures cache`
			`crawlers.reload_crawler_captures()`

			`# LACUS`
			`self.lacus = crawlers.get_lacus()`

			`# Capture`
			`self.har = None`
			`self.screenshot = None`
			`self.root_item = None`
			`self.har_dir = None`
			`self.items_dir = None`
			`self.domain = None`

			`# Send module state to logs`
			`self.redis_logger.info('Crawler initialized')`

			`def print_crawler_start_info(self, url, domain, domain_url):`
			`print()`
			`print()`
			`print('\033[92m------------------START CRAWLER------------------\033[0m')`
			`print(f'crawler type: {domain}')`
			`print('\033[92m-------------------------------------------------\033[0m')`
			`print(f'url: {url}')`
			`print(f'domain: {domain}')`
			`print(f'domain_url: {domain_url}')`
			`print()`

			`def get_message(self):`
			`# Check if a new Capture can be Launched`
			`if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():`
			`task_row = crawlers.get_crawler_task_from_queue()`
			`if task_row:`
			`print(task_row)`
			`task_uuid, priority = task_row`
			`self.enqueue_capture(task_uuid, priority)`

			`# Check if a Capture is Done`
			`capture = crawlers.get_crawler_capture()`
			`if capture:`
			`print(capture)`
			`capture_uuid = capture[0][0]`
			`capture_status = self.lacus.get_capture_status(capture_uuid)`
			`if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time`
			`crawlers.update_crawler_capture(capture_uuid)`
			`print(capture_uuid, capture_status, int(time.time()))`
			`else:`
			`self.compute(capture_uuid)`
			`crawlers.remove_crawler_capture(capture_uuid)`
			`print('capture', capture_uuid, 'completed')`


			`time.sleep(self.pending_seconds)`

			`def enqueue_capture(self, task_uuid, priority):`
			`task = crawlers.get_crawler_task(task_uuid)`
			`print(task)`
			`# task = {`
			`# 'uuid': task_uuid,`
			`# 'url': 'https://foo.be',`
			`# 'domain': 'foo.be',`
			`# 'depth': 1,`
			`# 'har': True,`
			`# 'screenshot': True,`
			`# 'user_agent': crawlers.get_default_user_agent(),`
			`# 'cookiejar': [],`
			`# 'header': '',`
			`# 'proxy': 'force_tor',`
			`# 'parent': 'manual',`
			`# }`
			`url = task['url']`
			`force = priority != 0`

			`# TODO unpack cookiejar`

			`# TODO HEADER`

			`capture_uuid = self.lacus.enqueue(url=url,`
			`depth=task['depth'],`
			`user_agent=task['user_agent'],`
			`proxy=task['proxy'],`
			`cookies=[],`
			`force=force,`
			`general_timeout_in_sec=90)`

			`crawlers.add_crawler_capture(task_uuid, capture_uuid)`
			`print(task_uuid, capture_uuid, 'launched')`
			`return capture_uuid`

			`# CRAWL DOMAIN`
			`# TODO: CATCH ERRORS`
			`def compute(self, capture_uuid):`

			`print('saving capture', capture_uuid)`

			`task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid)`
			`task = crawlers.get_crawler_task(task_uuid)`

			`print(task['domain'])`

			`self.domain = Domain(task['domain'])`

			`# TODO CHANGE EPOCH`
			`epoch = int(time.time())`
			`parent_id = task['parent']`
			`print(task)`

			`entries = self.lacus.get_capture(capture_uuid)`
			`print(entries['status'])`
			`self.har = task['har']`
			`self.screenshot = task['screenshot']`
			`str_date = crawlers.get_current_date(separator=True)`
			`self.har_dir = crawlers.get_date_har_dir(str_date)`
			`self.items_dir = crawlers.get_date_crawled_items_source(str_date)`
			`self.root_item = None`

			`# Save Capture`
			`self.save_capture_response(parent_id, entries)`

			`self.domain.update_daterange(str_date.replace('/', ''))`
			`# Origin + History`
			`if self.root_item:`
			`# domain.add_ports(port)`
			`self.domain.set_last_origin(parent_id)`
			`self.domain.add_history(epoch, root_item=self.root_item)`
			`elif self.domain.was_up():`
			`self.domain.add_history(epoch, root_item=epoch)`

			`crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)`
			`crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type())`

			`def save_capture_response(self, parent_id, entries):`
			`print(entries.keys())`
			`if 'error' in entries:`
			`# TODO IMPROVE ERROR MESSAGE`
			`self.redis_logger.warning(str(entries['error']))`
			`print(entries['error'])`
			`if entries.get('html'):`
			`print('retrieved content')`
			`# print(entries.get('html'))`

			`# TODO LOGS IF != domain`
			`if 'last_redirected_url' in entries and entries['last_redirected_url']:`
			`last_url = entries['last_redirected_url']`
			`unpacked_last_url = crawlers.unpack_url(last_url)`
			`current_domain = unpacked_last_url['domain']`
			`# REDIRECTION TODO CHECK IF WEB`
			`if current_domain != self.domain.id and not self.root_item:`
			`self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')`
			`print(f'External redirection {self.domain.id} -> {current_domain}')`
			`if not self.root_item:`
			`self.domain = Domain(current_domain)`
			`# TODO LAST URL`
			`# FIXME`
			`else:`
			`last_url = f'http://{self.domain.id}'`

			`if 'html' in entries and entries['html']:`
			`item_id = crawlers.create_item_id(self.items_dir, self.domain.id)`
			`print(item_id)`
			`gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html'])`
			`# send item to Global`
			`relay_message = f'{item_id} {gzip64encoded}'`
			`self.send_message_to_queue(relay_message, 'Mixer')`
			`# increase nb of paste by feeder name`
			`self.r_log_submit.hincrby('mixer_cache:list_feeder', 'crawler', 1)`

			`# Tag`
			`msg = f'infoleak:submission="crawler";{item_id}'`
			`self.send_message_to_queue(msg, 'Tags')`

			`crawlers.create_item_metadata(item_id, self.domain.id, last_url, parent_id)`
			`if self.root_item is None:`
			`self.root_item = item_id`
			`parent_id = item_id`

			`# SCREENSHOT`
			`if self.screenshot:`
			`if 'png' in entries and entries['png']:`
			`screenshot = Screenshots.create_screenshot(entries['png'], b64=False)`
			`if screenshot:`
			`# Create Correlations`
			`screenshot.add_correlation('item', '', item_id)`
			`screenshot.add_correlation('domain', '', self.domain.id)`
			`# HAR`
			`if self.har:`
			`if 'har' in entries and entries['har']:`
			`crawlers.save_har(self.har_dir, item_id, entries['har'])`
			`# Next Children`
			`entries_children = entries.get('children')`
			`if entries_children:`
			`for children in entries_children:`
			`self.save_capture_response(parent_id, children)`


			`if __name__ == '__main__':`
			`module = Crawler()`
			`module.debug = True`
			`# module.compute(('ooooo', 0))`
			`module.run()`


			`##################################`
			`##################################`
			`##################################`
			`##################################`
			`##################################`


			`# from Helper import Process`
			`# from pubsublogger import publisher`


			`# ======== FUNCTIONS ========`


			`# def update_auto_crawler():`
			`# current_epoch = int(time.time())`
			`# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)`
			`# for elem_to_crawl in list_to_crawl:`
			`# mess, type = elem_to_crawl.rsplit(';', 1)`
			`# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)`
			`# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)`

			`# Extract info form url (url, domain, domain url, ...)`
			`# def unpack_url(url):`
			`# to_crawl = {}`
			`# faup.decode(url)`
			`# url_unpack = faup.get()`
			`# to_crawl['domain'] = to_crawl['domain'].lower()`
			`# new_url_host = url_host.lower()`
			`# url_lower_case = url.replace(url_host, new_url_host, 1)`
			`#`
			`# if url_unpack['scheme'] is None:`
			`# to_crawl['scheme'] = 'http'`
			`# url= 'http://{}'.format(url_lower_case)`
			`# else:`
			`# try:`
			`# scheme = url_unpack['scheme'].decode()`
			`# except Exception as e:`
			`# scheme = url_unpack['scheme']`
			`# if scheme in default_proto_map:`
			`# to_crawl['scheme'] = scheme`
			`# url = url_lower_case`
			`# else:`
			`# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))`
			`# to_crawl['scheme'] = 'http'`
			`# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))`
			`#`
			`# if url_unpack['port'] is None:`
			`# to_crawl['port'] = default_proto_map[to_crawl['scheme']]`
			`# else:`
			`# try:`
			`# port = url_unpack['port'].decode()`
			`# except:`
			`# port = url_unpack['port']`
			`# # Verify port number #################### make function to verify/correct port number`
			`# try:`
			`# int(port)`
			`# # Invalid port Number`
			`# except Exception as e:`
			`# port = default_proto_map[to_crawl['scheme']]`
			`# to_crawl['port'] = port`
			`#`
			`# #if url_unpack['query_string'] is None:`
			`# # if to_crawl['port'] == 80:`
			`# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())`
			`# # else:`
			`# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])`
			`# #else:`
			`# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())`
			`#`
			`# to_crawl['url'] = url`
			`# if to_crawl['port'] == 80:`
			`# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)`
			`# else:`
			`# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])`
			`#`
			`# try:`
			`# to_crawl['tld'] = url_unpack['tld'].decode()`
			`# except:`
			`# to_crawl['tld'] = url_unpack['tld']`
			`#`
			`# return to_crawl`

			`# ##################################################### add ftp ???`
			`# update_auto_crawler()`

			`# # add next auto Crawling in queue:`
			`# if to_crawl['paste'] == 'auto':`
			`# redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))`
			`# # update list, last auto crawled domains`
			`# redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))`
			`# redis_crawler.ltrim('last_auto_crawled', 0, 9)`
			`#`