ail-framework/bin/crawlers/Crawler.py

328 lines
13 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
2023-05-12 13:29:53 +00:00
import logging.config
import sys
import time
2023-03-14 16:36:42 +00:00
from requests.exceptions import ConnectionError
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
2023-05-12 13:29:53 +00:00
from lib import ail_logger
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.objects import CookiesNames
2023-07-06 09:26:32 +00:00
from lib.objects import Etags
from lib.objects.Domains import Domain
from lib.objects.Items import Item
from lib.objects import Screenshots
from lib.objects import Titles
2023-05-12 13:29:53 +00:00
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
2023-03-14 16:36:42 +00:00
class Crawler(AbstractModule):
def __init__(self):
2023-05-12 13:29:53 +00:00
super(Crawler, self, ).__init__()
self.logger = logging.getLogger(f'{self.__class__.__name__}')
# Waiting time in seconds between to message processed
self.pending_seconds = 1
config_loader = ConfigLoader()
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
# TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES
# update hardcoded blacklist
crawlers.load_blacklist()
# update captures cache
crawlers.reload_crawler_captures()
2023-03-14 16:36:42 +00:00
self.crawler_scheduler = crawlers.CrawlerScheduler()
# LACUS
self.lacus = crawlers.get_lacus()
2023-04-04 12:15:35 +00:00
self.is_lacus_up = crawlers.is_lacus_connected(delta_check=0)
# Capture
self.har = None
self.screenshot = None
self.root_item = None
self.date = None
self.items_dir = None
self.original_domain = None
self.domain = None
# TODO Replace with warning list ???
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
'27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748', # not found
'3e66bf4cc250a68c10f8a30643d73e50e68bf1d4a38d4adc5bfc4659ca2974c0'} # 404
# Send module state to logs
2023-05-12 13:29:53 +00:00
self.logger.info('Crawler initialized')
2023-03-14 16:36:42 +00:00
def refresh_lacus_status(self):
try:
lacus_up = self.is_lacus_up
2023-03-14 16:36:42 +00:00
self.is_lacus_up = crawlers.get_lacus().is_up
# refresh lacus
2023-04-11 07:44:49 +00:00
if not lacus_up and self.is_lacus_up:
self.lacus = crawlers.get_lacus()
2023-03-14 16:36:42 +00:00
except:
self.is_lacus_up = False
if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time()))
time.sleep(30)
def print_crawler_start_info(self, url, domain_url):
print()
print()
print('\033[92m------------------START CRAWLER------------------\033[0m')
2023-03-14 16:36:42 +00:00
print(f'crawler type: {self.domain}')
print('\033[92m-------------------------------------------------\033[0m')
print(f'url: {url}')
2023-03-14 16:36:42 +00:00
print(f'domain: {self.domain}')
print(f'domain_url: {domain_url}')
print()
def get_message(self):
2023-03-14 16:36:42 +00:00
# Crawler Scheduler
self.crawler_scheduler.update_queue()
self.crawler_scheduler.process_queue()
self.refresh_lacus_status() # TODO LOG ERROR
2023-03-14 16:36:42 +00:00
if not self.is_lacus_up:
return None
# Check if a new Capture can be Launched
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.add_task_to_lacus_queue()
if task_row:
task_uuid, priority = task_row
2023-03-14 16:36:42 +00:00
try:
self.enqueue_capture(task_uuid, priority)
except ConnectionError:
print(task_row)
task = crawlers.CrawlerTask(task_uuid)
task.add_to_db_crawler_queue(priority)
self.refresh_lacus_status()
return None
# Get CrawlerCapture Object
capture = crawlers.get_crawler_capture()
if capture:
2023-03-14 16:36:42 +00:00
try:
status = self.lacus.get_capture_status(capture.uuid)
if status == crawlers.CaptureStatus.DONE:
return capture
elif status == crawlers.CaptureStatus.UNKNOWN:
capture_start = capture.get_start_time(r_str=False)
if int(time.time()) - capture_start > 600: # TODO ADD in new crawler config
task = capture.get_task()
task.reset()
capture.delete()
else:
capture.update(status)
else:
2023-03-14 16:36:42 +00:00
capture.update(status)
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
except ConnectionError:
print(capture.uuid)
capture.update(self, -1)
self.refresh_lacus_status()
time.sleep(self.pending_seconds)
def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid)
2023-03-14 16:36:42 +00:00
# print(task)
# task = {
# 'uuid': task_uuid,
# 'url': 'https://foo.be',
# 'domain': 'foo.be',
# 'depth': 1,
# 'har': True,
# 'screenshot': True,
# 'user_agent': crawlers.get_default_user_agent(),
# 'cookiejar': [],
# 'header': '',
# 'proxy': 'force_tor',
# 'parent': 'manual',
# }
2023-03-14 16:36:42 +00:00
url = task.get_url()
force = priority != 0
# TODO timeout
# TODO HEADER
2023-03-14 16:36:42 +00:00
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
# force=force,
# general_timeout_in_sec=120)
capture_uuid = self.lacus.enqueue(url=url,
depth=task.get_depth(),
user_agent=task.get_user_agent(),
proxy=task.get_proxy(),
cookies=task.get_cookies(),
force=force,
2023-03-14 16:36:42 +00:00
general_timeout_in_sec=90) # TODO increase timeout if onion ????
crawlers.create_capture(capture_uuid, task_uuid)
print(task.uuid, capture_uuid, 'launched')
return capture_uuid
# CRAWL DOMAIN
def compute(self, capture):
print('saving capture', capture.uuid)
task = capture.get_task()
domain = task.get_domain()
print(domain)
self.domain = Domain(domain)
self.original_domain = Domain(domain)
epoch = int(time.time())
parent_id = task.get_parent()
entries = self.lacus.get_capture(capture.uuid)
2023-06-18 13:09:09 +00:00
print(entries.get('status'))
self.har = task.get_har()
self.screenshot = task.get_screenshot()
2023-03-14 16:36:42 +00:00
# DEBUG
# self.har = True
# self.screenshot = True
self.date = crawlers.get_current_date(separator=True)
self.items_dir = crawlers.get_date_crawled_items_source(self.date)
self.root_item = None
# Save Capture
self.save_capture_response(parent_id, entries)
self.domain.update_daterange(self.date.replace('/', ''))
# Origin + History + tags
if self.root_item:
self.domain.set_last_origin(parent_id)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
self.domain.add_history(epoch, root_item=self.root_item)
if self.domain != self.original_domain:
self.original_domain.update_daterange(self.date.replace('/', ''))
if self.root_item:
self.original_domain.set_last_origin(parent_id)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
self.original_domain.add_history(epoch, root_item=self.root_item)
crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
2023-03-14 16:36:42 +00:00
print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
print()
task.remove()
def save_capture_response(self, parent_id, entries):
print(entries.keys())
if 'error' in entries:
# TODO IMPROVE ERROR MESSAGE
2023-05-12 13:29:53 +00:00
self.logger.warning(str(entries['error']))
2023-06-18 13:09:09 +00:00
print(entries.get('error'))
if entries.get('html'):
print('retrieved content')
# print(entries.get('html'))
2023-06-18 13:09:09 +00:00
if 'last_redirected_url' in entries and entries.get('last_redirected_url'):
last_url = entries['last_redirected_url']
unpacked_last_url = crawlers.unpack_url(last_url)
current_domain = unpacked_last_url['domain']
2023-03-14 16:36:42 +00:00
# REDIRECTION TODO CHECK IF TYPE CHANGE
if current_domain != self.domain.id and not self.root_item:
2023-05-12 13:29:53 +00:00
self.logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
print(f'External redirection {self.domain.id} -> {current_domain}')
if not self.root_item:
self.domain = Domain(current_domain)
# TODO LAST URL
# FIXME
else:
last_url = f'http://{self.domain.id}'
2023-06-18 13:09:09 +00:00
if 'html' in entries and entries.get('html'):
item_id = crawlers.create_item_id(self.items_dir, self.domain.id)
item = Item(item_id)
print(item.id)
gzip64encoded = crawlers.get_gzipped_b64_item(item.id, entries['html'])
# send item to Global
relay_message = f'crawler {gzip64encoded}'
self.add_message_to_queue(obj=item, message=relay_message, queue='Importers')
# Tag # TODO replace me with metadata to tags
msg = f'infoleak:submission="crawler"' # TODO FIXME
self.add_message_to_queue(obj=item, message=msg, queue='Tags')
# TODO replace me with metadata to add
crawlers.create_item_metadata(item_id, last_url, parent_id)
if self.root_item is None:
self.root_item = item_id
parent_id = item_id
title_content = crawlers.extract_title_from_html(entries['html'])
if title_content:
title = Titles.create_title(title_content)
title.add(item.get_date(), item)
# SCREENSHOT
if self.screenshot:
2023-06-18 13:09:09 +00:00
if 'png' in entries and entries.get('png'):
screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
if screenshot:
if not screenshot.is_tags_safe():
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item.add_tag(unsafe_tag)
# Remove Placeholder pages # TODO Replace with warning list ???
if screenshot.id not in self.placeholder_screenshots:
# Create Correlations
screenshot.add_correlation('item', '', item_id)
screenshot.add_correlation('domain', '', self.domain.id)
# HAR
if self.har:
2023-06-18 13:09:09 +00:00
if 'har' in entries and entries.get('har'):
har_id = crawlers.create_har_id(self.date, item_id)
crawlers.save_har(har_id, entries['har'])
for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']):
print(cookie_name)
cookie = CookiesNames.create(cookie_name)
cookie.add(self.date.replace('/', ''), self.domain)
2023-07-06 09:26:32 +00:00
for etag_content in crawlers.extract_etag_from_har(entries['har']):
print(etag_content)
etag = Etags.create(etag_content)
etag.add(self.date.replace('/', ''), self.domain)
crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', ''))
# Next Children
entries_children = entries.get('children')
if entries_children:
for children in entries_children:
self.save_capture_response(parent_id, children)
if __name__ == '__main__':
module = Crawler()
module.debug = True
module.run()