chg: [crawler] add crawler scheduler

This commit is contained in:
Terrtia 2023-03-14 17:36:42 +01:00
parent ae6f8af09f
commit 925d67a35e
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
8 changed files with 864 additions and 286 deletions

View file

@ -5,6 +5,8 @@ import os
import sys import sys
import time import time
from requests.exceptions import ConnectionError
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
# Import Project packages # Import Project packages
@ -15,6 +17,7 @@ from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain from lib.objects.Domains import Domain
from lib.objects import Screenshots from lib.objects import Screenshots
class Crawler(AbstractModule): class Crawler(AbstractModule):
def __init__(self): def __init__(self):
@ -37,8 +40,11 @@ class Crawler(AbstractModule):
# update captures cache # update captures cache
crawlers.reload_crawler_captures() crawlers.reload_crawler_captures()
self.crawler_scheduler = crawlers.CrawlerScheduler()
# LACUS # LACUS
self.lacus = crawlers.get_lacus() self.lacus = crawlers.get_lacus()
self.is_lacus_up = crawlers.get_lacus().is_up
# Capture # Capture
self.har = None self.har = None
@ -51,44 +57,70 @@ class Crawler(AbstractModule):
# Send module state to logs # Send module state to logs
self.redis_logger.info('Crawler initialized') self.redis_logger.info('Crawler initialized')
def print_crawler_start_info(self, url, domain, domain_url): def refresh_lacus_status(self):
try:
self.is_lacus_up = crawlers.get_lacus().is_up
except:
self.is_lacus_up = False
if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time()))
time.sleep(30)
def print_crawler_start_info(self, url, domain_url):
print() print()
print() print()
print('\033[92m------------------START CRAWLER------------------\033[0m') print('\033[92m------------------START CRAWLER------------------\033[0m')
print(f'crawler type: {domain}') print(f'crawler type: {self.domain}')
print('\033[92m-------------------------------------------------\033[0m') print('\033[92m-------------------------------------------------\033[0m')
print(f'url: {url}') print(f'url: {url}')
print(f'domain: {domain}') print(f'domain: {self.domain}')
print(f'domain_url: {domain_url}') print(f'domain_url: {domain_url}')
print() print()
def get_message(self): def get_message(self):
# Crawler Scheduler
self.crawler_scheduler.update_queue()
self.crawler_scheduler.process_queue()
self.refresh_lacus_status() # TODO LOG ERROR
if not self.is_lacus_up:
return None
# Check if a new Capture can be Launched # Check if a new Capture can be Launched
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.add_task_to_lacus_queue() task_row = crawlers.add_task_to_lacus_queue()
if task_row: if task_row:
print(task_row)
task_uuid, priority = task_row task_uuid, priority = task_row
self.enqueue_capture(task_uuid, priority) try:
self.enqueue_capture(task_uuid, priority)
except ConnectionError:
print(task_row)
task = crawlers.CrawlerTask(task_uuid)
task.add_to_db_crawler_queue(priority)
self.refresh_lacus_status()
return None
# Get CrawlerCapture Object # Get CrawlerCapture Object
capture = crawlers.get_crawler_capture() capture = crawlers.get_crawler_capture()
if capture: if capture:
print(capture.uuid) try:
status = self.lacus.get_capture_status(capture.uuid) status = self.lacus.get_capture_status(capture.uuid)
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
capture.update(status) capture.update(status)
print(capture.uuid, status, int(time.time())) print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
else: else:
self.compute(capture) return capture
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
print('capture', capture.uuid, 'completed') except ConnectionError:
print(capture.uuid)
capture.update(self, -1)
self.refresh_lacus_status()
time.sleep(self.pending_seconds) time.sleep(self.pending_seconds)
def enqueue_capture(self, task_uuid, priority): def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid) task = crawlers.CrawlerTask(task_uuid)
print(task) # print(task)
# task = { # task = {
# 'uuid': task_uuid, # 'uuid': task_uuid,
# 'url': 'https://foo.be', # 'url': 'https://foo.be',
@ -102,11 +134,15 @@ class Crawler(AbstractModule):
# 'proxy': 'force_tor', # 'proxy': 'force_tor',
# 'parent': 'manual', # 'parent': 'manual',
# } # }
url = task.get_url() url = task.get_url()
force = priority != 0 force = priority != 0
# TODO timeout # TODO timeout
# TODO HEADER # TODO HEADER
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
# force=force,
# general_timeout_in_sec=120)
capture_uuid = self.lacus.enqueue(url=url, capture_uuid = self.lacus.enqueue(url=url,
depth=task.get_depth(), depth=task.get_depth(),
@ -114,14 +150,13 @@ class Crawler(AbstractModule):
proxy=task.get_proxy(), proxy=task.get_proxy(),
cookies=task.get_cookies(), cookies=task.get_cookies(),
force=force, force=force,
general_timeout_in_sec=90) general_timeout_in_sec=90) # TODO increase timeout if onion ????
crawlers.create_capture(capture_uuid, task_uuid) crawlers.create_capture(capture_uuid, task_uuid)
print(task.uuid, capture_uuid, 'launched') print(task.uuid, capture_uuid, 'launched')
return capture_uuid return capture_uuid
# CRAWL DOMAIN # CRAWL DOMAIN
# TODO: CATCH ERRORS
def compute(self, capture): def compute(self, capture):
print('saving capture', capture.uuid) print('saving capture', capture.uuid)
@ -131,7 +166,6 @@ class Crawler(AbstractModule):
self.domain = Domain(domain) self.domain = Domain(domain)
# TODO CHANGE EPOCH
epoch = int(time.time()) epoch = int(time.time())
parent_id = task.get_parent() parent_id = task.get_parent()
@ -139,6 +173,9 @@ class Crawler(AbstractModule):
print(entries['status']) print(entries['status'])
self.har = task.get_har() self.har = task.get_har()
self.screenshot = task.get_screenshot() self.screenshot = task.get_screenshot()
# DEBUG
# self.har = True
# self.screenshot = True
str_date = crawlers.get_current_date(separator=True) str_date = crawlers.get_current_date(separator=True)
self.har_dir = crawlers.get_date_har_dir(str_date) self.har_dir = crawlers.get_date_har_dir(str_date)
self.items_dir = crawlers.get_date_crawled_items_source(str_date) self.items_dir = crawlers.get_date_crawled_items_source(str_date)
@ -156,7 +193,10 @@ class Crawler(AbstractModule):
self.domain.add_history(epoch, root_item=epoch) self.domain.add_history(epoch, root_item=epoch)
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch) crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
task.clear() print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
print()
task.remove()
def save_capture_response(self, parent_id, entries): def save_capture_response(self, parent_id, entries):
print(entries.keys()) print(entries.keys())
@ -168,12 +208,11 @@ class Crawler(AbstractModule):
print('retrieved content') print('retrieved content')
# print(entries.get('html')) # print(entries.get('html'))
# TODO LOGS IF != domain
if 'last_redirected_url' in entries and entries['last_redirected_url']: if 'last_redirected_url' in entries and entries['last_redirected_url']:
last_url = entries['last_redirected_url'] last_url = entries['last_redirected_url']
unpacked_last_url = crawlers.unpack_url(last_url) unpacked_last_url = crawlers.unpack_url(last_url)
current_domain = unpacked_last_url['domain'] current_domain = unpacked_last_url['domain']
# REDIRECTION TODO CHECK IF WEB # REDIRECTION TODO CHECK IF TYPE CHANGE
if current_domain != self.domain.id and not self.root_item: if current_domain != self.domain.id and not self.root_item:
self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}') self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
print(f'External redirection {self.domain.id} -> {current_domain}') print(f'External redirection {self.domain.id} -> {current_domain}')
@ -225,92 +264,4 @@ class Crawler(AbstractModule):
if __name__ == '__main__': if __name__ == '__main__':
module = Crawler() module = Crawler()
module.debug = True module.debug = True
# module.compute(('ooooo', 0))
module.run() module.run()
##################################
##################################
##################################
##################################
##################################
# def update_auto_crawler():
# current_epoch = int(time.time())
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
# for elem_to_crawl in list_to_crawl:
# mess, type = elem_to_crawl.rsplit(';', 1)
# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
# Extract info form url (url, domain, domain url, ...)
# def unpack_url(url):
# to_crawl = {}
# faup.decode(url)
# url_unpack = faup.get()
# to_crawl['domain'] = to_crawl['domain'].lower()
# new_url_host = url_host.lower()
# url_lower_case = url.replace(url_host, new_url_host, 1)
#
# if url_unpack['scheme'] is None:
# to_crawl['scheme'] = 'http'
# url= 'http://{}'.format(url_lower_case)
# else:
# try:
# scheme = url_unpack['scheme'].decode()
# except Exception as e:
# scheme = url_unpack['scheme']
# if scheme in default_proto_map:
# to_crawl['scheme'] = scheme
# url = url_lower_case
# else:
# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
# to_crawl['scheme'] = 'http'
# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
#
# if url_unpack['port'] is None:
# to_crawl['port'] = default_proto_map[to_crawl['scheme']]
# else:
# try:
# port = url_unpack['port'].decode()
# except:
# port = url_unpack['port']
# # Verify port number #################### make function to verify/correct port number
# try:
# int(port)
# # Invalid port Number
# except Exception as e:
# port = default_proto_map[to_crawl['scheme']]
# to_crawl['port'] = port
#
# #if url_unpack['query_string'] is None:
# # if to_crawl['port'] == 80:
# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
# # else:
# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
# #else:
# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())
#
# to_crawl['url'] = url
# if to_crawl['port'] == 80:
# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
# else:
# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
#
# try:
# to_crawl['tld'] = url_unpack['tld'].decode()
# except:
# to_crawl['tld'] = url_unpack['tld']
#
# return to_crawl
# ##################################################### add ftp ???
# update_auto_crawler()
# # add next auto Crawling in queue:
# if to_crawl['paste'] == 'auto':
# redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
# # update list, last auto crawled domains
# redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
# redis_crawler.ltrim('last_auto_crawled', 0, 9)
#

View file

@ -19,8 +19,9 @@ import uuid
from enum import IntEnum, unique from enum import IntEnum, unique
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
#from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pylacus import PyLacus from pylacus import PyLacus
@ -44,8 +45,6 @@ r_db = config_loader.get_db_conn("Kvrocks_DB")
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler") r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
r_cache = config_loader.get_redis_conn("Redis_Cache") r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes") ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
HAR_DIR = config_loader.get_files_directory('har') HAR_DIR = config_loader.get_files_directory('har')
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler") activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
@ -181,8 +180,8 @@ def extract_favicon_from_html(html, url):
################################################################################ ################################################################################
# # TODO: handle prefix cookies # # TODO:
# # TODO: fill empty fields # # TODO: REVIEW ME THEN REMOVE ME
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'): def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
# check cookie domain filed # check cookie domain filed
if not 'domain' in cookie_dict: if not 'domain' in cookie_dict:
@ -201,13 +200,6 @@ def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
return cookie_dict return cookie_dict
def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='web'):
cookies = get_cookiejar_cookies_list(cookiejar_uuid)
all_cookies = []
for cookie_dict in cookies:
all_cookies.append(create_cookie_crawler(cookie_dict, domain, crawler_type=crawler_type))
return all_cookies
################################################################################ ################################################################################
################################################################################ ################################################################################
################################################################################ ################################################################################
@ -695,7 +687,285 @@ def load_blacklist():
except Exception as e: except Exception as e:
print(e) print(e)
#### CRAWLER STATE #### #### CRAWLER Scheduler ####
@unique
class ScheduleStatus(IntEnum):
"""The status of the capture"""
UNKNOWN = -1
QUEUED = 0
SCHEDULED = 1
ONGOING = 2
def get_schedulers_uuid():
return r_crawler.smembers('scheduler:schedules')
def get_schedulers_metas():
schedulers = []
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
schedulers.append(schedule.get_meta_status())
return schedulers
class CrawlerScheduler:
def __init__(self):
self.min_frequency = 60 # TODO ADD IN CONFIG
def update_queue(self):
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
# check if already in scheduler queue
if schedule.is_scheduled():
continue
if schedule.is_tasked():
continue
# EXPIRE ????
time_next_run = 0.0
frequency = schedule.get_frequency() # optional or later -> cron
if frequency == 'hourly':
time_next_run = (datetime.now() + timedelta(hours=1)).timestamp()
elif frequency == 'daily':
time_next_run = (datetime.now() + timedelta(days=1)).timestamp()
elif frequency == 'weekly':
time_next_run = (datetime.now() + timedelta(weeks=1)).timestamp()
elif frequency == 'monthly':
time_next_run = (datetime.now() + relativedelta(months=1)).timestamp()
else:
months, weeks, days, hours, minutes = frequency.split(':')
if not months:
months = 0
if not weeks:
weeks = 0
if not days:
days = 0
if not hours:
hours = 0
if not minutes:
minutes = 0
current_time = datetime.now().timestamp()
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
days=int(days), hours=int(hours),
minutes=int(minutes))).timestamp()
# Make sure the next capture is not scheduled for in a too short interval
interval_next_capture = time_next_run - current_time
if interval_next_capture < self.min_frequency:
# self.logger.warning(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
print(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
time_next_run = (datetime.now() + timedelta(seconds=self.min_frequency)).timestamp()
schedule.set_next_run(time_next_run)
print('scheduled:', schedule_uuid)
def process_queue(self):
now = datetime.now().timestamp()
for raw_schedule in r_crawler.zrangebyscore('scheduler:queue', '-inf', int(now), withscores=True):
schedule_uuid, next_run = raw_schedule
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return None
meta = schedule.get_meta()
task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'],
header=meta['header'],
cookiejar=meta['cookiejar'], proxy=meta['proxy'],
user_agent=meta['user_agent'], parent='scheduler', priority=40)
if task_uuid:
schedule.set_task(task_uuid)
r_crawler.zrem('scheduler:queue', schedule_uuid)
# TODO Expire -> stuck in crawler queue or reached delta
class CrawlerSchedule:
def __init__(self, schedule_uuid):
self.uuid = schedule_uuid
def exists(self):
return r_crawler.exists(f'schedule:{self.uuid}')
def get_frequency(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'frequency')
def get_user(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user')
def get_date(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'date')
def get_captures(self): # only scheduled capture ????? exclude manual/discovery
pass
def get_status(self):
if self.is_scheduled():
return ScheduleStatus.SCHEDULED
if self.is_tasked():
if self.is_ongoing():
return ScheduleStatus.ONGOING
else:
return ScheduleStatus.QUEUED
return ScheduleStatus.UNKNOWN
def get_task_uuid(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'task')
def is_tasked(self):
task_uuid = self.get_task_uuid()
if task_uuid:
task = CrawlerTask(task_uuid)
tasked = task.exists()
if not tasked:
r_crawler.hdel(f'schedule:{self.uuid}', 'task')
return tasked
return False
def get_task(self):
task_uuid = self.get_task_uuid()
if task_uuid:
return CrawlerTask(task_uuid)
def set_task(self, task_uuid):
return r_crawler.hset(f'schedule:{self.uuid}', 'task', task_uuid)
def is_ongoing(self):
task = self.get_task()
if task:
return task.is_ongoing()
return False
def get_next_run(self, r_str=False):
next_run = r_crawler.zscore('scheduler:queue', self.uuid)
if next_run and r_str:
next_run = time.strftime('%Y-%m-%d - %H:%M:%S', time.localtime(int(next_run)))
return next_run
def set_next_run(self, time_next_run):
r_crawler.zadd('scheduler:queue', mapping={self.uuid: time_next_run})
def is_scheduled(self):
return bool(r_crawler.zscore('scheduler:queue', self.uuid))
def get_url(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'url')
def get_depth(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'depth')
def get_har(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'har') == 'True'
def get_screenshot(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'screenshot') == 'True'
def get_header(self):
r_crawler.hget(f'schedule:{self.uuid}', 'header')
def get_cookiejar(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'cookiejar')
def get_proxy(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'proxy')
def get_user_agent(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user_agent')
def _set_field(self, field, value):
return r_crawler.hset(f'schedule:{self.uuid}', field, value)
def get_meta(self, ui=False):
meta = {
'uuid': self.uuid,
'date': self.get_date(),
'frequency': self.get_frequency(),
'user': self.get_user(),
'url': self.get_url(),
'depth': self.get_depth(),
'har': self.get_har(),
'screenshot': self.get_screenshot(),
'user_agent': self.get_user_agent(),
'cookiejar': self.get_cookiejar(),
'header': self.get_header(),
'proxy': self.get_proxy(),
}
status = self.get_status()
if ui:
status = status.name
r_str = True
else:
r_str = False
meta['status'] = status
meta['next_run'] = self.get_next_run(r_str=r_str)
return meta
def get_meta_status(self): # TODO: Description ? Frequency ???
meta = {'uuid': self.uuid,
'url': self.get_url(),
'user': self.get_user(),
'next_run': self.get_next_run(r_str=True)}
status = self.get_status()
if isinstance(status, ScheduleStatus):
status = status.name
meta['status'] = status
return meta
def create(self, frequency, user, url,
depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
if self.exists():
raise Exception('Error: Monitor already exists')
url_decoded = unpack_url(url)
url = url_decoded['url']
self._set_field('date', datetime.now().strftime("%Y-%m-%d"))
self._set_field('frequency', frequency)
self._set_field('user', user)
self._set_field('url', url)
self._set_field('depth', int(depth))
self._set_field('har', str(har))
self._set_field('screenshot', str(screenshot))
if cookiejar:
self._set_field('cookiejar', cookiejar)
if header:
self._set_field('header', header)
if proxy:
if proxy == 'web':
proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor'
self._set_field('proxy', proxy)
if user_agent:
self._set_field('user_agent', user_agent)
r_crawler.sadd('scheduler:schedules', self.uuid)
def delete(self):
# remove from schedule queue
r_crawler.zrem('scheduler:queue', self.uuid)
# delete task
task = self.get_task()
if task:
task.delete()
# delete meta
r_crawler.delete(f'schedule:{self.uuid}')
r_crawler.srem('scheduler:schedules', self.uuid)
def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
schedule_uuid = gen_uuid()
schedule = CrawlerSchedule(schedule_uuid)
schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent)
return schedule_uuid
# TODO sanityze UUID
def api_delete_schedule(data):
schedule_uuid = data.get('uuid')
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return {'error': 'unknown schedule uuid', 'uuid': schedule}, 404
return schedule.delete(), 200
#### CRAWLER CAPTURE #### #### CRAWLER CAPTURE ####
@ -709,7 +979,15 @@ def reload_crawler_captures():
r_cache.delete('crawler:captures') r_cache.delete('crawler:captures')
for capture_uuid in get_crawler_captures(): for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid) capture = CrawlerCapture(capture_uuid)
r_cache.zadd('crawler:captures', {capture.uuid: 0}) capture.update(None)
def _clear_captures():
for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid)
task = capture.get_task()
task.delete()
capture.delete()
print(capture_uuid, 'deleted')
@unique @unique
class CaptureStatus(IntEnum): class CaptureStatus(IntEnum):
@ -741,6 +1019,9 @@ class CrawlerCapture:
def get_status(self): def get_status(self):
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status') return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
def is_ongoing(self):
return self.get_status() == CaptureStatus.ONGOING
def create(self, task_uuid): def create(self, task_uuid):
if self.exists(): if self.exists():
raise Exception(f'Error: Capture {self.uuid} already exists') raise Exception(f'Error: Capture {self.uuid} already exists')
@ -752,20 +1033,26 @@ class CrawlerCapture:
r_cache.zadd('crawler:captures', {self.uuid: launch_time}) r_cache.zadd('crawler:captures', {self.uuid: launch_time})
def update(self, status): def update(self, status):
last_check = int(time.time()) # Error or Reload
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status) if not status:
r_cache.zadd('crawler:captures', {self.uuid: last_check}) r_cache.hset(f'crawler:capture:{self.uuid}', 'status', CaptureStatus.UNKNOWN)
r_cache.zadd('crawler:captures', {self.uuid: 0})
else:
last_check = int(time.time())
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
r_cache.zadd('crawler:captures', {self.uuid: last_check})
def remove(self): # TODO INCOMPLETE # Crawler
def remove(self):
r_crawler.zrem('crawler:captures', self.uuid) r_crawler.zrem('crawler:captures', self.uuid)
r_cache.delete(f'crawler:capture:{self.uuid}')
r_crawler.hdel('crawler:captures:tasks', self.uuid) r_crawler.hdel('crawler:captures:tasks', self.uuid)
# TODO # Manual
# TODO DELETE TASK ???
def delete(self): def delete(self):
# task = self.get_task() # remove Capture from crawler queue
# task.delete() r_cache.zrem('crawler:captures', self.uuid)
r_cache.delete(f'crawler:capture:{self.uuid}') self.remove()
def create_capture(capture_uuid, task_uuid): def create_capture(capture_uuid, task_uuid):
@ -792,9 +1079,13 @@ def get_captures_status():
'uuid': task.uuid, 'uuid': task.uuid,
'domain': dom.get_id(), 'domain': dom.get_id(),
'type': dom.get_domain_type(), 'type': dom.get_domain_type(),
'start_time': capture.get_start_time(), ############### TODO 'start_time': capture.get_start_time(),
'status': capture.get_status(), 'status': capture.get_status(),
} }
capture_status = capture.get_status()
if capture_status:
capture_status = CaptureStatus(int(capture_status)).name
meta['status'] = capture_status
status.append(meta) status.append(meta)
return status return status
@ -872,6 +1163,12 @@ class CrawlerTask:
def get_capture(self): def get_capture(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture') return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
def is_ongoing(self):
capture_uuid = self.get_capture()
if capture_uuid:
return CrawlerCapture(capture_uuid).is_ongoing()
return False
def _set_field(self, field, value): def _set_field(self, field, value):
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value) return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
@ -923,8 +1220,6 @@ class CrawlerTask:
proxy = None proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion': elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor' proxy = 'force_tor'
if not user_agent:
user_agent = get_default_user_agent()
# TODO SANITIZE COOKIEJAR -> UUID # TODO SANITIZE COOKIEJAR -> UUID
@ -934,13 +1229,11 @@ class CrawlerTask:
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query) self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
return self.uuid return self.uuid
# TODO ADD TASK STATUS -----
self._set_field('domain', domain) self._set_field('domain', domain)
self._set_field('url', url) self._set_field('url', url)
self._set_field('depth', int(depth)) self._set_field('depth', int(depth))
self._set_field('har', har) self._set_field('har', har)
self._set_field('screenshot', screenshot) self._set_field('screenshot', screenshot)
self._set_field('user_agent', user_agent)
self._set_field('parent', parent) self._set_field('parent', parent)
if cookiejar: if cookiejar:
@ -949,30 +1242,45 @@ class CrawlerTask:
self._set_field('header', header) self._set_field('header', header)
if proxy: if proxy:
self._set_field('proxy', proxy) self._set_field('proxy', proxy)
if user_agent:
self._set_field('user_agent', user_agent)
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid) r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
self._set_field('hash', hash_query) self._set_field('hash', hash_query)
r_crawler.zadd('crawler:queue', {self.uuid: priority}) r_crawler.zadd('crawler:queue', {self.uuid: priority})
self.add_to_db_crawler_queue(priority)
# UI # UI
domain_type = dom.get_domain_type() domain_type = dom.get_domain_type()
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid) r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
self._set_field('queue', domain_type) self._set_field('queue', domain_type)
return self.uuid return self.uuid
def lacus_queue(self): def add_to_db_crawler_queue(self, priority):
r_crawler.sadd('crawler:queue:queued', self.uuid) r_crawler.zadd('crawler:queue', {self.uuid: priority})
def start(self):
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
def clear(self): # Crawler
r_crawler.hdel('crawler:queue:hash', self.get_hash()) def remove(self): # zrem cache + DB
r_crawler.srem(f'crawler:queue:type:{self.get_queue()}', self.uuid) capture_uuid = self.get_capture()
r_crawler.srem('crawler:queue:queued', self.uuid) if capture_uuid:
capture = CrawlerCapture(capture_uuid)
def delete(self): capture.remove()
self.clear() queue_type = self.get_queue()
if queue_type:
r_crawler.srem(f'crawler:queue:type:{queue_type}', self.uuid)
task_hash = self.get_hash()
if task_hash:
r_crawler.hdel('crawler:queue:hash', task_hash)
# meta
r_crawler.delete(f'crawler:task:{self.uuid}') r_crawler.delete(f'crawler:task:{self.uuid}')
# r_crawler.zadd('crawler:queue', {self.uuid: priority})
# Manual
def delete(self):
# queue
r_crawler.zrem('crawler:queue', self.uuid)
self.remove()
# TODO move to class ??? # TODO move to class ???
@ -990,7 +1298,7 @@ def add_task_to_lacus_queue():
return None return None
task_uuid, priority = task_uuid[0] task_uuid, priority = task_uuid[0]
task = CrawlerTask(task_uuid) task = CrawlerTask(task_uuid)
task.lacus_queue() task.start()
return task.uuid, priority return task.uuid, priority
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
@ -1006,29 +1314,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority) proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
return task_uuid return task_uuid
######################################################################
######################################################################
# def get_task_status(task_uuid): ## -- CRAWLER TASK -- ##
# domain = r_crawler.hget(f'crawler:task:{task_uuid}', 'domain')
# dom = Domain(domain)
# meta = {
# 'uuid': task_uuid,
# 'domain': dom.get_id(),
# 'domain_type': dom.get_domain_type(),
# 'start_time': r_crawler.hget(f'crawler:task:{task_uuid}', 'start_time'),
# 'status': 'test',
# }
# return meta
# def get_crawlers_tasks_status():
# tasks_status = []
# tasks = r_crawler.smembers('crawler:queue:queued')
# for task_uuid in tasks:
# tasks_status.append(get_task_status(task_uuid))
# return tasks_status
##-- CRAWLER TASK --##
#### CRAWLER TASK API #### #### CRAWLER TASK API ####
@ -1071,13 +1358,25 @@ def api_add_crawler_task(data, user_id=None):
return {'error': 'The access to this cookiejar is restricted'}, 403 return {'error': 'The access to this cookiejar is restricted'}, 403
cookiejar_uuid = cookiejar.uuid cookiejar_uuid = cookiejar.uuid
# if auto_crawler: frequency = data.get('frequency', None)
# try: if frequency:
# crawler_delta = int(crawler_delta) if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
# if crawler_delta < 0: if not isinstance(frequency, dict):
# return ({'error':'invalid delta between two pass of the crawler'}, 400) return {'error': 'Invalid frequency'}, 400
# except ValueError: else:
# return ({'error':'invalid delta between two pass of the crawler'}, 400) try:
months = int(frequency.get('months', 0))
weeks = int(frequency.get('weeks', 0))
days = int(frequency.get('days', 0))
hours = int(frequency.get('hours', 0))
minutes = int(frequency.get('minutes', 0))
except (TypeError, ValueError):
return {'error': 'Invalid frequency'}, 400
if min(months, weeks, days, hours, minutes) < 0:
return {'error': 'Invalid frequency'}, 400
if max(months, weeks, days, hours, minutes) <= 0:
return {'error': 'Invalid frequency'}, 400
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
# PROXY # PROXY
proxy = data.get('proxy', None) proxy = data.get('proxy', None)
@ -1088,15 +1387,16 @@ def api_add_crawler_task(data, user_id=None):
if verify[1] != 200: if verify[1] != 200:
return verify return verify
# TODO ############################################################################################################# if frequency:
# auto_crawler = auto_crawler # TODO verify user
# crawler_delta = crawler_delta return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
parent = 'manual' cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200
else:
# TODO HEADERS # TODO HEADERS
# TODO USER AGENT # TODO USER AGENT
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, cookiejar=cookiejar_uuid, return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
proxy=proxy, user_agent=None, parent='manual', priority=90), 200 cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None,
parent='manual', priority=90), 200
#### #### #### ####
@ -1108,13 +1408,6 @@ def api_add_crawler_task(data, user_id=None):
################################################################################### ###################################################################################
#### CRAWLER GLOBAL #### #### CRAWLER GLOBAL ####
# TODO: # FIXME: config db, dynamic load # TODO: # FIXME: config db, dynamic load
@ -1124,55 +1417,8 @@ def is_crawler_activated():
def get_crawler_all_types(): def get_crawler_all_types():
return ['onion', 'web'] return ['onion', 'web']
def sanitize_crawler_types(l_crawler_types):
all_crawler_types = get_crawler_all_types()
if not l_crawler_types:
return all_crawler_types
for crawler_type in l_crawler_types:
if crawler_type not in all_crawler_types:
return all_crawler_types
return l_crawler_types
##-- CRAWLER GLOBAL --## ##-- CRAWLER GLOBAL --##
#### AUTOMATIC CRAWLER ####
def get_auto_crawler_all_domain(l_crawler_types=[]):
l_crawler_types = sanitize_crawler_types(l_crawler_types)
if len(l_crawler_types) == 1:
return r_serv_onion.smembers(f'auto_crawler_url:{l_crawler_types[0]}')
else:
l_keys_name = []
for crawler_type in l_crawler_types:
l_keys_name.append(f'auto_crawler_url:{crawler_type}')
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
r_serv_onion.zadd('crawler_auto_queue', {f'{message};{domain_type}': int(time.time() + delta)})
# update list, last auto crawled domains
r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
r_serv_onion.ltrim('last_auto_crawled', 0, 9)
def update_auto_crawler_queue():
current_epoch = int(time.time())
# check if current_epoch > domain_next_epoch
l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
for elem in l_queue:
mess, domain_type = elem.rsplit(';', 1)
print(domain_type)
print(mess)
r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
##-- AUTOMATIC CRAWLER --##
#### CRAWLER TASK ####
##-- CRAWLER TASK --##
#### #### #### ####
@ -1207,6 +1453,8 @@ def save_har(har_dir, item_id, har_content):
# # # #
# # # # # # # # # # # # # # # # # # # # # # # #
#### PROXY ####
def api_verify_proxy(proxy_url): def api_verify_proxy(proxy_url):
parsed_proxy = urlparse(proxy_url) parsed_proxy = urlparse(proxy_url)
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port: if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
@ -1237,13 +1485,7 @@ class CrawlerProxy:
def get_url(self): def get_url(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url') return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
############################################################################################### #### CRAWLER LACUS ####
###############################################################################################
###############################################################################################
###############################################################################################
# # # # CRAWLER LACUS # # # #
def get_lacus_url(): def get_lacus_url():
return r_db.hget('crawler:lacus', 'url') return r_db.hget('crawler:lacus', 'url')
@ -1363,12 +1605,7 @@ def api_set_crawler_max_captures(data):
save_nb_max_captures(nb_captures) save_nb_max_captures(nb_captures)
return nb_captures, 200 return nb_captures, 200
## PROXY ## ## TEST ##
# TODO SAVE PROXY URL + ADD PROXY TESTS
# -> name + url
## PROXY ##
def is_test_ail_crawlers_successful(): def is_test_ail_crawlers_successful():
return r_db.hget('crawler:tor:test', 'success') == 'True' return r_db.hget('crawler:tor:test', 'success') == 'True'
@ -1380,7 +1617,6 @@ def save_test_ail_crawlers_result(test_success, message):
r_db.hset('crawler:tor:test', 'success', str(test_success)) r_db.hset('crawler:tor:test', 'success', str(test_success))
r_db.hset('crawler:tor:test', 'message', message) r_db.hset('crawler:tor:test', 'message', message)
# TODO CREATE TEST TASK
def test_ail_crawlers(): def test_ail_crawlers():
# # TODO: test web domain # # TODO: test web domain
if not ping_lacus(): if not ping_lacus():
@ -1431,10 +1667,11 @@ def test_ail_crawlers():
#### ---- #### #### ---- ####
# TODO MOVE ME # TODO MOVE ME IN CRAWLER OR FLASK
load_blacklist() load_blacklist()
# if __name__ == '__main__': # if __name__ == '__main__':
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd') # task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
# print(task.get_meta()) # print(task.get_meta())
# _clear_captures()

View file

@ -61,9 +61,9 @@ def create_json_response(data, status_code):
def crawlers_dashboard(): def crawlers_dashboard():
is_manager_connected = crawlers.get_lacus_connection_metadata() is_manager_connected = crawlers.get_lacus_connection_metadata()
crawlers_status = crawlers.get_captures_status() crawlers_status = crawlers.get_captures_status()
print(crawlers_status) # print(crawlers_status)
crawlers_latest_stats = crawlers.get_crawlers_stats() crawlers_latest_stats = crawlers.get_crawlers_stats()
print(crawlers_latest_stats) # print(crawlers_latest_stats)
date = crawlers.get_current_date() date = crawlers.get_current_date()
return render_template("dashboard_crawler.html", date=date, return render_template("dashboard_crawler.html", date=date,
is_manager_connected=is_manager_connected, is_manager_connected=is_manager_connected,
@ -77,6 +77,7 @@ def crawlers_dashboard():
def crawler_dashboard_json(): def crawler_dashboard_json():
crawlers_status = crawlers.get_captures_status() crawlers_status = crawlers.get_captures_status()
crawlers_latest_stats = crawlers.get_crawlers_stats() crawlers_latest_stats = crawlers.get_crawlers_stats()
# print(crawlers_status)
return jsonify({'crawlers_status': crawlers_status, return jsonify({'crawlers_status': crawlers_status,
'stats': crawlers_latest_stats}) 'stats': crawlers_latest_stats})
@ -106,13 +107,24 @@ def send_to_spider():
# POST val # POST val
url = request.form.get('url_to_crawl') url = request.form.get('url_to_crawl')
crawler_type = request.form.get('crawler_queue_type') crawler_type = request.form.get('crawler_queue_type')
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
screenshot = request.form.get('screenshot') screenshot = request.form.get('screenshot')
har = request.form.get('har') har = request.form.get('har')
depth_limit = request.form.get('depth_limit') depth_limit = request.form.get('depth_limit')
cookiejar_uuid = request.form.get('cookiejar') cookiejar_uuid = request.form.get('cookiejar')
# Frequency
if request.form.get('crawler_scheduler'):
frequency = request.form.get('frequency')
if frequency == 'custom':
months = request.form.get('frequency_months', 0)
weeks = request.form.get('frequency_weeks', 0)
days = request.form.get('frequency_days', 0)
hours = request.form.get('frequency_hours', 0)
minutes = request.form.get('frequency_minutes', 0)
frequency = {'months': months, 'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes}
else:
frequency = None
# PROXY # PROXY
proxy = request.form.get('proxy_name') proxy = request.form.get('proxy_name')
if proxy: if proxy:
@ -129,7 +141,7 @@ def send_to_spider():
cookiejar_uuid = cookiejar_uuid.rsplit(':') cookiejar_uuid = cookiejar_uuid.rsplit(':')
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot} data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
if proxy: if proxy:
data['proxy'] = proxy data['proxy'] = proxy
if cookiejar_uuid: if cookiejar_uuid:
@ -142,6 +154,43 @@ def send_to_spider():
return redirect(url_for('crawler_splash.manual')) return redirect(url_for('crawler_splash.manual'))
@crawler_splash.route("/crawlers/scheduler", methods=['GET'])
@login_required
@login_read_only
def scheduler_dashboard():
schedulers = crawlers.get_schedulers_metas()
# print(schedulers)
# TODO list currently queued ?
return render_template("crawler_scheduler_dashboard.html",
schedulers=schedulers,
is_manager_connected=crawlers.get_lacus_connection_metadata())
@crawler_splash.route("/crawlers/schedule", methods=['GET'])
@login_required
@login_read_only
def schedule_show():
schedule_uuid = request.args.get('uuid')
schedule = crawlers.CrawlerSchedule(schedule_uuid)
if not schedule.exists():
abort(404)
meta = schedule.get_meta(ui=True)
return render_template("crawler_schedule_uuid.html",
meta=meta)
@crawler_splash.route("/crawlers/schedule/delete", methods=['GET'])
@login_required
@login_analyst
def schedule_delete():
schedule_uuid = request.args.get('uuid')
schedule = crawlers.CrawlerSchedule(schedule_uuid)
if not schedule.exists():
abort(404)
res = crawlers.api_delete_schedule({'uuid': schedule_uuid})
if res[1] != 200:
return create_json_response(res[0], res[1])
return redirect(url_for('crawler_splash.scheduler_dashboard'))
@crawler_splash.route("/crawlers/last/domains", methods=['GET']) @crawler_splash.route("/crawlers/last/domains", methods=['GET'])
@login_required @login_required
@login_read_only @login_read_only
@ -228,11 +277,11 @@ def showDomain():
dict_domain['epoch'] = curr_epoch dict_domain['epoch'] = curr_epoch
dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch)) dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch))
print(dict_domain['epoch']) # print(dict_domain['epoch'])
dict_domain['crawler_history_items'] = [] dict_domain['crawler_history_items'] = []
for item_id in domain.get_crawled_items_by_epoch(epoch): for item_id in domain.get_crawled_items_by_epoch(epoch):
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options=['crawler'])) dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options={'crawler'}))
if dict_domain['crawler_history_items']: if dict_domain['crawler_history_items']:
dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items']) dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items'])
@ -521,7 +570,7 @@ def crawler_cookiejar_show():
@crawler_splash.route('/crawler/cookie/delete', methods=['GET']) @crawler_splash.route('/crawler/cookie/delete', methods=['GET'])
@login_required @login_required
@login_read_only @login_analyst
def crawler_cookiejar_cookie_delete(): def crawler_cookiejar_cookie_delete():
user_id = current_user.get_id() user_id = current_user.get_id()
cookie_uuid = request.args.get('uuid') cookie_uuid = request.args.get('uuid')
@ -536,7 +585,7 @@ def crawler_cookiejar_cookie_delete():
@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET']) @crawler_splash.route('/crawler/cookiejar/delete', methods=['GET'])
@login_required @login_required
@login_read_only @login_analyst
def crawler_cookiejar_delete(): def crawler_cookiejar_delete():
user_id = current_user.get_id() user_id = current_user.get_id()
cookiejar_uuid = request.args.get('uuid') cookiejar_uuid = request.args.get('uuid')
@ -699,7 +748,7 @@ def crawler_lacus_settings_crawler_manager():
api_key = request.form.get('api_key') api_key = request.form.get('api_key')
res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key}) res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key})
print(res) # print(res)
if res[1] != 200: if res[1] != 200:
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
else: else:

View file

@ -66,21 +66,59 @@
<div class="d-flex mt-3"> <div class="d-flex mt-3">
<i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp; <i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp;
<div class="custom-control custom-switch"> <div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type"> <input class="custom-control-input" type="checkbox" name="crawler_scheduler" value="True" id="crawler_scheduler">
<label class="custom-control-label" for="crawler_type"> <label class="custom-control-label" for="crawler_scheduler">
<i class="fas fa-clock"></i> &nbsp;Automatic <i class="fas fa-clock"></i> &nbsp;Scheduler
</label> </label>
</div> </div>
</div> </div>
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
<div class="input-group-prepend"> <div id="frequency_inputs">
<span class="input-group-text bg-light"><i class="fas fa-clock"></i>&nbsp;</span> <div class="mb-4">
</div> <select class="custom-select" id="frequency" name="frequency" onchange="frequency_selector_update(this);">
<input class="form-control" type="number" id="crawler_epoch" value="3600" min="1" name="crawler_epoch" required> <option value="hourly">Hourly</option>
<div class="input-group-append"> <option value="daily">Daily</option>
<span class="input-group-text">Time (seconds) between each crawling</span> <option value="weekly">Weekly</option>
</div> <option value="monthly">Monthly</option>
</div> <option value="custom">Custom</option>
</select>
</div>
<div id="custom_frequency">
<h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Months</b></span>
</div>
<input class="form-control" type="number" id="frequency_months" value="0" min="0" name="frequency_months" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Weeks</b></span>
</div>
<input class="form-control" type="number" id="frequency_weeks" value="0" min="0" name="frequency_weeks" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Days</b></span>
</div>
<input class="form-control" type="number" id="frequency_days" value="0" min="0" name="frequency_days" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Hours</b></span>
</div>
<input class="form-control" type="number" id="frequency_hours" value="0" min="0" name="frequency_hours" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Minutes</b></span>
</div>
<input class="form-control" type="number" id="frequency_minutes" value="0" min="0" name="frequency_minutes" required>
</div>
</div>
</div>
</div> </div>
@ -165,8 +203,9 @@ $(document).ready(function(){
$("#nav_manual_crawler").addClass("active"); $("#nav_manual_crawler").addClass("active");
queue_type_selector_input_controler() queue_type_selector_input_controler()
manual_crawler_input_controler(); manual_crawler_input_controler();
$("#custom_frequency").hide();
$('#crawler_type').on("change", function () { $('#crawler_scheduler').on("change", function () {
manual_crawler_input_controler(); manual_crawler_input_controler();
}); });
@ -190,10 +229,18 @@ function toggle_sidebar(){
} }
function manual_crawler_input_controler() { function manual_crawler_input_controler() {
if($('#crawler_type').is(':checked')){ if($('#crawler_scheduler').is(':checked')){
$("#crawler_epoch_input").show(); $("#frequency_inputs").show();
}else{ }else{
$("#crawler_epoch_input").hide(); $("#frequency_inputs").hide();
}
}
function frequency_selector_update(obj) {
if(obj.value === "custom") {
$("#custom_frequency").show();
}else{
$("#custom_frequency").hide();
} }
} }

View file

@ -0,0 +1,199 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<div class="card my-1">
<div class="card-header bg-dark text-white">
<h4 class="card-title"><b>{{ meta['url'] }}</b></h4>
</div>
<div class="card-body">
<div class="row">
<div class="col-lg-8">
<table class="table table-hover">
<tbody>
<tr>
<th>UUID</th>
<td>{{ meta['uuid'] }}</td>
</tr>
<tr>
<th>Url</th>
<td>{{ meta['url'] }}</td>
</tr>
<tr>
<th>Frequency</th>
<td>{{ meta['frequency'] }}</td>
</tr>
<tr>
<th>Creator</th>
<td>{{ meta['user'] }}</td>
</tr>
<tr>
<th>Date</th>
<td>{{ meta['date'] }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ meta['status'] }}</td>
</tr>
<tr>
<th>Next Run</th>
<td>
{% if not meta['next_run'] %}
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
{% else %}
{{ meta['next_run'] }}
{% endif %}
</td>
</tr>
</tbody>
</table>
<h4>Config:</h4>
<table class="table table-hover">
<tbody>
<tr>
<th><i class="fas fa-water"></i> Depth</th>
<td>{{ meta['depth'] }}</td>
</tr>
<tr>
<th><i class="fas fa-image"></i> Screenshot</th>
<td>
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['screenshot'] %}checked{% endif %} disabled>
<label class="custom-control-label" for="html_content_id">
<i class="fas fa-image"></i> &nbsp;
</label>
</div>
</td>
</tr>
<tr>
<th><i class="fas fa-file"></i> &nbsp;Har</th>
<td>
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['har'] %}checked{% endif %} disabled>
<label class="custom-control-label" for="html_content_id">
<i class="fas fa-file"></i> &nbsp;
</label>
</div>
</td>
</tr>
<tr>
<th><i class="fas fa-cookie-bite"></i> Cookiejar</th>
<td>
{% if not meta['cookiejar'] %}
-
{% else %}
<a href="{{ url_for('crawler_splash.crawler_cookiejar_show') }}?uuid={{meta['cookiejar']}}">
{{ meta['cookiejar'] }}
</a>
{% endif %}
</td>
</tr>
<tr>
<th>Proxy</th>
<td>
{% if not meta['proxy'] %}
-
{% else %}
{{ meta['proxy'] }}
{% endif %}
</td>
</tr>
<tr>
<th>User Agent</th>
<td>
{% if meta['user_agent'] %}
{{ meta['user_agent'] }}
{% else %}
Default
{% endif %}
</td>
</tr>
{% if meta['header'] %}
<tr>
<th>header</th>
<td>{{ meta['header'] }}</td>
</tr>
{% endif %}
</tbody>
</table>
</div>
<div class="col-lg-4">
<div>
<a href="{{ url_for('crawler_splash.schedule_delete') }}?uuid={{meta['uuid']}}">
<button type="button" class="btn btn-danger">
<i class="fas fa-trash-alt"></i> <b>Delete</b>
</button>
</a>
{# <a href="{{ url_for('investigations_b.edit_investigation') }}?uuid={{meta['uuid']}}">#}
{# <button type="button" class="btn btn-info">#}
{# <i class="fas fa-pencil-alt"></i> <b>Edit</b>#}
{# </button>#}
{# </a>#}
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_monitoring_crawler").addClass("active");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -0,0 +1,94 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dataTables.bootstrap.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
{% include 'crawler/crawler_disabled.html' %}
<h1>Schedulers:</h1>
<table class="table mt-1 table-hover table-borderless table-striped" id="table_scheduler">
<thead class="thead-dark">
<tr>
<th>Url</th>
<th>Status</th>
<th>Next Run</th>
<th>User</th>
</tr>
</thead>
<tbody id="tbody_last_crawled">
{% for meta in schedulers %}
<tr>
<td><a href="{{ url_for('crawler_splash.schedule_show') }}?uuid={{ meta['uuid'] }}">{{ meta['url'] }}</a></td>
<td>{{ meta['status'] }}</td>
<td>
{% if not meta['next_run'] %}
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
{% else %}
{{ meta['next_run'] }}
{% endif %}
</td>
<td>{{ meta['user'] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</body>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_scheduler_crawler").addClass("active");
$('#table_scheduler').DataTable();
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -208,8 +208,9 @@ function refresh_crawler_status(){
$('#stat_web_total').text(data.stats['web']['crawled']); $('#stat_web_total').text(data.stats['web']['crawled']);
$('#stat_web_queue').text(data.stats['web']['queue']); $('#stat_web_queue').text(data.stats['web']['queue']);
$("#tbody_crawler_onion_info").empty();
if(data.crawlers_status.length!=0){ if(data.crawlers_status.length!=0){
$("#tbody_crawler_onion_info").empty();
var tableRef = document.getElementById('tbody_crawler_onion_info'); var tableRef = document.getElementById('tbody_crawler_onion_info');
for (var i = 0; i < data.crawlers_status.length; i++) { for (var i = 0; i < data.crawlers_status.length; i++) {
var crawler = data.crawlers_status[i]; var crawler = data.crawlers_status[i];

View file

@ -35,8 +35,8 @@
</a> </a>
</li> </li>
<li class="nav-item"> <li class="nav-item">
<a class="nav-link" href="{{url_for('hiddenServices.auto_crawler')}}" id="nav_auto_crawler"> <a class="nav-link" href="{{url_for('crawler_splash.scheduler_dashboard')}}" id="nav_scheduler_crawler">
<i class="fas fa-sync"></i> Automatic Crawler <i class="fas fa-sync"></i> Scheduler
</a> </a>
</li> </li>
<li class="nav-item"> <li class="nav-item">