chg: [crawler] add crawler scheduler

This commit is contained in:
Terrtia 2023-03-14 17:36:42 +01:00
parent ae6f8af09f
commit 925d67a35e
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
8 changed files with 864 additions and 286 deletions

View file

@ -5,6 +5,8 @@ import os
import sys
import time
from requests.exceptions import ConnectionError
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
@ -15,6 +17,7 @@ from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain
from lib.objects import Screenshots
class Crawler(AbstractModule):
def __init__(self):
@ -37,8 +40,11 @@ class Crawler(AbstractModule):
# update captures cache
crawlers.reload_crawler_captures()
self.crawler_scheduler = crawlers.CrawlerScheduler()
# LACUS
self.lacus = crawlers.get_lacus()
self.is_lacus_up = crawlers.get_lacus().is_up
# Capture
self.har = None
@ -51,44 +57,70 @@ class Crawler(AbstractModule):
# Send module state to logs
self.redis_logger.info('Crawler initialized')
def print_crawler_start_info(self, url, domain, domain_url):
def refresh_lacus_status(self):
try:
self.is_lacus_up = crawlers.get_lacus().is_up
except:
self.is_lacus_up = False
if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time()))
time.sleep(30)
def print_crawler_start_info(self, url, domain_url):
print()
print()
print('\033[92m------------------START CRAWLER------------------\033[0m')
print(f'crawler type: {domain}')
print(f'crawler type: {self.domain}')
print('\033[92m-------------------------------------------------\033[0m')
print(f'url: {url}')
print(f'domain: {domain}')
print(f'domain: {self.domain}')
print(f'domain_url: {domain_url}')
print()
def get_message(self):
# Crawler Scheduler
self.crawler_scheduler.update_queue()
self.crawler_scheduler.process_queue()
self.refresh_lacus_status() # TODO LOG ERROR
if not self.is_lacus_up:
return None
# Check if a new Capture can be Launched
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.add_task_to_lacus_queue()
if task_row:
print(task_row)
task_uuid, priority = task_row
self.enqueue_capture(task_uuid, priority)
try:
self.enqueue_capture(task_uuid, priority)
except ConnectionError:
print(task_row)
task = crawlers.CrawlerTask(task_uuid)
task.add_to_db_crawler_queue(priority)
self.refresh_lacus_status()
return None
# Get CrawlerCapture Object
capture = crawlers.get_crawler_capture()
if capture:
print(capture.uuid)
status = self.lacus.get_capture_status(capture.uuid)
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
capture.update(status)
print(capture.uuid, status, int(time.time()))
else:
self.compute(capture)
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
print('capture', capture.uuid, 'completed')
try:
status = self.lacus.get_capture_status(capture.uuid)
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
capture.update(status)
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
else:
return capture
except ConnectionError:
print(capture.uuid)
capture.update(self, -1)
self.refresh_lacus_status()
time.sleep(self.pending_seconds)
def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid)
print(task)
# print(task)
# task = {
# 'uuid': task_uuid,
# 'url': 'https://foo.be',
@ -102,11 +134,15 @@ class Crawler(AbstractModule):
# 'proxy': 'force_tor',
# 'parent': 'manual',
# }
url = task.get_url()
force = priority != 0
# TODO timeout
# TODO HEADER
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
# force=force,
# general_timeout_in_sec=120)
capture_uuid = self.lacus.enqueue(url=url,
depth=task.get_depth(),
@ -114,14 +150,13 @@ class Crawler(AbstractModule):
proxy=task.get_proxy(),
cookies=task.get_cookies(),
force=force,
general_timeout_in_sec=90)
general_timeout_in_sec=90) # TODO increase timeout if onion ????
crawlers.create_capture(capture_uuid, task_uuid)
print(task.uuid, capture_uuid, 'launched')
return capture_uuid
# CRAWL DOMAIN
# TODO: CATCH ERRORS
def compute(self, capture):
print('saving capture', capture.uuid)
@ -131,7 +166,6 @@ class Crawler(AbstractModule):
self.domain = Domain(domain)
# TODO CHANGE EPOCH
epoch = int(time.time())
parent_id = task.get_parent()
@ -139,6 +173,9 @@ class Crawler(AbstractModule):
print(entries['status'])
self.har = task.get_har()
self.screenshot = task.get_screenshot()
# DEBUG
# self.har = True
# self.screenshot = True
str_date = crawlers.get_current_date(separator=True)
self.har_dir = crawlers.get_date_har_dir(str_date)
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
@ -156,7 +193,10 @@ class Crawler(AbstractModule):
self.domain.add_history(epoch, root_item=epoch)
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
task.clear()
print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
print()
task.remove()
def save_capture_response(self, parent_id, entries):
print(entries.keys())
@ -168,12 +208,11 @@ class Crawler(AbstractModule):
print('retrieved content')
# print(entries.get('html'))
# TODO LOGS IF != domain
if 'last_redirected_url' in entries and entries['last_redirected_url']:
last_url = entries['last_redirected_url']
unpacked_last_url = crawlers.unpack_url(last_url)
current_domain = unpacked_last_url['domain']
# REDIRECTION TODO CHECK IF WEB
# REDIRECTION TODO CHECK IF TYPE CHANGE
if current_domain != self.domain.id and not self.root_item:
self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
print(f'External redirection {self.domain.id} -> {current_domain}')
@ -225,92 +264,4 @@ class Crawler(AbstractModule):
if __name__ == '__main__':
module = Crawler()
module.debug = True
# module.compute(('ooooo', 0))
module.run()
##################################
##################################
##################################
##################################
##################################
# def update_auto_crawler():
# current_epoch = int(time.time())
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
# for elem_to_crawl in list_to_crawl:
# mess, type = elem_to_crawl.rsplit(';', 1)
# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
# Extract info form url (url, domain, domain url, ...)
# def unpack_url(url):
# to_crawl = {}
# faup.decode(url)
# url_unpack = faup.get()
# to_crawl['domain'] = to_crawl['domain'].lower()
# new_url_host = url_host.lower()
# url_lower_case = url.replace(url_host, new_url_host, 1)
#
# if url_unpack['scheme'] is None:
# to_crawl['scheme'] = 'http'
# url= 'http://{}'.format(url_lower_case)
# else:
# try:
# scheme = url_unpack['scheme'].decode()
# except Exception as e:
# scheme = url_unpack['scheme']
# if scheme in default_proto_map:
# to_crawl['scheme'] = scheme
# url = url_lower_case
# else:
# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
# to_crawl['scheme'] = 'http'
# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
#
# if url_unpack['port'] is None:
# to_crawl['port'] = default_proto_map[to_crawl['scheme']]
# else:
# try:
# port = url_unpack['port'].decode()
# except:
# port = url_unpack['port']
# # Verify port number #################### make function to verify/correct port number
# try:
# int(port)
# # Invalid port Number
# except Exception as e:
# port = default_proto_map[to_crawl['scheme']]
# to_crawl['port'] = port
#
# #if url_unpack['query_string'] is None:
# # if to_crawl['port'] == 80:
# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
# # else:
# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
# #else:
# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())
#
# to_crawl['url'] = url
# if to_crawl['port'] == 80:
# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
# else:
# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
#
# try:
# to_crawl['tld'] = url_unpack['tld'].decode()
# except:
# to_crawl['tld'] = url_unpack['tld']
#
# return to_crawl
# ##################################################### add ftp ???
# update_auto_crawler()
# # add next auto Crawling in queue:
# if to_crawl['paste'] == 'auto':
# redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
# # update list, last auto crawled domains
# redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
# redis_crawler.ltrim('last_auto_crawled', 0, 9)
#

View file

@ -19,8 +19,9 @@ import uuid
from enum import IntEnum, unique
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from urllib.parse import urlparse, urljoin
#from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
from pylacus import PyLacus
@ -44,8 +45,6 @@ r_db = config_loader.get_db_conn("Kvrocks_DB")
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
HAR_DIR = config_loader.get_files_directory('har')
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
@ -181,8 +180,8 @@ def extract_favicon_from_html(html, url):
################################################################################
# # TODO: handle prefix cookies
# # TODO: fill empty fields
# # TODO:
# # TODO: REVIEW ME THEN REMOVE ME
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
# check cookie domain filed
if not 'domain' in cookie_dict:
@ -201,13 +200,6 @@ def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
return cookie_dict
def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='web'):
cookies = get_cookiejar_cookies_list(cookiejar_uuid)
all_cookies = []
for cookie_dict in cookies:
all_cookies.append(create_cookie_crawler(cookie_dict, domain, crawler_type=crawler_type))
return all_cookies
################################################################################
################################################################################
################################################################################
@ -695,7 +687,285 @@ def load_blacklist():
except Exception as e:
print(e)
#### CRAWLER STATE ####
#### CRAWLER Scheduler ####
@unique
class ScheduleStatus(IntEnum):
"""The status of the capture"""
UNKNOWN = -1
QUEUED = 0
SCHEDULED = 1
ONGOING = 2
def get_schedulers_uuid():
return r_crawler.smembers('scheduler:schedules')
def get_schedulers_metas():
schedulers = []
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
schedulers.append(schedule.get_meta_status())
return schedulers
class CrawlerScheduler:
def __init__(self):
self.min_frequency = 60 # TODO ADD IN CONFIG
def update_queue(self):
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
# check if already in scheduler queue
if schedule.is_scheduled():
continue
if schedule.is_tasked():
continue
# EXPIRE ????
time_next_run = 0.0
frequency = schedule.get_frequency() # optional or later -> cron
if frequency == 'hourly':
time_next_run = (datetime.now() + timedelta(hours=1)).timestamp()
elif frequency == 'daily':
time_next_run = (datetime.now() + timedelta(days=1)).timestamp()
elif frequency == 'weekly':
time_next_run = (datetime.now() + timedelta(weeks=1)).timestamp()
elif frequency == 'monthly':
time_next_run = (datetime.now() + relativedelta(months=1)).timestamp()
else:
months, weeks, days, hours, minutes = frequency.split(':')
if not months:
months = 0
if not weeks:
weeks = 0
if not days:
days = 0
if not hours:
hours = 0
if not minutes:
minutes = 0
current_time = datetime.now().timestamp()
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
days=int(days), hours=int(hours),
minutes=int(minutes))).timestamp()
# Make sure the next capture is not scheduled for in a too short interval
interval_next_capture = time_next_run - current_time
if interval_next_capture < self.min_frequency:
# self.logger.warning(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
print(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
time_next_run = (datetime.now() + timedelta(seconds=self.min_frequency)).timestamp()
schedule.set_next_run(time_next_run)
print('scheduled:', schedule_uuid)
def process_queue(self):
now = datetime.now().timestamp()
for raw_schedule in r_crawler.zrangebyscore('scheduler:queue', '-inf', int(now), withscores=True):
schedule_uuid, next_run = raw_schedule
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return None
meta = schedule.get_meta()
task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'],
header=meta['header'],
cookiejar=meta['cookiejar'], proxy=meta['proxy'],
user_agent=meta['user_agent'], parent='scheduler', priority=40)
if task_uuid:
schedule.set_task(task_uuid)
r_crawler.zrem('scheduler:queue', schedule_uuid)
# TODO Expire -> stuck in crawler queue or reached delta
class CrawlerSchedule:
def __init__(self, schedule_uuid):
self.uuid = schedule_uuid
def exists(self):
return r_crawler.exists(f'schedule:{self.uuid}')
def get_frequency(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'frequency')
def get_user(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user')
def get_date(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'date')
def get_captures(self): # only scheduled capture ????? exclude manual/discovery
pass
def get_status(self):
if self.is_scheduled():
return ScheduleStatus.SCHEDULED
if self.is_tasked():
if self.is_ongoing():
return ScheduleStatus.ONGOING
else:
return ScheduleStatus.QUEUED
return ScheduleStatus.UNKNOWN
def get_task_uuid(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'task')
def is_tasked(self):
task_uuid = self.get_task_uuid()
if task_uuid:
task = CrawlerTask(task_uuid)
tasked = task.exists()
if not tasked:
r_crawler.hdel(f'schedule:{self.uuid}', 'task')
return tasked
return False
def get_task(self):
task_uuid = self.get_task_uuid()
if task_uuid:
return CrawlerTask(task_uuid)
def set_task(self, task_uuid):
return r_crawler.hset(f'schedule:{self.uuid}', 'task', task_uuid)
def is_ongoing(self):
task = self.get_task()
if task:
return task.is_ongoing()
return False
def get_next_run(self, r_str=False):
next_run = r_crawler.zscore('scheduler:queue', self.uuid)
if next_run and r_str:
next_run = time.strftime('%Y-%m-%d - %H:%M:%S', time.localtime(int(next_run)))
return next_run
def set_next_run(self, time_next_run):
r_crawler.zadd('scheduler:queue', mapping={self.uuid: time_next_run})
def is_scheduled(self):
return bool(r_crawler.zscore('scheduler:queue', self.uuid))
def get_url(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'url')
def get_depth(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'depth')
def get_har(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'har') == 'True'
def get_screenshot(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'screenshot') == 'True'
def get_header(self):
r_crawler.hget(f'schedule:{self.uuid}', 'header')
def get_cookiejar(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'cookiejar')
def get_proxy(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'proxy')
def get_user_agent(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user_agent')
def _set_field(self, field, value):
return r_crawler.hset(f'schedule:{self.uuid}', field, value)
def get_meta(self, ui=False):
meta = {
'uuid': self.uuid,
'date': self.get_date(),
'frequency': self.get_frequency(),
'user': self.get_user(),
'url': self.get_url(),
'depth': self.get_depth(),
'har': self.get_har(),
'screenshot': self.get_screenshot(),
'user_agent': self.get_user_agent(),
'cookiejar': self.get_cookiejar(),
'header': self.get_header(),
'proxy': self.get_proxy(),
}
status = self.get_status()
if ui:
status = status.name
r_str = True
else:
r_str = False
meta['status'] = status
meta['next_run'] = self.get_next_run(r_str=r_str)
return meta
def get_meta_status(self): # TODO: Description ? Frequency ???
meta = {'uuid': self.uuid,
'url': self.get_url(),
'user': self.get_user(),
'next_run': self.get_next_run(r_str=True)}
status = self.get_status()
if isinstance(status, ScheduleStatus):
status = status.name
meta['status'] = status
return meta
def create(self, frequency, user, url,
depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
if self.exists():
raise Exception('Error: Monitor already exists')
url_decoded = unpack_url(url)
url = url_decoded['url']
self._set_field('date', datetime.now().strftime("%Y-%m-%d"))
self._set_field('frequency', frequency)
self._set_field('user', user)
self._set_field('url', url)
self._set_field('depth', int(depth))
self._set_field('har', str(har))
self._set_field('screenshot', str(screenshot))
if cookiejar:
self._set_field('cookiejar', cookiejar)
if header:
self._set_field('header', header)
if proxy:
if proxy == 'web':
proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor'
self._set_field('proxy', proxy)
if user_agent:
self._set_field('user_agent', user_agent)
r_crawler.sadd('scheduler:schedules', self.uuid)
def delete(self):
# remove from schedule queue
r_crawler.zrem('scheduler:queue', self.uuid)
# delete task
task = self.get_task()
if task:
task.delete()
# delete meta
r_crawler.delete(f'schedule:{self.uuid}')
r_crawler.srem('scheduler:schedules', self.uuid)
def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
schedule_uuid = gen_uuid()
schedule = CrawlerSchedule(schedule_uuid)
schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent)
return schedule_uuid
# TODO sanityze UUID
def api_delete_schedule(data):
schedule_uuid = data.get('uuid')
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return {'error': 'unknown schedule uuid', 'uuid': schedule}, 404
return schedule.delete(), 200
#### CRAWLER CAPTURE ####
@ -709,7 +979,15 @@ def reload_crawler_captures():
r_cache.delete('crawler:captures')
for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid)
r_cache.zadd('crawler:captures', {capture.uuid: 0})
capture.update(None)
def _clear_captures():
for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid)
task = capture.get_task()
task.delete()
capture.delete()
print(capture_uuid, 'deleted')
@unique
class CaptureStatus(IntEnum):
@ -741,6 +1019,9 @@ class CrawlerCapture:
def get_status(self):
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
def is_ongoing(self):
return self.get_status() == CaptureStatus.ONGOING
def create(self, task_uuid):
if self.exists():
raise Exception(f'Error: Capture {self.uuid} already exists')
@ -752,20 +1033,26 @@ class CrawlerCapture:
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
def update(self, status):
last_check = int(time.time())
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
r_cache.zadd('crawler:captures', {self.uuid: last_check})
# Error or Reload
if not status:
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', CaptureStatus.UNKNOWN)
r_cache.zadd('crawler:captures', {self.uuid: 0})
else:
last_check = int(time.time())
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
r_cache.zadd('crawler:captures', {self.uuid: last_check})
def remove(self): # TODO INCOMPLETE
# Crawler
def remove(self):
r_crawler.zrem('crawler:captures', self.uuid)
r_cache.delete(f'crawler:capture:{self.uuid}')
r_crawler.hdel('crawler:captures:tasks', self.uuid)
# TODO
# TODO DELETE TASK ???
# Manual
def delete(self):
# task = self.get_task()
# task.delete()
r_cache.delete(f'crawler:capture:{self.uuid}')
# remove Capture from crawler queue
r_cache.zrem('crawler:captures', self.uuid)
self.remove()
def create_capture(capture_uuid, task_uuid):
@ -792,9 +1079,13 @@ def get_captures_status():
'uuid': task.uuid,
'domain': dom.get_id(),
'type': dom.get_domain_type(),
'start_time': capture.get_start_time(), ############### TODO
'start_time': capture.get_start_time(),
'status': capture.get_status(),
}
capture_status = capture.get_status()
if capture_status:
capture_status = CaptureStatus(int(capture_status)).name
meta['status'] = capture_status
status.append(meta)
return status
@ -872,6 +1163,12 @@ class CrawlerTask:
def get_capture(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
def is_ongoing(self):
capture_uuid = self.get_capture()
if capture_uuid:
return CrawlerCapture(capture_uuid).is_ongoing()
return False
def _set_field(self, field, value):
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
@ -923,8 +1220,6 @@ class CrawlerTask:
proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor'
if not user_agent:
user_agent = get_default_user_agent()
# TODO SANITIZE COOKIEJAR -> UUID
@ -934,13 +1229,11 @@ class CrawlerTask:
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
return self.uuid
# TODO ADD TASK STATUS -----
self._set_field('domain', domain)
self._set_field('url', url)
self._set_field('depth', int(depth))
self._set_field('har', har)
self._set_field('screenshot', screenshot)
self._set_field('user_agent', user_agent)
self._set_field('parent', parent)
if cookiejar:
@ -949,30 +1242,45 @@ class CrawlerTask:
self._set_field('header', header)
if proxy:
self._set_field('proxy', proxy)
if user_agent:
self._set_field('user_agent', user_agent)
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
self._set_field('hash', hash_query)
r_crawler.zadd('crawler:queue', {self.uuid: priority})
self.add_to_db_crawler_queue(priority)
# UI
domain_type = dom.get_domain_type()
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
self._set_field('queue', domain_type)
return self.uuid
def lacus_queue(self):
r_crawler.sadd('crawler:queue:queued', self.uuid)
def add_to_db_crawler_queue(self, priority):
r_crawler.zadd('crawler:queue', {self.uuid: priority})
def start(self):
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
def clear(self):
r_crawler.hdel('crawler:queue:hash', self.get_hash())
r_crawler.srem(f'crawler:queue:type:{self.get_queue()}', self.uuid)
r_crawler.srem('crawler:queue:queued', self.uuid)
def delete(self):
self.clear()
# Crawler
def remove(self): # zrem cache + DB
capture_uuid = self.get_capture()
if capture_uuid:
capture = CrawlerCapture(capture_uuid)
capture.remove()
queue_type = self.get_queue()
if queue_type:
r_crawler.srem(f'crawler:queue:type:{queue_type}', self.uuid)
task_hash = self.get_hash()
if task_hash:
r_crawler.hdel('crawler:queue:hash', task_hash)
# meta
r_crawler.delete(f'crawler:task:{self.uuid}')
# r_crawler.zadd('crawler:queue', {self.uuid: priority})
# Manual
def delete(self):
# queue
r_crawler.zrem('crawler:queue', self.uuid)
self.remove()
# TODO move to class ???
@ -990,7 +1298,7 @@ def add_task_to_lacus_queue():
return None
task_uuid, priority = task_uuid[0]
task = CrawlerTask(task_uuid)
task.lacus_queue()
task.start()
return task.uuid, priority
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
@ -1006,29 +1314,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
return task_uuid
######################################################################
######################################################################
# def get_task_status(task_uuid):
# domain = r_crawler.hget(f'crawler:task:{task_uuid}', 'domain')
# dom = Domain(domain)
# meta = {
# 'uuid': task_uuid,
# 'domain': dom.get_id(),
# 'domain_type': dom.get_domain_type(),
# 'start_time': r_crawler.hget(f'crawler:task:{task_uuid}', 'start_time'),
# 'status': 'test',
# }
# return meta
# def get_crawlers_tasks_status():
# tasks_status = []
# tasks = r_crawler.smembers('crawler:queue:queued')
# for task_uuid in tasks:
# tasks_status.append(get_task_status(task_uuid))
# return tasks_status
##-- CRAWLER TASK --##
## -- CRAWLER TASK -- ##
#### CRAWLER TASK API ####
@ -1071,13 +1358,25 @@ def api_add_crawler_task(data, user_id=None):
return {'error': 'The access to this cookiejar is restricted'}, 403
cookiejar_uuid = cookiejar.uuid
# if auto_crawler:
# try:
# crawler_delta = int(crawler_delta)
# if crawler_delta < 0:
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
# except ValueError:
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
frequency = data.get('frequency', None)
if frequency:
if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
if not isinstance(frequency, dict):
return {'error': 'Invalid frequency'}, 400
else:
try:
months = int(frequency.get('months', 0))
weeks = int(frequency.get('weeks', 0))
days = int(frequency.get('days', 0))
hours = int(frequency.get('hours', 0))
minutes = int(frequency.get('minutes', 0))
except (TypeError, ValueError):
return {'error': 'Invalid frequency'}, 400
if min(months, weeks, days, hours, minutes) < 0:
return {'error': 'Invalid frequency'}, 400
if max(months, weeks, days, hours, minutes) <= 0:
return {'error': 'Invalid frequency'}, 400
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
# PROXY
proxy = data.get('proxy', None)
@ -1088,15 +1387,16 @@ def api_add_crawler_task(data, user_id=None):
if verify[1] != 200:
return verify
# TODO #############################################################################################################
# auto_crawler = auto_crawler
# crawler_delta = crawler_delta
parent = 'manual'
# TODO HEADERS
# TODO USER AGENT
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, cookiejar=cookiejar_uuid,
proxy=proxy, user_agent=None, parent='manual', priority=90), 200
if frequency:
# TODO verify user
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200
else:
# TODO HEADERS
# TODO USER AGENT
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None,
parent='manual', priority=90), 200
#### ####
@ -1108,13 +1408,6 @@ def api_add_crawler_task(data, user_id=None):
###################################################################################
#### CRAWLER GLOBAL ####
# TODO: # FIXME: config db, dynamic load
@ -1124,55 +1417,8 @@ def is_crawler_activated():
def get_crawler_all_types():
return ['onion', 'web']
def sanitize_crawler_types(l_crawler_types):
all_crawler_types = get_crawler_all_types()
if not l_crawler_types:
return all_crawler_types
for crawler_type in l_crawler_types:
if crawler_type not in all_crawler_types:
return all_crawler_types
return l_crawler_types
##-- CRAWLER GLOBAL --##
#### AUTOMATIC CRAWLER ####
def get_auto_crawler_all_domain(l_crawler_types=[]):
l_crawler_types = sanitize_crawler_types(l_crawler_types)
if len(l_crawler_types) == 1:
return r_serv_onion.smembers(f'auto_crawler_url:{l_crawler_types[0]}')
else:
l_keys_name = []
for crawler_type in l_crawler_types:
l_keys_name.append(f'auto_crawler_url:{crawler_type}')
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
r_serv_onion.zadd('crawler_auto_queue', {f'{message};{domain_type}': int(time.time() + delta)})
# update list, last auto crawled domains
r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
r_serv_onion.ltrim('last_auto_crawled', 0, 9)
def update_auto_crawler_queue():
current_epoch = int(time.time())
# check if current_epoch > domain_next_epoch
l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
for elem in l_queue:
mess, domain_type = elem.rsplit(';', 1)
print(domain_type)
print(mess)
r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
##-- AUTOMATIC CRAWLER --##
#### CRAWLER TASK ####
##-- CRAWLER TASK --##
#### ####
@ -1207,6 +1453,8 @@ def save_har(har_dir, item_id, har_content):
# #
# # # # # # # # # # # #
#### PROXY ####
def api_verify_proxy(proxy_url):
parsed_proxy = urlparse(proxy_url)
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
@ -1237,13 +1485,7 @@ class CrawlerProxy:
def get_url(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
###############################################################################################
###############################################################################################
###############################################################################################
###############################################################################################
# # # # CRAWLER LACUS # # # #
#### CRAWLER LACUS ####
def get_lacus_url():
return r_db.hget('crawler:lacus', 'url')
@ -1363,12 +1605,7 @@ def api_set_crawler_max_captures(data):
save_nb_max_captures(nb_captures)
return nb_captures, 200
## PROXY ##
# TODO SAVE PROXY URL + ADD PROXY TESTS
# -> name + url
## PROXY ##
## TEST ##
def is_test_ail_crawlers_successful():
return r_db.hget('crawler:tor:test', 'success') == 'True'
@ -1380,7 +1617,6 @@ def save_test_ail_crawlers_result(test_success, message):
r_db.hset('crawler:tor:test', 'success', str(test_success))
r_db.hset('crawler:tor:test', 'message', message)
# TODO CREATE TEST TASK
def test_ail_crawlers():
# # TODO: test web domain
if not ping_lacus():
@ -1431,10 +1667,11 @@ def test_ail_crawlers():
#### ---- ####
# TODO MOVE ME
# TODO MOVE ME IN CRAWLER OR FLASK
load_blacklist()
# if __name__ == '__main__':
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
# print(task.get_meta())
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
# print(task.get_meta())
# _clear_captures()

View file

@ -61,9 +61,9 @@ def create_json_response(data, status_code):
def crawlers_dashboard():
is_manager_connected = crawlers.get_lacus_connection_metadata()
crawlers_status = crawlers.get_captures_status()
print(crawlers_status)
# print(crawlers_status)
crawlers_latest_stats = crawlers.get_crawlers_stats()
print(crawlers_latest_stats)
# print(crawlers_latest_stats)
date = crawlers.get_current_date()
return render_template("dashboard_crawler.html", date=date,
is_manager_connected=is_manager_connected,
@ -77,6 +77,7 @@ def crawlers_dashboard():
def crawler_dashboard_json():
crawlers_status = crawlers.get_captures_status()
crawlers_latest_stats = crawlers.get_crawlers_stats()
# print(crawlers_status)
return jsonify({'crawlers_status': crawlers_status,
'stats': crawlers_latest_stats})
@ -106,13 +107,24 @@ def send_to_spider():
# POST val
url = request.form.get('url_to_crawl')
crawler_type = request.form.get('crawler_queue_type')
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
screenshot = request.form.get('screenshot')
har = request.form.get('har')
depth_limit = request.form.get('depth_limit')
cookiejar_uuid = request.form.get('cookiejar')
# Frequency
if request.form.get('crawler_scheduler'):
frequency = request.form.get('frequency')
if frequency == 'custom':
months = request.form.get('frequency_months', 0)
weeks = request.form.get('frequency_weeks', 0)
days = request.form.get('frequency_days', 0)
hours = request.form.get('frequency_hours', 0)
minutes = request.form.get('frequency_minutes', 0)
frequency = {'months': months, 'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes}
else:
frequency = None
# PROXY
proxy = request.form.get('proxy_name')
if proxy:
@ -129,7 +141,7 @@ def send_to_spider():
cookiejar_uuid = cookiejar_uuid.rsplit(':')
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot}
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
if proxy:
data['proxy'] = proxy
if cookiejar_uuid:
@ -142,6 +154,43 @@ def send_to_spider():
return redirect(url_for('crawler_splash.manual'))
@crawler_splash.route("/crawlers/scheduler", methods=['GET'])
@login_required
@login_read_only
def scheduler_dashboard():
schedulers = crawlers.get_schedulers_metas()
# print(schedulers)
# TODO list currently queued ?
return render_template("crawler_scheduler_dashboard.html",
schedulers=schedulers,
is_manager_connected=crawlers.get_lacus_connection_metadata())
@crawler_splash.route("/crawlers/schedule", methods=['GET'])
@login_required
@login_read_only
def schedule_show():
schedule_uuid = request.args.get('uuid')
schedule = crawlers.CrawlerSchedule(schedule_uuid)
if not schedule.exists():
abort(404)
meta = schedule.get_meta(ui=True)
return render_template("crawler_schedule_uuid.html",
meta=meta)
@crawler_splash.route("/crawlers/schedule/delete", methods=['GET'])
@login_required
@login_analyst
def schedule_delete():
schedule_uuid = request.args.get('uuid')
schedule = crawlers.CrawlerSchedule(schedule_uuid)
if not schedule.exists():
abort(404)
res = crawlers.api_delete_schedule({'uuid': schedule_uuid})
if res[1] != 200:
return create_json_response(res[0], res[1])
return redirect(url_for('crawler_splash.scheduler_dashboard'))
@crawler_splash.route("/crawlers/last/domains", methods=['GET'])
@login_required
@login_read_only
@ -228,11 +277,11 @@ def showDomain():
dict_domain['epoch'] = curr_epoch
dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch))
print(dict_domain['epoch'])
# print(dict_domain['epoch'])
dict_domain['crawler_history_items'] = []
for item_id in domain.get_crawled_items_by_epoch(epoch):
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options=['crawler']))
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options={'crawler'}))
if dict_domain['crawler_history_items']:
dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items'])
@ -521,7 +570,7 @@ def crawler_cookiejar_show():
@crawler_splash.route('/crawler/cookie/delete', methods=['GET'])
@login_required
@login_read_only
@login_analyst
def crawler_cookiejar_cookie_delete():
user_id = current_user.get_id()
cookie_uuid = request.args.get('uuid')
@ -536,7 +585,7 @@ def crawler_cookiejar_cookie_delete():
@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET'])
@login_required
@login_read_only
@login_analyst
def crawler_cookiejar_delete():
user_id = current_user.get_id()
cookiejar_uuid = request.args.get('uuid')
@ -699,7 +748,7 @@ def crawler_lacus_settings_crawler_manager():
api_key = request.form.get('api_key')
res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key})
print(res)
# print(res)
if res[1] != 200:
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
else:

View file

@ -66,21 +66,59 @@
<div class="d-flex mt-3">
<i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp;
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
<label class="custom-control-label" for="crawler_type">
<i class="fas fa-clock"></i> &nbsp;Automatic
<input class="custom-control-input" type="checkbox" name="crawler_scheduler" value="True" id="crawler_scheduler">
<label class="custom-control-label" for="crawler_scheduler">
<i class="fas fa-clock"></i> &nbsp;Scheduler
</label>
</div>
</div>
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
<div class="input-group-prepend">
<span class="input-group-text bg-light"><i class="fas fa-clock"></i>&nbsp;</span>
</div>
<input class="form-control" type="number" id="crawler_epoch" value="3600" min="1" name="crawler_epoch" required>
<div class="input-group-append">
<span class="input-group-text">Time (seconds) between each crawling</span>
</div>
</div>
<div id="frequency_inputs">
<div class="mb-4">
<select class="custom-select" id="frequency" name="frequency" onchange="frequency_selector_update(this);">
<option value="hourly">Hourly</option>
<option value="daily">Daily</option>
<option value="weekly">Weekly</option>
<option value="monthly">Monthly</option>
<option value="custom">Custom</option>
</select>
</div>
<div id="custom_frequency">
<h5><i class="fas fa-clock"></i>&nbsp;Adjust crawling interval as needed</h5>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Months</b></span>
</div>
<input class="form-control" type="number" id="frequency_months" value="0" min="0" name="frequency_months" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Weeks</b></span>
</div>
<input class="form-control" type="number" id="frequency_weeks" value="0" min="0" name="frequency_weeks" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Days</b></span>
</div>
<input class="form-control" type="number" id="frequency_days" value="0" min="0" name="frequency_days" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Hours</b></span>
</div>
<input class="form-control" type="number" id="frequency_hours" value="0" min="0" name="frequency_hours" required>
</div>
<div class="input-group">
<div class="input-group-prepend">
<span class="input-group-text bg-light" style="width: 90px"><b>Minutes</b></span>
</div>
<input class="form-control" type="number" id="frequency_minutes" value="0" min="0" name="frequency_minutes" required>
</div>
</div>
</div>
</div>
@ -165,8 +203,9 @@ $(document).ready(function(){
$("#nav_manual_crawler").addClass("active");
queue_type_selector_input_controler()
manual_crawler_input_controler();
$("#custom_frequency").hide();
$('#crawler_type').on("change", function () {
$('#crawler_scheduler').on("change", function () {
manual_crawler_input_controler();
});
@ -190,10 +229,18 @@ function toggle_sidebar(){
}
function manual_crawler_input_controler() {
if($('#crawler_type').is(':checked')){
$("#crawler_epoch_input").show();
if($('#crawler_scheduler').is(':checked')){
$("#frequency_inputs").show();
}else{
$("#crawler_epoch_input").hide();
$("#frequency_inputs").hide();
}
}
function frequency_selector_update(obj) {
if(obj.value === "custom") {
$("#custom_frequency").show();
}else{
$("#custom_frequency").hide();
}
}

View file

@ -0,0 +1,199 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<div class="card my-1">
<div class="card-header bg-dark text-white">
<h4 class="card-title"><b>{{ meta['url'] }}</b></h4>
</div>
<div class="card-body">
<div class="row">
<div class="col-lg-8">
<table class="table table-hover">
<tbody>
<tr>
<th>UUID</th>
<td>{{ meta['uuid'] }}</td>
</tr>
<tr>
<th>Url</th>
<td>{{ meta['url'] }}</td>
</tr>
<tr>
<th>Frequency</th>
<td>{{ meta['frequency'] }}</td>
</tr>
<tr>
<th>Creator</th>
<td>{{ meta['user'] }}</td>
</tr>
<tr>
<th>Date</th>
<td>{{ meta['date'] }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ meta['status'] }}</td>
</tr>
<tr>
<th>Next Run</th>
<td>
{% if not meta['next_run'] %}
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
{% else %}
{{ meta['next_run'] }}
{% endif %}
</td>
</tr>
</tbody>
</table>
<h4>Config:</h4>
<table class="table table-hover">
<tbody>
<tr>
<th><i class="fas fa-water"></i> Depth</th>
<td>{{ meta['depth'] }}</td>
</tr>
<tr>
<th><i class="fas fa-image"></i> Screenshot</th>
<td>
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['screenshot'] %}checked{% endif %} disabled>
<label class="custom-control-label" for="html_content_id">
<i class="fas fa-image"></i> &nbsp;
</label>
</div>
</td>
</tr>
<tr>
<th><i class="fas fa-file"></i> &nbsp;Har</th>
<td>
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['har'] %}checked{% endif %} disabled>
<label class="custom-control-label" for="html_content_id">
<i class="fas fa-file"></i> &nbsp;
</label>
</div>
</td>
</tr>
<tr>
<th><i class="fas fa-cookie-bite"></i> Cookiejar</th>
<td>
{% if not meta['cookiejar'] %}
-
{% else %}
<a href="{{ url_for('crawler_splash.crawler_cookiejar_show') }}?uuid={{meta['cookiejar']}}">
{{ meta['cookiejar'] }}
</a>
{% endif %}
</td>
</tr>
<tr>
<th>Proxy</th>
<td>
{% if not meta['proxy'] %}
-
{% else %}
{{ meta['proxy'] }}
{% endif %}
</td>
</tr>
<tr>
<th>User Agent</th>
<td>
{% if meta['user_agent'] %}
{{ meta['user_agent'] }}
{% else %}
Default
{% endif %}
</td>
</tr>
{% if meta['header'] %}
<tr>
<th>header</th>
<td>{{ meta['header'] }}</td>
</tr>
{% endif %}
</tbody>
</table>
</div>
<div class="col-lg-4">
<div>
<a href="{{ url_for('crawler_splash.schedule_delete') }}?uuid={{meta['uuid']}}">
<button type="button" class="btn btn-danger">
<i class="fas fa-trash-alt"></i> <b>Delete</b>
</button>
</a>
{# <a href="{{ url_for('investigations_b.edit_investigation') }}?uuid={{meta['uuid']}}">#}
{# <button type="button" class="btn btn-info">#}
{# <i class="fas fa-pencil-alt"></i> <b>Edit</b>#}
{# </button>#}
{# </a>#}
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</body>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_monitoring_crawler").addClass("active");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -0,0 +1,94 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dataTables.bootstrap.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
{% include 'crawler/crawler_disabled.html' %}
<h1>Schedulers:</h1>
<table class="table mt-1 table-hover table-borderless table-striped" id="table_scheduler">
<thead class="thead-dark">
<tr>
<th>Url</th>
<th>Status</th>
<th>Next Run</th>
<th>User</th>
</tr>
</thead>
<tbody id="tbody_last_crawled">
{% for meta in schedulers %}
<tr>
<td><a href="{{ url_for('crawler_splash.schedule_show') }}?uuid={{ meta['uuid'] }}">{{ meta['url'] }}</a></td>
<td>{{ meta['status'] }}</td>
<td>
{% if not meta['next_run'] %}
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
{% else %}
{{ meta['next_run'] }}
{% endif %}
</td>
<td>{{ meta['user'] }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</body>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_scheduler_crawler").addClass("active");
$('#table_scheduler').DataTable();
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -208,8 +208,9 @@ function refresh_crawler_status(){
$('#stat_web_total').text(data.stats['web']['crawled']);
$('#stat_web_queue').text(data.stats['web']['queue']);
$("#tbody_crawler_onion_info").empty();
if(data.crawlers_status.length!=0){
$("#tbody_crawler_onion_info").empty();
var tableRef = document.getElementById('tbody_crawler_onion_info');
for (var i = 0; i < data.crawlers_status.length; i++) {
var crawler = data.crawlers_status[i];

View file

@ -35,8 +35,8 @@
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="{{url_for('hiddenServices.auto_crawler')}}" id="nav_auto_crawler">
<i class="fas fa-sync"></i> Automatic Crawler
<a class="nav-link" href="{{url_for('crawler_splash.scheduler_dashboard')}}" id="nav_scheduler_crawler">
<i class="fas fa-sync"></i> Scheduler
</a>
</li>
<li class="nav-item">