mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [crawler] add crawler scheduler
This commit is contained in:
parent
ae6f8af09f
commit
925d67a35e
8 changed files with 864 additions and 286 deletions
|
@ -5,6 +5,8 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from requests.exceptions import ConnectionError
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
##################################
|
##################################
|
||||||
# Import Project packages
|
# Import Project packages
|
||||||
|
@ -15,6 +17,7 @@ from lib.ConfigLoader import ConfigLoader
|
||||||
from lib.objects.Domains import Domain
|
from lib.objects.Domains import Domain
|
||||||
from lib.objects import Screenshots
|
from lib.objects import Screenshots
|
||||||
|
|
||||||
|
|
||||||
class Crawler(AbstractModule):
|
class Crawler(AbstractModule):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -37,8 +40,11 @@ class Crawler(AbstractModule):
|
||||||
# update captures cache
|
# update captures cache
|
||||||
crawlers.reload_crawler_captures()
|
crawlers.reload_crawler_captures()
|
||||||
|
|
||||||
|
self.crawler_scheduler = crawlers.CrawlerScheduler()
|
||||||
|
|
||||||
# LACUS
|
# LACUS
|
||||||
self.lacus = crawlers.get_lacus()
|
self.lacus = crawlers.get_lacus()
|
||||||
|
self.is_lacus_up = crawlers.get_lacus().is_up
|
||||||
|
|
||||||
# Capture
|
# Capture
|
||||||
self.har = None
|
self.har = None
|
||||||
|
@ -51,44 +57,70 @@ class Crawler(AbstractModule):
|
||||||
# Send module state to logs
|
# Send module state to logs
|
||||||
self.redis_logger.info('Crawler initialized')
|
self.redis_logger.info('Crawler initialized')
|
||||||
|
|
||||||
def print_crawler_start_info(self, url, domain, domain_url):
|
def refresh_lacus_status(self):
|
||||||
|
try:
|
||||||
|
self.is_lacus_up = crawlers.get_lacus().is_up
|
||||||
|
except:
|
||||||
|
self.is_lacus_up = False
|
||||||
|
if not self.is_lacus_up:
|
||||||
|
print("Can't reach lacus server", int(time.time()))
|
||||||
|
time.sleep(30)
|
||||||
|
|
||||||
|
def print_crawler_start_info(self, url, domain_url):
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
||||||
print(f'crawler type: {domain}')
|
print(f'crawler type: {self.domain}')
|
||||||
print('\033[92m-------------------------------------------------\033[0m')
|
print('\033[92m-------------------------------------------------\033[0m')
|
||||||
print(f'url: {url}')
|
print(f'url: {url}')
|
||||||
print(f'domain: {domain}')
|
print(f'domain: {self.domain}')
|
||||||
print(f'domain_url: {domain_url}')
|
print(f'domain_url: {domain_url}')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
def get_message(self):
|
def get_message(self):
|
||||||
|
# Crawler Scheduler
|
||||||
|
self.crawler_scheduler.update_queue()
|
||||||
|
self.crawler_scheduler.process_queue()
|
||||||
|
|
||||||
|
self.refresh_lacus_status() # TODO LOG ERROR
|
||||||
|
if not self.is_lacus_up:
|
||||||
|
return None
|
||||||
|
|
||||||
# Check if a new Capture can be Launched
|
# Check if a new Capture can be Launched
|
||||||
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
||||||
task_row = crawlers.add_task_to_lacus_queue()
|
task_row = crawlers.add_task_to_lacus_queue()
|
||||||
if task_row:
|
if task_row:
|
||||||
print(task_row)
|
|
||||||
task_uuid, priority = task_row
|
task_uuid, priority = task_row
|
||||||
self.enqueue_capture(task_uuid, priority)
|
try:
|
||||||
|
self.enqueue_capture(task_uuid, priority)
|
||||||
|
except ConnectionError:
|
||||||
|
print(task_row)
|
||||||
|
task = crawlers.CrawlerTask(task_uuid)
|
||||||
|
task.add_to_db_crawler_queue(priority)
|
||||||
|
self.refresh_lacus_status()
|
||||||
|
return None
|
||||||
|
|
||||||
# Get CrawlerCapture Object
|
# Get CrawlerCapture Object
|
||||||
capture = crawlers.get_crawler_capture()
|
capture = crawlers.get_crawler_capture()
|
||||||
if capture:
|
if capture:
|
||||||
print(capture.uuid)
|
try:
|
||||||
status = self.lacus.get_capture_status(capture.uuid)
|
status = self.lacus.get_capture_status(capture.uuid)
|
||||||
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
||||||
capture.update(status)
|
capture.update(status)
|
||||||
print(capture.uuid, status, int(time.time()))
|
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
|
||||||
else:
|
else:
|
||||||
self.compute(capture)
|
return capture
|
||||||
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
|
|
||||||
print('capture', capture.uuid, 'completed')
|
except ConnectionError:
|
||||||
|
print(capture.uuid)
|
||||||
|
capture.update(self, -1)
|
||||||
|
self.refresh_lacus_status()
|
||||||
|
|
||||||
time.sleep(self.pending_seconds)
|
time.sleep(self.pending_seconds)
|
||||||
|
|
||||||
def enqueue_capture(self, task_uuid, priority):
|
def enqueue_capture(self, task_uuid, priority):
|
||||||
task = crawlers.CrawlerTask(task_uuid)
|
task = crawlers.CrawlerTask(task_uuid)
|
||||||
print(task)
|
# print(task)
|
||||||
# task = {
|
# task = {
|
||||||
# 'uuid': task_uuid,
|
# 'uuid': task_uuid,
|
||||||
# 'url': 'https://foo.be',
|
# 'url': 'https://foo.be',
|
||||||
|
@ -102,11 +134,15 @@ class Crawler(AbstractModule):
|
||||||
# 'proxy': 'force_tor',
|
# 'proxy': 'force_tor',
|
||||||
# 'parent': 'manual',
|
# 'parent': 'manual',
|
||||||
# }
|
# }
|
||||||
|
|
||||||
url = task.get_url()
|
url = task.get_url()
|
||||||
force = priority != 0
|
force = priority != 0
|
||||||
# TODO timeout
|
# TODO timeout
|
||||||
|
|
||||||
# TODO HEADER
|
# TODO HEADER
|
||||||
|
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
|
||||||
|
# force=force,
|
||||||
|
# general_timeout_in_sec=120)
|
||||||
|
|
||||||
capture_uuid = self.lacus.enqueue(url=url,
|
capture_uuid = self.lacus.enqueue(url=url,
|
||||||
depth=task.get_depth(),
|
depth=task.get_depth(),
|
||||||
|
@ -114,14 +150,13 @@ class Crawler(AbstractModule):
|
||||||
proxy=task.get_proxy(),
|
proxy=task.get_proxy(),
|
||||||
cookies=task.get_cookies(),
|
cookies=task.get_cookies(),
|
||||||
force=force,
|
force=force,
|
||||||
general_timeout_in_sec=90)
|
general_timeout_in_sec=90) # TODO increase timeout if onion ????
|
||||||
|
|
||||||
crawlers.create_capture(capture_uuid, task_uuid)
|
crawlers.create_capture(capture_uuid, task_uuid)
|
||||||
print(task.uuid, capture_uuid, 'launched')
|
print(task.uuid, capture_uuid, 'launched')
|
||||||
return capture_uuid
|
return capture_uuid
|
||||||
|
|
||||||
# CRAWL DOMAIN
|
# CRAWL DOMAIN
|
||||||
# TODO: CATCH ERRORS
|
|
||||||
def compute(self, capture):
|
def compute(self, capture):
|
||||||
print('saving capture', capture.uuid)
|
print('saving capture', capture.uuid)
|
||||||
|
|
||||||
|
@ -131,7 +166,6 @@ class Crawler(AbstractModule):
|
||||||
|
|
||||||
self.domain = Domain(domain)
|
self.domain = Domain(domain)
|
||||||
|
|
||||||
# TODO CHANGE EPOCH
|
|
||||||
epoch = int(time.time())
|
epoch = int(time.time())
|
||||||
parent_id = task.get_parent()
|
parent_id = task.get_parent()
|
||||||
|
|
||||||
|
@ -139,6 +173,9 @@ class Crawler(AbstractModule):
|
||||||
print(entries['status'])
|
print(entries['status'])
|
||||||
self.har = task.get_har()
|
self.har = task.get_har()
|
||||||
self.screenshot = task.get_screenshot()
|
self.screenshot = task.get_screenshot()
|
||||||
|
# DEBUG
|
||||||
|
# self.har = True
|
||||||
|
# self.screenshot = True
|
||||||
str_date = crawlers.get_current_date(separator=True)
|
str_date = crawlers.get_current_date(separator=True)
|
||||||
self.har_dir = crawlers.get_date_har_dir(str_date)
|
self.har_dir = crawlers.get_date_har_dir(str_date)
|
||||||
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
|
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
|
||||||
|
@ -156,7 +193,10 @@ class Crawler(AbstractModule):
|
||||||
self.domain.add_history(epoch, root_item=epoch)
|
self.domain.add_history(epoch, root_item=epoch)
|
||||||
|
|
||||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||||
task.clear()
|
print('capture:', capture.uuid, 'completed')
|
||||||
|
print('task: ', task.uuid, 'completed')
|
||||||
|
print()
|
||||||
|
task.remove()
|
||||||
|
|
||||||
def save_capture_response(self, parent_id, entries):
|
def save_capture_response(self, parent_id, entries):
|
||||||
print(entries.keys())
|
print(entries.keys())
|
||||||
|
@ -168,12 +208,11 @@ class Crawler(AbstractModule):
|
||||||
print('retrieved content')
|
print('retrieved content')
|
||||||
# print(entries.get('html'))
|
# print(entries.get('html'))
|
||||||
|
|
||||||
# TODO LOGS IF != domain
|
|
||||||
if 'last_redirected_url' in entries and entries['last_redirected_url']:
|
if 'last_redirected_url' in entries and entries['last_redirected_url']:
|
||||||
last_url = entries['last_redirected_url']
|
last_url = entries['last_redirected_url']
|
||||||
unpacked_last_url = crawlers.unpack_url(last_url)
|
unpacked_last_url = crawlers.unpack_url(last_url)
|
||||||
current_domain = unpacked_last_url['domain']
|
current_domain = unpacked_last_url['domain']
|
||||||
# REDIRECTION TODO CHECK IF WEB
|
# REDIRECTION TODO CHECK IF TYPE CHANGE
|
||||||
if current_domain != self.domain.id and not self.root_item:
|
if current_domain != self.domain.id and not self.root_item:
|
||||||
self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
|
self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
|
||||||
print(f'External redirection {self.domain.id} -> {current_domain}')
|
print(f'External redirection {self.domain.id} -> {current_domain}')
|
||||||
|
@ -225,92 +264,4 @@ class Crawler(AbstractModule):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
module = Crawler()
|
module = Crawler()
|
||||||
module.debug = True
|
module.debug = True
|
||||||
# module.compute(('ooooo', 0))
|
|
||||||
module.run()
|
module.run()
|
||||||
|
|
||||||
|
|
||||||
##################################
|
|
||||||
##################################
|
|
||||||
##################################
|
|
||||||
##################################
|
|
||||||
##################################
|
|
||||||
|
|
||||||
# def update_auto_crawler():
|
|
||||||
# current_epoch = int(time.time())
|
|
||||||
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
|
|
||||||
# for elem_to_crawl in list_to_crawl:
|
|
||||||
# mess, type = elem_to_crawl.rsplit(';', 1)
|
|
||||||
# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
|
|
||||||
# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
|
|
||||||
|
|
||||||
# Extract info form url (url, domain, domain url, ...)
|
|
||||||
# def unpack_url(url):
|
|
||||||
# to_crawl = {}
|
|
||||||
# faup.decode(url)
|
|
||||||
# url_unpack = faup.get()
|
|
||||||
# to_crawl['domain'] = to_crawl['domain'].lower()
|
|
||||||
# new_url_host = url_host.lower()
|
|
||||||
# url_lower_case = url.replace(url_host, new_url_host, 1)
|
|
||||||
#
|
|
||||||
# if url_unpack['scheme'] is None:
|
|
||||||
# to_crawl['scheme'] = 'http'
|
|
||||||
# url= 'http://{}'.format(url_lower_case)
|
|
||||||
# else:
|
|
||||||
# try:
|
|
||||||
# scheme = url_unpack['scheme'].decode()
|
|
||||||
# except Exception as e:
|
|
||||||
# scheme = url_unpack['scheme']
|
|
||||||
# if scheme in default_proto_map:
|
|
||||||
# to_crawl['scheme'] = scheme
|
|
||||||
# url = url_lower_case
|
|
||||||
# else:
|
|
||||||
# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
|
|
||||||
# to_crawl['scheme'] = 'http'
|
|
||||||
# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
|
|
||||||
#
|
|
||||||
# if url_unpack['port'] is None:
|
|
||||||
# to_crawl['port'] = default_proto_map[to_crawl['scheme']]
|
|
||||||
# else:
|
|
||||||
# try:
|
|
||||||
# port = url_unpack['port'].decode()
|
|
||||||
# except:
|
|
||||||
# port = url_unpack['port']
|
|
||||||
# # Verify port number #################### make function to verify/correct port number
|
|
||||||
# try:
|
|
||||||
# int(port)
|
|
||||||
# # Invalid port Number
|
|
||||||
# except Exception as e:
|
|
||||||
# port = default_proto_map[to_crawl['scheme']]
|
|
||||||
# to_crawl['port'] = port
|
|
||||||
#
|
|
||||||
# #if url_unpack['query_string'] is None:
|
|
||||||
# # if to_crawl['port'] == 80:
|
|
||||||
# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
|
|
||||||
# # else:
|
|
||||||
# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
|
|
||||||
# #else:
|
|
||||||
# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())
|
|
||||||
#
|
|
||||||
# to_crawl['url'] = url
|
|
||||||
# if to_crawl['port'] == 80:
|
|
||||||
# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
|
|
||||||
# else:
|
|
||||||
# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
|
|
||||||
#
|
|
||||||
# try:
|
|
||||||
# to_crawl['tld'] = url_unpack['tld'].decode()
|
|
||||||
# except:
|
|
||||||
# to_crawl['tld'] = url_unpack['tld']
|
|
||||||
#
|
|
||||||
# return to_crawl
|
|
||||||
|
|
||||||
# ##################################################### add ftp ???
|
|
||||||
# update_auto_crawler()
|
|
||||||
|
|
||||||
# # add next auto Crawling in queue:
|
|
||||||
# if to_crawl['paste'] == 'auto':
|
|
||||||
# redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
|
|
||||||
# # update list, last auto crawled domains
|
|
||||||
# redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
|
|
||||||
# redis_crawler.ltrim('last_auto_crawled', 0, 9)
|
|
||||||
#
|
|
||||||
|
|
|
@ -19,8 +19,9 @@ import uuid
|
||||||
|
|
||||||
from enum import IntEnum, unique
|
from enum import IntEnum, unique
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
from dateutil.relativedelta import relativedelta
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
#from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from pylacus import PyLacus
|
from pylacus import PyLacus
|
||||||
|
|
||||||
|
@ -44,8 +45,6 @@ r_db = config_loader.get_db_conn("Kvrocks_DB")
|
||||||
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
|
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
|
||||||
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||||
|
|
||||||
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
|
||||||
|
|
||||||
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
|
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
|
||||||
HAR_DIR = config_loader.get_files_directory('har')
|
HAR_DIR = config_loader.get_files_directory('har')
|
||||||
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
|
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
|
||||||
|
@ -181,8 +180,8 @@ def extract_favicon_from_html(html, url):
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
# # TODO: handle prefix cookies
|
# # TODO:
|
||||||
# # TODO: fill empty fields
|
# # TODO: REVIEW ME THEN REMOVE ME
|
||||||
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
|
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
|
||||||
# check cookie domain filed
|
# check cookie domain filed
|
||||||
if not 'domain' in cookie_dict:
|
if not 'domain' in cookie_dict:
|
||||||
|
@ -201,13 +200,6 @@ def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
|
||||||
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
|
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
|
||||||
return cookie_dict
|
return cookie_dict
|
||||||
|
|
||||||
def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='web'):
|
|
||||||
cookies = get_cookiejar_cookies_list(cookiejar_uuid)
|
|
||||||
all_cookies = []
|
|
||||||
for cookie_dict in cookies:
|
|
||||||
all_cookies.append(create_cookie_crawler(cookie_dict, domain, crawler_type=crawler_type))
|
|
||||||
return all_cookies
|
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
################################################################################
|
################################################################################
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -695,7 +687,285 @@ def load_blacklist():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
#### CRAWLER STATE ####
|
#### CRAWLER Scheduler ####
|
||||||
|
|
||||||
|
@unique
|
||||||
|
class ScheduleStatus(IntEnum):
|
||||||
|
"""The status of the capture"""
|
||||||
|
UNKNOWN = -1
|
||||||
|
QUEUED = 0
|
||||||
|
SCHEDULED = 1
|
||||||
|
ONGOING = 2
|
||||||
|
|
||||||
|
def get_schedulers_uuid():
|
||||||
|
return r_crawler.smembers('scheduler:schedules')
|
||||||
|
|
||||||
|
def get_schedulers_metas():
|
||||||
|
schedulers = []
|
||||||
|
for schedule_uuid in get_schedulers_uuid():
|
||||||
|
schedule = CrawlerSchedule(schedule_uuid)
|
||||||
|
schedulers.append(schedule.get_meta_status())
|
||||||
|
return schedulers
|
||||||
|
|
||||||
|
class CrawlerScheduler:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.min_frequency = 60 # TODO ADD IN CONFIG
|
||||||
|
|
||||||
|
def update_queue(self):
|
||||||
|
for schedule_uuid in get_schedulers_uuid():
|
||||||
|
schedule = CrawlerSchedule(schedule_uuid)
|
||||||
|
# check if already in scheduler queue
|
||||||
|
if schedule.is_scheduled():
|
||||||
|
continue
|
||||||
|
if schedule.is_tasked():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# EXPIRE ????
|
||||||
|
|
||||||
|
time_next_run = 0.0
|
||||||
|
frequency = schedule.get_frequency() # optional or later -> cron
|
||||||
|
if frequency == 'hourly':
|
||||||
|
time_next_run = (datetime.now() + timedelta(hours=1)).timestamp()
|
||||||
|
elif frequency == 'daily':
|
||||||
|
time_next_run = (datetime.now() + timedelta(days=1)).timestamp()
|
||||||
|
elif frequency == 'weekly':
|
||||||
|
time_next_run = (datetime.now() + timedelta(weeks=1)).timestamp()
|
||||||
|
elif frequency == 'monthly':
|
||||||
|
time_next_run = (datetime.now() + relativedelta(months=1)).timestamp()
|
||||||
|
else:
|
||||||
|
months, weeks, days, hours, minutes = frequency.split(':')
|
||||||
|
if not months:
|
||||||
|
months = 0
|
||||||
|
if not weeks:
|
||||||
|
weeks = 0
|
||||||
|
if not days:
|
||||||
|
days = 0
|
||||||
|
if not hours:
|
||||||
|
hours = 0
|
||||||
|
if not minutes:
|
||||||
|
minutes = 0
|
||||||
|
current_time = datetime.now().timestamp()
|
||||||
|
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
|
||||||
|
days=int(days), hours=int(hours),
|
||||||
|
minutes=int(minutes))).timestamp()
|
||||||
|
# Make sure the next capture is not scheduled for in a too short interval
|
||||||
|
interval_next_capture = time_next_run - current_time
|
||||||
|
if interval_next_capture < self.min_frequency:
|
||||||
|
# self.logger.warning(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
|
||||||
|
print(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
|
||||||
|
time_next_run = (datetime.now() + timedelta(seconds=self.min_frequency)).timestamp()
|
||||||
|
|
||||||
|
schedule.set_next_run(time_next_run)
|
||||||
|
print('scheduled:', schedule_uuid)
|
||||||
|
|
||||||
|
def process_queue(self):
|
||||||
|
now = datetime.now().timestamp()
|
||||||
|
for raw_schedule in r_crawler.zrangebyscore('scheduler:queue', '-inf', int(now), withscores=True):
|
||||||
|
schedule_uuid, next_run = raw_schedule
|
||||||
|
schedule = CrawlerSchedule(schedule_uuid)
|
||||||
|
if not schedule.exists():
|
||||||
|
return None
|
||||||
|
meta = schedule.get_meta()
|
||||||
|
task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'],
|
||||||
|
header=meta['header'],
|
||||||
|
cookiejar=meta['cookiejar'], proxy=meta['proxy'],
|
||||||
|
user_agent=meta['user_agent'], parent='scheduler', priority=40)
|
||||||
|
if task_uuid:
|
||||||
|
schedule.set_task(task_uuid)
|
||||||
|
r_crawler.zrem('scheduler:queue', schedule_uuid)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO Expire -> stuck in crawler queue or reached delta
|
||||||
|
class CrawlerSchedule:
|
||||||
|
def __init__(self, schedule_uuid):
|
||||||
|
self.uuid = schedule_uuid
|
||||||
|
|
||||||
|
def exists(self):
|
||||||
|
return r_crawler.exists(f'schedule:{self.uuid}')
|
||||||
|
|
||||||
|
def get_frequency(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'frequency')
|
||||||
|
|
||||||
|
def get_user(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'user')
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'date')
|
||||||
|
|
||||||
|
def get_captures(self): # only scheduled capture ????? exclude manual/discovery
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_status(self):
|
||||||
|
if self.is_scheduled():
|
||||||
|
return ScheduleStatus.SCHEDULED
|
||||||
|
if self.is_tasked():
|
||||||
|
if self.is_ongoing():
|
||||||
|
return ScheduleStatus.ONGOING
|
||||||
|
else:
|
||||||
|
return ScheduleStatus.QUEUED
|
||||||
|
return ScheduleStatus.UNKNOWN
|
||||||
|
|
||||||
|
def get_task_uuid(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'task')
|
||||||
|
|
||||||
|
def is_tasked(self):
|
||||||
|
task_uuid = self.get_task_uuid()
|
||||||
|
if task_uuid:
|
||||||
|
task = CrawlerTask(task_uuid)
|
||||||
|
tasked = task.exists()
|
||||||
|
if not tasked:
|
||||||
|
r_crawler.hdel(f'schedule:{self.uuid}', 'task')
|
||||||
|
return tasked
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_task(self):
|
||||||
|
task_uuid = self.get_task_uuid()
|
||||||
|
if task_uuid:
|
||||||
|
return CrawlerTask(task_uuid)
|
||||||
|
|
||||||
|
def set_task(self, task_uuid):
|
||||||
|
return r_crawler.hset(f'schedule:{self.uuid}', 'task', task_uuid)
|
||||||
|
|
||||||
|
def is_ongoing(self):
|
||||||
|
task = self.get_task()
|
||||||
|
if task:
|
||||||
|
return task.is_ongoing()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_next_run(self, r_str=False):
|
||||||
|
next_run = r_crawler.zscore('scheduler:queue', self.uuid)
|
||||||
|
if next_run and r_str:
|
||||||
|
next_run = time.strftime('%Y-%m-%d - %H:%M:%S', time.localtime(int(next_run)))
|
||||||
|
return next_run
|
||||||
|
|
||||||
|
def set_next_run(self, time_next_run):
|
||||||
|
r_crawler.zadd('scheduler:queue', mapping={self.uuid: time_next_run})
|
||||||
|
|
||||||
|
def is_scheduled(self):
|
||||||
|
return bool(r_crawler.zscore('scheduler:queue', self.uuid))
|
||||||
|
|
||||||
|
def get_url(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'url')
|
||||||
|
|
||||||
|
def get_depth(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'depth')
|
||||||
|
|
||||||
|
def get_har(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'har') == 'True'
|
||||||
|
|
||||||
|
def get_screenshot(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'screenshot') == 'True'
|
||||||
|
|
||||||
|
def get_header(self):
|
||||||
|
r_crawler.hget(f'schedule:{self.uuid}', 'header')
|
||||||
|
|
||||||
|
def get_cookiejar(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'cookiejar')
|
||||||
|
|
||||||
|
def get_proxy(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'proxy')
|
||||||
|
|
||||||
|
def get_user_agent(self):
|
||||||
|
return r_crawler.hget(f'schedule:{self.uuid}', 'user_agent')
|
||||||
|
|
||||||
|
def _set_field(self, field, value):
|
||||||
|
return r_crawler.hset(f'schedule:{self.uuid}', field, value)
|
||||||
|
|
||||||
|
def get_meta(self, ui=False):
|
||||||
|
meta = {
|
||||||
|
'uuid': self.uuid,
|
||||||
|
'date': self.get_date(),
|
||||||
|
'frequency': self.get_frequency(),
|
||||||
|
'user': self.get_user(),
|
||||||
|
'url': self.get_url(),
|
||||||
|
'depth': self.get_depth(),
|
||||||
|
'har': self.get_har(),
|
||||||
|
'screenshot': self.get_screenshot(),
|
||||||
|
'user_agent': self.get_user_agent(),
|
||||||
|
'cookiejar': self.get_cookiejar(),
|
||||||
|
'header': self.get_header(),
|
||||||
|
'proxy': self.get_proxy(),
|
||||||
|
}
|
||||||
|
status = self.get_status()
|
||||||
|
if ui:
|
||||||
|
status = status.name
|
||||||
|
r_str = True
|
||||||
|
else:
|
||||||
|
r_str = False
|
||||||
|
meta['status'] = status
|
||||||
|
meta['next_run'] = self.get_next_run(r_str=r_str)
|
||||||
|
return meta
|
||||||
|
|
||||||
|
def get_meta_status(self): # TODO: Description ? Frequency ???
|
||||||
|
meta = {'uuid': self.uuid,
|
||||||
|
'url': self.get_url(),
|
||||||
|
'user': self.get_user(),
|
||||||
|
'next_run': self.get_next_run(r_str=True)}
|
||||||
|
status = self.get_status()
|
||||||
|
if isinstance(status, ScheduleStatus):
|
||||||
|
status = status.name
|
||||||
|
meta['status'] = status
|
||||||
|
return meta
|
||||||
|
|
||||||
|
def create(self, frequency, user, url,
|
||||||
|
depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
|
||||||
|
|
||||||
|
if self.exists():
|
||||||
|
raise Exception('Error: Monitor already exists')
|
||||||
|
|
||||||
|
url_decoded = unpack_url(url)
|
||||||
|
url = url_decoded['url']
|
||||||
|
|
||||||
|
self._set_field('date', datetime.now().strftime("%Y-%m-%d"))
|
||||||
|
self._set_field('frequency', frequency)
|
||||||
|
self._set_field('user', user)
|
||||||
|
self._set_field('url', url)
|
||||||
|
self._set_field('depth', int(depth))
|
||||||
|
self._set_field('har', str(har))
|
||||||
|
self._set_field('screenshot', str(screenshot))
|
||||||
|
|
||||||
|
if cookiejar:
|
||||||
|
self._set_field('cookiejar', cookiejar)
|
||||||
|
if header:
|
||||||
|
self._set_field('header', header)
|
||||||
|
if proxy:
|
||||||
|
if proxy == 'web':
|
||||||
|
proxy = None
|
||||||
|
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
|
||||||
|
proxy = 'force_tor'
|
||||||
|
self._set_field('proxy', proxy)
|
||||||
|
if user_agent:
|
||||||
|
self._set_field('user_agent', user_agent)
|
||||||
|
|
||||||
|
r_crawler.sadd('scheduler:schedules', self.uuid)
|
||||||
|
|
||||||
|
def delete(self):
|
||||||
|
# remove from schedule queue
|
||||||
|
r_crawler.zrem('scheduler:queue', self.uuid)
|
||||||
|
|
||||||
|
# delete task
|
||||||
|
task = self.get_task()
|
||||||
|
if task:
|
||||||
|
task.delete()
|
||||||
|
|
||||||
|
# delete meta
|
||||||
|
r_crawler.delete(f'schedule:{self.uuid}')
|
||||||
|
r_crawler.srem('scheduler:schedules', self.uuid)
|
||||||
|
|
||||||
|
def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
|
||||||
|
schedule_uuid = gen_uuid()
|
||||||
|
schedule = CrawlerSchedule(schedule_uuid)
|
||||||
|
schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent)
|
||||||
|
return schedule_uuid
|
||||||
|
|
||||||
|
# TODO sanityze UUID
|
||||||
|
def api_delete_schedule(data):
|
||||||
|
schedule_uuid = data.get('uuid')
|
||||||
|
schedule = CrawlerSchedule(schedule_uuid)
|
||||||
|
if not schedule.exists():
|
||||||
|
return {'error': 'unknown schedule uuid', 'uuid': schedule}, 404
|
||||||
|
return schedule.delete(), 200
|
||||||
|
|
||||||
#### CRAWLER CAPTURE ####
|
#### CRAWLER CAPTURE ####
|
||||||
|
|
||||||
|
@ -709,7 +979,15 @@ def reload_crawler_captures():
|
||||||
r_cache.delete('crawler:captures')
|
r_cache.delete('crawler:captures')
|
||||||
for capture_uuid in get_crawler_captures():
|
for capture_uuid in get_crawler_captures():
|
||||||
capture = CrawlerCapture(capture_uuid)
|
capture = CrawlerCapture(capture_uuid)
|
||||||
r_cache.zadd('crawler:captures', {capture.uuid: 0})
|
capture.update(None)
|
||||||
|
|
||||||
|
def _clear_captures():
|
||||||
|
for capture_uuid in get_crawler_captures():
|
||||||
|
capture = CrawlerCapture(capture_uuid)
|
||||||
|
task = capture.get_task()
|
||||||
|
task.delete()
|
||||||
|
capture.delete()
|
||||||
|
print(capture_uuid, 'deleted')
|
||||||
|
|
||||||
@unique
|
@unique
|
||||||
class CaptureStatus(IntEnum):
|
class CaptureStatus(IntEnum):
|
||||||
|
@ -741,6 +1019,9 @@ class CrawlerCapture:
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
|
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
|
||||||
|
|
||||||
|
def is_ongoing(self):
|
||||||
|
return self.get_status() == CaptureStatus.ONGOING
|
||||||
|
|
||||||
def create(self, task_uuid):
|
def create(self, task_uuid):
|
||||||
if self.exists():
|
if self.exists():
|
||||||
raise Exception(f'Error: Capture {self.uuid} already exists')
|
raise Exception(f'Error: Capture {self.uuid} already exists')
|
||||||
|
@ -752,20 +1033,26 @@ class CrawlerCapture:
|
||||||
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
|
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
|
||||||
|
|
||||||
def update(self, status):
|
def update(self, status):
|
||||||
last_check = int(time.time())
|
# Error or Reload
|
||||||
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
|
if not status:
|
||||||
r_cache.zadd('crawler:captures', {self.uuid: last_check})
|
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', CaptureStatus.UNKNOWN)
|
||||||
|
r_cache.zadd('crawler:captures', {self.uuid: 0})
|
||||||
|
else:
|
||||||
|
last_check = int(time.time())
|
||||||
|
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
|
||||||
|
r_cache.zadd('crawler:captures', {self.uuid: last_check})
|
||||||
|
|
||||||
def remove(self): # TODO INCOMPLETE
|
# Crawler
|
||||||
|
def remove(self):
|
||||||
r_crawler.zrem('crawler:captures', self.uuid)
|
r_crawler.zrem('crawler:captures', self.uuid)
|
||||||
|
r_cache.delete(f'crawler:capture:{self.uuid}')
|
||||||
r_crawler.hdel('crawler:captures:tasks', self.uuid)
|
r_crawler.hdel('crawler:captures:tasks', self.uuid)
|
||||||
|
|
||||||
# TODO
|
# Manual
|
||||||
# TODO DELETE TASK ???
|
|
||||||
def delete(self):
|
def delete(self):
|
||||||
# task = self.get_task()
|
# remove Capture from crawler queue
|
||||||
# task.delete()
|
r_cache.zrem('crawler:captures', self.uuid)
|
||||||
r_cache.delete(f'crawler:capture:{self.uuid}')
|
self.remove()
|
||||||
|
|
||||||
|
|
||||||
def create_capture(capture_uuid, task_uuid):
|
def create_capture(capture_uuid, task_uuid):
|
||||||
|
@ -792,9 +1079,13 @@ def get_captures_status():
|
||||||
'uuid': task.uuid,
|
'uuid': task.uuid,
|
||||||
'domain': dom.get_id(),
|
'domain': dom.get_id(),
|
||||||
'type': dom.get_domain_type(),
|
'type': dom.get_domain_type(),
|
||||||
'start_time': capture.get_start_time(), ############### TODO
|
'start_time': capture.get_start_time(),
|
||||||
'status': capture.get_status(),
|
'status': capture.get_status(),
|
||||||
}
|
}
|
||||||
|
capture_status = capture.get_status()
|
||||||
|
if capture_status:
|
||||||
|
capture_status = CaptureStatus(int(capture_status)).name
|
||||||
|
meta['status'] = capture_status
|
||||||
status.append(meta)
|
status.append(meta)
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
@ -872,6 +1163,12 @@ class CrawlerTask:
|
||||||
def get_capture(self):
|
def get_capture(self):
|
||||||
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
|
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
|
||||||
|
|
||||||
|
def is_ongoing(self):
|
||||||
|
capture_uuid = self.get_capture()
|
||||||
|
if capture_uuid:
|
||||||
|
return CrawlerCapture(capture_uuid).is_ongoing()
|
||||||
|
return False
|
||||||
|
|
||||||
def _set_field(self, field, value):
|
def _set_field(self, field, value):
|
||||||
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
|
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
|
||||||
|
|
||||||
|
@ -923,8 +1220,6 @@ class CrawlerTask:
|
||||||
proxy = None
|
proxy = None
|
||||||
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
|
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
|
||||||
proxy = 'force_tor'
|
proxy = 'force_tor'
|
||||||
if not user_agent:
|
|
||||||
user_agent = get_default_user_agent()
|
|
||||||
|
|
||||||
# TODO SANITIZE COOKIEJAR -> UUID
|
# TODO SANITIZE COOKIEJAR -> UUID
|
||||||
|
|
||||||
|
@ -934,13 +1229,11 @@ class CrawlerTask:
|
||||||
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
|
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
|
||||||
return self.uuid
|
return self.uuid
|
||||||
|
|
||||||
# TODO ADD TASK STATUS -----
|
|
||||||
self._set_field('domain', domain)
|
self._set_field('domain', domain)
|
||||||
self._set_field('url', url)
|
self._set_field('url', url)
|
||||||
self._set_field('depth', int(depth))
|
self._set_field('depth', int(depth))
|
||||||
self._set_field('har', har)
|
self._set_field('har', har)
|
||||||
self._set_field('screenshot', screenshot)
|
self._set_field('screenshot', screenshot)
|
||||||
self._set_field('user_agent', user_agent)
|
|
||||||
self._set_field('parent', parent)
|
self._set_field('parent', parent)
|
||||||
|
|
||||||
if cookiejar:
|
if cookiejar:
|
||||||
|
@ -949,30 +1242,45 @@ class CrawlerTask:
|
||||||
self._set_field('header', header)
|
self._set_field('header', header)
|
||||||
if proxy:
|
if proxy:
|
||||||
self._set_field('proxy', proxy)
|
self._set_field('proxy', proxy)
|
||||||
|
if user_agent:
|
||||||
|
self._set_field('user_agent', user_agent)
|
||||||
|
|
||||||
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
|
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
|
||||||
self._set_field('hash', hash_query)
|
self._set_field('hash', hash_query)
|
||||||
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
||||||
|
self.add_to_db_crawler_queue(priority)
|
||||||
# UI
|
# UI
|
||||||
domain_type = dom.get_domain_type()
|
domain_type = dom.get_domain_type()
|
||||||
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
|
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
|
||||||
self._set_field('queue', domain_type)
|
self._set_field('queue', domain_type)
|
||||||
return self.uuid
|
return self.uuid
|
||||||
|
|
||||||
def lacus_queue(self):
|
def add_to_db_crawler_queue(self, priority):
|
||||||
r_crawler.sadd('crawler:queue:queued', self.uuid)
|
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
||||||
|
|
||||||
|
def start(self):
|
||||||
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
def clear(self):
|
# Crawler
|
||||||
r_crawler.hdel('crawler:queue:hash', self.get_hash())
|
def remove(self): # zrem cache + DB
|
||||||
r_crawler.srem(f'crawler:queue:type:{self.get_queue()}', self.uuid)
|
capture_uuid = self.get_capture()
|
||||||
r_crawler.srem('crawler:queue:queued', self.uuid)
|
if capture_uuid:
|
||||||
|
capture = CrawlerCapture(capture_uuid)
|
||||||
def delete(self):
|
capture.remove()
|
||||||
self.clear()
|
queue_type = self.get_queue()
|
||||||
|
if queue_type:
|
||||||
|
r_crawler.srem(f'crawler:queue:type:{queue_type}', self.uuid)
|
||||||
|
task_hash = self.get_hash()
|
||||||
|
if task_hash:
|
||||||
|
r_crawler.hdel('crawler:queue:hash', task_hash)
|
||||||
|
# meta
|
||||||
r_crawler.delete(f'crawler:task:{self.uuid}')
|
r_crawler.delete(f'crawler:task:{self.uuid}')
|
||||||
# r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
|
||||||
|
|
||||||
|
# Manual
|
||||||
|
def delete(self):
|
||||||
|
# queue
|
||||||
|
r_crawler.zrem('crawler:queue', self.uuid)
|
||||||
|
self.remove()
|
||||||
|
|
||||||
|
|
||||||
# TODO move to class ???
|
# TODO move to class ???
|
||||||
|
@ -990,7 +1298,7 @@ def add_task_to_lacus_queue():
|
||||||
return None
|
return None
|
||||||
task_uuid, priority = task_uuid[0]
|
task_uuid, priority = task_uuid[0]
|
||||||
task = CrawlerTask(task_uuid)
|
task = CrawlerTask(task_uuid)
|
||||||
task.lacus_queue()
|
task.start()
|
||||||
return task.uuid, priority
|
return task.uuid, priority
|
||||||
|
|
||||||
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
||||||
|
@ -1006,29 +1314,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
|
||||||
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
|
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
|
||||||
return task_uuid
|
return task_uuid
|
||||||
|
|
||||||
######################################################################
|
|
||||||
######################################################################
|
|
||||||
|
|
||||||
# def get_task_status(task_uuid):
|
## -- CRAWLER TASK -- ##
|
||||||
# domain = r_crawler.hget(f'crawler:task:{task_uuid}', 'domain')
|
|
||||||
# dom = Domain(domain)
|
|
||||||
# meta = {
|
|
||||||
# 'uuid': task_uuid,
|
|
||||||
# 'domain': dom.get_id(),
|
|
||||||
# 'domain_type': dom.get_domain_type(),
|
|
||||||
# 'start_time': r_crawler.hget(f'crawler:task:{task_uuid}', 'start_time'),
|
|
||||||
# 'status': 'test',
|
|
||||||
# }
|
|
||||||
# return meta
|
|
||||||
|
|
||||||
# def get_crawlers_tasks_status():
|
|
||||||
# tasks_status = []
|
|
||||||
# tasks = r_crawler.smembers('crawler:queue:queued')
|
|
||||||
# for task_uuid in tasks:
|
|
||||||
# tasks_status.append(get_task_status(task_uuid))
|
|
||||||
# return tasks_status
|
|
||||||
|
|
||||||
##-- CRAWLER TASK --##
|
|
||||||
|
|
||||||
#### CRAWLER TASK API ####
|
#### CRAWLER TASK API ####
|
||||||
|
|
||||||
|
@ -1071,13 +1358,25 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
return {'error': 'The access to this cookiejar is restricted'}, 403
|
return {'error': 'The access to this cookiejar is restricted'}, 403
|
||||||
cookiejar_uuid = cookiejar.uuid
|
cookiejar_uuid = cookiejar.uuid
|
||||||
|
|
||||||
# if auto_crawler:
|
frequency = data.get('frequency', None)
|
||||||
# try:
|
if frequency:
|
||||||
# crawler_delta = int(crawler_delta)
|
if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
|
||||||
# if crawler_delta < 0:
|
if not isinstance(frequency, dict):
|
||||||
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
|
return {'error': 'Invalid frequency'}, 400
|
||||||
# except ValueError:
|
else:
|
||||||
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
|
try:
|
||||||
|
months = int(frequency.get('months', 0))
|
||||||
|
weeks = int(frequency.get('weeks', 0))
|
||||||
|
days = int(frequency.get('days', 0))
|
||||||
|
hours = int(frequency.get('hours', 0))
|
||||||
|
minutes = int(frequency.get('minutes', 0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return {'error': 'Invalid frequency'}, 400
|
||||||
|
if min(months, weeks, days, hours, minutes) < 0:
|
||||||
|
return {'error': 'Invalid frequency'}, 400
|
||||||
|
if max(months, weeks, days, hours, minutes) <= 0:
|
||||||
|
return {'error': 'Invalid frequency'}, 400
|
||||||
|
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
||||||
|
|
||||||
# PROXY
|
# PROXY
|
||||||
proxy = data.get('proxy', None)
|
proxy = data.get('proxy', None)
|
||||||
|
@ -1088,15 +1387,16 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
if verify[1] != 200:
|
if verify[1] != 200:
|
||||||
return verify
|
return verify
|
||||||
|
|
||||||
# TODO #############################################################################################################
|
if frequency:
|
||||||
# auto_crawler = auto_crawler
|
# TODO verify user
|
||||||
# crawler_delta = crawler_delta
|
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
parent = 'manual'
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200
|
||||||
|
else:
|
||||||
# TODO HEADERS
|
# TODO HEADERS
|
||||||
# TODO USER AGENT
|
# TODO USER AGENT
|
||||||
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, cookiejar=cookiejar_uuid,
|
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
proxy=proxy, user_agent=None, parent='manual', priority=90), 200
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None,
|
||||||
|
parent='manual', priority=90), 200
|
||||||
|
|
||||||
|
|
||||||
#### ####
|
#### ####
|
||||||
|
@ -1108,13 +1408,6 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
###################################################################################
|
###################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#### CRAWLER GLOBAL ####
|
#### CRAWLER GLOBAL ####
|
||||||
|
|
||||||
# TODO: # FIXME: config db, dynamic load
|
# TODO: # FIXME: config db, dynamic load
|
||||||
|
@ -1124,55 +1417,8 @@ def is_crawler_activated():
|
||||||
def get_crawler_all_types():
|
def get_crawler_all_types():
|
||||||
return ['onion', 'web']
|
return ['onion', 'web']
|
||||||
|
|
||||||
def sanitize_crawler_types(l_crawler_types):
|
|
||||||
all_crawler_types = get_crawler_all_types()
|
|
||||||
if not l_crawler_types:
|
|
||||||
return all_crawler_types
|
|
||||||
for crawler_type in l_crawler_types:
|
|
||||||
if crawler_type not in all_crawler_types:
|
|
||||||
return all_crawler_types
|
|
||||||
return l_crawler_types
|
|
||||||
|
|
||||||
##-- CRAWLER GLOBAL --##
|
##-- CRAWLER GLOBAL --##
|
||||||
|
|
||||||
#### AUTOMATIC CRAWLER ####
|
|
||||||
|
|
||||||
def get_auto_crawler_all_domain(l_crawler_types=[]):
|
|
||||||
l_crawler_types = sanitize_crawler_types(l_crawler_types)
|
|
||||||
if len(l_crawler_types) == 1:
|
|
||||||
return r_serv_onion.smembers(f'auto_crawler_url:{l_crawler_types[0]}')
|
|
||||||
else:
|
|
||||||
l_keys_name = []
|
|
||||||
for crawler_type in l_crawler_types:
|
|
||||||
l_keys_name.append(f'auto_crawler_url:{crawler_type}')
|
|
||||||
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
|
|
||||||
|
|
||||||
def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
|
|
||||||
r_serv_onion.zadd('crawler_auto_queue', {f'{message};{domain_type}': int(time.time() + delta)})
|
|
||||||
# update list, last auto crawled domains
|
|
||||||
r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
|
|
||||||
r_serv_onion.ltrim('last_auto_crawled', 0, 9)
|
|
||||||
|
|
||||||
def update_auto_crawler_queue():
|
|
||||||
current_epoch = int(time.time())
|
|
||||||
# check if current_epoch > domain_next_epoch
|
|
||||||
l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
|
|
||||||
for elem in l_queue:
|
|
||||||
mess, domain_type = elem.rsplit(';', 1)
|
|
||||||
print(domain_type)
|
|
||||||
print(mess)
|
|
||||||
r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
|
|
||||||
|
|
||||||
|
|
||||||
##-- AUTOMATIC CRAWLER --##
|
|
||||||
|
|
||||||
#### CRAWLER TASK ####
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##-- CRAWLER TASK --##
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#### ####
|
#### ####
|
||||||
|
|
||||||
|
@ -1207,6 +1453,8 @@ def save_har(har_dir, item_id, har_content):
|
||||||
# #
|
# #
|
||||||
# # # # # # # # # # # #
|
# # # # # # # # # # # #
|
||||||
|
|
||||||
|
#### PROXY ####
|
||||||
|
|
||||||
def api_verify_proxy(proxy_url):
|
def api_verify_proxy(proxy_url):
|
||||||
parsed_proxy = urlparse(proxy_url)
|
parsed_proxy = urlparse(proxy_url)
|
||||||
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
|
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
|
||||||
|
@ -1237,13 +1485,7 @@ class CrawlerProxy:
|
||||||
def get_url(self):
|
def get_url(self):
|
||||||
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
|
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
|
||||||
|
|
||||||
###############################################################################################
|
#### CRAWLER LACUS ####
|
||||||
###############################################################################################
|
|
||||||
###############################################################################################
|
|
||||||
###############################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
# # # # CRAWLER LACUS # # # #
|
|
||||||
|
|
||||||
def get_lacus_url():
|
def get_lacus_url():
|
||||||
return r_db.hget('crawler:lacus', 'url')
|
return r_db.hget('crawler:lacus', 'url')
|
||||||
|
@ -1363,12 +1605,7 @@ def api_set_crawler_max_captures(data):
|
||||||
save_nb_max_captures(nb_captures)
|
save_nb_max_captures(nb_captures)
|
||||||
return nb_captures, 200
|
return nb_captures, 200
|
||||||
|
|
||||||
## PROXY ##
|
## TEST ##
|
||||||
|
|
||||||
# TODO SAVE PROXY URL + ADD PROXY TESTS
|
|
||||||
# -> name + url
|
|
||||||
|
|
||||||
## PROXY ##
|
|
||||||
|
|
||||||
def is_test_ail_crawlers_successful():
|
def is_test_ail_crawlers_successful():
|
||||||
return r_db.hget('crawler:tor:test', 'success') == 'True'
|
return r_db.hget('crawler:tor:test', 'success') == 'True'
|
||||||
|
@ -1380,7 +1617,6 @@ def save_test_ail_crawlers_result(test_success, message):
|
||||||
r_db.hset('crawler:tor:test', 'success', str(test_success))
|
r_db.hset('crawler:tor:test', 'success', str(test_success))
|
||||||
r_db.hset('crawler:tor:test', 'message', message)
|
r_db.hset('crawler:tor:test', 'message', message)
|
||||||
|
|
||||||
# TODO CREATE TEST TASK
|
|
||||||
def test_ail_crawlers():
|
def test_ail_crawlers():
|
||||||
# # TODO: test web domain
|
# # TODO: test web domain
|
||||||
if not ping_lacus():
|
if not ping_lacus():
|
||||||
|
@ -1431,10 +1667,11 @@ def test_ail_crawlers():
|
||||||
#### ---- ####
|
#### ---- ####
|
||||||
|
|
||||||
|
|
||||||
# TODO MOVE ME
|
# TODO MOVE ME IN CRAWLER OR FLASK
|
||||||
load_blacklist()
|
load_blacklist()
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
|
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
|
||||||
# print(task.get_meta())
|
# print(task.get_meta())
|
||||||
|
# _clear_captures()
|
||||||
|
|
||||||
|
|
|
@ -61,9 +61,9 @@ def create_json_response(data, status_code):
|
||||||
def crawlers_dashboard():
|
def crawlers_dashboard():
|
||||||
is_manager_connected = crawlers.get_lacus_connection_metadata()
|
is_manager_connected = crawlers.get_lacus_connection_metadata()
|
||||||
crawlers_status = crawlers.get_captures_status()
|
crawlers_status = crawlers.get_captures_status()
|
||||||
print(crawlers_status)
|
# print(crawlers_status)
|
||||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||||
print(crawlers_latest_stats)
|
# print(crawlers_latest_stats)
|
||||||
date = crawlers.get_current_date()
|
date = crawlers.get_current_date()
|
||||||
return render_template("dashboard_crawler.html", date=date,
|
return render_template("dashboard_crawler.html", date=date,
|
||||||
is_manager_connected=is_manager_connected,
|
is_manager_connected=is_manager_connected,
|
||||||
|
@ -77,6 +77,7 @@ def crawlers_dashboard():
|
||||||
def crawler_dashboard_json():
|
def crawler_dashboard_json():
|
||||||
crawlers_status = crawlers.get_captures_status()
|
crawlers_status = crawlers.get_captures_status()
|
||||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||||
|
# print(crawlers_status)
|
||||||
|
|
||||||
return jsonify({'crawlers_status': crawlers_status,
|
return jsonify({'crawlers_status': crawlers_status,
|
||||||
'stats': crawlers_latest_stats})
|
'stats': crawlers_latest_stats})
|
||||||
|
@ -106,13 +107,24 @@ def send_to_spider():
|
||||||
# POST val
|
# POST val
|
||||||
url = request.form.get('url_to_crawl')
|
url = request.form.get('url_to_crawl')
|
||||||
crawler_type = request.form.get('crawler_queue_type')
|
crawler_type = request.form.get('crawler_queue_type')
|
||||||
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
|
|
||||||
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
|
|
||||||
screenshot = request.form.get('screenshot')
|
screenshot = request.form.get('screenshot')
|
||||||
har = request.form.get('har')
|
har = request.form.get('har')
|
||||||
depth_limit = request.form.get('depth_limit')
|
depth_limit = request.form.get('depth_limit')
|
||||||
cookiejar_uuid = request.form.get('cookiejar')
|
cookiejar_uuid = request.form.get('cookiejar')
|
||||||
|
|
||||||
|
# Frequency
|
||||||
|
if request.form.get('crawler_scheduler'):
|
||||||
|
frequency = request.form.get('frequency')
|
||||||
|
if frequency == 'custom':
|
||||||
|
months = request.form.get('frequency_months', 0)
|
||||||
|
weeks = request.form.get('frequency_weeks', 0)
|
||||||
|
days = request.form.get('frequency_days', 0)
|
||||||
|
hours = request.form.get('frequency_hours', 0)
|
||||||
|
minutes = request.form.get('frequency_minutes', 0)
|
||||||
|
frequency = {'months': months, 'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes}
|
||||||
|
else:
|
||||||
|
frequency = None
|
||||||
|
|
||||||
# PROXY
|
# PROXY
|
||||||
proxy = request.form.get('proxy_name')
|
proxy = request.form.get('proxy_name')
|
||||||
if proxy:
|
if proxy:
|
||||||
|
@ -129,7 +141,7 @@ def send_to_spider():
|
||||||
cookiejar_uuid = cookiejar_uuid.rsplit(':')
|
cookiejar_uuid = cookiejar_uuid.rsplit(':')
|
||||||
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
||||||
|
|
||||||
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot}
|
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
|
||||||
if proxy:
|
if proxy:
|
||||||
data['proxy'] = proxy
|
data['proxy'] = proxy
|
||||||
if cookiejar_uuid:
|
if cookiejar_uuid:
|
||||||
|
@ -142,6 +154,43 @@ def send_to_spider():
|
||||||
return redirect(url_for('crawler_splash.manual'))
|
return redirect(url_for('crawler_splash.manual'))
|
||||||
|
|
||||||
|
|
||||||
|
@crawler_splash.route("/crawlers/scheduler", methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
@login_read_only
|
||||||
|
def scheduler_dashboard():
|
||||||
|
schedulers = crawlers.get_schedulers_metas()
|
||||||
|
# print(schedulers)
|
||||||
|
# TODO list currently queued ?
|
||||||
|
return render_template("crawler_scheduler_dashboard.html",
|
||||||
|
schedulers=schedulers,
|
||||||
|
is_manager_connected=crawlers.get_lacus_connection_metadata())
|
||||||
|
|
||||||
|
@crawler_splash.route("/crawlers/schedule", methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
@login_read_only
|
||||||
|
def schedule_show():
|
||||||
|
schedule_uuid = request.args.get('uuid')
|
||||||
|
schedule = crawlers.CrawlerSchedule(schedule_uuid)
|
||||||
|
if not schedule.exists():
|
||||||
|
abort(404)
|
||||||
|
meta = schedule.get_meta(ui=True)
|
||||||
|
return render_template("crawler_schedule_uuid.html",
|
||||||
|
meta=meta)
|
||||||
|
|
||||||
|
@crawler_splash.route("/crawlers/schedule/delete", methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
@login_analyst
|
||||||
|
def schedule_delete():
|
||||||
|
schedule_uuid = request.args.get('uuid')
|
||||||
|
schedule = crawlers.CrawlerSchedule(schedule_uuid)
|
||||||
|
if not schedule.exists():
|
||||||
|
abort(404)
|
||||||
|
res = crawlers.api_delete_schedule({'uuid': schedule_uuid})
|
||||||
|
if res[1] != 200:
|
||||||
|
return create_json_response(res[0], res[1])
|
||||||
|
return redirect(url_for('crawler_splash.scheduler_dashboard'))
|
||||||
|
|
||||||
|
|
||||||
@crawler_splash.route("/crawlers/last/domains", methods=['GET'])
|
@crawler_splash.route("/crawlers/last/domains", methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
@login_read_only
|
@login_read_only
|
||||||
|
@ -228,11 +277,11 @@ def showDomain():
|
||||||
dict_domain['epoch'] = curr_epoch
|
dict_domain['epoch'] = curr_epoch
|
||||||
dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch))
|
dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch))
|
||||||
|
|
||||||
print(dict_domain['epoch'])
|
# print(dict_domain['epoch'])
|
||||||
|
|
||||||
dict_domain['crawler_history_items'] = []
|
dict_domain['crawler_history_items'] = []
|
||||||
for item_id in domain.get_crawled_items_by_epoch(epoch):
|
for item_id in domain.get_crawled_items_by_epoch(epoch):
|
||||||
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options=['crawler']))
|
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options={'crawler'}))
|
||||||
if dict_domain['crawler_history_items']:
|
if dict_domain['crawler_history_items']:
|
||||||
dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items'])
|
dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items'])
|
||||||
|
|
||||||
|
@ -521,7 +570,7 @@ def crawler_cookiejar_show():
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/cookie/delete', methods=['GET'])
|
@crawler_splash.route('/crawler/cookie/delete', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
@login_read_only
|
@login_analyst
|
||||||
def crawler_cookiejar_cookie_delete():
|
def crawler_cookiejar_cookie_delete():
|
||||||
user_id = current_user.get_id()
|
user_id = current_user.get_id()
|
||||||
cookie_uuid = request.args.get('uuid')
|
cookie_uuid = request.args.get('uuid')
|
||||||
|
@ -536,7 +585,7 @@ def crawler_cookiejar_cookie_delete():
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET'])
|
@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
@login_read_only
|
@login_analyst
|
||||||
def crawler_cookiejar_delete():
|
def crawler_cookiejar_delete():
|
||||||
user_id = current_user.get_id()
|
user_id = current_user.get_id()
|
||||||
cookiejar_uuid = request.args.get('uuid')
|
cookiejar_uuid = request.args.get('uuid')
|
||||||
|
@ -699,7 +748,7 @@ def crawler_lacus_settings_crawler_manager():
|
||||||
api_key = request.form.get('api_key')
|
api_key = request.form.get('api_key')
|
||||||
|
|
||||||
res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key})
|
res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key})
|
||||||
print(res)
|
# print(res)
|
||||||
if res[1] != 200:
|
if res[1] != 200:
|
||||||
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -66,21 +66,59 @@
|
||||||
<div class="d-flex mt-3">
|
<div class="d-flex mt-3">
|
||||||
<i class="fas fa-user-ninja mt-1"></i> Manual
|
<i class="fas fa-user-ninja mt-1"></i> Manual
|
||||||
<div class="custom-control custom-switch">
|
<div class="custom-control custom-switch">
|
||||||
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
|
<input class="custom-control-input" type="checkbox" name="crawler_scheduler" value="True" id="crawler_scheduler">
|
||||||
<label class="custom-control-label" for="crawler_type">
|
<label class="custom-control-label" for="crawler_scheduler">
|
||||||
<i class="fas fa-clock"></i> Automatic
|
<i class="fas fa-clock"></i> Scheduler
|
||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
|
|
||||||
<div class="input-group-prepend">
|
<div id="frequency_inputs">
|
||||||
<span class="input-group-text bg-light"><i class="fas fa-clock"></i> </span>
|
<div class="mb-4">
|
||||||
</div>
|
<select class="custom-select" id="frequency" name="frequency" onchange="frequency_selector_update(this);">
|
||||||
<input class="form-control" type="number" id="crawler_epoch" value="3600" min="1" name="crawler_epoch" required>
|
<option value="hourly">Hourly</option>
|
||||||
<div class="input-group-append">
|
<option value="daily">Daily</option>
|
||||||
<span class="input-group-text">Time (seconds) between each crawling</span>
|
<option value="weekly">Weekly</option>
|
||||||
</div>
|
<option value="monthly">Monthly</option>
|
||||||
</div>
|
<option value="custom">Custom</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div id="custom_frequency">
|
||||||
|
<h5><i class="fas fa-clock"></i> Adjust crawling interval as needed</h5>
|
||||||
|
<div class="input-group">
|
||||||
|
<div class="input-group-prepend">
|
||||||
|
<span class="input-group-text bg-light" style="width: 90px"><b>Months</b></span>
|
||||||
|
</div>
|
||||||
|
<input class="form-control" type="number" id="frequency_months" value="0" min="0" name="frequency_months" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<div class="input-group-prepend">
|
||||||
|
<span class="input-group-text bg-light" style="width: 90px"><b>Weeks</b></span>
|
||||||
|
</div>
|
||||||
|
<input class="form-control" type="number" id="frequency_weeks" value="0" min="0" name="frequency_weeks" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<div class="input-group-prepend">
|
||||||
|
<span class="input-group-text bg-light" style="width: 90px"><b>Days</b></span>
|
||||||
|
</div>
|
||||||
|
<input class="form-control" type="number" id="frequency_days" value="0" min="0" name="frequency_days" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<div class="input-group-prepend">
|
||||||
|
<span class="input-group-text bg-light" style="width: 90px"><b>Hours</b></span>
|
||||||
|
</div>
|
||||||
|
<input class="form-control" type="number" id="frequency_hours" value="0" min="0" name="frequency_hours" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<div class="input-group-prepend">
|
||||||
|
<span class="input-group-text bg-light" style="width: 90px"><b>Minutes</b></span>
|
||||||
|
</div>
|
||||||
|
<input class="form-control" type="number" id="frequency_minutes" value="0" min="0" name="frequency_minutes" required>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
@ -165,8 +203,9 @@ $(document).ready(function(){
|
||||||
$("#nav_manual_crawler").addClass("active");
|
$("#nav_manual_crawler").addClass("active");
|
||||||
queue_type_selector_input_controler()
|
queue_type_selector_input_controler()
|
||||||
manual_crawler_input_controler();
|
manual_crawler_input_controler();
|
||||||
|
$("#custom_frequency").hide();
|
||||||
|
|
||||||
$('#crawler_type').on("change", function () {
|
$('#crawler_scheduler').on("change", function () {
|
||||||
manual_crawler_input_controler();
|
manual_crawler_input_controler();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -190,10 +229,18 @@ function toggle_sidebar(){
|
||||||
}
|
}
|
||||||
|
|
||||||
function manual_crawler_input_controler() {
|
function manual_crawler_input_controler() {
|
||||||
if($('#crawler_type').is(':checked')){
|
if($('#crawler_scheduler').is(':checked')){
|
||||||
$("#crawler_epoch_input").show();
|
$("#frequency_inputs").show();
|
||||||
}else{
|
}else{
|
||||||
$("#crawler_epoch_input").hide();
|
$("#frequency_inputs").hide();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function frequency_selector_update(obj) {
|
||||||
|
if(obj.value === "custom") {
|
||||||
|
$("#custom_frequency").show();
|
||||||
|
}else{
|
||||||
|
$("#custom_frequency").hide();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,199 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>AIL-Framework</title>
|
||||||
|
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||||
|
<!-- Core CSS -->
|
||||||
|
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||||
|
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||||
|
|
||||||
|
<!-- JS -->
|
||||||
|
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
{% include 'nav_bar.html' %}
|
||||||
|
|
||||||
|
<div class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
|
||||||
|
{% include 'crawler/menu_sidebar.html' %}
|
||||||
|
|
||||||
|
<div class="col-12 col-lg-10" id="core_content">
|
||||||
|
|
||||||
|
<div class="card my-1">
|
||||||
|
<div class="card-header bg-dark text-white">
|
||||||
|
<h4 class="card-title"><b>{{ meta['url'] }}</b></h4>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-lg-8">
|
||||||
|
|
||||||
|
<table class="table table-hover">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>UUID</th>
|
||||||
|
<td>{{ meta['uuid'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Url</th>
|
||||||
|
<td>{{ meta['url'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Frequency</th>
|
||||||
|
<td>{{ meta['frequency'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Creator</th>
|
||||||
|
<td>{{ meta['user'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Date</th>
|
||||||
|
<td>{{ meta['date'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Status</th>
|
||||||
|
<td>{{ meta['status'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Next Run</th>
|
||||||
|
<td>
|
||||||
|
{% if not meta['next_run'] %}
|
||||||
|
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
|
||||||
|
{% else %}
|
||||||
|
{{ meta['next_run'] }}
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h4>Config:</h4>
|
||||||
|
|
||||||
|
<table class="table table-hover">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th><i class="fas fa-water"></i> Depth</th>
|
||||||
|
<td>{{ meta['depth'] }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th><i class="fas fa-image"></i> Screenshot</th>
|
||||||
|
<td>
|
||||||
|
<div class="custom-control custom-switch">
|
||||||
|
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['screenshot'] %}checked{% endif %} disabled>
|
||||||
|
<label class="custom-control-label" for="html_content_id">
|
||||||
|
<i class="fas fa-image"></i>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th><i class="fas fa-file"></i> Har</th>
|
||||||
|
<td>
|
||||||
|
<div class="custom-control custom-switch">
|
||||||
|
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['har'] %}checked{% endif %} disabled>
|
||||||
|
<label class="custom-control-label" for="html_content_id">
|
||||||
|
<i class="fas fa-file"></i>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th><i class="fas fa-cookie-bite"></i> Cookiejar</th>
|
||||||
|
<td>
|
||||||
|
{% if not meta['cookiejar'] %}
|
||||||
|
-
|
||||||
|
{% else %}
|
||||||
|
<a href="{{ url_for('crawler_splash.crawler_cookiejar_show') }}?uuid={{meta['cookiejar']}}">
|
||||||
|
{{ meta['cookiejar'] }}
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Proxy</th>
|
||||||
|
<td>
|
||||||
|
{% if not meta['proxy'] %}
|
||||||
|
-
|
||||||
|
{% else %}
|
||||||
|
{{ meta['proxy'] }}
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>User Agent</th>
|
||||||
|
<td>
|
||||||
|
{% if meta['user_agent'] %}
|
||||||
|
{{ meta['user_agent'] }}
|
||||||
|
{% else %}
|
||||||
|
Default
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% if meta['header'] %}
|
||||||
|
<tr>
|
||||||
|
<th>header</th>
|
||||||
|
<td>{{ meta['header'] }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endif %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<div class="col-lg-4">
|
||||||
|
<div>
|
||||||
|
<a href="{{ url_for('crawler_splash.schedule_delete') }}?uuid={{meta['uuid']}}">
|
||||||
|
<button type="button" class="btn btn-danger">
|
||||||
|
<i class="fas fa-trash-alt"></i> <b>Delete</b>
|
||||||
|
</button>
|
||||||
|
</a>
|
||||||
|
{# <a href="{{ url_for('investigations_b.edit_investigation') }}?uuid={{meta['uuid']}}">#}
|
||||||
|
{# <button type="button" class="btn btn-info">#}
|
||||||
|
{# <i class="fas fa-pencil-alt"></i> <b>Edit</b>#}
|
||||||
|
{# </button>#}
|
||||||
|
{# </a>#}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var chart = {};
|
||||||
|
$(document).ready(function(){
|
||||||
|
$("#page-Crawler").addClass("active");
|
||||||
|
$("#nav_monitoring_crawler").addClass("active");
|
||||||
|
});
|
||||||
|
|
||||||
|
function toggle_sidebar(){
|
||||||
|
if($('#nav_menu').is(':visible')){
|
||||||
|
$('#nav_menu').hide();
|
||||||
|
$('#side_menu').removeClass('border-right')
|
||||||
|
$('#side_menu').removeClass('col-lg-2')
|
||||||
|
$('#core_content').removeClass('col-lg-10')
|
||||||
|
}else{
|
||||||
|
$('#nav_menu').show();
|
||||||
|
$('#side_menu').addClass('border-right')
|
||||||
|
$('#side_menu').addClass('col-lg-2')
|
||||||
|
$('#core_content').addClass('col-lg-10')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
</script>
|
|
@ -0,0 +1,94 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>AIL-Framework</title>
|
||||||
|
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||||
|
<!-- Core CSS -->
|
||||||
|
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||||
|
<link href="{{ url_for('static', filename='css/dataTables.bootstrap.min.css') }}" rel="stylesheet">
|
||||||
|
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||||
|
|
||||||
|
<!-- JS -->
|
||||||
|
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
|
||||||
|
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
|
||||||
|
{% include 'nav_bar.html' %}
|
||||||
|
|
||||||
|
<div class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
|
||||||
|
{% include 'crawler/menu_sidebar.html' %}
|
||||||
|
|
||||||
|
<div class="col-12 col-lg-10" id="core_content">
|
||||||
|
|
||||||
|
{% include 'crawler/crawler_disabled.html' %}
|
||||||
|
|
||||||
|
<h1>Schedulers:</h1>
|
||||||
|
|
||||||
|
<table class="table mt-1 table-hover table-borderless table-striped" id="table_scheduler">
|
||||||
|
<thead class="thead-dark">
|
||||||
|
<tr>
|
||||||
|
<th>Url</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Next Run</th>
|
||||||
|
<th>User</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="tbody_last_crawled">
|
||||||
|
{% for meta in schedulers %}
|
||||||
|
<tr>
|
||||||
|
<td><a href="{{ url_for('crawler_splash.schedule_show') }}?uuid={{ meta['uuid'] }}">{{ meta['url'] }}</a></td>
|
||||||
|
<td>{{ meta['status'] }}</td>
|
||||||
|
<td>
|
||||||
|
{% if not meta['next_run'] %}
|
||||||
|
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
|
||||||
|
{% else %}
|
||||||
|
{{ meta['next_run'] }}
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
<td>{{ meta['user'] }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</body>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var chart = {};
|
||||||
|
$(document).ready(function(){
|
||||||
|
$("#page-Crawler").addClass("active");
|
||||||
|
$("#nav_scheduler_crawler").addClass("active");
|
||||||
|
$('#table_scheduler').DataTable();
|
||||||
|
});
|
||||||
|
|
||||||
|
function toggle_sidebar(){
|
||||||
|
if($('#nav_menu').is(':visible')){
|
||||||
|
$('#nav_menu').hide();
|
||||||
|
$('#side_menu').removeClass('border-right')
|
||||||
|
$('#side_menu').removeClass('col-lg-2')
|
||||||
|
$('#core_content').removeClass('col-lg-10')
|
||||||
|
}else{
|
||||||
|
$('#nav_menu').show();
|
||||||
|
$('#side_menu').addClass('border-right')
|
||||||
|
$('#side_menu').addClass('col-lg-2')
|
||||||
|
$('#core_content').addClass('col-lg-10')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
</script>
|
|
@ -208,8 +208,9 @@ function refresh_crawler_status(){
|
||||||
$('#stat_web_total').text(data.stats['web']['crawled']);
|
$('#stat_web_total').text(data.stats['web']['crawled']);
|
||||||
$('#stat_web_queue').text(data.stats['web']['queue']);
|
$('#stat_web_queue').text(data.stats['web']['queue']);
|
||||||
|
|
||||||
|
$("#tbody_crawler_onion_info").empty();
|
||||||
|
|
||||||
if(data.crawlers_status.length!=0){
|
if(data.crawlers_status.length!=0){
|
||||||
$("#tbody_crawler_onion_info").empty();
|
|
||||||
var tableRef = document.getElementById('tbody_crawler_onion_info');
|
var tableRef = document.getElementById('tbody_crawler_onion_info');
|
||||||
for (var i = 0; i < data.crawlers_status.length; i++) {
|
for (var i = 0; i < data.crawlers_status.length; i++) {
|
||||||
var crawler = data.crawlers_status[i];
|
var crawler = data.crawlers_status[i];
|
||||||
|
|
|
@ -35,8 +35,8 @@
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
<a class="nav-link" href="{{url_for('hiddenServices.auto_crawler')}}" id="nav_auto_crawler">
|
<a class="nav-link" href="{{url_for('crawler_splash.scheduler_dashboard')}}" id="nav_scheduler_crawler">
|
||||||
<i class="fas fa-sync"></i> Automatic Crawler
|
<i class="fas fa-sync"></i> Scheduler
|
||||||
</a>
|
</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
|
|
Loading…
Reference in a new issue