mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 01:37:17 +00:00
chg: [crawler] add crawler scheduler
This commit is contained in:
parent
ae6f8af09f
commit
925d67a35e
8 changed files with 864 additions and 286 deletions
|
@ -5,6 +5,8 @@ import os
|
|||
import sys
|
||||
import time
|
||||
|
||||
from requests.exceptions import ConnectionError
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
##################################
|
||||
# Import Project packages
|
||||
|
@ -15,6 +17,7 @@ from lib.ConfigLoader import ConfigLoader
|
|||
from lib.objects.Domains import Domain
|
||||
from lib.objects import Screenshots
|
||||
|
||||
|
||||
class Crawler(AbstractModule):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -37,8 +40,11 @@ class Crawler(AbstractModule):
|
|||
# update captures cache
|
||||
crawlers.reload_crawler_captures()
|
||||
|
||||
self.crawler_scheduler = crawlers.CrawlerScheduler()
|
||||
|
||||
# LACUS
|
||||
self.lacus = crawlers.get_lacus()
|
||||
self.is_lacus_up = crawlers.get_lacus().is_up
|
||||
|
||||
# Capture
|
||||
self.har = None
|
||||
|
@ -51,44 +57,70 @@ class Crawler(AbstractModule):
|
|||
# Send module state to logs
|
||||
self.redis_logger.info('Crawler initialized')
|
||||
|
||||
def print_crawler_start_info(self, url, domain, domain_url):
|
||||
def refresh_lacus_status(self):
|
||||
try:
|
||||
self.is_lacus_up = crawlers.get_lacus().is_up
|
||||
except:
|
||||
self.is_lacus_up = False
|
||||
if not self.is_lacus_up:
|
||||
print("Can't reach lacus server", int(time.time()))
|
||||
time.sleep(30)
|
||||
|
||||
def print_crawler_start_info(self, url, domain_url):
|
||||
print()
|
||||
print()
|
||||
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
||||
print(f'crawler type: {domain}')
|
||||
print(f'crawler type: {self.domain}')
|
||||
print('\033[92m-------------------------------------------------\033[0m')
|
||||
print(f'url: {url}')
|
||||
print(f'domain: {domain}')
|
||||
print(f'domain: {self.domain}')
|
||||
print(f'domain_url: {domain_url}')
|
||||
print()
|
||||
|
||||
def get_message(self):
|
||||
# Crawler Scheduler
|
||||
self.crawler_scheduler.update_queue()
|
||||
self.crawler_scheduler.process_queue()
|
||||
|
||||
self.refresh_lacus_status() # TODO LOG ERROR
|
||||
if not self.is_lacus_up:
|
||||
return None
|
||||
|
||||
# Check if a new Capture can be Launched
|
||||
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
||||
task_row = crawlers.add_task_to_lacus_queue()
|
||||
if task_row:
|
||||
print(task_row)
|
||||
task_uuid, priority = task_row
|
||||
self.enqueue_capture(task_uuid, priority)
|
||||
try:
|
||||
self.enqueue_capture(task_uuid, priority)
|
||||
except ConnectionError:
|
||||
print(task_row)
|
||||
task = crawlers.CrawlerTask(task_uuid)
|
||||
task.add_to_db_crawler_queue(priority)
|
||||
self.refresh_lacus_status()
|
||||
return None
|
||||
|
||||
# Get CrawlerCapture Object
|
||||
capture = crawlers.get_crawler_capture()
|
||||
if capture:
|
||||
print(capture.uuid)
|
||||
status = self.lacus.get_capture_status(capture.uuid)
|
||||
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
||||
capture.update(status)
|
||||
print(capture.uuid, status, int(time.time()))
|
||||
else:
|
||||
self.compute(capture)
|
||||
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
|
||||
print('capture', capture.uuid, 'completed')
|
||||
try:
|
||||
status = self.lacus.get_capture_status(capture.uuid)
|
||||
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
||||
capture.update(status)
|
||||
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
|
||||
else:
|
||||
return capture
|
||||
|
||||
except ConnectionError:
|
||||
print(capture.uuid)
|
||||
capture.update(self, -1)
|
||||
self.refresh_lacus_status()
|
||||
|
||||
time.sleep(self.pending_seconds)
|
||||
|
||||
def enqueue_capture(self, task_uuid, priority):
|
||||
task = crawlers.CrawlerTask(task_uuid)
|
||||
print(task)
|
||||
# print(task)
|
||||
# task = {
|
||||
# 'uuid': task_uuid,
|
||||
# 'url': 'https://foo.be',
|
||||
|
@ -102,11 +134,15 @@ class Crawler(AbstractModule):
|
|||
# 'proxy': 'force_tor',
|
||||
# 'parent': 'manual',
|
||||
# }
|
||||
|
||||
url = task.get_url()
|
||||
force = priority != 0
|
||||
# TODO timeout
|
||||
|
||||
# TODO HEADER
|
||||
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
|
||||
# force=force,
|
||||
# general_timeout_in_sec=120)
|
||||
|
||||
capture_uuid = self.lacus.enqueue(url=url,
|
||||
depth=task.get_depth(),
|
||||
|
@ -114,14 +150,13 @@ class Crawler(AbstractModule):
|
|||
proxy=task.get_proxy(),
|
||||
cookies=task.get_cookies(),
|
||||
force=force,
|
||||
general_timeout_in_sec=90)
|
||||
general_timeout_in_sec=90) # TODO increase timeout if onion ????
|
||||
|
||||
crawlers.create_capture(capture_uuid, task_uuid)
|
||||
print(task.uuid, capture_uuid, 'launched')
|
||||
return capture_uuid
|
||||
|
||||
# CRAWL DOMAIN
|
||||
# TODO: CATCH ERRORS
|
||||
def compute(self, capture):
|
||||
print('saving capture', capture.uuid)
|
||||
|
||||
|
@ -131,7 +166,6 @@ class Crawler(AbstractModule):
|
|||
|
||||
self.domain = Domain(domain)
|
||||
|
||||
# TODO CHANGE EPOCH
|
||||
epoch = int(time.time())
|
||||
parent_id = task.get_parent()
|
||||
|
||||
|
@ -139,6 +173,9 @@ class Crawler(AbstractModule):
|
|||
print(entries['status'])
|
||||
self.har = task.get_har()
|
||||
self.screenshot = task.get_screenshot()
|
||||
# DEBUG
|
||||
# self.har = True
|
||||
# self.screenshot = True
|
||||
str_date = crawlers.get_current_date(separator=True)
|
||||
self.har_dir = crawlers.get_date_har_dir(str_date)
|
||||
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
|
||||
|
@ -156,7 +193,10 @@ class Crawler(AbstractModule):
|
|||
self.domain.add_history(epoch, root_item=epoch)
|
||||
|
||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||
task.clear()
|
||||
print('capture:', capture.uuid, 'completed')
|
||||
print('task: ', task.uuid, 'completed')
|
||||
print()
|
||||
task.remove()
|
||||
|
||||
def save_capture_response(self, parent_id, entries):
|
||||
print(entries.keys())
|
||||
|
@ -168,12 +208,11 @@ class Crawler(AbstractModule):
|
|||
print('retrieved content')
|
||||
# print(entries.get('html'))
|
||||
|
||||
# TODO LOGS IF != domain
|
||||
if 'last_redirected_url' in entries and entries['last_redirected_url']:
|
||||
last_url = entries['last_redirected_url']
|
||||
unpacked_last_url = crawlers.unpack_url(last_url)
|
||||
current_domain = unpacked_last_url['domain']
|
||||
# REDIRECTION TODO CHECK IF WEB
|
||||
# REDIRECTION TODO CHECK IF TYPE CHANGE
|
||||
if current_domain != self.domain.id and not self.root_item:
|
||||
self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
|
||||
print(f'External redirection {self.domain.id} -> {current_domain}')
|
||||
|
@ -225,92 +264,4 @@ class Crawler(AbstractModule):
|
|||
if __name__ == '__main__':
|
||||
module = Crawler()
|
||||
module.debug = True
|
||||
# module.compute(('ooooo', 0))
|
||||
module.run()
|
||||
|
||||
|
||||
##################################
|
||||
##################################
|
||||
##################################
|
||||
##################################
|
||||
##################################
|
||||
|
||||
# def update_auto_crawler():
|
||||
# current_epoch = int(time.time())
|
||||
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
|
||||
# for elem_to_crawl in list_to_crawl:
|
||||
# mess, type = elem_to_crawl.rsplit(';', 1)
|
||||
# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
|
||||
# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
|
||||
|
||||
# Extract info form url (url, domain, domain url, ...)
|
||||
# def unpack_url(url):
|
||||
# to_crawl = {}
|
||||
# faup.decode(url)
|
||||
# url_unpack = faup.get()
|
||||
# to_crawl['domain'] = to_crawl['domain'].lower()
|
||||
# new_url_host = url_host.lower()
|
||||
# url_lower_case = url.replace(url_host, new_url_host, 1)
|
||||
#
|
||||
# if url_unpack['scheme'] is None:
|
||||
# to_crawl['scheme'] = 'http'
|
||||
# url= 'http://{}'.format(url_lower_case)
|
||||
# else:
|
||||
# try:
|
||||
# scheme = url_unpack['scheme'].decode()
|
||||
# except Exception as e:
|
||||
# scheme = url_unpack['scheme']
|
||||
# if scheme in default_proto_map:
|
||||
# to_crawl['scheme'] = scheme
|
||||
# url = url_lower_case
|
||||
# else:
|
||||
# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
|
||||
# to_crawl['scheme'] = 'http'
|
||||
# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
|
||||
#
|
||||
# if url_unpack['port'] is None:
|
||||
# to_crawl['port'] = default_proto_map[to_crawl['scheme']]
|
||||
# else:
|
||||
# try:
|
||||
# port = url_unpack['port'].decode()
|
||||
# except:
|
||||
# port = url_unpack['port']
|
||||
# # Verify port number #################### make function to verify/correct port number
|
||||
# try:
|
||||
# int(port)
|
||||
# # Invalid port Number
|
||||
# except Exception as e:
|
||||
# port = default_proto_map[to_crawl['scheme']]
|
||||
# to_crawl['port'] = port
|
||||
#
|
||||
# #if url_unpack['query_string'] is None:
|
||||
# # if to_crawl['port'] == 80:
|
||||
# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
|
||||
# # else:
|
||||
# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
|
||||
# #else:
|
||||
# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode())
|
||||
#
|
||||
# to_crawl['url'] = url
|
||||
# if to_crawl['port'] == 80:
|
||||
# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
|
||||
# else:
|
||||
# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
|
||||
#
|
||||
# try:
|
||||
# to_crawl['tld'] = url_unpack['tld'].decode()
|
||||
# except:
|
||||
# to_crawl['tld'] = url_unpack['tld']
|
||||
#
|
||||
# return to_crawl
|
||||
|
||||
# ##################################################### add ftp ???
|
||||
# update_auto_crawler()
|
||||
|
||||
# # add next auto Crawling in queue:
|
||||
# if to_crawl['paste'] == 'auto':
|
||||
# redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
|
||||
# # update list, last auto crawled domains
|
||||
# redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
|
||||
# redis_crawler.ltrim('last_auto_crawled', 0, 9)
|
||||
#
|
||||
|
|
|
@ -19,8 +19,9 @@ import uuid
|
|||
|
||||
from enum import IntEnum, unique
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from urllib.parse import urlparse, urljoin
|
||||
#from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from pylacus import PyLacus
|
||||
|
||||
|
@ -44,8 +45,6 @@ r_db = config_loader.get_db_conn("Kvrocks_DB")
|
|||
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
|
||||
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||
|
||||
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
||||
|
||||
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
|
||||
HAR_DIR = config_loader.get_files_directory('har')
|
||||
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
|
||||
|
@ -181,8 +180,8 @@ def extract_favicon_from_html(html, url):
|
|||
|
||||
################################################################################
|
||||
|
||||
# # TODO: handle prefix cookies
|
||||
# # TODO: fill empty fields
|
||||
# # TODO:
|
||||
# # TODO: REVIEW ME THEN REMOVE ME
|
||||
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
|
||||
# check cookie domain filed
|
||||
if not 'domain' in cookie_dict:
|
||||
|
@ -201,13 +200,6 @@ def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
|
|||
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
|
||||
return cookie_dict
|
||||
|
||||
def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='web'):
|
||||
cookies = get_cookiejar_cookies_list(cookiejar_uuid)
|
||||
all_cookies = []
|
||||
for cookie_dict in cookies:
|
||||
all_cookies.append(create_cookie_crawler(cookie_dict, domain, crawler_type=crawler_type))
|
||||
return all_cookies
|
||||
|
||||
################################################################################
|
||||
################################################################################
|
||||
################################################################################
|
||||
|
@ -695,7 +687,285 @@ def load_blacklist():
|
|||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
#### CRAWLER STATE ####
|
||||
#### CRAWLER Scheduler ####
|
||||
|
||||
@unique
|
||||
class ScheduleStatus(IntEnum):
|
||||
"""The status of the capture"""
|
||||
UNKNOWN = -1
|
||||
QUEUED = 0
|
||||
SCHEDULED = 1
|
||||
ONGOING = 2
|
||||
|
||||
def get_schedulers_uuid():
|
||||
return r_crawler.smembers('scheduler:schedules')
|
||||
|
||||
def get_schedulers_metas():
|
||||
schedulers = []
|
||||
for schedule_uuid in get_schedulers_uuid():
|
||||
schedule = CrawlerSchedule(schedule_uuid)
|
||||
schedulers.append(schedule.get_meta_status())
|
||||
return schedulers
|
||||
|
||||
class CrawlerScheduler:
|
||||
|
||||
def __init__(self):
|
||||
self.min_frequency = 60 # TODO ADD IN CONFIG
|
||||
|
||||
def update_queue(self):
|
||||
for schedule_uuid in get_schedulers_uuid():
|
||||
schedule = CrawlerSchedule(schedule_uuid)
|
||||
# check if already in scheduler queue
|
||||
if schedule.is_scheduled():
|
||||
continue
|
||||
if schedule.is_tasked():
|
||||
continue
|
||||
|
||||
# EXPIRE ????
|
||||
|
||||
time_next_run = 0.0
|
||||
frequency = schedule.get_frequency() # optional or later -> cron
|
||||
if frequency == 'hourly':
|
||||
time_next_run = (datetime.now() + timedelta(hours=1)).timestamp()
|
||||
elif frequency == 'daily':
|
||||
time_next_run = (datetime.now() + timedelta(days=1)).timestamp()
|
||||
elif frequency == 'weekly':
|
||||
time_next_run = (datetime.now() + timedelta(weeks=1)).timestamp()
|
||||
elif frequency == 'monthly':
|
||||
time_next_run = (datetime.now() + relativedelta(months=1)).timestamp()
|
||||
else:
|
||||
months, weeks, days, hours, minutes = frequency.split(':')
|
||||
if not months:
|
||||
months = 0
|
||||
if not weeks:
|
||||
weeks = 0
|
||||
if not days:
|
||||
days = 0
|
||||
if not hours:
|
||||
hours = 0
|
||||
if not minutes:
|
||||
minutes = 0
|
||||
current_time = datetime.now().timestamp()
|
||||
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
|
||||
days=int(days), hours=int(hours),
|
||||
minutes=int(minutes))).timestamp()
|
||||
# Make sure the next capture is not scheduled for in a too short interval
|
||||
interval_next_capture = time_next_run - current_time
|
||||
if interval_next_capture < self.min_frequency:
|
||||
# self.logger.warning(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
|
||||
print(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
|
||||
time_next_run = (datetime.now() + timedelta(seconds=self.min_frequency)).timestamp()
|
||||
|
||||
schedule.set_next_run(time_next_run)
|
||||
print('scheduled:', schedule_uuid)
|
||||
|
||||
def process_queue(self):
|
||||
now = datetime.now().timestamp()
|
||||
for raw_schedule in r_crawler.zrangebyscore('scheduler:queue', '-inf', int(now), withscores=True):
|
||||
schedule_uuid, next_run = raw_schedule
|
||||
schedule = CrawlerSchedule(schedule_uuid)
|
||||
if not schedule.exists():
|
||||
return None
|
||||
meta = schedule.get_meta()
|
||||
task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'],
|
||||
header=meta['header'],
|
||||
cookiejar=meta['cookiejar'], proxy=meta['proxy'],
|
||||
user_agent=meta['user_agent'], parent='scheduler', priority=40)
|
||||
if task_uuid:
|
||||
schedule.set_task(task_uuid)
|
||||
r_crawler.zrem('scheduler:queue', schedule_uuid)
|
||||
|
||||
|
||||
# TODO Expire -> stuck in crawler queue or reached delta
|
||||
class CrawlerSchedule:
|
||||
def __init__(self, schedule_uuid):
|
||||
self.uuid = schedule_uuid
|
||||
|
||||
def exists(self):
|
||||
return r_crawler.exists(f'schedule:{self.uuid}')
|
||||
|
||||
def get_frequency(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'frequency')
|
||||
|
||||
def get_user(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'user')
|
||||
|
||||
def get_date(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'date')
|
||||
|
||||
def get_captures(self): # only scheduled capture ????? exclude manual/discovery
|
||||
pass
|
||||
|
||||
def get_status(self):
|
||||
if self.is_scheduled():
|
||||
return ScheduleStatus.SCHEDULED
|
||||
if self.is_tasked():
|
||||
if self.is_ongoing():
|
||||
return ScheduleStatus.ONGOING
|
||||
else:
|
||||
return ScheduleStatus.QUEUED
|
||||
return ScheduleStatus.UNKNOWN
|
||||
|
||||
def get_task_uuid(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'task')
|
||||
|
||||
def is_tasked(self):
|
||||
task_uuid = self.get_task_uuid()
|
||||
if task_uuid:
|
||||
task = CrawlerTask(task_uuid)
|
||||
tasked = task.exists()
|
||||
if not tasked:
|
||||
r_crawler.hdel(f'schedule:{self.uuid}', 'task')
|
||||
return tasked
|
||||
return False
|
||||
|
||||
def get_task(self):
|
||||
task_uuid = self.get_task_uuid()
|
||||
if task_uuid:
|
||||
return CrawlerTask(task_uuid)
|
||||
|
||||
def set_task(self, task_uuid):
|
||||
return r_crawler.hset(f'schedule:{self.uuid}', 'task', task_uuid)
|
||||
|
||||
def is_ongoing(self):
|
||||
task = self.get_task()
|
||||
if task:
|
||||
return task.is_ongoing()
|
||||
return False
|
||||
|
||||
def get_next_run(self, r_str=False):
|
||||
next_run = r_crawler.zscore('scheduler:queue', self.uuid)
|
||||
if next_run and r_str:
|
||||
next_run = time.strftime('%Y-%m-%d - %H:%M:%S', time.localtime(int(next_run)))
|
||||
return next_run
|
||||
|
||||
def set_next_run(self, time_next_run):
|
||||
r_crawler.zadd('scheduler:queue', mapping={self.uuid: time_next_run})
|
||||
|
||||
def is_scheduled(self):
|
||||
return bool(r_crawler.zscore('scheduler:queue', self.uuid))
|
||||
|
||||
def get_url(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'url')
|
||||
|
||||
def get_depth(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'depth')
|
||||
|
||||
def get_har(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'har') == 'True'
|
||||
|
||||
def get_screenshot(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'screenshot') == 'True'
|
||||
|
||||
def get_header(self):
|
||||
r_crawler.hget(f'schedule:{self.uuid}', 'header')
|
||||
|
||||
def get_cookiejar(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'cookiejar')
|
||||
|
||||
def get_proxy(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'proxy')
|
||||
|
||||
def get_user_agent(self):
|
||||
return r_crawler.hget(f'schedule:{self.uuid}', 'user_agent')
|
||||
|
||||
def _set_field(self, field, value):
|
||||
return r_crawler.hset(f'schedule:{self.uuid}', field, value)
|
||||
|
||||
def get_meta(self, ui=False):
|
||||
meta = {
|
||||
'uuid': self.uuid,
|
||||
'date': self.get_date(),
|
||||
'frequency': self.get_frequency(),
|
||||
'user': self.get_user(),
|
||||
'url': self.get_url(),
|
||||
'depth': self.get_depth(),
|
||||
'har': self.get_har(),
|
||||
'screenshot': self.get_screenshot(),
|
||||
'user_agent': self.get_user_agent(),
|
||||
'cookiejar': self.get_cookiejar(),
|
||||
'header': self.get_header(),
|
||||
'proxy': self.get_proxy(),
|
||||
}
|
||||
status = self.get_status()
|
||||
if ui:
|
||||
status = status.name
|
||||
r_str = True
|
||||
else:
|
||||
r_str = False
|
||||
meta['status'] = status
|
||||
meta['next_run'] = self.get_next_run(r_str=r_str)
|
||||
return meta
|
||||
|
||||
def get_meta_status(self): # TODO: Description ? Frequency ???
|
||||
meta = {'uuid': self.uuid,
|
||||
'url': self.get_url(),
|
||||
'user': self.get_user(),
|
||||
'next_run': self.get_next_run(r_str=True)}
|
||||
status = self.get_status()
|
||||
if isinstance(status, ScheduleStatus):
|
||||
status = status.name
|
||||
meta['status'] = status
|
||||
return meta
|
||||
|
||||
def create(self, frequency, user, url,
|
||||
depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
|
||||
|
||||
if self.exists():
|
||||
raise Exception('Error: Monitor already exists')
|
||||
|
||||
url_decoded = unpack_url(url)
|
||||
url = url_decoded['url']
|
||||
|
||||
self._set_field('date', datetime.now().strftime("%Y-%m-%d"))
|
||||
self._set_field('frequency', frequency)
|
||||
self._set_field('user', user)
|
||||
self._set_field('url', url)
|
||||
self._set_field('depth', int(depth))
|
||||
self._set_field('har', str(har))
|
||||
self._set_field('screenshot', str(screenshot))
|
||||
|
||||
if cookiejar:
|
||||
self._set_field('cookiejar', cookiejar)
|
||||
if header:
|
||||
self._set_field('header', header)
|
||||
if proxy:
|
||||
if proxy == 'web':
|
||||
proxy = None
|
||||
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
|
||||
proxy = 'force_tor'
|
||||
self._set_field('proxy', proxy)
|
||||
if user_agent:
|
||||
self._set_field('user_agent', user_agent)
|
||||
|
||||
r_crawler.sadd('scheduler:schedules', self.uuid)
|
||||
|
||||
def delete(self):
|
||||
# remove from schedule queue
|
||||
r_crawler.zrem('scheduler:queue', self.uuid)
|
||||
|
||||
# delete task
|
||||
task = self.get_task()
|
||||
if task:
|
||||
task.delete()
|
||||
|
||||
# delete meta
|
||||
r_crawler.delete(f'schedule:{self.uuid}')
|
||||
r_crawler.srem('scheduler:schedules', self.uuid)
|
||||
|
||||
def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
|
||||
schedule_uuid = gen_uuid()
|
||||
schedule = CrawlerSchedule(schedule_uuid)
|
||||
schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent)
|
||||
return schedule_uuid
|
||||
|
||||
# TODO sanityze UUID
|
||||
def api_delete_schedule(data):
|
||||
schedule_uuid = data.get('uuid')
|
||||
schedule = CrawlerSchedule(schedule_uuid)
|
||||
if not schedule.exists():
|
||||
return {'error': 'unknown schedule uuid', 'uuid': schedule}, 404
|
||||
return schedule.delete(), 200
|
||||
|
||||
#### CRAWLER CAPTURE ####
|
||||
|
||||
|
@ -709,7 +979,15 @@ def reload_crawler_captures():
|
|||
r_cache.delete('crawler:captures')
|
||||
for capture_uuid in get_crawler_captures():
|
||||
capture = CrawlerCapture(capture_uuid)
|
||||
r_cache.zadd('crawler:captures', {capture.uuid: 0})
|
||||
capture.update(None)
|
||||
|
||||
def _clear_captures():
|
||||
for capture_uuid in get_crawler_captures():
|
||||
capture = CrawlerCapture(capture_uuid)
|
||||
task = capture.get_task()
|
||||
task.delete()
|
||||
capture.delete()
|
||||
print(capture_uuid, 'deleted')
|
||||
|
||||
@unique
|
||||
class CaptureStatus(IntEnum):
|
||||
|
@ -741,6 +1019,9 @@ class CrawlerCapture:
|
|||
def get_status(self):
|
||||
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
|
||||
|
||||
def is_ongoing(self):
|
||||
return self.get_status() == CaptureStatus.ONGOING
|
||||
|
||||
def create(self, task_uuid):
|
||||
if self.exists():
|
||||
raise Exception(f'Error: Capture {self.uuid} already exists')
|
||||
|
@ -752,20 +1033,26 @@ class CrawlerCapture:
|
|||
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
|
||||
|
||||
def update(self, status):
|
||||
last_check = int(time.time())
|
||||
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
|
||||
r_cache.zadd('crawler:captures', {self.uuid: last_check})
|
||||
# Error or Reload
|
||||
if not status:
|
||||
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', CaptureStatus.UNKNOWN)
|
||||
r_cache.zadd('crawler:captures', {self.uuid: 0})
|
||||
else:
|
||||
last_check = int(time.time())
|
||||
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
|
||||
r_cache.zadd('crawler:captures', {self.uuid: last_check})
|
||||
|
||||
def remove(self): # TODO INCOMPLETE
|
||||
# Crawler
|
||||
def remove(self):
|
||||
r_crawler.zrem('crawler:captures', self.uuid)
|
||||
r_cache.delete(f'crawler:capture:{self.uuid}')
|
||||
r_crawler.hdel('crawler:captures:tasks', self.uuid)
|
||||
|
||||
# TODO
|
||||
# TODO DELETE TASK ???
|
||||
# Manual
|
||||
def delete(self):
|
||||
# task = self.get_task()
|
||||
# task.delete()
|
||||
r_cache.delete(f'crawler:capture:{self.uuid}')
|
||||
# remove Capture from crawler queue
|
||||
r_cache.zrem('crawler:captures', self.uuid)
|
||||
self.remove()
|
||||
|
||||
|
||||
def create_capture(capture_uuid, task_uuid):
|
||||
|
@ -792,9 +1079,13 @@ def get_captures_status():
|
|||
'uuid': task.uuid,
|
||||
'domain': dom.get_id(),
|
||||
'type': dom.get_domain_type(),
|
||||
'start_time': capture.get_start_time(), ############### TODO
|
||||
'start_time': capture.get_start_time(),
|
||||
'status': capture.get_status(),
|
||||
}
|
||||
capture_status = capture.get_status()
|
||||
if capture_status:
|
||||
capture_status = CaptureStatus(int(capture_status)).name
|
||||
meta['status'] = capture_status
|
||||
status.append(meta)
|
||||
return status
|
||||
|
||||
|
@ -872,6 +1163,12 @@ class CrawlerTask:
|
|||
def get_capture(self):
|
||||
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
|
||||
|
||||
def is_ongoing(self):
|
||||
capture_uuid = self.get_capture()
|
||||
if capture_uuid:
|
||||
return CrawlerCapture(capture_uuid).is_ongoing()
|
||||
return False
|
||||
|
||||
def _set_field(self, field, value):
|
||||
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
|
||||
|
||||
|
@ -923,8 +1220,6 @@ class CrawlerTask:
|
|||
proxy = None
|
||||
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
|
||||
proxy = 'force_tor'
|
||||
if not user_agent:
|
||||
user_agent = get_default_user_agent()
|
||||
|
||||
# TODO SANITIZE COOKIEJAR -> UUID
|
||||
|
||||
|
@ -934,13 +1229,11 @@ class CrawlerTask:
|
|||
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
|
||||
return self.uuid
|
||||
|
||||
# TODO ADD TASK STATUS -----
|
||||
self._set_field('domain', domain)
|
||||
self._set_field('url', url)
|
||||
self._set_field('depth', int(depth))
|
||||
self._set_field('har', har)
|
||||
self._set_field('screenshot', screenshot)
|
||||
self._set_field('user_agent', user_agent)
|
||||
self._set_field('parent', parent)
|
||||
|
||||
if cookiejar:
|
||||
|
@ -949,30 +1242,45 @@ class CrawlerTask:
|
|||
self._set_field('header', header)
|
||||
if proxy:
|
||||
self._set_field('proxy', proxy)
|
||||
if user_agent:
|
||||
self._set_field('user_agent', user_agent)
|
||||
|
||||
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
|
||||
self._set_field('hash', hash_query)
|
||||
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
||||
self.add_to_db_crawler_queue(priority)
|
||||
# UI
|
||||
domain_type = dom.get_domain_type()
|
||||
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
|
||||
self._set_field('queue', domain_type)
|
||||
return self.uuid
|
||||
|
||||
def lacus_queue(self):
|
||||
r_crawler.sadd('crawler:queue:queued', self.uuid)
|
||||
def add_to_db_crawler_queue(self, priority):
|
||||
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
||||
|
||||
def start(self):
|
||||
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
def clear(self):
|
||||
r_crawler.hdel('crawler:queue:hash', self.get_hash())
|
||||
r_crawler.srem(f'crawler:queue:type:{self.get_queue()}', self.uuid)
|
||||
r_crawler.srem('crawler:queue:queued', self.uuid)
|
||||
|
||||
def delete(self):
|
||||
self.clear()
|
||||
# Crawler
|
||||
def remove(self): # zrem cache + DB
|
||||
capture_uuid = self.get_capture()
|
||||
if capture_uuid:
|
||||
capture = CrawlerCapture(capture_uuid)
|
||||
capture.remove()
|
||||
queue_type = self.get_queue()
|
||||
if queue_type:
|
||||
r_crawler.srem(f'crawler:queue:type:{queue_type}', self.uuid)
|
||||
task_hash = self.get_hash()
|
||||
if task_hash:
|
||||
r_crawler.hdel('crawler:queue:hash', task_hash)
|
||||
# meta
|
||||
r_crawler.delete(f'crawler:task:{self.uuid}')
|
||||
# r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
||||
|
||||
# Manual
|
||||
def delete(self):
|
||||
# queue
|
||||
r_crawler.zrem('crawler:queue', self.uuid)
|
||||
self.remove()
|
||||
|
||||
|
||||
# TODO move to class ???
|
||||
|
@ -990,7 +1298,7 @@ def add_task_to_lacus_queue():
|
|||
return None
|
||||
task_uuid, priority = task_uuid[0]
|
||||
task = CrawlerTask(task_uuid)
|
||||
task.lacus_queue()
|
||||
task.start()
|
||||
return task.uuid, priority
|
||||
|
||||
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
||||
|
@ -1006,29 +1314,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
|
|||
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
|
||||
return task_uuid
|
||||
|
||||
######################################################################
|
||||
######################################################################
|
||||
|
||||
# def get_task_status(task_uuid):
|
||||
# domain = r_crawler.hget(f'crawler:task:{task_uuid}', 'domain')
|
||||
# dom = Domain(domain)
|
||||
# meta = {
|
||||
# 'uuid': task_uuid,
|
||||
# 'domain': dom.get_id(),
|
||||
# 'domain_type': dom.get_domain_type(),
|
||||
# 'start_time': r_crawler.hget(f'crawler:task:{task_uuid}', 'start_time'),
|
||||
# 'status': 'test',
|
||||
# }
|
||||
# return meta
|
||||
|
||||
# def get_crawlers_tasks_status():
|
||||
# tasks_status = []
|
||||
# tasks = r_crawler.smembers('crawler:queue:queued')
|
||||
# for task_uuid in tasks:
|
||||
# tasks_status.append(get_task_status(task_uuid))
|
||||
# return tasks_status
|
||||
|
||||
##-- CRAWLER TASK --##
|
||||
## -- CRAWLER TASK -- ##
|
||||
|
||||
#### CRAWLER TASK API ####
|
||||
|
||||
|
@ -1071,13 +1358,25 @@ def api_add_crawler_task(data, user_id=None):
|
|||
return {'error': 'The access to this cookiejar is restricted'}, 403
|
||||
cookiejar_uuid = cookiejar.uuid
|
||||
|
||||
# if auto_crawler:
|
||||
# try:
|
||||
# crawler_delta = int(crawler_delta)
|
||||
# if crawler_delta < 0:
|
||||
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
|
||||
# except ValueError:
|
||||
# return ({'error':'invalid delta between two pass of the crawler'}, 400)
|
||||
frequency = data.get('frequency', None)
|
||||
if frequency:
|
||||
if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
|
||||
if not isinstance(frequency, dict):
|
||||
return {'error': 'Invalid frequency'}, 400
|
||||
else:
|
||||
try:
|
||||
months = int(frequency.get('months', 0))
|
||||
weeks = int(frequency.get('weeks', 0))
|
||||
days = int(frequency.get('days', 0))
|
||||
hours = int(frequency.get('hours', 0))
|
||||
minutes = int(frequency.get('minutes', 0))
|
||||
except (TypeError, ValueError):
|
||||
return {'error': 'Invalid frequency'}, 400
|
||||
if min(months, weeks, days, hours, minutes) < 0:
|
||||
return {'error': 'Invalid frequency'}, 400
|
||||
if max(months, weeks, days, hours, minutes) <= 0:
|
||||
return {'error': 'Invalid frequency'}, 400
|
||||
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
||||
|
||||
# PROXY
|
||||
proxy = data.get('proxy', None)
|
||||
|
@ -1088,15 +1387,16 @@ def api_add_crawler_task(data, user_id=None):
|
|||
if verify[1] != 200:
|
||||
return verify
|
||||
|
||||
# TODO #############################################################################################################
|
||||
# auto_crawler = auto_crawler
|
||||
# crawler_delta = crawler_delta
|
||||
parent = 'manual'
|
||||
|
||||
# TODO HEADERS
|
||||
# TODO USER AGENT
|
||||
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, cookiejar=cookiejar_uuid,
|
||||
proxy=proxy, user_agent=None, parent='manual', priority=90), 200
|
||||
if frequency:
|
||||
# TODO verify user
|
||||
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200
|
||||
else:
|
||||
# TODO HEADERS
|
||||
# TODO USER AGENT
|
||||
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None,
|
||||
parent='manual', priority=90), 200
|
||||
|
||||
|
||||
#### ####
|
||||
|
@ -1108,13 +1408,6 @@ def api_add_crawler_task(data, user_id=None):
|
|||
###################################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#### CRAWLER GLOBAL ####
|
||||
|
||||
# TODO: # FIXME: config db, dynamic load
|
||||
|
@ -1124,55 +1417,8 @@ def is_crawler_activated():
|
|||
def get_crawler_all_types():
|
||||
return ['onion', 'web']
|
||||
|
||||
def sanitize_crawler_types(l_crawler_types):
|
||||
all_crawler_types = get_crawler_all_types()
|
||||
if not l_crawler_types:
|
||||
return all_crawler_types
|
||||
for crawler_type in l_crawler_types:
|
||||
if crawler_type not in all_crawler_types:
|
||||
return all_crawler_types
|
||||
return l_crawler_types
|
||||
|
||||
##-- CRAWLER GLOBAL --##
|
||||
|
||||
#### AUTOMATIC CRAWLER ####
|
||||
|
||||
def get_auto_crawler_all_domain(l_crawler_types=[]):
|
||||
l_crawler_types = sanitize_crawler_types(l_crawler_types)
|
||||
if len(l_crawler_types) == 1:
|
||||
return r_serv_onion.smembers(f'auto_crawler_url:{l_crawler_types[0]}')
|
||||
else:
|
||||
l_keys_name = []
|
||||
for crawler_type in l_crawler_types:
|
||||
l_keys_name.append(f'auto_crawler_url:{crawler_type}')
|
||||
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
|
||||
|
||||
def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
|
||||
r_serv_onion.zadd('crawler_auto_queue', {f'{message};{domain_type}': int(time.time() + delta)})
|
||||
# update list, last auto crawled domains
|
||||
r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
|
||||
r_serv_onion.ltrim('last_auto_crawled', 0, 9)
|
||||
|
||||
def update_auto_crawler_queue():
|
||||
current_epoch = int(time.time())
|
||||
# check if current_epoch > domain_next_epoch
|
||||
l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
|
||||
for elem in l_queue:
|
||||
mess, domain_type = elem.rsplit(';', 1)
|
||||
print(domain_type)
|
||||
print(mess)
|
||||
r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
|
||||
|
||||
|
||||
##-- AUTOMATIC CRAWLER --##
|
||||
|
||||
#### CRAWLER TASK ####
|
||||
|
||||
|
||||
|
||||
##-- CRAWLER TASK --##
|
||||
|
||||
|
||||
|
||||
#### ####
|
||||
|
||||
|
@ -1207,6 +1453,8 @@ def save_har(har_dir, item_id, har_content):
|
|||
# #
|
||||
# # # # # # # # # # # #
|
||||
|
||||
#### PROXY ####
|
||||
|
||||
def api_verify_proxy(proxy_url):
|
||||
parsed_proxy = urlparse(proxy_url)
|
||||
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
|
||||
|
@ -1237,13 +1485,7 @@ class CrawlerProxy:
|
|||
def get_url(self):
|
||||
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
|
||||
|
||||
###############################################################################################
|
||||
###############################################################################################
|
||||
###############################################################################################
|
||||
###############################################################################################
|
||||
|
||||
|
||||
# # # # CRAWLER LACUS # # # #
|
||||
#### CRAWLER LACUS ####
|
||||
|
||||
def get_lacus_url():
|
||||
return r_db.hget('crawler:lacus', 'url')
|
||||
|
@ -1363,12 +1605,7 @@ def api_set_crawler_max_captures(data):
|
|||
save_nb_max_captures(nb_captures)
|
||||
return nb_captures, 200
|
||||
|
||||
## PROXY ##
|
||||
|
||||
# TODO SAVE PROXY URL + ADD PROXY TESTS
|
||||
# -> name + url
|
||||
|
||||
## PROXY ##
|
||||
## TEST ##
|
||||
|
||||
def is_test_ail_crawlers_successful():
|
||||
return r_db.hget('crawler:tor:test', 'success') == 'True'
|
||||
|
@ -1380,7 +1617,6 @@ def save_test_ail_crawlers_result(test_success, message):
|
|||
r_db.hset('crawler:tor:test', 'success', str(test_success))
|
||||
r_db.hset('crawler:tor:test', 'message', message)
|
||||
|
||||
# TODO CREATE TEST TASK
|
||||
def test_ail_crawlers():
|
||||
# # TODO: test web domain
|
||||
if not ping_lacus():
|
||||
|
@ -1431,10 +1667,11 @@ def test_ail_crawlers():
|
|||
#### ---- ####
|
||||
|
||||
|
||||
# TODO MOVE ME
|
||||
# TODO MOVE ME IN CRAWLER OR FLASK
|
||||
load_blacklist()
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
|
||||
# print(task.get_meta())
|
||||
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
|
||||
# print(task.get_meta())
|
||||
# _clear_captures()
|
||||
|
||||
|
|
|
@ -61,9 +61,9 @@ def create_json_response(data, status_code):
|
|||
def crawlers_dashboard():
|
||||
is_manager_connected = crawlers.get_lacus_connection_metadata()
|
||||
crawlers_status = crawlers.get_captures_status()
|
||||
print(crawlers_status)
|
||||
# print(crawlers_status)
|
||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||
print(crawlers_latest_stats)
|
||||
# print(crawlers_latest_stats)
|
||||
date = crawlers.get_current_date()
|
||||
return render_template("dashboard_crawler.html", date=date,
|
||||
is_manager_connected=is_manager_connected,
|
||||
|
@ -77,6 +77,7 @@ def crawlers_dashboard():
|
|||
def crawler_dashboard_json():
|
||||
crawlers_status = crawlers.get_captures_status()
|
||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||
# print(crawlers_status)
|
||||
|
||||
return jsonify({'crawlers_status': crawlers_status,
|
||||
'stats': crawlers_latest_stats})
|
||||
|
@ -106,13 +107,24 @@ def send_to_spider():
|
|||
# POST val
|
||||
url = request.form.get('url_to_crawl')
|
||||
crawler_type = request.form.get('crawler_queue_type')
|
||||
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
|
||||
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
|
||||
screenshot = request.form.get('screenshot')
|
||||
har = request.form.get('har')
|
||||
depth_limit = request.form.get('depth_limit')
|
||||
cookiejar_uuid = request.form.get('cookiejar')
|
||||
|
||||
# Frequency
|
||||
if request.form.get('crawler_scheduler'):
|
||||
frequency = request.form.get('frequency')
|
||||
if frequency == 'custom':
|
||||
months = request.form.get('frequency_months', 0)
|
||||
weeks = request.form.get('frequency_weeks', 0)
|
||||
days = request.form.get('frequency_days', 0)
|
||||
hours = request.form.get('frequency_hours', 0)
|
||||
minutes = request.form.get('frequency_minutes', 0)
|
||||
frequency = {'months': months, 'weeks': weeks, 'days': days, 'hours': hours, 'minutes': minutes}
|
||||
else:
|
||||
frequency = None
|
||||
|
||||
# PROXY
|
||||
proxy = request.form.get('proxy_name')
|
||||
if proxy:
|
||||
|
@ -129,7 +141,7 @@ def send_to_spider():
|
|||
cookiejar_uuid = cookiejar_uuid.rsplit(':')
|
||||
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
||||
|
||||
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot}
|
||||
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
|
||||
if proxy:
|
||||
data['proxy'] = proxy
|
||||
if cookiejar_uuid:
|
||||
|
@ -142,6 +154,43 @@ def send_to_spider():
|
|||
return redirect(url_for('crawler_splash.manual'))
|
||||
|
||||
|
||||
@crawler_splash.route("/crawlers/scheduler", methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
def scheduler_dashboard():
|
||||
schedulers = crawlers.get_schedulers_metas()
|
||||
# print(schedulers)
|
||||
# TODO list currently queued ?
|
||||
return render_template("crawler_scheduler_dashboard.html",
|
||||
schedulers=schedulers,
|
||||
is_manager_connected=crawlers.get_lacus_connection_metadata())
|
||||
|
||||
@crawler_splash.route("/crawlers/schedule", methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
def schedule_show():
|
||||
schedule_uuid = request.args.get('uuid')
|
||||
schedule = crawlers.CrawlerSchedule(schedule_uuid)
|
||||
if not schedule.exists():
|
||||
abort(404)
|
||||
meta = schedule.get_meta(ui=True)
|
||||
return render_template("crawler_schedule_uuid.html",
|
||||
meta=meta)
|
||||
|
||||
@crawler_splash.route("/crawlers/schedule/delete", methods=['GET'])
|
||||
@login_required
|
||||
@login_analyst
|
||||
def schedule_delete():
|
||||
schedule_uuid = request.args.get('uuid')
|
||||
schedule = crawlers.CrawlerSchedule(schedule_uuid)
|
||||
if not schedule.exists():
|
||||
abort(404)
|
||||
res = crawlers.api_delete_schedule({'uuid': schedule_uuid})
|
||||
if res[1] != 200:
|
||||
return create_json_response(res[0], res[1])
|
||||
return redirect(url_for('crawler_splash.scheduler_dashboard'))
|
||||
|
||||
|
||||
@crawler_splash.route("/crawlers/last/domains", methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
|
@ -228,11 +277,11 @@ def showDomain():
|
|||
dict_domain['epoch'] = curr_epoch
|
||||
dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch))
|
||||
|
||||
print(dict_domain['epoch'])
|
||||
# print(dict_domain['epoch'])
|
||||
|
||||
dict_domain['crawler_history_items'] = []
|
||||
for item_id in domain.get_crawled_items_by_epoch(epoch):
|
||||
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options=['crawler']))
|
||||
dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options={'crawler'}))
|
||||
if dict_domain['crawler_history_items']:
|
||||
dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items'])
|
||||
|
||||
|
@ -521,7 +570,7 @@ def crawler_cookiejar_show():
|
|||
|
||||
@crawler_splash.route('/crawler/cookie/delete', methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
@login_analyst
|
||||
def crawler_cookiejar_cookie_delete():
|
||||
user_id = current_user.get_id()
|
||||
cookie_uuid = request.args.get('uuid')
|
||||
|
@ -536,7 +585,7 @@ def crawler_cookiejar_cookie_delete():
|
|||
|
||||
@crawler_splash.route('/crawler/cookiejar/delete', methods=['GET'])
|
||||
@login_required
|
||||
@login_read_only
|
||||
@login_analyst
|
||||
def crawler_cookiejar_delete():
|
||||
user_id = current_user.get_id()
|
||||
cookiejar_uuid = request.args.get('uuid')
|
||||
|
@ -699,7 +748,7 @@ def crawler_lacus_settings_crawler_manager():
|
|||
api_key = request.form.get('api_key')
|
||||
|
||||
res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key})
|
||||
print(res)
|
||||
# print(res)
|
||||
if res[1] != 200:
|
||||
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
||||
else:
|
||||
|
|
|
@ -66,21 +66,59 @@
|
|||
<div class="d-flex mt-3">
|
||||
<i class="fas fa-user-ninja mt-1"></i> Manual
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
|
||||
<label class="custom-control-label" for="crawler_type">
|
||||
<i class="fas fa-clock"></i> Automatic
|
||||
<input class="custom-control-input" type="checkbox" name="crawler_scheduler" value="True" id="crawler_scheduler">
|
||||
<label class="custom-control-label" for="crawler_scheduler">
|
||||
<i class="fas fa-clock"></i> Scheduler
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light"><i class="fas fa-clock"></i> </span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="crawler_epoch" value="3600" min="1" name="crawler_epoch" required>
|
||||
<div class="input-group-append">
|
||||
<span class="input-group-text">Time (seconds) between each crawling</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="frequency_inputs">
|
||||
<div class="mb-4">
|
||||
<select class="custom-select" id="frequency" name="frequency" onchange="frequency_selector_update(this);">
|
||||
<option value="hourly">Hourly</option>
|
||||
<option value="daily">Daily</option>
|
||||
<option value="weekly">Weekly</option>
|
||||
<option value="monthly">Monthly</option>
|
||||
<option value="custom">Custom</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="custom_frequency">
|
||||
<h5><i class="fas fa-clock"></i> Adjust crawling interval as needed</h5>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light" style="width: 90px"><b>Months</b></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="frequency_months" value="0" min="0" name="frequency_months" required>
|
||||
</div>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light" style="width: 90px"><b>Weeks</b></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="frequency_weeks" value="0" min="0" name="frequency_weeks" required>
|
||||
</div>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light" style="width: 90px"><b>Days</b></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="frequency_days" value="0" min="0" name="frequency_days" required>
|
||||
</div>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light" style="width: 90px"><b>Hours</b></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="frequency_hours" value="0" min="0" name="frequency_hours" required>
|
||||
</div>
|
||||
<div class="input-group">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light" style="width: 90px"><b>Minutes</b></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="frequency_minutes" value="0" min="0" name="frequency_minutes" required>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
@ -165,8 +203,9 @@ $(document).ready(function(){
|
|||
$("#nav_manual_crawler").addClass("active");
|
||||
queue_type_selector_input_controler()
|
||||
manual_crawler_input_controler();
|
||||
$("#custom_frequency").hide();
|
||||
|
||||
$('#crawler_type').on("change", function () {
|
||||
$('#crawler_scheduler').on("change", function () {
|
||||
manual_crawler_input_controler();
|
||||
});
|
||||
|
||||
|
@ -190,10 +229,18 @@ function toggle_sidebar(){
|
|||
}
|
||||
|
||||
function manual_crawler_input_controler() {
|
||||
if($('#crawler_type').is(':checked')){
|
||||
$("#crawler_epoch_input").show();
|
||||
if($('#crawler_scheduler').is(':checked')){
|
||||
$("#frequency_inputs").show();
|
||||
}else{
|
||||
$("#crawler_epoch_input").hide();
|
||||
$("#frequency_inputs").hide();
|
||||
}
|
||||
}
|
||||
|
||||
function frequency_selector_update(obj) {
|
||||
if(obj.value === "custom") {
|
||||
$("#custom_frequency").show();
|
||||
}else{
|
||||
$("#custom_frequency").hide();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>AIL-Framework</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
|
||||
<!-- JS -->
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
{% include 'crawler/menu_sidebar.html' %}
|
||||
|
||||
<div class="col-12 col-lg-10" id="core_content">
|
||||
|
||||
<div class="card my-1">
|
||||
<div class="card-header bg-dark text-white">
|
||||
<h4 class="card-title"><b>{{ meta['url'] }}</b></h4>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
|
||||
<div class="row">
|
||||
<div class="col-lg-8">
|
||||
|
||||
<table class="table table-hover">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>UUID</th>
|
||||
<td>{{ meta['uuid'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Url</th>
|
||||
<td>{{ meta['url'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Frequency</th>
|
||||
<td>{{ meta['frequency'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Creator</th>
|
||||
<td>{{ meta['user'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Date</th>
|
||||
<td>{{ meta['date'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Status</th>
|
||||
<td>{{ meta['status'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Next Run</th>
|
||||
<td>
|
||||
{% if not meta['next_run'] %}
|
||||
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
|
||||
{% else %}
|
||||
{{ meta['next_run'] }}
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h4>Config:</h4>
|
||||
|
||||
<table class="table table-hover">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th><i class="fas fa-water"></i> Depth</th>
|
||||
<td>{{ meta['depth'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><i class="fas fa-image"></i> Screenshot</th>
|
||||
<td>
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['screenshot'] %}checked{% endif %} disabled>
|
||||
<label class="custom-control-label" for="html_content_id">
|
||||
<i class="fas fa-image"></i>
|
||||
</label>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><i class="fas fa-file"></i> Har</th>
|
||||
<td>
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" id="html_content_id" {% if meta['har'] %}checked{% endif %} disabled>
|
||||
<label class="custom-control-label" for="html_content_id">
|
||||
<i class="fas fa-file"></i>
|
||||
</label>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><i class="fas fa-cookie-bite"></i> Cookiejar</th>
|
||||
<td>
|
||||
{% if not meta['cookiejar'] %}
|
||||
-
|
||||
{% else %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_cookiejar_show') }}?uuid={{meta['cookiejar']}}">
|
||||
{{ meta['cookiejar'] }}
|
||||
</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Proxy</th>
|
||||
<td>
|
||||
{% if not meta['proxy'] %}
|
||||
-
|
||||
{% else %}
|
||||
{{ meta['proxy'] }}
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>User Agent</th>
|
||||
<td>
|
||||
{% if meta['user_agent'] %}
|
||||
{{ meta['user_agent'] }}
|
||||
{% else %}
|
||||
Default
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% if meta['header'] %}
|
||||
<tr>
|
||||
<th>header</th>
|
||||
<td>{{ meta['header'] }}</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div class="col-lg-4">
|
||||
<div>
|
||||
<a href="{{ url_for('crawler_splash.schedule_delete') }}?uuid={{meta['uuid']}}">
|
||||
<button type="button" class="btn btn-danger">
|
||||
<i class="fas fa-trash-alt"></i> <b>Delete</b>
|
||||
</button>
|
||||
</a>
|
||||
{# <a href="{{ url_for('investigations_b.edit_investigation') }}?uuid={{meta['uuid']}}">#}
|
||||
{# <button type="button" class="btn btn-info">#}
|
||||
{# <i class="fas fa-pencil-alt"></i> <b>Edit</b>#}
|
||||
{# </button>#}
|
||||
{# </a>#}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
var chart = {};
|
||||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
$("#nav_monitoring_crawler").addClass("active");
|
||||
});
|
||||
|
||||
function toggle_sidebar(){
|
||||
if($('#nav_menu').is(':visible')){
|
||||
$('#nav_menu').hide();
|
||||
$('#side_menu').removeClass('border-right')
|
||||
$('#side_menu').removeClass('col-lg-2')
|
||||
$('#core_content').removeClass('col-lg-10')
|
||||
}else{
|
||||
$('#nav_menu').show();
|
||||
$('#side_menu').addClass('border-right')
|
||||
$('#side_menu').addClass('col-lg-2')
|
||||
$('#core_content').addClass('col-lg-10')
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
|
@ -0,0 +1,94 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>AIL-Framework</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/dataTables.bootstrap.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
|
||||
<!-- JS -->
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
{% include 'crawler/menu_sidebar.html' %}
|
||||
|
||||
<div class="col-12 col-lg-10" id="core_content">
|
||||
|
||||
{% include 'crawler/crawler_disabled.html' %}
|
||||
|
||||
<h1>Schedulers:</h1>
|
||||
|
||||
<table class="table mt-1 table-hover table-borderless table-striped" id="table_scheduler">
|
||||
<thead class="thead-dark">
|
||||
<tr>
|
||||
<th>Url</th>
|
||||
<th>Status</th>
|
||||
<th>Next Run</th>
|
||||
<th>User</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="tbody_last_crawled">
|
||||
{% for meta in schedulers %}
|
||||
<tr>
|
||||
<td><a href="{{ url_for('crawler_splash.schedule_show') }}?uuid={{ meta['uuid'] }}">{{ meta['url'] }}</a></td>
|
||||
<td>{{ meta['status'] }}</td>
|
||||
<td>
|
||||
{% if not meta['next_run'] %}
|
||||
<b class="text-danger"><i class="fas fa-exclamation-triangle"></i> Please verify that the crawler module is running ...</b>
|
||||
{% else %}
|
||||
{{ meta['next_run'] }}
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ meta['user'] }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
var chart = {};
|
||||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
$("#nav_scheduler_crawler").addClass("active");
|
||||
$('#table_scheduler').DataTable();
|
||||
});
|
||||
|
||||
function toggle_sidebar(){
|
||||
if($('#nav_menu').is(':visible')){
|
||||
$('#nav_menu').hide();
|
||||
$('#side_menu').removeClass('border-right')
|
||||
$('#side_menu').removeClass('col-lg-2')
|
||||
$('#core_content').removeClass('col-lg-10')
|
||||
}else{
|
||||
$('#nav_menu').show();
|
||||
$('#side_menu').addClass('border-right')
|
||||
$('#side_menu').addClass('col-lg-2')
|
||||
$('#core_content').addClass('col-lg-10')
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
|
@ -208,8 +208,9 @@ function refresh_crawler_status(){
|
|||
$('#stat_web_total').text(data.stats['web']['crawled']);
|
||||
$('#stat_web_queue').text(data.stats['web']['queue']);
|
||||
|
||||
$("#tbody_crawler_onion_info").empty();
|
||||
|
||||
if(data.crawlers_status.length!=0){
|
||||
$("#tbody_crawler_onion_info").empty();
|
||||
var tableRef = document.getElementById('tbody_crawler_onion_info');
|
||||
for (var i = 0; i < data.crawlers_status.length; i++) {
|
||||
var crawler = data.crawlers_status[i];
|
||||
|
|
|
@ -35,8 +35,8 @@
|
|||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{{url_for('hiddenServices.auto_crawler')}}" id="nav_auto_crawler">
|
||||
<i class="fas fa-sync"></i> Automatic Crawler
|
||||
<a class="nav-link" href="{{url_for('crawler_splash.scheduler_dashboard')}}" id="nav_scheduler_crawler">
|
||||
<i class="fas fa-sync"></i> Scheduler
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
|
|
Loading…
Reference in a new issue