chg: [crawler] refactor crawler tasks + migrate cookiejars + add proxy option
24
HOWTO.md
|
@ -131,29 +131,29 @@ Finally, you can quit this program by pressing either ``<q>`` or ``<C-c>``.
|
||||||
Crawler
|
Crawler
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
In AIL, you can crawl Tor hidden services. Don't forget to review the proxy configuration of your Tor client and especially if you enabled the SOCKS5 proxy and binding on the appropriate IP address reachable via the dockers where Splash runs.
|
In AIL, you can crawl websites and Tor hidden services. Don't forget to review the proxy configuration of your Tor client and especially if you enabled the SOCKS5 proxy
|
||||||
|
|
||||||
|
[//]: # (and binding on the appropriate IP address reachable via the dockers where Splash runs.)
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
|
|
||||||
[Install AIL-Splash-Manager](https://github.com/ail-project/ail-splash-manager)
|
[Install Lacus](https://github.com/ail-project/lacus)
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
1. Search the Splash-Manager API key. This API key is generated when you launch the manager for the first time.
|
1. Lacus URL:
|
||||||
(located in your Splash Manager directory ``ail-splash-manager/token_admin.txt``)
|
|
||||||
|
|
||||||
|
|
||||||
2. Splash Manager URL and API Key:
|
|
||||||
In the webinterface, go to ``Crawlers>Settings`` and click on the Edit button
|
In the webinterface, go to ``Crawlers>Settings`` and click on the Edit button
|
||||||
![Splash Manager Config](./doc/screenshots/splash_manager_config_edit_1.png?raw=true "AIL framework Splash Manager Config")
|
|
||||||
|
|
||||||
![Splash Manager Config](./doc/screenshots/splash_manager_config_edit_2.png?raw=true "AIL framework Splash Manager Config")
|
![Splash Manager Config](./doc/screenshots/lacus_config.png?raw=true "AIL Lacus Config")
|
||||||
|
|
||||||
3. Launch AIL Crawlers:
|
![Splash Manager Config](./doc/screenshots/lacus_config_edit.png?raw=true "AIL Lacus Config")
|
||||||
|
|
||||||
|
2. Launch AIL Crawlers:
|
||||||
Choose the number of crawlers you want to launch
|
Choose the number of crawlers you want to launch
|
||||||
![Splash Manager Nb Crawlers Config](./doc/screenshots/splash_manager_nb_crawlers_1.png?raw=true "AIL framework Nb Crawlers Config")
|
|
||||||
![Splash Manager Nb Crawlers Config](./doc/screenshots/splash_manager_nb_crawlers_2.png?raw=true "AIL framework Nb Crawlers Config")
|
![Splash Manager Nb Crawlers Config](./doc/screenshots/crawler_nb_captures.png?raw=true "AIL Lacus Nb Crawlers Config")
|
||||||
|
![Splash Manager Nb Crawlers Config](./doc/screenshots/crawler_nb_captures_edit.png?raw=true "AIL Lacus Nb Crawlers Config")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -65,31 +65,29 @@ class Crawler(AbstractModule):
|
||||||
def get_message(self):
|
def get_message(self):
|
||||||
# Check if a new Capture can be Launched
|
# Check if a new Capture can be Launched
|
||||||
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
||||||
task_row = crawlers.get_crawler_task_from_queue()
|
task_row = crawlers.add_task_to_lacus_queue()
|
||||||
if task_row:
|
if task_row:
|
||||||
print(task_row)
|
print(task_row)
|
||||||
task_uuid, priority = task_row
|
task_uuid, priority = task_row
|
||||||
self.enqueue_capture(task_uuid, priority)
|
self.enqueue_capture(task_uuid, priority)
|
||||||
|
|
||||||
# Check if a Capture is Done
|
# Get CrawlerCapture Object
|
||||||
capture = crawlers.get_crawler_capture()
|
capture = crawlers.get_crawler_capture()
|
||||||
if capture:
|
if capture:
|
||||||
print(capture)
|
print(capture.uuid)
|
||||||
capture_uuid = capture[0][0]
|
status = self.lacus.get_capture_status(capture.uuid)
|
||||||
capture_status = self.lacus.get_capture_status(capture_uuid)
|
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
||||||
if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
|
capture.update(status)
|
||||||
crawlers.update_crawler_capture(capture_uuid)
|
print(capture.uuid, status, int(time.time()))
|
||||||
print(capture_uuid, capture_status, int(time.time()))
|
|
||||||
else:
|
else:
|
||||||
self.compute(capture_uuid)
|
self.compute(capture)
|
||||||
crawlers.remove_crawler_capture(capture_uuid)
|
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
|
||||||
print('capture', capture_uuid, 'completed')
|
print('capture', capture.uuid, 'completed')
|
||||||
|
|
||||||
|
|
||||||
time.sleep(self.pending_seconds)
|
time.sleep(self.pending_seconds)
|
||||||
|
|
||||||
def enqueue_capture(self, task_uuid, priority):
|
def enqueue_capture(self, task_uuid, priority):
|
||||||
task = crawlers.get_crawler_task(task_uuid)
|
task = crawlers.CrawlerTask(task_uuid)
|
||||||
print(task)
|
print(task)
|
||||||
# task = {
|
# task = {
|
||||||
# 'uuid': task_uuid,
|
# 'uuid': task_uuid,
|
||||||
|
@ -104,47 +102,43 @@ class Crawler(AbstractModule):
|
||||||
# 'proxy': 'force_tor',
|
# 'proxy': 'force_tor',
|
||||||
# 'parent': 'manual',
|
# 'parent': 'manual',
|
||||||
# }
|
# }
|
||||||
url = task['url']
|
url = task.get_url()
|
||||||
force = priority != 0
|
force = priority != 0
|
||||||
|
# TODO timeout
|
||||||
# TODO unpack cookiejar
|
|
||||||
|
|
||||||
# TODO HEADER
|
# TODO HEADER
|
||||||
|
|
||||||
capture_uuid = self.lacus.enqueue(url=url,
|
capture_uuid = self.lacus.enqueue(url=url,
|
||||||
depth=task['depth'],
|
depth=task.get_depth(),
|
||||||
user_agent=task['user_agent'],
|
user_agent=task.get_user_agent(),
|
||||||
proxy=task['proxy'],
|
proxy=task.get_proxy(),
|
||||||
cookies=[],
|
cookies=task.get_cookies(),
|
||||||
force=force,
|
force=force,
|
||||||
general_timeout_in_sec=90)
|
general_timeout_in_sec=90)
|
||||||
|
|
||||||
crawlers.add_crawler_capture(task_uuid, capture_uuid)
|
crawlers.create_capture(capture_uuid, task_uuid)
|
||||||
print(task_uuid, capture_uuid, 'launched')
|
print(task.uuid, capture_uuid, 'launched')
|
||||||
return capture_uuid
|
return capture_uuid
|
||||||
|
|
||||||
# CRAWL DOMAIN
|
# CRAWL DOMAIN
|
||||||
# TODO: CATCH ERRORS
|
# TODO: CATCH ERRORS
|
||||||
def compute(self, capture_uuid):
|
def compute(self, capture):
|
||||||
|
print('saving capture', capture.uuid)
|
||||||
|
|
||||||
print('saving capture', capture_uuid)
|
task = capture.get_task()
|
||||||
|
domain = task.get_domain()
|
||||||
|
print(domain)
|
||||||
|
|
||||||
task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid)
|
self.domain = Domain(domain)
|
||||||
task = crawlers.get_crawler_task(task_uuid)
|
|
||||||
|
|
||||||
print(task['domain'])
|
|
||||||
|
|
||||||
self.domain = Domain(task['domain'])
|
|
||||||
|
|
||||||
# TODO CHANGE EPOCH
|
# TODO CHANGE EPOCH
|
||||||
epoch = int(time.time())
|
epoch = int(time.time())
|
||||||
parent_id = task['parent']
|
parent_id = task.get_parent()
|
||||||
print(task)
|
|
||||||
|
|
||||||
entries = self.lacus.get_capture(capture_uuid)
|
entries = self.lacus.get_capture(capture.uuid)
|
||||||
print(entries['status'])
|
print(entries['status'])
|
||||||
self.har = task['har']
|
self.har = task.get_har()
|
||||||
self.screenshot = task['screenshot']
|
self.screenshot = task.get_screenshot()
|
||||||
str_date = crawlers.get_current_date(separator=True)
|
str_date = crawlers.get_current_date(separator=True)
|
||||||
self.har_dir = crawlers.get_date_har_dir(str_date)
|
self.har_dir = crawlers.get_date_har_dir(str_date)
|
||||||
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
|
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
|
||||||
|
@ -156,14 +150,13 @@ class Crawler(AbstractModule):
|
||||||
self.domain.update_daterange(str_date.replace('/', ''))
|
self.domain.update_daterange(str_date.replace('/', ''))
|
||||||
# Origin + History
|
# Origin + History
|
||||||
if self.root_item:
|
if self.root_item:
|
||||||
# domain.add_ports(port)
|
|
||||||
self.domain.set_last_origin(parent_id)
|
self.domain.set_last_origin(parent_id)
|
||||||
self.domain.add_history(epoch, root_item=self.root_item)
|
self.domain.add_history(epoch, root_item=self.root_item)
|
||||||
elif self.domain.was_up():
|
elif self.domain.was_up():
|
||||||
self.domain.add_history(epoch, root_item=epoch)
|
self.domain.add_history(epoch, root_item=epoch)
|
||||||
|
|
||||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||||
crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type())
|
task.clear()
|
||||||
|
|
||||||
def save_capture_response(self, parent_id, entries):
|
def save_capture_response(self, parent_id, entries):
|
||||||
print(entries.keys())
|
print(entries.keys())
|
||||||
|
@ -242,14 +235,6 @@ if __name__ == '__main__':
|
||||||
##################################
|
##################################
|
||||||
##################################
|
##################################
|
||||||
|
|
||||||
|
|
||||||
# from Helper import Process
|
|
||||||
# from pubsublogger import publisher
|
|
||||||
|
|
||||||
|
|
||||||
# ======== FUNCTIONS ========
|
|
||||||
|
|
||||||
|
|
||||||
# def update_auto_crawler():
|
# def update_auto_crawler():
|
||||||
# current_epoch = int(time.time())
|
# current_epoch = int(time.time())
|
||||||
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
|
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
|
||||||
|
|
|
@ -91,7 +91,7 @@ class Onion(AbstractModule):
|
||||||
if onion_urls:
|
if onion_urls:
|
||||||
if crawlers.is_crawler_activated():
|
if crawlers.is_crawler_activated():
|
||||||
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR
|
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR
|
||||||
task_uuid = crawlers.add_crawler_task(domain, parent=item.get_id())
|
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0)
|
||||||
if task_uuid:
|
if task_uuid:
|
||||||
print(f'{domain} added to crawler queue: {task_uuid}')
|
print(f'{domain} added to crawler queue: {task_uuid}')
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -10,8 +10,8 @@ This module spots zerobins-like services for further processing
|
||||||
# Import External packages
|
# Import External packages
|
||||||
##################################
|
##################################
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
##################################
|
##################################
|
||||||
|
@ -30,7 +30,7 @@ class Zerobins(AbstractModule):
|
||||||
super(Zerobins, self).__init__()
|
super(Zerobins, self).__init__()
|
||||||
|
|
||||||
binz = [
|
binz = [
|
||||||
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
|
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
|
||||||
]
|
]
|
||||||
|
|
||||||
self.regex = re.compile('|'.join(binz))
|
self.regex = re.compile('|'.join(binz))
|
||||||
|
@ -59,13 +59,13 @@ class Zerobins(AbstractModule):
|
||||||
if len(matching_binz) > 0:
|
if len(matching_binz) > 0:
|
||||||
for bin_url in matching_binz:
|
for bin_url in matching_binz:
|
||||||
print(f'send {bin_url} to crawler')
|
print(f'send {bin_url} to crawler')
|
||||||
crawlers.add_crawler_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
|
# TODO Change priority ???
|
||||||
parent='manual', priority=10)
|
crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
|
||||||
|
parent='manual', priority=60)
|
||||||
|
|
||||||
self.redis_logger.debug("Compute message in queue")
|
self.redis_logger.debug("Compute message in queue")
|
||||||
|
|
||||||
|
|
||||||
# TODO TEST ME
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
module = Zerobins()
|
module = Zerobins()
|
||||||
module.run()
|
module.run()
|
||||||
|
|
BIN
doc/screenshots/crawler_nb_captures.png
Normal file
After Width: | Height: | Size: 27 KiB |
BIN
doc/screenshots/crawler_nb_captures_edit.png
Normal file
After Width: | Height: | Size: 67 KiB |
BIN
doc/screenshots/lacus_config.png
Normal file
After Width: | Height: | Size: 73 KiB |
BIN
doc/screenshots/lacus_config_edit.png
Normal file
After Width: | Height: | Size: 66 KiB |
Before Width: | Height: | Size: 104 KiB |
Before Width: | Height: | Size: 66 KiB |
Before Width: | Height: | Size: 51 KiB |
Before Width: | Height: | Size: 65 KiB |
|
@ -60,7 +60,7 @@ def create_json_response(data, status_code):
|
||||||
@login_read_only
|
@login_read_only
|
||||||
def crawlers_dashboard():
|
def crawlers_dashboard():
|
||||||
is_manager_connected = crawlers.get_lacus_connection_metadata()
|
is_manager_connected = crawlers.get_lacus_connection_metadata()
|
||||||
crawlers_status = crawlers.get_crawler_capture_status()
|
crawlers_status = crawlers.get_captures_status()
|
||||||
print(crawlers_status)
|
print(crawlers_status)
|
||||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||||
print(crawlers_latest_stats)
|
print(crawlers_latest_stats)
|
||||||
|
@ -75,7 +75,7 @@ def crawlers_dashboard():
|
||||||
@login_required
|
@login_required
|
||||||
@login_read_only
|
@login_read_only
|
||||||
def crawler_dashboard_json():
|
def crawler_dashboard_json():
|
||||||
crawlers_status = crawlers.get_crawler_capture_status()
|
crawlers_status = crawlers.get_captures_status()
|
||||||
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
crawlers_latest_stats = crawlers.get_crawlers_stats()
|
||||||
|
|
||||||
return jsonify({'crawlers_status': crawlers_status,
|
return jsonify({'crawlers_status': crawlers_status,
|
||||||
|
@ -106,7 +106,6 @@ def send_to_spider():
|
||||||
# POST val
|
# POST val
|
||||||
url = request.form.get('url_to_crawl')
|
url = request.form.get('url_to_crawl')
|
||||||
crawler_type = request.form.get('crawler_queue_type')
|
crawler_type = request.form.get('crawler_queue_type')
|
||||||
proxy = request.form.get('proxy_name')
|
|
||||||
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
|
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
|
||||||
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
|
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
|
||||||
screenshot = request.form.get('screenshot')
|
screenshot = request.form.get('screenshot')
|
||||||
|
@ -114,7 +113,13 @@ def send_to_spider():
|
||||||
depth_limit = request.form.get('depth_limit')
|
depth_limit = request.form.get('depth_limit')
|
||||||
cookiejar_uuid = request.form.get('cookiejar')
|
cookiejar_uuid = request.form.get('cookiejar')
|
||||||
|
|
||||||
if crawler_type == 'onion':
|
# PROXY
|
||||||
|
proxy = request.form.get('proxy_name')
|
||||||
|
if proxy:
|
||||||
|
res = crawlers.api_verify_proxy(proxy)
|
||||||
|
if res[1] != 200:
|
||||||
|
return create_json_response(res[0], res[1])
|
||||||
|
elif crawler_type == 'onion':
|
||||||
proxy = 'force_tor'
|
proxy = 'force_tor'
|
||||||
|
|
||||||
if cookiejar_uuid:
|
if cookiejar_uuid:
|
||||||
|
@ -129,6 +134,7 @@ def send_to_spider():
|
||||||
data['proxy'] = proxy
|
data['proxy'] = proxy
|
||||||
if cookiejar_uuid:
|
if cookiejar_uuid:
|
||||||
data['cookiejar'] = cookiejar_uuid
|
data['cookiejar'] = cookiejar_uuid
|
||||||
|
# print(data)
|
||||||
res = crawlers.api_add_crawler_task(data, user_id=user_id)
|
res = crawlers.api_add_crawler_task(data, user_id=user_id)
|
||||||
|
|
||||||
if res[1] != 200:
|
if res[1] != 200:
|
||||||
|
@ -655,36 +661,6 @@ def crawler_cookiejar_cookie_json_add_post():
|
||||||
|
|
||||||
# --- Cookiejar ---#
|
# --- Cookiejar ---#
|
||||||
|
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST'])
|
|
||||||
@login_required
|
|
||||||
@login_admin
|
|
||||||
def crawler_splash_setings_crawlers_to_lauch():
|
|
||||||
if request.method == 'POST':
|
|
||||||
dict_splash_name = {}
|
|
||||||
for crawler_name in list(request.form):
|
|
||||||
dict_splash_name[crawler_name] = request.form.get(crawler_name)
|
|
||||||
res = crawlers.api_set_nb_crawlers_to_launch(dict_splash_name)
|
|
||||||
if res[1] != 200:
|
|
||||||
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
|
||||||
else:
|
|
||||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
|
||||||
else:
|
|
||||||
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch_ui()
|
|
||||||
return render_template("settings_edit_crawlers_to_launch.html",
|
|
||||||
nb_crawlers_to_launch=nb_crawlers_to_launch)
|
|
||||||
|
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET'])
|
|
||||||
@login_required
|
|
||||||
@login_admin
|
|
||||||
def crawler_splash_setings_relaunch_crawler():
|
|
||||||
crawlers.relaunch_crawlers()
|
|
||||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
|
||||||
|
|
||||||
|
|
||||||
## - - ##
|
|
||||||
|
|
||||||
#### LACUS ####
|
#### LACUS ####
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/settings', methods=['GET'])
|
@crawler_splash.route('/crawler/settings', methods=['GET'])
|
||||||
|
@ -693,6 +669,7 @@ def crawler_splash_setings_relaunch_crawler():
|
||||||
def crawler_settings():
|
def crawler_settings():
|
||||||
lacus_url = crawlers.get_lacus_url()
|
lacus_url = crawlers.get_lacus_url()
|
||||||
api_key = crawlers.get_hidden_lacus_api_key()
|
api_key = crawlers.get_hidden_lacus_api_key()
|
||||||
|
nb_captures = crawlers.get_crawler_max_captures()
|
||||||
|
|
||||||
is_manager_connected = crawlers.get_lacus_connection_metadata(force_ping=True)
|
is_manager_connected = crawlers.get_lacus_connection_metadata(force_ping=True)
|
||||||
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
|
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
|
||||||
|
@ -701,14 +678,13 @@ def crawler_settings():
|
||||||
# TODO REGISTER PROXY
|
# TODO REGISTER PROXY
|
||||||
# all_proxies = crawlers.get_all_proxies_metadata()
|
# all_proxies = crawlers.get_all_proxies_metadata()
|
||||||
|
|
||||||
# nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
|
|
||||||
# crawler_full_config = Config_DB.get_full_config_by_section('crawler')
|
# crawler_full_config = Config_DB.get_full_config_by_section('crawler')
|
||||||
|
|
||||||
return render_template("settings_crawler.html",
|
return render_template("settings_crawler.html",
|
||||||
is_manager_connected=is_manager_connected,
|
is_manager_connected=is_manager_connected,
|
||||||
lacus_url=lacus_url, api_key=api_key,
|
lacus_url=lacus_url, api_key=api_key,
|
||||||
|
nb_captures=nb_captures,
|
||||||
# all_proxies=all_proxies,
|
# all_proxies=all_proxies,
|
||||||
# nb_crawlers_to_launch=nb_crawlers_to_launch,
|
|
||||||
is_crawler_working=is_crawler_working,
|
is_crawler_working=is_crawler_working,
|
||||||
crawler_error_mess=crawler_error_mess,
|
crawler_error_mess=crawler_error_mess,
|
||||||
)
|
)
|
||||||
|
@ -733,6 +709,22 @@ def crawler_lacus_settings_crawler_manager():
|
||||||
api_key = crawlers.get_lacus_api_key()
|
api_key = crawlers.get_lacus_api_key()
|
||||||
return render_template("settings_edit_lacus_crawler.html", lacus_url=lacus_url, api_key=api_key)
|
return render_template("settings_edit_lacus_crawler.html", lacus_url=lacus_url, api_key=api_key)
|
||||||
|
|
||||||
|
@crawler_splash.route('/crawler/settings/crawlers_to_launch', methods=['GET', 'POST'])
|
||||||
|
@login_required
|
||||||
|
@login_admin
|
||||||
|
def crawler_settings_crawlers_to_launch():
|
||||||
|
if request.method == 'POST':
|
||||||
|
nb_captures = request.form.get('nb_captures')
|
||||||
|
res = crawlers.api_set_crawler_max_captures({'nb': nb_captures})
|
||||||
|
if res[1] != 200:
|
||||||
|
return create_json_response(res[0], res[1])
|
||||||
|
else:
|
||||||
|
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||||
|
else:
|
||||||
|
nb_captures = crawlers.get_crawler_max_captures()
|
||||||
|
return render_template("settings_edit_crawlers_to_launch.html",
|
||||||
|
nb_captures=nb_captures)
|
||||||
|
|
||||||
|
|
||||||
@crawler_splash.route('/crawler/settings/crawler/test', methods=['GET'])
|
@crawler_splash.route('/crawler/settings/crawler/test', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
|
|
|
@ -1,31 +1,31 @@
|
||||||
{%if not is_manager_connected['status']%}
|
{%if not is_manager_connected['status']%}
|
||||||
<div class="alert alert-secondary text-center my-2" role="alert">
|
<div class="alert alert-secondary text-center my-2" role="alert">
|
||||||
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
|
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
|
||||||
<p>
|
<p>
|
||||||
{%if 'error' in is_manager_connected%}
|
{%if 'error' in is_manager_connected%}
|
||||||
<b>{{is_manager_connected['status_code']}}</b>
|
<b>{{is_manager_connected['status_code']}}</b>
|
||||||
<br>
|
<br>
|
||||||
<b>Error:</b> {{is_manager_connected['error']}}
|
<b>Error:</b> {{is_manager_connected['error']}}
|
||||||
{%else%}
|
{%else%}
|
||||||
<b>Error:</b> core/Crawler_manager not launched
|
<b>Error:</b> Lacus not connected
|
||||||
{%endif%}
|
{%endif%}
|
||||||
</p>
|
</p>
|
||||||
<div style="max-width: 500px;">
|
<div style="max-width: 500px;">
|
||||||
<ul class="list-group my-3">
|
<ul class="list-group my-3">
|
||||||
<li class="list-group-item bg-dark text-white">Splash Manager Features:</li>
|
<li class="list-group-item bg-dark text-white"><h3>Lacus Features:</h3></li>
|
||||||
<li class="list-group-item">Install and run Splash crawlers on another server</li>
|
<li class="list-group-item">Install and run crawlers on another server</li>
|
||||||
<li class="list-group-item">Handle proxies (Web and tor)</li>
|
<li class="list-group-item">Handle proxies ( <i class="fab fa-html5"></i> Web and <i class="fas fa-user-secret"></i> tor)</li>
|
||||||
<li class="list-group-item">Launch/Kill Splash Dockers</li>
|
<li class="list-group-item">Multiple Concurrent Captures</li>
|
||||||
<li class="list-group-item">Restart crawlers on crash</li>
|
<li class="list-group-item">HOW TO</li>
|
||||||
<li class="list-group-item">
|
<li class="list-group-item">
|
||||||
<div class="d-flex justify-content-center">
|
<div class="d-flex justify-content-center">
|
||||||
<a class="btn btn-info" href="https://github.com/ail-project/ail-splash-manager" role="button">
|
<a class="btn btn-info" href="https://github.com/ail-project/lacus" role="button">
|
||||||
<i class="fab fa-github"></i> Install and Configure AIL-Splash-Manager
|
<i class="fab fa-github"></i> Install and Configure Lacus
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
|
||||||
|
|
||||||
{%endif%}
|
{%endif%}
|
||||||
|
|
|
@ -60,13 +60,8 @@
|
||||||
{%endfor%}
|
{%endfor%}
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div id="div_proxy_name">
|
<div class="input-group" id="div_proxy_name">
|
||||||
<select class="custom-select form-control" name="proxy_name" id="proxy_name">
|
<input type="text" class="form-control" id="proxy_name" name="proxy_name" placeholder="Expected Format: [scheme]://[username]:[password]@[hostname]:[port]">
|
||||||
<option value="None" selected>Use a proxy</option>
|
|
||||||
{%for proxy in proxies%}
|
|
||||||
<option value="{{proxy}}">{{proxy}}</option>
|
|
||||||
{%endfor%}
|
|
||||||
</select>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="d-flex mt-3">
|
<div class="d-flex mt-3">
|
||||||
<i class="fas fa-user-ninja mt-1"></i> Manual
|
<i class="fas fa-user-ninja mt-1"></i> Manual
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
<h5 class="card-title"><i class="fas fa-cookie-bite"></i> Edit Cookie: {{cookie_uuid}}</h5>
|
<h5 class="card-title"><i class="fas fa-cookie-bite"></i> Edit Cookie: {{cookie_uuid}}</h5>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-4">
|
<div class="col-4">
|
||||||
<a class="btn btn-danger float-right" href="{{ url_for('crawler_splash.crawler_cookiejar_cookie_delete') }}?cookiejar_uuid={{cookiejar_uuid}}&cookie_uuid={{cookie_uuid}}">
|
<a class="btn btn-danger float-right" href="{{ url_for('crawler_splash.crawler_cookiejar_cookie_delete') }}?uuid={{cookie_uuid}}">
|
||||||
<i class="fas fa-trash-alt"></i>
|
<i class="fas fa-trash-alt"></i>
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -53,7 +53,7 @@
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</span>
|
</span>
|
||||||
<h4>Splash Crawler Manager</h4>
|
<h4>AIL Lacus Crawler</h4>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
|
|
||||||
|
@ -92,52 +92,43 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="card border-secondary">
|
{# <div class="card border-secondary">#}
|
||||||
<div class="card-body text-dark">
|
{# <div class="card-body text-dark">#}
|
||||||
<h5 class="card-title">All Proxies:</h5>
|
{# <h5 class="card-title">All Proxies:</h5>#}
|
||||||
<table class="table table-striped">
|
{# <table class="table table-striped">#}
|
||||||
<thead class="bg-info text-white">
|
{# <thead class="bg-info text-white">#}
|
||||||
<tr>
|
{# <tr>#}
|
||||||
<th>Proxy name</th>
|
{# <th>Proxy name</th>#}
|
||||||
<th>URL</th>
|
{# <th>URL</th>#}
|
||||||
<th>Crawler Type</th>
|
{# <th>Description</th>#}
|
||||||
<th>Description</th>
|
{# <th></th>#}
|
||||||
<th></th>
|
{# </tr>#}
|
||||||
</tr>
|
{# </thead>#}
|
||||||
</thead>
|
{# <tbody>#}
|
||||||
<tbody>
|
{# {% for proxy_name in all_proxies %}#}
|
||||||
{% for proxy_name in all_proxies %}
|
{# <tr>#}
|
||||||
<tr>
|
{# <td>#}
|
||||||
<td>
|
{# {{proxy_name}}#}
|
||||||
{{proxy_name}}
|
{# </td>#}
|
||||||
</td>
|
{# <td>#}
|
||||||
<td>
|
{# {{all_proxies[proxy_name]['url']}}#}
|
||||||
{{all_proxies[proxy_name]['url']}}
|
{# </td>#}
|
||||||
</td>
|
{# <td>#}
|
||||||
<td>
|
{# {{all_proxies[proxy_name]['description']}}#}
|
||||||
{%if all_proxies[proxy_name]['crawler_type']=='tor'%}
|
{# </td>#}
|
||||||
<i class="fas fa-user-secret"></i>
|
{# <td>#}
|
||||||
{%else%}
|
{# <div class="d-flex justify-content-end">#}
|
||||||
<i class="fab fa-html5"></i>
|
{# <!-- <button class="btn btn-outline-dark px-1 py-0">#}
|
||||||
{%endif%}
|
{# <i class="fas fa-pencil-alt"></i>#}
|
||||||
{{all_proxies[proxy_name]['crawler_type']}}
|
{# </button> -->#}
|
||||||
</td>
|
{# </div>#}
|
||||||
<td>
|
{# </td>#}
|
||||||
{{all_proxies[proxy_name]['description']}}
|
{# </tr>#}
|
||||||
</td>
|
{# {% endfor %}#}
|
||||||
<td>
|
{# </tbody>#}
|
||||||
<div class="d-flex justify-content-end">
|
{# </table>#}
|
||||||
<!-- <button class="btn btn-outline-dark px-1 py-0">
|
{# </div>#}
|
||||||
<i class="fas fa-pencil-alt"></i>
|
{# </div>#}
|
||||||
</button> -->
|
|
||||||
</div>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -176,25 +167,12 @@
|
||||||
|
|
||||||
<div class="card border-secondary my-4">
|
<div class="card border-secondary my-4">
|
||||||
<div class="card-body text-dark">
|
<div class="card-body text-dark">
|
||||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
<h5 class="card-title">
|
||||||
<table class="table table-sm">
|
Number of Concurrent Crawlers to Launch: <b class="text-primary">{{ nb_captures }}</b>
|
||||||
<tbody>
|
</h5>
|
||||||
{%for crawler in nb_crawlers_to_launch%}
|
<a href="{{ url_for('crawler_splash.crawler_settings_crawlers_to_launch') }}">
|
||||||
<tr>
|
|
||||||
<td>{{crawler}}</td>
|
|
||||||
<td>{{nb_crawlers_to_launch[crawler]}}</td>
|
|
||||||
</tr>
|
|
||||||
{%endfor%}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
|
|
||||||
<button type="button" class="btn btn-info">
|
<button type="button" class="btn btn-info">
|
||||||
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
|
Edit <i class="fas fa-pencil-alt"></i>
|
||||||
</button>
|
|
||||||
</a>
|
|
||||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_relaunch_crawler') }}">
|
|
||||||
<button type="button" class="btn btn-danger">
|
|
||||||
ReLaunch Crawlers <i class="fas fa-redo"></i>
|
|
||||||
</button>
|
</button>
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -26,22 +26,17 @@
|
||||||
|
|
||||||
<div class="col-12 col-lg-10" id="core_content">
|
<div class="col-12 col-lg-10" id="core_content">
|
||||||
|
|
||||||
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}" method="post" enctype="multipart/form-data">
|
<div class="card my-2">
|
||||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
<div class="card-header bg-dark text-white">
|
||||||
<table class="table table-sm">
|
|
||||||
<tbody>
|
<form action="{{ url_for('crawler_splash.crawler_settings_crawlers_to_launch') }}" method="post" enctype="multipart/form-data">
|
||||||
{%for crawler_name in nb_crawlers_to_launch%}
|
<h3 class="card-title">Number of Concurrent Crawlers to Launch:</h3>
|
||||||
<tr>
|
<input class="form-control" type="number" id="nb_captures" value="{{ nb_captures }}" min="1" name="nb_captures" required>
|
||||||
<td>{{crawler_name}}</td>
|
|
||||||
<td>
|
<button type="submit" class="btn btn-primary my-2">Edit <i class="fas fa-pencil-alt"></i></button>
|
||||||
<input class="form-control" type="number" id="{{crawler_name}}" value="{{nb_crawlers_to_launch[crawler_name]}}" min="0" name="{{crawler_name}}" required>
|
</form>
|
||||||
</td>
|
</div>
|
||||||
</tr>
|
</div>
|
||||||
{%endfor%}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
|
|
||||||
</form>
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|