chg: [crawler] refactor crawler tasks + migrate cookiejars + add proxy option

This commit is contained in:
Terrtia 2023-02-21 12:22:49 +01:00
parent c04bc7bb57
commit 6842efc15d
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
19 changed files with 568 additions and 788 deletions

View file

@ -131,29 +131,29 @@ Finally, you can quit this program by pressing either ``<q>`` or ``<C-c>``.
Crawler
---------------------
In AIL, you can crawl Tor hidden services. Don't forget to review the proxy configuration of your Tor client and especially if you enabled the SOCKS5 proxy and binding on the appropriate IP address reachable via the dockers where Splash runs.
In AIL, you can crawl websites and Tor hidden services. Don't forget to review the proxy configuration of your Tor client and especially if you enabled the SOCKS5 proxy
[//]: # (and binding on the appropriate IP address reachable via the dockers where Splash runs.)
### Installation
[Install AIL-Splash-Manager](https://github.com/ail-project/ail-splash-manager)
[Install Lacus](https://github.com/ail-project/lacus)
### Configuration
1. Search the Splash-Manager API key. This API key is generated when you launch the manager for the first time.
(located in your Splash Manager directory ``ail-splash-manager/token_admin.txt``)
2. Splash Manager URL and API Key:
1. Lacus URL:
In the webinterface, go to ``Crawlers>Settings`` and click on the Edit button
![Splash Manager Config](./doc/screenshots/splash_manager_config_edit_1.png?raw=true "AIL framework Splash Manager Config")
![Splash Manager Config](./doc/screenshots/splash_manager_config_edit_2.png?raw=true "AIL framework Splash Manager Config")
![Splash Manager Config](./doc/screenshots/lacus_config.png?raw=true "AIL Lacus Config")
3. Launch AIL Crawlers:
![Splash Manager Config](./doc/screenshots/lacus_config_edit.png?raw=true "AIL Lacus Config")
2. Launch AIL Crawlers:
Choose the number of crawlers you want to launch
![Splash Manager Nb Crawlers Config](./doc/screenshots/splash_manager_nb_crawlers_1.png?raw=true "AIL framework Nb Crawlers Config")
![Splash Manager Nb Crawlers Config](./doc/screenshots/splash_manager_nb_crawlers_2.png?raw=true "AIL framework Nb Crawlers Config")
![Splash Manager Nb Crawlers Config](./doc/screenshots/crawler_nb_captures.png?raw=true "AIL Lacus Nb Crawlers Config")
![Splash Manager Nb Crawlers Config](./doc/screenshots/crawler_nb_captures_edit.png?raw=true "AIL Lacus Nb Crawlers Config")

View file

@ -65,31 +65,29 @@ class Crawler(AbstractModule):
def get_message(self):
# Check if a new Capture can be Launched
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.get_crawler_task_from_queue()
task_row = crawlers.add_task_to_lacus_queue()
if task_row:
print(task_row)
task_uuid, priority = task_row
self.enqueue_capture(task_uuid, priority)
# Check if a Capture is Done
# Get CrawlerCapture Object
capture = crawlers.get_crawler_capture()
if capture:
print(capture)
capture_uuid = capture[0][0]
capture_status = self.lacus.get_capture_status(capture_uuid)
if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
crawlers.update_crawler_capture(capture_uuid)
print(capture_uuid, capture_status, int(time.time()))
print(capture.uuid)
status = self.lacus.get_capture_status(capture.uuid)
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time
capture.update(status)
print(capture.uuid, status, int(time.time()))
else:
self.compute(capture_uuid)
crawlers.remove_crawler_capture(capture_uuid)
print('capture', capture_uuid, 'completed')
self.compute(capture)
capture.delete() # TODO DELETE TASK ONLY IF NOT SCHEDULED TASKS
print('capture', capture.uuid, 'completed')
time.sleep(self.pending_seconds)
def enqueue_capture(self, task_uuid, priority):
task = crawlers.get_crawler_task(task_uuid)
task = crawlers.CrawlerTask(task_uuid)
print(task)
# task = {
# 'uuid': task_uuid,
@ -104,47 +102,43 @@ class Crawler(AbstractModule):
# 'proxy': 'force_tor',
# 'parent': 'manual',
# }
url = task['url']
url = task.get_url()
force = priority != 0
# TODO unpack cookiejar
# TODO timeout
# TODO HEADER
capture_uuid = self.lacus.enqueue(url=url,
depth=task['depth'],
user_agent=task['user_agent'],
proxy=task['proxy'],
cookies=[],
depth=task.get_depth(),
user_agent=task.get_user_agent(),
proxy=task.get_proxy(),
cookies=task.get_cookies(),
force=force,
general_timeout_in_sec=90)
crawlers.add_crawler_capture(task_uuid, capture_uuid)
print(task_uuid, capture_uuid, 'launched')
crawlers.create_capture(capture_uuid, task_uuid)
print(task.uuid, capture_uuid, 'launched')
return capture_uuid
# CRAWL DOMAIN
# TODO: CATCH ERRORS
def compute(self, capture_uuid):
def compute(self, capture):
print('saving capture', capture.uuid)
print('saving capture', capture_uuid)
task = capture.get_task()
domain = task.get_domain()
print(domain)
task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid)
task = crawlers.get_crawler_task(task_uuid)
print(task['domain'])
self.domain = Domain(task['domain'])
self.domain = Domain(domain)
# TODO CHANGE EPOCH
epoch = int(time.time())
parent_id = task['parent']
print(task)
parent_id = task.get_parent()
entries = self.lacus.get_capture(capture_uuid)
entries = self.lacus.get_capture(capture.uuid)
print(entries['status'])
self.har = task['har']
self.screenshot = task['screenshot']
self.har = task.get_har()
self.screenshot = task.get_screenshot()
str_date = crawlers.get_current_date(separator=True)
self.har_dir = crawlers.get_date_har_dir(str_date)
self.items_dir = crawlers.get_date_crawled_items_source(str_date)
@ -156,14 +150,13 @@ class Crawler(AbstractModule):
self.domain.update_daterange(str_date.replace('/', ''))
# Origin + History
if self.root_item:
# domain.add_ports(port)
self.domain.set_last_origin(parent_id)
self.domain.add_history(epoch, root_item=self.root_item)
elif self.domain.was_up():
self.domain.add_history(epoch, root_item=epoch)
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type())
task.clear()
def save_capture_response(self, parent_id, entries):
print(entries.keys())
@ -242,14 +235,6 @@ if __name__ == '__main__':
##################################
##################################
# from Helper import Process
# from pubsublogger import publisher
# ======== FUNCTIONS ========
# def update_auto_crawler():
# current_epoch = int(time.time())
# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)

File diff suppressed because it is too large Load diff

View file

@ -91,7 +91,7 @@ class Onion(AbstractModule):
if onion_urls:
if crawlers.is_crawler_activated():
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR
task_uuid = crawlers.add_crawler_task(domain, parent=item.get_id())
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0)
if task_uuid:
print(f'{domain} added to crawler queue: {task_uuid}')
else:

View file

@ -10,8 +10,8 @@ This module spots zerobins-like services for further processing
# Import External packages
##################################
import os
import sys
import re
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
@ -30,7 +30,7 @@ class Zerobins(AbstractModule):
super(Zerobins, self).__init__()
binz = [
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones
]
self.regex = re.compile('|'.join(binz))
@ -59,13 +59,13 @@ class Zerobins(AbstractModule):
if len(matching_binz) > 0:
for bin_url in matching_binz:
print(f'send {bin_url} to crawler')
crawlers.add_crawler_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
parent='manual', priority=10)
# TODO Change priority ???
crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
parent='manual', priority=60)
self.redis_logger.debug("Compute message in queue")
# TODO TEST ME
if __name__ == '__main__':
module = Zerobins()
module.run()

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 65 KiB

View file

@ -60,7 +60,7 @@ def create_json_response(data, status_code):
@login_read_only
def crawlers_dashboard():
is_manager_connected = crawlers.get_lacus_connection_metadata()
crawlers_status = crawlers.get_crawler_capture_status()
crawlers_status = crawlers.get_captures_status()
print(crawlers_status)
crawlers_latest_stats = crawlers.get_crawlers_stats()
print(crawlers_latest_stats)
@ -75,7 +75,7 @@ def crawlers_dashboard():
@login_required
@login_read_only
def crawler_dashboard_json():
crawlers_status = crawlers.get_crawler_capture_status()
crawlers_status = crawlers.get_captures_status()
crawlers_latest_stats = crawlers.get_crawlers_stats()
return jsonify({'crawlers_status': crawlers_status,
@ -106,7 +106,6 @@ def send_to_spider():
# POST val
url = request.form.get('url_to_crawl')
crawler_type = request.form.get('crawler_queue_type')
proxy = request.form.get('proxy_name')
auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler
crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler
screenshot = request.form.get('screenshot')
@ -114,7 +113,13 @@ def send_to_spider():
depth_limit = request.form.get('depth_limit')
cookiejar_uuid = request.form.get('cookiejar')
if crawler_type == 'onion':
# PROXY
proxy = request.form.get('proxy_name')
if proxy:
res = crawlers.api_verify_proxy(proxy)
if res[1] != 200:
return create_json_response(res[0], res[1])
elif crawler_type == 'onion':
proxy = 'force_tor'
if cookiejar_uuid:
@ -129,6 +134,7 @@ def send_to_spider():
data['proxy'] = proxy
if cookiejar_uuid:
data['cookiejar'] = cookiejar_uuid
# print(data)
res = crawlers.api_add_crawler_task(data, user_id=user_id)
if res[1] != 200:
@ -655,36 +661,6 @@ def crawler_cookiejar_cookie_json_add_post():
# --- Cookiejar ---#
@crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST'])
@login_required
@login_admin
def crawler_splash_setings_crawlers_to_lauch():
if request.method == 'POST':
dict_splash_name = {}
for crawler_name in list(request.form):
dict_splash_name[crawler_name] = request.form.get(crawler_name)
res = crawlers.api_set_nb_crawlers_to_launch(dict_splash_name)
if res[1] != 200:
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
else:
return redirect(url_for('crawler_splash.crawler_splash_setings'))
else:
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch_ui()
return render_template("settings_edit_crawlers_to_launch.html",
nb_crawlers_to_launch=nb_crawlers_to_launch)
@crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET'])
@login_required
@login_admin
def crawler_splash_setings_relaunch_crawler():
crawlers.relaunch_crawlers()
return redirect(url_for('crawler_splash.crawler_splash_setings'))
## - - ##
#### LACUS ####
@crawler_splash.route('/crawler/settings', methods=['GET'])
@ -693,6 +669,7 @@ def crawler_splash_setings_relaunch_crawler():
def crawler_settings():
lacus_url = crawlers.get_lacus_url()
api_key = crawlers.get_hidden_lacus_api_key()
nb_captures = crawlers.get_crawler_max_captures()
is_manager_connected = crawlers.get_lacus_connection_metadata(force_ping=True)
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
@ -701,14 +678,13 @@ def crawler_settings():
# TODO REGISTER PROXY
# all_proxies = crawlers.get_all_proxies_metadata()
# nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
# crawler_full_config = Config_DB.get_full_config_by_section('crawler')
return render_template("settings_crawler.html",
is_manager_connected=is_manager_connected,
lacus_url=lacus_url, api_key=api_key,
nb_captures=nb_captures,
# all_proxies=all_proxies,
# nb_crawlers_to_launch=nb_crawlers_to_launch,
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
)
@ -733,6 +709,22 @@ def crawler_lacus_settings_crawler_manager():
api_key = crawlers.get_lacus_api_key()
return render_template("settings_edit_lacus_crawler.html", lacus_url=lacus_url, api_key=api_key)
@crawler_splash.route('/crawler/settings/crawlers_to_launch', methods=['GET', 'POST'])
@login_required
@login_admin
def crawler_settings_crawlers_to_launch():
if request.method == 'POST':
nb_captures = request.form.get('nb_captures')
res = crawlers.api_set_crawler_max_captures({'nb': nb_captures})
if res[1] != 200:
return create_json_response(res[0], res[1])
else:
return redirect(url_for('crawler_splash.crawler_settings'))
else:
nb_captures = crawlers.get_crawler_max_captures()
return render_template("settings_edit_crawlers_to_launch.html",
nb_captures=nb_captures)
@crawler_splash.route('/crawler/settings/crawler/test', methods=['GET'])
@login_required

View file

@ -1,31 +1,31 @@
{%if not is_manager_connected['status']%}
<div class="alert alert-secondary text-center my-2" role="alert">
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
<p>
{%if 'error' in is_manager_connected%}
<b>{{is_manager_connected['status_code']}}</b>
<br>
<b>Error:</b> {{is_manager_connected['error']}}
{%else%}
<b>Error:</b> core/Crawler_manager not launched
{%endif%}
</p>
<div style="max-width: 500px;">
<ul class="list-group my-3">
<li class="list-group-item bg-dark text-white">Splash Manager Features:</li>
<li class="list-group-item">Install and run Splash crawlers on another server</li>
<li class="list-group-item">Handle proxies (Web and tor)</li>
<li class="list-group-item">Launch/Kill Splash Dockers</li>
<li class="list-group-item">Restart crawlers on crash</li>
<li class="list-group-item">
<div class="d-flex justify-content-center">
<a class="btn btn-info" href="https://github.com/ail-project/ail-splash-manager" role="button">
<i class="fab fa-github"></i> Install and Configure AIL-Splash-Manager
</a>
</div>
</li>
</ul>
<div class="alert alert-secondary text-center my-2" role="alert">
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
<p>
{%if 'error' in is_manager_connected%}
<b>{{is_manager_connected['status_code']}}</b>
<br>
<b>Error:</b> {{is_manager_connected['error']}}
{%else%}
<b>Error:</b> Lacus not connected
{%endif%}
</p>
<div style="max-width: 500px;">
<ul class="list-group my-3">
<li class="list-group-item bg-dark text-white"><h3>Lacus Features:</h3></li>
<li class="list-group-item">Install and run crawlers on another server</li>
<li class="list-group-item">Handle proxies ( <i class="fab fa-html5"></i> Web and <i class="fas fa-user-secret"></i> tor)</li>
<li class="list-group-item">Multiple Concurrent Captures</li>
<li class="list-group-item">HOW TO</li>
<li class="list-group-item">
<div class="d-flex justify-content-center">
<a class="btn btn-info" href="https://github.com/ail-project/lacus" role="button">
<i class="fab fa-github"></i> Install and Configure Lacus
</a>
</div>
</li>
</ul>
</div>
</div>
</div>
{%endif%}

View file

@ -60,13 +60,8 @@
{%endfor%}
</select>
</div>
<div id="div_proxy_name">
<select class="custom-select form-control" name="proxy_name" id="proxy_name">
<option value="None" selected>Use a proxy</option>
{%for proxy in proxies%}
<option value="{{proxy}}">{{proxy}}</option>
{%endfor%}
</select>
<div class="input-group" id="div_proxy_name">
<input type="text" class="form-control" id="proxy_name" name="proxy_name" placeholder="Expected Format: [scheme]://[username]:[password]@[hostname]:[port]">
</div>
<div class="d-flex mt-3">
<i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp;

View file

@ -33,7 +33,7 @@
<h5 class="card-title"><i class="fas fa-cookie-bite"></i> Edit Cookie: {{cookie_uuid}}</h5>
</div>
<div class="col-4">
<a class="btn btn-danger float-right" href="{{ url_for('crawler_splash.crawler_cookiejar_cookie_delete') }}?cookiejar_uuid={{cookiejar_uuid}}&cookie_uuid={{cookie_uuid}}">
<a class="btn btn-danger float-right" href="{{ url_for('crawler_splash.crawler_cookiejar_cookie_delete') }}?uuid={{cookie_uuid}}">
<i class="fas fa-trash-alt"></i>
</a>
</div>

View file

@ -53,7 +53,7 @@
</div>
{% endif %}
</span>
<h4>Splash Crawler Manager</h4>
<h4>AIL Lacus Crawler</h4>
</div>
<div class="card-body">
@ -92,52 +92,43 @@
</div>
</div>
<div class="card border-secondary">
<div class="card-body text-dark">
<h5 class="card-title">All Proxies:</h5>
<table class="table table-striped">
<thead class="bg-info text-white">
<tr>
<th>Proxy name</th>
<th>URL</th>
<th>Crawler Type</th>
<th>Description</th>
<th></th>
</tr>
</thead>
<tbody>
{% for proxy_name in all_proxies %}
<tr>
<td>
{{proxy_name}}
</td>
<td>
{{all_proxies[proxy_name]['url']}}
</td>
<td>
{%if all_proxies[proxy_name]['crawler_type']=='tor'%}
<i class="fas fa-user-secret"></i>
{%else%}
<i class="fab fa-html5"></i>
{%endif%}
{{all_proxies[proxy_name]['crawler_type']}}
</td>
<td>
{{all_proxies[proxy_name]['description']}}
</td>
<td>
<div class="d-flex justify-content-end">
<!-- <button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button> -->
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
{# <div class="card border-secondary">#}
{# <div class="card-body text-dark">#}
{# <h5 class="card-title">All Proxies:</h5>#}
{# <table class="table table-striped">#}
{# <thead class="bg-info text-white">#}
{# <tr>#}
{# <th>Proxy name</th>#}
{# <th>URL</th>#}
{# <th>Description</th>#}
{# <th></th>#}
{# </tr>#}
{# </thead>#}
{# <tbody>#}
{# {% for proxy_name in all_proxies %}#}
{# <tr>#}
{# <td>#}
{# {{proxy_name}}#}
{# </td>#}
{# <td>#}
{# {{all_proxies[proxy_name]['url']}}#}
{# </td>#}
{# <td>#}
{# {{all_proxies[proxy_name]['description']}}#}
{# </td>#}
{# <td>#}
{# <div class="d-flex justify-content-end">#}
{# <!-- <button class="btn btn-outline-dark px-1 py-0">#}
{# <i class="fas fa-pencil-alt"></i>#}
{# </button> -->#}
{# </div>#}
{# </td>#}
{# </tr>#}
{# {% endfor %}#}
{# </tbody>#}
{# </table>#}
{# </div>#}
{# </div>#}
</div>
</div>
@ -176,25 +167,12 @@
<div class="card border-secondary my-4">
<div class="card-body text-dark">
<h5 class="card-title">Number of Crawlers to Launch:</h5>
<table class="table table-sm">
<tbody>
{%for crawler in nb_crawlers_to_launch%}
<tr>
<td>{{crawler}}</td>
<td>{{nb_crawlers_to_launch[crawler]}}</td>
</tr>
{%endfor%}
</tbody>
</table>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
<h5 class="card-title">
Number of Concurrent Crawlers to Launch: &nbsp;&nbsp;<b class="text-primary">{{ nb_captures }}</b>
</h5>
<a href="{{ url_for('crawler_splash.crawler_settings_crawlers_to_launch') }}">
<button type="button" class="btn btn-info">
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
</button>
</a>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_relaunch_crawler') }}">
<button type="button" class="btn btn-danger">
ReLaunch Crawlers <i class="fas fa-redo"></i>
Edit <i class="fas fa-pencil-alt"></i>
</button>
</a>
</div>

View file

@ -26,22 +26,17 @@
<div class="col-12 col-lg-10" id="core_content">
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}" method="post" enctype="multipart/form-data">
<h5 class="card-title">Number of Crawlers to Launch:</h5>
<table class="table table-sm">
<tbody>
{%for crawler_name in nb_crawlers_to_launch%}
<tr>
<td>{{crawler_name}}</td>
<td>
<input class="form-control" type="number" id="{{crawler_name}}" value="{{nb_crawlers_to_launch[crawler_name]}}" min="0" name="{{crawler_name}}" required>
</td>
</tr>
{%endfor%}
</tbody>
</table>
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
</form>
<div class="card my-2">
<div class="card-header bg-dark text-white">
<form action="{{ url_for('crawler_splash.crawler_settings_crawlers_to_launch') }}" method="post" enctype="multipart/form-data">
<h3 class="card-title">Number of Concurrent Crawlers to Launch:</h3>
<input class="form-control" type="number" id="nb_captures" value="{{ nb_captures }}" min="1" name="nb_captures" required>
<button type="submit" class="btn btn-primary my-2">Edit <i class="fas fa-pencil-alt"></i></button>
</form>
</div>
</div>
</div>
</div>