From f7964fb5e6308dd9604b351b43943273eb6efc26 Mon Sep 17 00:00:00 2001
From: terrtia
Date: Thu, 6 Feb 2025 15:10:06 +0100
Subject: [PATCH] chg: [crawler] add option to controls whether the crawler
should proceed with crawling onion domains that have not yet been classified
as safe or unsafe.
---
bin/crawlers/Crawler.py | 6 ++-
bin/lib/crawlers.py | 41 ++++++++++++++++++-
var/www/blueprints/crawler_splash.py | 15 ++++++-
.../crawler_splash/settings_crawler.html | 30 +++++++++++++-
4 files changed, 87 insertions(+), 5 deletions(-)
diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
index a017bf52..e312282b 100755
--- a/bin/crawlers/Crawler.py
+++ b/bin/crawlers/Crawler.py
@@ -58,6 +58,7 @@ class Crawler(AbstractModule):
config_loader = ConfigLoader()
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
+ self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
self.last_config_check = int(time.time())
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@@ -145,6 +146,7 @@ class Crawler(AbstractModule):
# Refresh Config
if int(time.time()) - self.last_config_check > 60:
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
+ self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
self.last_config_check = int(time.time())
# Check if a new Capture can be Launched
@@ -156,7 +158,7 @@ class Crawler(AbstractModule):
if self.filter_unsafe_onion:
if domain.endswith('.onion'):
try:
- if not crawlers.check_if_onion_is_safe(domain):
+ if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
# print('---------------------------------------------------------')
# print('DOMAIN FILTERED')
task.delete()
@@ -388,7 +390,7 @@ class Crawler(AbstractModule):
# Filter Domain
if self.filter_unsafe_onion:
if current_domain.endswith('.onion'):
- if not crawlers.check_if_onion_is_safe(current_domain):
+ if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
return False
# TODO LAST URL
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index dcd01397..4f8805f6 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
return {'error': f'Timeout Error'}
-def check_if_onion_is_safe(onion_url):
+def check_if_onion_is_safe(onion_url, unknown):
resp = _onion_lookup(onion_url)
if resp:
if isinstance(resp, dict):
@@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
elif 'error' in resp:
if resp['error']:
raise OnionFilteringError(resp['error'])
+ elif not unknown:
+ if isinstance(resp, list):
+ if len(resp) > 1:
+ if resp[1] == 404:
+ return True
return False
@@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
return True
return False
+# # Crawl Unknown Onion # #
+def _is_onion_filter_unknown():
+ unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
+ if unknown is None:
+ r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
+ filter_enabled = False
+ else:
+ filter_enabled = unknown == 'True'
+ r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
+ return filter_enabled
+
+def is_onion_filter_unknown(cache=True):
+ if cache:
+ res = r_cache.get('crawler:onion_filter:unknown')
+ if res is None:
+ unknown = _is_onion_filter_unknown()
+ r_cache.set('crawler:onion_filter:unknown', str(unknown))
+ return unknown
+ else:
+ return res == 'True'
+ else:
+ return _is_onion_filter_unknown()
+
+def change_onion_filter_unknown_state(new_state):
+ old_state = is_onion_filter_unknown(cache=False)
+ if old_state != new_state:
+ r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
+ r_cache.set('crawler:onion_filter:unknown', str(new_state))
+ update_time = time.time()
+ r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
+ r_cache.set('crawler:onion_filter:last_update_time', update_time)
+ return True
+ return False
+
#### ---- ####
diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py
index b21fb8ff..74084536 100644
--- a/var/www/blueprints/crawler_splash.py
+++ b/var/www/blueprints/crawler_splash.py
@@ -997,6 +997,7 @@ def crawler_settings():
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
+ is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
# TODO REGISTER PROXY
# all_proxies = crawlers.get_all_proxies_metadata()
@@ -1011,6 +1012,7 @@ def crawler_settings():
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
is_onion_filter_enabled=is_onion_filter_enabled,
+ is_onion_filter_unknown=is_onion_filter_unknown
)
@@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
filter_unsafe_onion = True
else:
filter_unsafe_onion = False
- print(filter_unsafe_onion)
crawlers.change_onion_filter_state(filter_unsafe_onion)
return redirect(url_for('crawler_splash.crawler_settings'))
+@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
+@login_required
+@login_admin
+def crawler_filter_unknown_onion():
+ filter_unknown_onion = request.args.get('state')
+ if filter_unknown_onion == 'enable':
+ filter_unknown_onion = True
+ else:
+ filter_unknown_onion = False
+ crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
+ return redirect(url_for('crawler_splash.crawler_settings'))
+
# --- LACUS ---#
diff --git a/var/www/templates/crawler/crawler_splash/settings_crawler.html b/var/www/templates/crawler/crawler_splash/settings_crawler.html
index 7577ff55..8c1aa6ee 100644
--- a/var/www/templates/crawler/crawler_splash/settings_crawler.html
+++ b/var/www/templates/crawler/crawler_splash/settings_crawler.html
@@ -243,7 +243,7 @@
{% if is_onion_filter_enabled %}
-
+
Disable Onion Filter
@@ -254,6 +254,34 @@
{% endif %}
+
+
+
+
+ Crawl Unknown Onion: {% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}
+
+ This option controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.
+
+
+ If disabled: The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.
+ If enabled: The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.
+
+
+ This option is useful for users who want to explore uncharted domains while still benefiting from the filter_unsafe_onion
protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.
+ {% if is_onion_filter_unknown %}
+
+
+ Disable Unknown Onion Filter
+
+
+ {% else %}
+
+
+ Enable Unknown Onion Filter
+
+
+ {% endif %}
+