From f7964fb5e6308dd9604b351b43943273eb6efc26 Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 6 Feb 2025 15:10:06 +0100 Subject: [PATCH] chg: [crawler] add option to controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe. --- bin/crawlers/Crawler.py | 6 ++- bin/lib/crawlers.py | 41 ++++++++++++++++++- var/www/blueprints/crawler_splash.py | 15 ++++++- .../crawler_splash/settings_crawler.html | 30 +++++++++++++- 4 files changed, 87 insertions(+), 5 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index a017bf52..e312282b 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -58,6 +58,7 @@ class Crawler(AbstractModule): config_loader = ConfigLoader() self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False) + self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False) self.last_config_check = int(time.time()) self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') @@ -145,6 +146,7 @@ class Crawler(AbstractModule): # Refresh Config if int(time.time()) - self.last_config_check > 60: self.filter_unsafe_onion = crawlers.is_onion_filter_enabled() + self.filter_unknown_onion = crawlers.is_onion_filter_unknown() self.last_config_check = int(time.time()) # Check if a new Capture can be Launched @@ -156,7 +158,7 @@ class Crawler(AbstractModule): if self.filter_unsafe_onion: if domain.endswith('.onion'): try: - if not crawlers.check_if_onion_is_safe(domain): + if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion): # print('---------------------------------------------------------') # print('DOMAIN FILTERED') task.delete() @@ -388,7 +390,7 @@ class Crawler(AbstractModule): # Filter Domain if self.filter_unsafe_onion: if current_domain.endswith('.onion'): - if not crawlers.check_if_onion_is_safe(current_domain): + if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion): return False # TODO LAST URL diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index dcd01397..4f8805f6 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url): return {'error': f'Timeout Error'} -def check_if_onion_is_safe(onion_url): +def check_if_onion_is_safe(onion_url, unknown): resp = _onion_lookup(onion_url) if resp: if isinstance(resp, dict): @@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url): elif 'error' in resp: if resp['error']: raise OnionFilteringError(resp['error']) + elif not unknown: + if isinstance(resp, list): + if len(resp) > 1: + if resp[1] == 404: + return True return False @@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state): return True return False +# # Crawl Unknown Onion # # +def _is_onion_filter_unknown(): + unknown = r_crawler.hget('crawler:onion_filter', 'unknown') + if unknown is None: + r_crawler.hset('crawler:onion_filter', 'unknown', str(False)) + filter_enabled = False + else: + filter_enabled = unknown == 'True' + r_cache.set('crawler:onion_filter:unknown', str(filter_enabled)) + return filter_enabled + +def is_onion_filter_unknown(cache=True): + if cache: + res = r_cache.get('crawler:onion_filter:unknown') + if res is None: + unknown = _is_onion_filter_unknown() + r_cache.set('crawler:onion_filter:unknown', str(unknown)) + return unknown + else: + return res == 'True' + else: + return _is_onion_filter_unknown() + +def change_onion_filter_unknown_state(new_state): + old_state = is_onion_filter_unknown(cache=False) + if old_state != new_state: + r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state)) + r_cache.set('crawler:onion_filter:unknown', str(new_state)) + update_time = time.time() + r_crawler.hset('crawler:onion_filter', 'update_time', update_time) + r_cache.set('crawler:onion_filter:last_update_time', update_time) + return True + return False + #### ---- #### diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index b21fb8ff..74084536 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -997,6 +997,7 @@ def crawler_settings(): crawler_error_mess = crawlers.get_test_ail_crawlers_message() is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False) + is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False) # TODO REGISTER PROXY # all_proxies = crawlers.get_all_proxies_metadata() @@ -1011,6 +1012,7 @@ def crawler_settings(): is_crawler_working=is_crawler_working, crawler_error_mess=crawler_error_mess, is_onion_filter_enabled=is_onion_filter_enabled, + is_onion_filter_unknown=is_onion_filter_unknown ) @@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion(): filter_unsafe_onion = True else: filter_unsafe_onion = False - print(filter_unsafe_onion) crawlers.change_onion_filter_state(filter_unsafe_onion) return redirect(url_for('crawler_splash.crawler_settings')) +@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET']) +@login_required +@login_admin +def crawler_filter_unknown_onion(): + filter_unknown_onion = request.args.get('state') + if filter_unknown_onion == 'enable': + filter_unknown_onion = True + else: + filter_unknown_onion = False + crawlers.change_onion_filter_unknown_state(filter_unknown_onion) + return redirect(url_for('crawler_splash.crawler_settings')) + # --- LACUS ---# diff --git a/var/www/templates/crawler/crawler_splash/settings_crawler.html b/var/www/templates/crawler/crawler_splash/settings_crawler.html index 7577ff55..8c1aa6ee 100644 --- a/var/www/templates/crawler/crawler_splash/settings_crawler.html +++ b/var/www/templates/crawler/crawler_splash/settings_crawler.html @@ -243,7 +243,7 @@

{% if is_onion_filter_enabled %} - @@ -254,6 +254,34 @@ {% endif %} + +
+ +
+ Crawl Unknown Onion:   {% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %} +
+

This option controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.

+ + + +

This option is useful for users who want to explore uncharted domains while still benefiting from the filter_unsafe_onion protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.

+ {% if is_onion_filter_unknown %} + + + + {% else %} + + + + {% endif %} +