chg: [crawler] add option to controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.

This commit is contained in:
terrtia 2025-02-06 15:10:06 +01:00
parent f01cfe70be
commit f7964fb5e6
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 87 additions and 5 deletions

View file

@ -58,6 +58,7 @@ class Crawler(AbstractModule):
config_loader = ConfigLoader()
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
self.last_config_check = int(time.time())
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@ -145,6 +146,7 @@ class Crawler(AbstractModule):
# Refresh Config
if int(time.time()) - self.last_config_check > 60:
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
self.last_config_check = int(time.time())
# Check if a new Capture can be Launched
@ -156,7 +158,7 @@ class Crawler(AbstractModule):
if self.filter_unsafe_onion:
if domain.endswith('.onion'):
try:
if not crawlers.check_if_onion_is_safe(domain):
if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
# print('---------------------------------------------------------')
# print('DOMAIN FILTERED')
task.delete()
@ -388,7 +390,7 @@ class Crawler(AbstractModule):
# Filter Domain
if self.filter_unsafe_onion:
if current_domain.endswith('.onion'):
if not crawlers.check_if_onion_is_safe(current_domain):
if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
return False
# TODO LAST URL

View file

@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
return {'error': f'Timeout Error'}
def check_if_onion_is_safe(onion_url):
def check_if_onion_is_safe(onion_url, unknown):
resp = _onion_lookup(onion_url)
if resp:
if isinstance(resp, dict):
@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
elif 'error' in resp:
if resp['error']:
raise OnionFilteringError(resp['error'])
elif not unknown:
if isinstance(resp, list):
if len(resp) > 1:
if resp[1] == 404:
return True
return False
@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
return True
return False
# # Crawl Unknown Onion # #
def _is_onion_filter_unknown():
unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
if unknown is None:
r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
filter_enabled = False
else:
filter_enabled = unknown == 'True'
r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
return filter_enabled
def is_onion_filter_unknown(cache=True):
if cache:
res = r_cache.get('crawler:onion_filter:unknown')
if res is None:
unknown = _is_onion_filter_unknown()
r_cache.set('crawler:onion_filter:unknown', str(unknown))
return unknown
else:
return res == 'True'
else:
return _is_onion_filter_unknown()
def change_onion_filter_unknown_state(new_state):
old_state = is_onion_filter_unknown(cache=False)
if old_state != new_state:
r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
r_cache.set('crawler:onion_filter:unknown', str(new_state))
update_time = time.time()
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
r_cache.set('crawler:onion_filter:last_update_time', update_time)
return True
return False
#### ---- ####

View file

@ -997,6 +997,7 @@ def crawler_settings():
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
# TODO REGISTER PROXY
# all_proxies = crawlers.get_all_proxies_metadata()
@ -1011,6 +1012,7 @@ def crawler_settings():
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
is_onion_filter_enabled=is_onion_filter_enabled,
is_onion_filter_unknown=is_onion_filter_unknown
)
@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
filter_unsafe_onion = True
else:
filter_unsafe_onion = False
print(filter_unsafe_onion)
crawlers.change_onion_filter_state(filter_unsafe_onion)
return redirect(url_for('crawler_splash.crawler_settings'))
@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
@login_required
@login_admin
def crawler_filter_unknown_onion():
filter_unknown_onion = request.args.get('state')
if filter_unknown_onion == 'enable':
filter_unknown_onion = True
else:
filter_unknown_onion = False
crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
return redirect(url_for('crawler_splash.crawler_settings'))
# --- LACUS ---#

View file

@ -243,7 +243,7 @@
</p>
{% if is_onion_filter_enabled %}
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
<button class="btn btn-danger mx-4 my-2">
<button class="btn btn-danger my-2">
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
</button>
</a>
@ -254,6 +254,34 @@
</button>
</a>
{% endif %}
<hr class="border-1 my-4">
<h5 class="card-title">
Crawl Unknown Onion: &nbsp;&nbsp;<b class="text-primary"><span class="text-{% if is_onion_filter_unknown %}success{% else %}secondary{% endif %}">{% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}</span></b>
</h5>
<p>This option controls whether the crawler should proceed with crawling onion domains that have <strong>not yet been classified</strong> as safe or unsafe.</p>
<ul>
<li><strong>If disabled:</strong> The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.</li>
<li><strong>If enabled:</strong> The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.</li>
</ul>
<p>This option is useful for users who want to explore uncharted domains while still benefiting from the <code>filter_unsafe_onion</code> protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.</p>
{% if is_onion_filter_unknown %}
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=disable">
<button class="btn btn-secondary my-2">
<i class="fa-solid fa-xmark"></i> Disable Unknown Onion Filter
</button>
</a>
{% else %}
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=enable">
<button class="btn btn-info my-2">
<i class="fa-solid fa-check"></i> Enable Unknown Onion Filter
</button>
</a>
{% endif %}
</div>
</div>