mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-02-14 13:26:24 +00:00
chg: [crawler] add option to controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.
This commit is contained in:
parent
f01cfe70be
commit
f7964fb5e6
4 changed files with 87 additions and 5 deletions
|
@ -58,6 +58,7 @@ class Crawler(AbstractModule):
|
|||
config_loader = ConfigLoader()
|
||||
|
||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
|
||||
self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
|
||||
self.last_config_check = int(time.time())
|
||||
|
||||
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||
|
@ -145,6 +146,7 @@ class Crawler(AbstractModule):
|
|||
# Refresh Config
|
||||
if int(time.time()) - self.last_config_check > 60:
|
||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
|
||||
self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
|
||||
self.last_config_check = int(time.time())
|
||||
|
||||
# Check if a new Capture can be Launched
|
||||
|
@ -156,7 +158,7 @@ class Crawler(AbstractModule):
|
|||
if self.filter_unsafe_onion:
|
||||
if domain.endswith('.onion'):
|
||||
try:
|
||||
if not crawlers.check_if_onion_is_safe(domain):
|
||||
if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
|
||||
# print('---------------------------------------------------------')
|
||||
# print('DOMAIN FILTERED')
|
||||
task.delete()
|
||||
|
@ -388,7 +390,7 @@ class Crawler(AbstractModule):
|
|||
# Filter Domain
|
||||
if self.filter_unsafe_onion:
|
||||
if current_domain.endswith('.onion'):
|
||||
if not crawlers.check_if_onion_is_safe(current_domain):
|
||||
if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
|
||||
return False
|
||||
|
||||
# TODO LAST URL
|
||||
|
|
|
@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
|
|||
return {'error': f'Timeout Error'}
|
||||
|
||||
|
||||
def check_if_onion_is_safe(onion_url):
|
||||
def check_if_onion_is_safe(onion_url, unknown):
|
||||
resp = _onion_lookup(onion_url)
|
||||
if resp:
|
||||
if isinstance(resp, dict):
|
||||
|
@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
|
|||
elif 'error' in resp:
|
||||
if resp['error']:
|
||||
raise OnionFilteringError(resp['error'])
|
||||
elif not unknown:
|
||||
if isinstance(resp, list):
|
||||
if len(resp) > 1:
|
||||
if resp[1] == 404:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
|
|||
return True
|
||||
return False
|
||||
|
||||
# # Crawl Unknown Onion # #
|
||||
def _is_onion_filter_unknown():
|
||||
unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
|
||||
if unknown is None:
|
||||
r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
|
||||
filter_enabled = False
|
||||
else:
|
||||
filter_enabled = unknown == 'True'
|
||||
r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
|
||||
return filter_enabled
|
||||
|
||||
def is_onion_filter_unknown(cache=True):
|
||||
if cache:
|
||||
res = r_cache.get('crawler:onion_filter:unknown')
|
||||
if res is None:
|
||||
unknown = _is_onion_filter_unknown()
|
||||
r_cache.set('crawler:onion_filter:unknown', str(unknown))
|
||||
return unknown
|
||||
else:
|
||||
return res == 'True'
|
||||
else:
|
||||
return _is_onion_filter_unknown()
|
||||
|
||||
def change_onion_filter_unknown_state(new_state):
|
||||
old_state = is_onion_filter_unknown(cache=False)
|
||||
if old_state != new_state:
|
||||
r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
|
||||
r_cache.set('crawler:onion_filter:unknown', str(new_state))
|
||||
update_time = time.time()
|
||||
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
|
||||
r_cache.set('crawler:onion_filter:last_update_time', update_time)
|
||||
return True
|
||||
return False
|
||||
|
||||
#### ---- ####
|
||||
|
||||
|
||||
|
|
|
@ -997,6 +997,7 @@ def crawler_settings():
|
|||
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
|
||||
|
||||
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
|
||||
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
|
||||
|
||||
# TODO REGISTER PROXY
|
||||
# all_proxies = crawlers.get_all_proxies_metadata()
|
||||
|
@ -1011,6 +1012,7 @@ def crawler_settings():
|
|||
is_crawler_working=is_crawler_working,
|
||||
crawler_error_mess=crawler_error_mess,
|
||||
is_onion_filter_enabled=is_onion_filter_enabled,
|
||||
is_onion_filter_unknown=is_onion_filter_unknown
|
||||
)
|
||||
|
||||
|
||||
|
@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
|
|||
filter_unsafe_onion = True
|
||||
else:
|
||||
filter_unsafe_onion = False
|
||||
print(filter_unsafe_onion)
|
||||
crawlers.change_onion_filter_state(filter_unsafe_onion)
|
||||
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||
|
||||
@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_filter_unknown_onion():
|
||||
filter_unknown_onion = request.args.get('state')
|
||||
if filter_unknown_onion == 'enable':
|
||||
filter_unknown_onion = True
|
||||
else:
|
||||
filter_unknown_onion = False
|
||||
crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
|
||||
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||
|
||||
|
||||
# --- LACUS ---#
|
||||
|
|
|
@ -243,7 +243,7 @@
|
|||
</p>
|
||||
{% if is_onion_filter_enabled %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
|
||||
<button class="btn btn-danger mx-4 my-2">
|
||||
<button class="btn btn-danger my-2">
|
||||
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
|
||||
</button>
|
||||
</a>
|
||||
|
@ -254,6 +254,34 @@
|
|||
</button>
|
||||
</a>
|
||||
{% endif %}
|
||||
|
||||
<hr class="border-1 my-4">
|
||||
|
||||
<h5 class="card-title">
|
||||
Crawl Unknown Onion: <b class="text-primary"><span class="text-{% if is_onion_filter_unknown %}success{% else %}secondary{% endif %}">{% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}</span></b>
|
||||
</h5>
|
||||
<p>This option controls whether the crawler should proceed with crawling onion domains that have <strong>not yet been classified</strong> as safe or unsafe.</p>
|
||||
|
||||
<ul>
|
||||
<li><strong>If disabled:</strong> The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.</li>
|
||||
<li><strong>If enabled:</strong> The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.</li>
|
||||
</ul>
|
||||
|
||||
<p>This option is useful for users who want to explore uncharted domains while still benefiting from the <code>filter_unsafe_onion</code> protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.</p>
|
||||
{% if is_onion_filter_unknown %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=disable">
|
||||
<button class="btn btn-secondary my-2">
|
||||
<i class="fa-solid fa-xmark"></i> Disable Unknown Onion Filter
|
||||
</button>
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=enable">
|
||||
<button class="btn btn-info my-2">
|
||||
<i class="fa-solid fa-check"></i> Enable Unknown Onion Filter
|
||||
</button>
|
||||
</a>
|
||||
{% endif %}
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue