mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-02-15 05:46:22 +00:00
chg: [crawler] add option to controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.
This commit is contained in:
parent
f01cfe70be
commit
f7964fb5e6
4 changed files with 87 additions and 5 deletions
|
@ -58,6 +58,7 @@ class Crawler(AbstractModule):
|
||||||
config_loader = ConfigLoader()
|
config_loader = ConfigLoader()
|
||||||
|
|
||||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
|
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
|
||||||
|
self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
|
||||||
self.last_config_check = int(time.time())
|
self.last_config_check = int(time.time())
|
||||||
|
|
||||||
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||||
|
@ -145,6 +146,7 @@ class Crawler(AbstractModule):
|
||||||
# Refresh Config
|
# Refresh Config
|
||||||
if int(time.time()) - self.last_config_check > 60:
|
if int(time.time()) - self.last_config_check > 60:
|
||||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
|
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
|
||||||
|
self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
|
||||||
self.last_config_check = int(time.time())
|
self.last_config_check = int(time.time())
|
||||||
|
|
||||||
# Check if a new Capture can be Launched
|
# Check if a new Capture can be Launched
|
||||||
|
@ -156,7 +158,7 @@ class Crawler(AbstractModule):
|
||||||
if self.filter_unsafe_onion:
|
if self.filter_unsafe_onion:
|
||||||
if domain.endswith('.onion'):
|
if domain.endswith('.onion'):
|
||||||
try:
|
try:
|
||||||
if not crawlers.check_if_onion_is_safe(domain):
|
if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
|
||||||
# print('---------------------------------------------------------')
|
# print('---------------------------------------------------------')
|
||||||
# print('DOMAIN FILTERED')
|
# print('DOMAIN FILTERED')
|
||||||
task.delete()
|
task.delete()
|
||||||
|
@ -388,7 +390,7 @@ class Crawler(AbstractModule):
|
||||||
# Filter Domain
|
# Filter Domain
|
||||||
if self.filter_unsafe_onion:
|
if self.filter_unsafe_onion:
|
||||||
if current_domain.endswith('.onion'):
|
if current_domain.endswith('.onion'):
|
||||||
if not crawlers.check_if_onion_is_safe(current_domain):
|
if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# TODO LAST URL
|
# TODO LAST URL
|
||||||
|
|
|
@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
|
||||||
return {'error': f'Timeout Error'}
|
return {'error': f'Timeout Error'}
|
||||||
|
|
||||||
|
|
||||||
def check_if_onion_is_safe(onion_url):
|
def check_if_onion_is_safe(onion_url, unknown):
|
||||||
resp = _onion_lookup(onion_url)
|
resp = _onion_lookup(onion_url)
|
||||||
if resp:
|
if resp:
|
||||||
if isinstance(resp, dict):
|
if isinstance(resp, dict):
|
||||||
|
@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
|
||||||
elif 'error' in resp:
|
elif 'error' in resp:
|
||||||
if resp['error']:
|
if resp['error']:
|
||||||
raise OnionFilteringError(resp['error'])
|
raise OnionFilteringError(resp['error'])
|
||||||
|
elif not unknown:
|
||||||
|
if isinstance(resp, list):
|
||||||
|
if len(resp) > 1:
|
||||||
|
if resp[1] == 404:
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# # Crawl Unknown Onion # #
|
||||||
|
def _is_onion_filter_unknown():
|
||||||
|
unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
|
||||||
|
if unknown is None:
|
||||||
|
r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
|
||||||
|
filter_enabled = False
|
||||||
|
else:
|
||||||
|
filter_enabled = unknown == 'True'
|
||||||
|
r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
|
||||||
|
return filter_enabled
|
||||||
|
|
||||||
|
def is_onion_filter_unknown(cache=True):
|
||||||
|
if cache:
|
||||||
|
res = r_cache.get('crawler:onion_filter:unknown')
|
||||||
|
if res is None:
|
||||||
|
unknown = _is_onion_filter_unknown()
|
||||||
|
r_cache.set('crawler:onion_filter:unknown', str(unknown))
|
||||||
|
return unknown
|
||||||
|
else:
|
||||||
|
return res == 'True'
|
||||||
|
else:
|
||||||
|
return _is_onion_filter_unknown()
|
||||||
|
|
||||||
|
def change_onion_filter_unknown_state(new_state):
|
||||||
|
old_state = is_onion_filter_unknown(cache=False)
|
||||||
|
if old_state != new_state:
|
||||||
|
r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
|
||||||
|
r_cache.set('crawler:onion_filter:unknown', str(new_state))
|
||||||
|
update_time = time.time()
|
||||||
|
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
|
||||||
|
r_cache.set('crawler:onion_filter:last_update_time', update_time)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
#### ---- ####
|
#### ---- ####
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -997,6 +997,7 @@ def crawler_settings():
|
||||||
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
|
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
|
||||||
|
|
||||||
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
|
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
|
||||||
|
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
|
||||||
|
|
||||||
# TODO REGISTER PROXY
|
# TODO REGISTER PROXY
|
||||||
# all_proxies = crawlers.get_all_proxies_metadata()
|
# all_proxies = crawlers.get_all_proxies_metadata()
|
||||||
|
@ -1011,6 +1012,7 @@ def crawler_settings():
|
||||||
is_crawler_working=is_crawler_working,
|
is_crawler_working=is_crawler_working,
|
||||||
crawler_error_mess=crawler_error_mess,
|
crawler_error_mess=crawler_error_mess,
|
||||||
is_onion_filter_enabled=is_onion_filter_enabled,
|
is_onion_filter_enabled=is_onion_filter_enabled,
|
||||||
|
is_onion_filter_unknown=is_onion_filter_unknown
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
|
||||||
filter_unsafe_onion = True
|
filter_unsafe_onion = True
|
||||||
else:
|
else:
|
||||||
filter_unsafe_onion = False
|
filter_unsafe_onion = False
|
||||||
print(filter_unsafe_onion)
|
|
||||||
crawlers.change_onion_filter_state(filter_unsafe_onion)
|
crawlers.change_onion_filter_state(filter_unsafe_onion)
|
||||||
return redirect(url_for('crawler_splash.crawler_settings'))
|
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||||
|
|
||||||
|
@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
@login_admin
|
||||||
|
def crawler_filter_unknown_onion():
|
||||||
|
filter_unknown_onion = request.args.get('state')
|
||||||
|
if filter_unknown_onion == 'enable':
|
||||||
|
filter_unknown_onion = True
|
||||||
|
else:
|
||||||
|
filter_unknown_onion = False
|
||||||
|
crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
|
||||||
|
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||||
|
|
||||||
|
|
||||||
# --- LACUS ---#
|
# --- LACUS ---#
|
||||||
|
|
|
@ -243,7 +243,7 @@
|
||||||
</p>
|
</p>
|
||||||
{% if is_onion_filter_enabled %}
|
{% if is_onion_filter_enabled %}
|
||||||
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
|
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
|
||||||
<button class="btn btn-danger mx-4 my-2">
|
<button class="btn btn-danger my-2">
|
||||||
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
|
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
|
||||||
</button>
|
</button>
|
||||||
</a>
|
</a>
|
||||||
|
@ -254,6 +254,34 @@
|
||||||
</button>
|
</button>
|
||||||
</a>
|
</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
<hr class="border-1 my-4">
|
||||||
|
|
||||||
|
<h5 class="card-title">
|
||||||
|
Crawl Unknown Onion: <b class="text-primary"><span class="text-{% if is_onion_filter_unknown %}success{% else %}secondary{% endif %}">{% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}</span></b>
|
||||||
|
</h5>
|
||||||
|
<p>This option controls whether the crawler should proceed with crawling onion domains that have <strong>not yet been classified</strong> as safe or unsafe.</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li><strong>If disabled:</strong> The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.</li>
|
||||||
|
<li><strong>If enabled:</strong> The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>This option is useful for users who want to explore uncharted domains while still benefiting from the <code>filter_unsafe_onion</code> protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.</p>
|
||||||
|
{% if is_onion_filter_unknown %}
|
||||||
|
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=disable">
|
||||||
|
<button class="btn btn-secondary my-2">
|
||||||
|
<i class="fa-solid fa-xmark"></i> Disable Unknown Onion Filter
|
||||||
|
</button>
|
||||||
|
</a>
|
||||||
|
{% else %}
|
||||||
|
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=enable">
|
||||||
|
<button class="btn btn-info my-2">
|
||||||
|
<i class="fa-solid fa-check"></i> Enable Unknown Onion Filter
|
||||||
|
</button>
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue