mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-02-14 13:26:24 +00:00
chg: [Filter unsafe onion] add a new unsafe onion filter option
This commit is contained in:
parent
98652a1013
commit
f01cfe70be
5 changed files with 198 additions and 36 deletions
|
@ -17,7 +17,7 @@ from modules.abstract_module import AbstractModule
|
|||
from lib import ail_logger
|
||||
from lib import crawlers
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from lib.exceptions import TimeoutException
|
||||
from lib.exceptions import TimeoutException, OnionFilteringError
|
||||
from lib.Tag import get_domain_vanity_tags
|
||||
from lib.objects import CookiesNames
|
||||
from lib.objects import Etags
|
||||
|
@ -57,6 +57,9 @@ class Crawler(AbstractModule):
|
|||
|
||||
config_loader = ConfigLoader()
|
||||
|
||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
|
||||
self.last_config_check = int(time.time())
|
||||
|
||||
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
|
||||
|
@ -139,11 +142,31 @@ class Crawler(AbstractModule):
|
|||
if not self.is_lacus_up:
|
||||
return None
|
||||
|
||||
# Refresh Config
|
||||
if int(time.time()) - self.last_config_check > 60:
|
||||
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
|
||||
self.last_config_check = int(time.time())
|
||||
|
||||
# Check if a new Capture can be Launched
|
||||
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
||||
task_row = crawlers.add_task_to_lacus_queue()
|
||||
if task_row:
|
||||
task, priority = task_row
|
||||
domain = task.get_domain()
|
||||
if self.filter_unsafe_onion:
|
||||
if domain.endswith('.onion'):
|
||||
try:
|
||||
if not crawlers.check_if_onion_is_safe(domain):
|
||||
# print('---------------------------------------------------------')
|
||||
# print('DOMAIN FILTERED')
|
||||
task.delete()
|
||||
return None
|
||||
except OnionFilteringError:
|
||||
task.reset()
|
||||
self.logger.warning(f'Onion Filtering Connection Error, {task.uuid} Send back in queue')
|
||||
time.sleep(10)
|
||||
return None
|
||||
|
||||
task.start()
|
||||
task_uuid = task.uuid
|
||||
try:
|
||||
|
@ -301,41 +324,46 @@ class Crawler(AbstractModule):
|
|||
self.root_item = None
|
||||
|
||||
# Save Capture
|
||||
self.save_capture_response(parent_id, entries)
|
||||
|
||||
if self.parent != 'lookup':
|
||||
# Update domain first/last seen
|
||||
self.domain.update_daterange(self.date.replace('/', ''))
|
||||
# Origin + History + tags
|
||||
if self.root_item:
|
||||
self.domain.set_last_origin(parent_id)
|
||||
# Vanity
|
||||
self.domain.update_vanity_cluster()
|
||||
domain_vanity = self.domain.get_vanity()
|
||||
if domain_vanity in self.vanity_tags:
|
||||
for tag in self.vanity_tags[domain_vanity]:
|
||||
self.domain.add_tag(tag)
|
||||
# Tags
|
||||
for tag in task.get_tags():
|
||||
self.domain.add_tag(tag)
|
||||
# Crawler stats
|
||||
self.domain.add_history(epoch, root_item=self.root_item)
|
||||
|
||||
if self.domain != self.original_domain:
|
||||
self.original_domain.update_daterange(self.date.replace('/', ''))
|
||||
saved = self.save_capture_response(parent_id, entries)
|
||||
if saved:
|
||||
if self.parent != 'lookup':
|
||||
# Update domain first/last seen
|
||||
self.domain.update_daterange(self.date.replace('/', ''))
|
||||
# Origin + History + tags
|
||||
if self.root_item:
|
||||
self.original_domain.set_last_origin(parent_id)
|
||||
self.domain.set_last_origin(parent_id)
|
||||
# Vanity
|
||||
self.domain.update_vanity_cluster()
|
||||
domain_vanity = self.domain.get_vanity()
|
||||
if domain_vanity in self.vanity_tags:
|
||||
for tag in self.vanity_tags[domain_vanity]:
|
||||
self.domain.add_tag(tag)
|
||||
# Tags
|
||||
for tag in task.get_tags():
|
||||
self.domain.add_tag(tag)
|
||||
self.original_domain.add_history(epoch, root_item=self.root_item)
|
||||
# crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
|
||||
# Crawler stats
|
||||
self.domain.add_history(epoch, root_item=self.root_item)
|
||||
|
||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||
print('capture:', capture.uuid, 'completed')
|
||||
print('task: ', task.uuid, 'completed')
|
||||
print()
|
||||
if self.domain != self.original_domain:
|
||||
self.original_domain.update_daterange(self.date.replace('/', ''))
|
||||
if self.root_item:
|
||||
self.original_domain.set_last_origin(parent_id)
|
||||
# Tags
|
||||
for tag in task.get_tags():
|
||||
self.domain.add_tag(tag)
|
||||
self.original_domain.add_history(epoch, root_item=self.root_item)
|
||||
# crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
|
||||
|
||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||
print('capture:', capture.uuid, 'completed')
|
||||
print('task: ', task.uuid, 'completed')
|
||||
print()
|
||||
else:
|
||||
print('capture:', capture.uuid, 'Unsafe Content Filtered')
|
||||
print('task: ', task.uuid, 'Unsafe Content Filtered')
|
||||
print()
|
||||
task.remove()
|
||||
self.root_item = None
|
||||
|
||||
def save_capture_response(self, parent_id, entries):
|
||||
print(entries.keys())
|
||||
|
@ -357,6 +385,12 @@ class Crawler(AbstractModule):
|
|||
print(f'External redirection {self.domain.id} -> {current_domain}')
|
||||
if not self.root_item:
|
||||
self.domain = Domain(current_domain)
|
||||
# Filter Domain
|
||||
if self.filter_unsafe_onion:
|
||||
if current_domain.endswith('.onion'):
|
||||
if not crawlers.check_if_onion_is_safe(current_domain):
|
||||
return False
|
||||
|
||||
# TODO LAST URL
|
||||
# FIXME
|
||||
else:
|
||||
|
@ -449,6 +483,7 @@ class Crawler(AbstractModule):
|
|||
if entries_children:
|
||||
for children in entries_children:
|
||||
self.save_capture_response(parent_id, children)
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -39,12 +39,14 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
from packages import git_status
|
||||
from packages import Date
|
||||
from lib import ail_orgs
|
||||
from lib.exceptions import OnionFilteringError
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from lib.regex_helper import regex_findall
|
||||
from lib.objects.Domains import Domain
|
||||
from lib.objects.Titles import Title
|
||||
from lib.objects import HHHashs
|
||||
from lib.objects.Items import Item
|
||||
from lib import Tag
|
||||
|
||||
config_loader = ConfigLoader()
|
||||
r_db = config_loader.get_db_conn("Kvrocks_DB")
|
||||
|
@ -2269,13 +2271,87 @@ def test_ail_crawlers():
|
|||
|
||||
#### ---- ####
|
||||
|
||||
# TODO CHECK MIGRATION - Rest API
|
||||
|
||||
# TODO MIGRATE ME
|
||||
# def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
|
||||
# # validate url
|
||||
# if url is None or url=='' or url=='\n':
|
||||
# return ({'error':'invalid depth limit'}, 400)
|
||||
# # # # # # # # # # # # #
|
||||
# #
|
||||
# CONTENT FILTERING #
|
||||
# #
|
||||
# # # # # # # # # # # # #
|
||||
|
||||
def _onion_lookup(onion_url):
|
||||
try:
|
||||
commit_id = git_status.get_last_commit_id_from_local()
|
||||
user_agent = f'AIL-{commit_id}'
|
||||
headers = {'User-Agent': user_agent}
|
||||
response = requests.get(f'https://onion.ail-project.org/api/lookup/{onion_url}', timeout=10, headers=headers)
|
||||
if response.status_code == 200:
|
||||
json_response = response.json()
|
||||
return json_response
|
||||
else:
|
||||
print(response)
|
||||
return {'error': f'{response.status_code}'}
|
||||
except requests.exceptions.ConnectionError:
|
||||
return {'error': f'Connection Error'}
|
||||
except requests.exceptions.ReadTimeout:
|
||||
return {'error': f'Timeout Error'}
|
||||
|
||||
|
||||
def check_if_onion_is_safe(onion_url):
|
||||
resp = _onion_lookup(onion_url)
|
||||
if resp:
|
||||
if isinstance(resp, dict):
|
||||
if 'tags' in resp:
|
||||
return Tag.is_tags_safe(resp['tags'])
|
||||
elif 'error' in resp:
|
||||
if resp['error']:
|
||||
raise OnionFilteringError(resp['error'])
|
||||
return False
|
||||
|
||||
|
||||
def _is_onion_filter_enabled():
|
||||
enabled = r_crawler.hget('crawler:onion_filter', 'enabled')
|
||||
if enabled is None:
|
||||
r_crawler.hset('crawler:onion_filter', 'enabled', str(True))
|
||||
filter_enabled = True
|
||||
else:
|
||||
filter_enabled = enabled == 'True'
|
||||
r_cache.set('crawler:onion_filter:state', str(filter_enabled))
|
||||
return filter_enabled
|
||||
|
||||
def is_onion_filter_enabled(cache=True):
|
||||
if cache:
|
||||
res = r_cache.get('crawler:onion_filter:state')
|
||||
if res is None:
|
||||
enabled = _is_onion_filter_enabled()
|
||||
r_cache.set('crawler:onion_filter:state', str(enabled))
|
||||
return enabled
|
||||
else:
|
||||
return res == 'True'
|
||||
else:
|
||||
return _is_onion_filter_enabled()
|
||||
|
||||
def get_onion_filter_last_update_time():
|
||||
last_update_time = r_cache.get('crawler:onion_filter:last_update_time')
|
||||
if not last_update_time:
|
||||
last_update_time = r_crawler.hget('crawler:onion_filter', 'update_time')
|
||||
if not last_update_time:
|
||||
last_update_time = 0
|
||||
last_update_time = float(last_update_time)
|
||||
r_cache.set('crawler:onion_filter:last_update_time', last_update_time)
|
||||
return float(last_update_time)
|
||||
|
||||
def change_onion_filter_state(new_state):
|
||||
old_state = is_onion_filter_enabled(cache=False)
|
||||
if old_state != new_state:
|
||||
r_crawler.hset('crawler:onion_filter', 'enabled', str(new_state))
|
||||
r_cache.set('crawler:onion_filter:state', str(new_state))
|
||||
update_time = time.time()
|
||||
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
|
||||
r_cache.set('crawler:onion_filter:last_update_time', update_time)
|
||||
return True
|
||||
return False
|
||||
|
||||
#### ---- ####
|
||||
|
||||
|
||||
# TODO MOVE ME IN CRAWLER OR FLASK
|
||||
|
|
|
@ -26,3 +26,6 @@ class MISPConnectionError(AILError):
|
|||
|
||||
class AILObjectUnknown(AILError):
|
||||
pass
|
||||
|
||||
class OnionFilteringError(AILError):
|
||||
pass
|
||||
|
|
|
@ -996,6 +996,8 @@ def crawler_settings():
|
|||
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
|
||||
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
|
||||
|
||||
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
|
||||
|
||||
# TODO REGISTER PROXY
|
||||
# all_proxies = crawlers.get_all_proxies_metadata()
|
||||
|
||||
|
@ -1008,6 +1010,7 @@ def crawler_settings():
|
|||
# all_proxies=all_proxies,
|
||||
is_crawler_working=is_crawler_working,
|
||||
crawler_error_mess=crawler_error_mess,
|
||||
is_onion_filter_enabled=is_onion_filter_enabled,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1054,4 +1057,18 @@ def crawler_settings_crawler_test():
|
|||
crawlers.test_ail_crawlers()
|
||||
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||
|
||||
@crawler_splash.route('/crawler/settings/crawler/filter_unsafe_onion', methods=['GET'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_filter_unsafe_onion():
|
||||
filter_unsafe_onion = request.args.get('state')
|
||||
if filter_unsafe_onion == 'enable':
|
||||
filter_unsafe_onion = True
|
||||
else:
|
||||
filter_unsafe_onion = False
|
||||
print(filter_unsafe_onion)
|
||||
crawlers.change_onion_filter_state(filter_unsafe_onion)
|
||||
return redirect(url_for('crawler_splash.crawler_settings'))
|
||||
|
||||
|
||||
# --- LACUS ---#
|
||||
|
|
|
@ -226,6 +226,37 @@
|
|||
</div>
|
||||
</div> -->
|
||||
|
||||
<div class="card border-secondary my-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">
|
||||
Filter Unsafe Onion: <b class="text-primary"><span class="text-{% if is_onion_filter_enabled %}success{% else %}danger{% endif %}">{{ is_onion_filter_enabled }}</span></b>
|
||||
</h5>
|
||||
<p>
|
||||
This option enables filtering of onion domains that are considered unsafe due to containing violent content, child sexual abuse material (CSAM), or other harmful materials. When enabled, the system will attempt to identify and exclude such domains from crawling.<br><br>
|
||||
|
||||
<span class="text-danger"><i class="fa-solid fa-triangle-exclamation fa-2x"></i></span><b> Disabling this option may result in crawling and downloading content that includes CSAM, extreme violence, or other harmful materials.</b><br> Users are strongly advised to keep this feature enabled to avoid unintentional exposure to such content.<br><br>
|
||||
|
||||
🔍 How It Works: The filtering mechanism leverages known blocklists, heuristics, and automated detection techniques to reduce the risk of crawling unsafe content. While no filtering system is perfect, we continuously strive to improve detection and minimize exposure to harmful materials.<br><br>
|
||||
|
||||
By using this feature, you benefit from an added layer of protection, but please note that some unsafe onion domains may still bypass detection due to evolving content and obfuscation techniques.<br>
|
||||
We encourage users to remain cautious and use this feature as an additional safeguard.
|
||||
</p>
|
||||
{% if is_onion_filter_enabled %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
|
||||
<button class="btn btn-danger mx-4 my-2">
|
||||
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
|
||||
</button>
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=enable">
|
||||
<button class="btn btn-success my-2">
|
||||
<i class="fa-solid fa-check"></i> Enable Onion Filter
|
||||
</button>
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<a href="{{ url_for('crawler_splash.crawler_blacklist') }}">
|
||||
<button type="button" class="btn btn-outline-danger">Blacklisted domains</button>
|
||||
</a>
|
||||
|
|
Loading…
Add table
Reference in a new issue