chg: [crawler] add unsafe tag if domain contain unsafe screenshot

This commit is contained in:
Terrtia 2023-05-10 16:28:19 +02:00
parent 37c71b8438
commit 6b60041db2
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -15,6 +15,7 @@ from modules.abstract_module import AbstractModule
from lib import crawlers from lib import crawlers
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain from lib.objects.Domains import Domain
from lib.objects.Items import Item
from lib.objects import Screenshots from lib.objects import Screenshots
@ -53,6 +54,9 @@ class Crawler(AbstractModule):
self.items_dir = None self.items_dir = None
self.domain = None self.domain = None
# TODO Replace with warning list ???
self.placeholder_screenshots = {'27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748'}
# Send module state to logs # Send module state to logs
self.redis_logger.info('Crawler initialized') self.redis_logger.info('Crawler initialized')
@ -248,8 +252,13 @@ class Crawler(AbstractModule):
if 'png' in entries and entries['png']: if 'png' in entries and entries['png']:
screenshot = Screenshots.create_screenshot(entries['png'], b64=False) screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
if screenshot: if screenshot:
# Remove Errors pages # TODO Replace with warning list ??? if not screenshot.is_tags_safe():
if screenshot.id not in ['27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748']: unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item = Item(item_id)
item.add_tag(unsafe_tag)
# Remove Placeholder pages # TODO Replace with warning list ???
if screenshot.id not in self.placeholder_screenshots:
# Create Correlations # Create Correlations
screenshot.add_correlation('item', '', item_id) screenshot.add_correlation('item', '', item_id)
screenshot.add_correlation('domain', '', self.domain.id) screenshot.add_correlation('domain', '', self.domain.id)