chg: [crawler] tag domain by vanity
Some checks failed
CI / ail_test (3.10) (push) Has been cancelled
CI / ail_test (3.7) (push) Has been cancelled
CI / ail_test (3.8) (push) Has been cancelled
CI / ail_test (3.9) (push) Has been cancelled

This commit is contained in:
terrtia 2024-10-10 11:03:07 +02:00
parent 72f4733242
commit 2ead8c21aa
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 30 additions and 0 deletions

View file

@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
from lib import ail_logger from lib import ail_logger
from lib import crawlers from lib import crawlers
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames from lib.objects import CookiesNames
from lib.objects import Etags from lib.objects import Etags
from lib.objects.Domains import Domain from lib.objects.Domains import Domain
@ -40,6 +41,9 @@ class Crawler(AbstractModule):
self.tracker_yara = Tracker_Yara(queue=False) self.tracker_yara = Tracker_Yara(queue=False)
self.vanity_tags = get_domain_vanity_tags()
print('vanity tags:', self.vanity_tags)
config_loader = ConfigLoader() config_loader = ConfigLoader()
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@ -271,7 +275,12 @@ class Crawler(AbstractModule):
# Origin + History + tags # Origin + History + tags
if self.root_item: if self.root_item:
self.domain.set_last_origin(parent_id) self.domain.set_last_origin(parent_id)
# Vanity
self.domain.update_vanity_cluster() self.domain.update_vanity_cluster()
domain_vanity = self.domain.get_vanity()
if domain_vanity in self.vanity_tags:
for tag in self.vanity_tags[domain_vanity]:
self.domain.add_tag(tag)
# Tags # Tags
for tag in task.get_tags(): for tag in task.get_tags():
self.domain.add_tag(tag) self.domain.add_tag(tag)

View file

@ -1521,6 +1521,24 @@ def refresh_auto_push():
# --- TAG AUTO PUSH --- # # --- TAG AUTO PUSH --- #
def get_domain_vanity_tags():
vanity = {}
try:
with open(os.path.join(os.environ['AIL_HOME'], 'files/vanity_tags')) as f:
ltags = json.load(f)
if ltags:
for tag in ltags:
if is_taxonomie_tag(tag) or is_galaxy_tag(tag):
for s_vanity in ltags[tag]:
if s_vanity not in vanity:
vanity[s_vanity] = []
vanity[s_vanity].append(tag)
except FileNotFoundError:
pass
except json.decoder.JSONDecodeError:
print('Error files/vanity_tags, Invalid JSON')
return vanity
################################################################################### ###################################################################################
################################################################################### ###################################################################################
################################################################################### ###################################################################################

View file

@ -411,6 +411,9 @@ class Domain(AbstractObject):
r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id) r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id)
r_crawler.sadd(f'domain:language:{self.id}', language) r_crawler.sadd(f'domain:language:{self.id}', language)
def get_vanity(self, len_vanity=4):
return get_domain_vanity(self.id, len_vanity=len_vanity)
def update_vanity_cluster(self): def update_vanity_cluster(self):
if self.get_domain_type() == 'onion': if self.get_domain_type() == 'onion':
update_vanity_cluster(self.id) update_vanity_cluster(self.id)