From 2ead8c21aafd08c3219c316813905e7220652d8c Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 10 Oct 2024 11:03:07 +0200 Subject: [PATCH] chg: [crawler] tag domain by vanity --- bin/crawlers/Crawler.py | 9 +++++++++ bin/lib/Tag.py | 18 ++++++++++++++++++ bin/lib/objects/Domains.py | 3 +++ 3 files changed, 30 insertions(+) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index fb626c2e..abd4b12f 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule from lib import ail_logger from lib import crawlers from lib.ConfigLoader import ConfigLoader +from lib.Tag import get_domain_vanity_tags from lib.objects import CookiesNames from lib.objects import Etags from lib.objects.Domains import Domain @@ -40,6 +41,9 @@ class Crawler(AbstractModule): self.tracker_yara = Tracker_Yara(queue=False) + self.vanity_tags = get_domain_vanity_tags() + print('vanity tags:', self.vanity_tags) + config_loader = ConfigLoader() self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') @@ -271,7 +275,12 @@ class Crawler(AbstractModule): # Origin + History + tags if self.root_item: self.domain.set_last_origin(parent_id) + # Vanity self.domain.update_vanity_cluster() + domain_vanity = self.domain.get_vanity() + if domain_vanity in self.vanity_tags: + for tag in self.vanity_tags[domain_vanity]: + self.domain.add_tag(tag) # Tags for tag in task.get_tags(): self.domain.add_tag(tag) diff --git a/bin/lib/Tag.py b/bin/lib/Tag.py index 650b34d0..86bffd3a 100755 --- a/bin/lib/Tag.py +++ b/bin/lib/Tag.py @@ -1521,6 +1521,24 @@ def refresh_auto_push(): # --- TAG AUTO PUSH --- # +def get_domain_vanity_tags(): + vanity = {} + try: + with open(os.path.join(os.environ['AIL_HOME'], 'files/vanity_tags')) as f: + ltags = json.load(f) + if ltags: + for tag in ltags: + if is_taxonomie_tag(tag) or is_galaxy_tag(tag): + for s_vanity in ltags[tag]: + if s_vanity not in vanity: + vanity[s_vanity] = [] + vanity[s_vanity].append(tag) + except FileNotFoundError: + pass + except json.decoder.JSONDecodeError: + print('Error files/vanity_tags, Invalid JSON') + return vanity + ################################################################################### ################################################################################### ################################################################################### diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py index 88e94dd9..b20208eb 100755 --- a/bin/lib/objects/Domains.py +++ b/bin/lib/objects/Domains.py @@ -411,6 +411,9 @@ class Domain(AbstractObject): r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id) r_crawler.sadd(f'domain:language:{self.id}', language) + def get_vanity(self, len_vanity=4): + return get_domain_vanity(self.id, len_vanity=len_vanity) + def update_vanity_cluster(self): if self.get_domain_type() == 'onion': update_vanity_cluster(self.id)