From 109ce56a4ac0238af80f4b6bdeb2b142d9de7cfa Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 9 Jan 2025 11:09:36 +0100 Subject: [PATCH] fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep --- bin/crawlers/Crawler.py | 31 ++++++++++++++++++++++++++++--- bin/lib/crawlers.py | 26 +++----------------------- bin/lib/exceptions.py | 6 +++++- bin/modules/abstract_module.py | 7 +++++-- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 0ae27380..309b649a 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule from lib import ail_logger from lib import crawlers from lib.ConfigLoader import ConfigLoader +from lib.exceptions import TimeoutException from lib.Tag import get_domain_vanity_tags from lib.objects import CookiesNames from lib.objects import Etags @@ -30,6 +31,15 @@ from trackers.Tracker_Yara import Tracker_Yara logging.config.dictConfig(ail_logger.get_config(name='crawlers')) +# SIGNAL ALARM +import signal +def timeout_handler(signum, frame): + raise TimeoutException + + +signal.signal(signal.SIGALRM, timeout_handler) + + class Crawler(AbstractModule): def __init__(self): @@ -104,7 +114,10 @@ class Crawler(AbstractModule): self.is_lacus_up = False if not self.is_lacus_up: print("Can't reach lacus server", int(time.time())) - time.sleep(30) + try: + time.sleep(30) + except TimeoutException: + pass def print_crawler_start_info(self, url, domain_url): print() @@ -183,7 +196,10 @@ class Crawler(AbstractModule): capture.update(-1) self.refresh_lacus_status() - time.sleep(self.pending_seconds) + try: + time.sleep(self.pending_seconds) + except TimeoutException: + pass def enqueue_capture(self, task_uuid, priority): task = crawlers.CrawlerTask(task_uuid) @@ -364,7 +380,16 @@ class Crawler(AbstractModule): dom_hash.add(self.date.replace('/', ''), item) dom_hash.add_correlation('domain', '', self.domain.id) - title_content = crawlers.extract_title_from_html(entries['html'], item_id) + # TITLE + signal.alarm(60) + try: + title_content = crawlers.extract_title_from_html(entries['html']) + except TimeoutException: + self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}') + title_content = None + else: + signal.alarm(0) + if title_content: title = Titles.create_title(title_content) title.add(item.get_date(), item) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index c8a4db2f..7a75e0ce 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -28,19 +28,6 @@ from pylacus import PyLacus from pyfaup.faup import Faup - -import signal - -class TimeoutException(Exception): - pass - -def timeout_handler(signum, frame): - raise TimeoutException - - -signal.signal(signal.SIGALRM, timeout_handler) - - # interact with splash_crawler API import requests requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) @@ -73,7 +60,7 @@ config_loader = None faup = Faup() -logger_crawler = logging.getLogger('crawlers.log') +# logger_crawler = logging.getLogger('crawlers.log') # # # # # # # # # # @@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url): # # # # # # # # # # -def extract_title_from_html(html, item_id): - # signal.alarm(60) - # try: +# /!\ REQUIRE ALARM SIGNAL +def extract_title_from_html(html): soup = BeautifulSoup(html, 'html.parser') title = soup.title if title: title = title.string if title: return str(title) - # except TimeoutException: - # signal.alarm(0) - # logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}') - # else: - # signal.alarm(0) - # signal.alarm(0) return '' def extract_description_from_html(html): diff --git a/bin/lib/exceptions.py b/bin/lib/exceptions.py index 1c21933f..66a30683 100755 --- a/bin/lib/exceptions.py +++ b/bin/lib/exceptions.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* -from pymisp import PyMISPError +# from pymisp import PyMISPError + +# SIGNAL ALARM +class TimeoutException(Exception): + pass class AILError(Exception): def __init__(self, message): diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index f984bf4a..7478cb87 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -21,7 +21,7 @@ sys.path.append(os.environ['AIL_BIN']) from lib import ail_logger from lib.ail_queues import AILQueue from lib import regex_helper -from lib.exceptions import ModuleQueueError +from lib.exceptions import ModuleQueueError, TimeoutException from lib.objects.ail_objects import get_obj_from_global_id logging.config.dictConfig(ail_logger.get_config(name='modules')) @@ -193,7 +193,10 @@ class AbstractModule(ABC): self.computeNone() # Wait before next process self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s") - time.sleep(self.pending_seconds) + try: + time.sleep(self.pending_seconds) + except TimeoutException: + pass def _module_name(self): """