From 0287a1380b7a9f0cb26b0d012af09c9609967fb9 Mon Sep 17 00:00:00 2001 From: terrtia Date: Wed, 8 Jan 2025 15:14:54 +0100 Subject: [PATCH] fix: [crawler] log timeout + debug signal timeout --- bin/crawlers/Crawler.py | 2 +- bin/lib/crawlers.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 0ae89121..0ae27380 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -364,7 +364,7 @@ class Crawler(AbstractModule): dom_hash.add(self.date.replace('/', ''), item) dom_hash.add_correlation('domain', '', self.domain.id) - title_content = crawlers.extract_title_from_html(entries['html']) + title_content = crawlers.extract_title_from_html(entries['html'], item_id) if title_content: title = Titles.create_title(title_content) title.add(item.get_date(), item) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 50e7a575..a0d907ec 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -10,6 +10,7 @@ import base64 import gzip import hashlib import json +import logging import os import pickle import re @@ -72,6 +73,8 @@ config_loader = None faup = Faup() +logger_crawler = logging.getLogger('crawlers.log') + # # # # # # # # # # # DOMAINS # @@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url): # # # # # # # # # # -def extract_title_from_html(html): +def extract_title_from_html(html, item_id): signal.alarm(60) try: soup = BeautifulSoup(html, 'html.parser') @@ -333,8 +336,10 @@ def extract_title_from_html(html): return str(title) except TimeoutException: signal.alarm(0) + logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}') else: signal.alarm(0) + signal.alarm(0) return '' def extract_description_from_html(html):