fix: [crawler] log timeout + debug signal timeout

2025-01-18 16:36:13 +00:00 · 2025-01-08 15:14:54 +01:00 · 2025-01-08 15:14:54 +01:00 · 0287a1380b
commit 0287a1380b
parent 38d1d01d8a
2 changed files with 7 additions and 2 deletions
--- a/bin/crawlers/Crawler.py
+++ b/bin/crawlers/Crawler.py
@ -364,7 +364,7 @@ class Crawler(AbstractModule):
            dom_hash.add(self.date.replace('/', ''), item)
            dom_hash.add_correlation('domain', '', self.domain.id)
-            title_content = crawlers.extract_title_from_html(entries['html'])
+            title_content = crawlers.extract_title_from_html(entries['html'], item_id)
            if title_content:
                title = Titles.create_title(title_content)
                title.add(item.get_date(), item)
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -10,6 +10,7 @@ import base64
 import gzip
 import hashlib
 import json
 import logging
 import os
 import pickle
 import re
@ -72,6 +73,8 @@ config_loader = None
 faup = Faup()
 logger_crawler = logging.getLogger('crawlers.log')
 # # # # # # # #
 #             #
 #   DOMAINS   #
@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
 #             #
 # # # # # # # #
-def extract_title_from_html(html):
+def extract_title_from_html(html, item_id):
    signal.alarm(60)
    try:
        soup = BeautifulSoup(html, 'html.parser')
@ -333,8 +336,10 @@ def extract_title_from_html(html):
                return str(title)
    except TimeoutException:
        signal.alarm(0)
        logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
    else:
        signal.alarm(0)
    signal.alarm(0)
    return ''
 def extract_description_from_html(html):