From 0287a1380b7a9f0cb26b0d012af09c9609967fb9 Mon Sep 17 00:00:00 2001
From: terrtia <or1994@hotmail.fr>
Date: Wed, 8 Jan 2025 15:14:54 +0100
Subject: [PATCH] fix: [crawler] log timeout + debug signal timeout

---
 bin/crawlers/Crawler.py | 2 +-
 bin/lib/crawlers.py     | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
index 0ae89121..0ae27380 100755
--- a/bin/crawlers/Crawler.py
+++ b/bin/crawlers/Crawler.py
@@ -364,7 +364,7 @@ class Crawler(AbstractModule):
             dom_hash.add(self.date.replace('/', ''), item)
             dom_hash.add_correlation('domain', '', self.domain.id)
 
-            title_content = crawlers.extract_title_from_html(entries['html'])
+            title_content = crawlers.extract_title_from_html(entries['html'], item_id)
             if title_content:
                 title = Titles.create_title(title_content)
                 title.add(item.get_date(), item)
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 50e7a575..a0d907ec 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -10,6 +10,7 @@ import base64
 import gzip
 import hashlib
 import json
+import logging
 import os
 import pickle
 import re
@@ -72,6 +73,8 @@ config_loader = None
 
 faup = Faup()
 
+logger_crawler = logging.getLogger('crawlers.log')
+
 # # # # # # # #
 #             #
 #   DOMAINS   #
@@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
 #             #
 # # # # # # # #
 
-def extract_title_from_html(html):
+def extract_title_from_html(html, item_id):
     signal.alarm(60)
     try:
         soup = BeautifulSoup(html, 'html.parser')
@@ -333,8 +336,10 @@ def extract_title_from_html(html):
                 return str(title)
     except TimeoutException:
         signal.alarm(0)
+        logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
     else:
         signal.alarm(0)
+    signal.alarm(0)
     return ''
 
 def extract_description_from_html(html):