fix: [crawler] log timeout + debug signal timeout

This commit is contained in:
terrtia 2025-01-08 15:14:54 +01:00
parent 38d1d01d8a
commit 0287a1380b
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 7 additions and 2 deletions

View file

@ -364,7 +364,7 @@ class Crawler(AbstractModule):
dom_hash.add(self.date.replace('/', ''), item) dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id) dom_hash.add_correlation('domain', '', self.domain.id)
title_content = crawlers.extract_title_from_html(entries['html']) title_content = crawlers.extract_title_from_html(entries['html'], item_id)
if title_content: if title_content:
title = Titles.create_title(title_content) title = Titles.create_title(title_content)
title.add(item.get_date(), item) title.add(item.get_date(), item)

View file

@ -10,6 +10,7 @@ import base64
import gzip import gzip
import hashlib import hashlib
import json import json
import logging
import os import os
import pickle import pickle
import re import re
@ -72,6 +73,8 @@ config_loader = None
faup = Faup() faup = Faup()
logger_crawler = logging.getLogger('crawlers.log')
# # # # # # # # # # # # # # # #
# # # #
# DOMAINS # # DOMAINS #
@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
# # # #
# # # # # # # # # # # # # # # #
def extract_title_from_html(html): def extract_title_from_html(html, item_id):
signal.alarm(60) signal.alarm(60)
try: try:
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
@ -333,8 +336,10 @@ def extract_title_from_html(html):
return str(title) return str(title)
except TimeoutException: except TimeoutException:
signal.alarm(0) signal.alarm(0)
logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
else: else:
signal.alarm(0) signal.alarm(0)
signal.alarm(0)
return '' return ''
def extract_description_from_html(html): def extract_description_from_html(html):