fix: [title beautifullsoup] add signal, BeautifulSoup html.parser is stuck

This commit is contained in:
terrtia 2025-01-07 17:21:58 +01:00
parent 8692d9b45b
commit 868da3c6bc
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -27,6 +27,19 @@ from pylacus import PyLacus
from pyfaup.faup import Faup from pyfaup.faup import Faup
import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# interact with splash_crawler API # interact with splash_crawler API
import requests import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
@ -310,12 +323,18 @@ def extract_favicon_from_html(html, url):
# # # # # # # # # # # # # # # #
def extract_title_from_html(html): def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser') signal.alarm(60)
title = soup.title try:
if title: soup = BeautifulSoup(html, 'html.parser')
title = title.string title = soup.title
if title: if title:
return str(title) title = title.string
if title:
return str(title)
except TimeoutException:
pass
else:
signal.alarm(0)
return '' return ''
def extract_description_from_html(html): def extract_description_from_html(html):