mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 16:36:13 +00:00
fix: [title beautifullsoup] add signal, BeautifulSoup html.parser is stuck
This commit is contained in:
parent
8692d9b45b
commit
868da3c6bc
1 changed files with 24 additions and 5 deletions
|
@ -27,6 +27,19 @@ from pylacus import PyLacus
|
||||||
|
|
||||||
from pyfaup.faup import Faup
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
|
|
||||||
|
import signal
|
||||||
|
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutException
|
||||||
|
|
||||||
|
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
|
||||||
|
|
||||||
# interact with splash_crawler API
|
# interact with splash_crawler API
|
||||||
import requests
|
import requests
|
||||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
@ -310,12 +323,18 @@ def extract_favicon_from_html(html, url):
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
|
|
||||||
def extract_title_from_html(html):
|
def extract_title_from_html(html):
|
||||||
|
signal.alarm(60)
|
||||||
|
try:
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
title = soup.title
|
title = soup.title
|
||||||
if title:
|
if title:
|
||||||
title = title.string
|
title = title.string
|
||||||
if title:
|
if title:
|
||||||
return str(title)
|
return str(title)
|
||||||
|
except TimeoutException:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
signal.alarm(0)
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def extract_description_from_html(html):
|
def extract_description_from_html(html):
|
||||||
|
|
Loading…
Add table
Reference in a new issue