mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 16:36:13 +00:00
fix: [crawler] log timeout + debug signal timeout
This commit is contained in:
parent
38d1d01d8a
commit
0287a1380b
2 changed files with 7 additions and 2 deletions
|
@ -364,7 +364,7 @@ class Crawler(AbstractModule):
|
|||
dom_hash.add(self.date.replace('/', ''), item)
|
||||
dom_hash.add_correlation('domain', '', self.domain.id)
|
||||
|
||||
title_content = crawlers.extract_title_from_html(entries['html'])
|
||||
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
|
||||
if title_content:
|
||||
title = Titles.create_title(title_content)
|
||||
title.add(item.get_date(), item)
|
||||
|
|
|
@ -10,6 +10,7 @@ import base64
|
|||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
|
@ -72,6 +73,8 @@ config_loader = None
|
|||
|
||||
faup = Faup()
|
||||
|
||||
logger_crawler = logging.getLogger('crawlers.log')
|
||||
|
||||
# # # # # # # #
|
||||
# #
|
||||
# DOMAINS #
|
||||
|
@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
|
|||
# #
|
||||
# # # # # # # #
|
||||
|
||||
def extract_title_from_html(html):
|
||||
def extract_title_from_html(html, item_id):
|
||||
signal.alarm(60)
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
@ -333,8 +336,10 @@ def extract_title_from_html(html):
|
|||
return str(title)
|
||||
except TimeoutException:
|
||||
signal.alarm(0)
|
||||
logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
||||
else:
|
||||
signal.alarm(0)
|
||||
signal.alarm(0)
|
||||
return ''
|
||||
|
||||
def extract_description_from_html(html):
|
||||
|
|
Loading…
Add table
Reference in a new issue