mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 16:36:13 +00:00
fix: [crawler] log timeout + debug signal timeout
This commit is contained in:
parent
38d1d01d8a
commit
0287a1380b
2 changed files with 7 additions and 2 deletions
|
@ -364,7 +364,7 @@ class Crawler(AbstractModule):
|
||||||
dom_hash.add(self.date.replace('/', ''), item)
|
dom_hash.add(self.date.replace('/', ''), item)
|
||||||
dom_hash.add_correlation('domain', '', self.domain.id)
|
dom_hash.add_correlation('domain', '', self.domain.id)
|
||||||
|
|
||||||
title_content = crawlers.extract_title_from_html(entries['html'])
|
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
|
||||||
if title_content:
|
if title_content:
|
||||||
title = Titles.create_title(title_content)
|
title = Titles.create_title(title_content)
|
||||||
title.add(item.get_date(), item)
|
title.add(item.get_date(), item)
|
||||||
|
|
|
@ -10,6 +10,7 @@ import base64
|
||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
|
@ -72,6 +73,8 @@ config_loader = None
|
||||||
|
|
||||||
faup = Faup()
|
faup = Faup()
|
||||||
|
|
||||||
|
logger_crawler = logging.getLogger('crawlers.log')
|
||||||
|
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
# #
|
# #
|
||||||
# DOMAINS #
|
# DOMAINS #
|
||||||
|
@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
|
||||||
# #
|
# #
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
|
|
||||||
def extract_title_from_html(html):
|
def extract_title_from_html(html, item_id):
|
||||||
signal.alarm(60)
|
signal.alarm(60)
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
@ -333,8 +336,10 @@ def extract_title_from_html(html):
|
||||||
return str(title)
|
return str(title)
|
||||||
except TimeoutException:
|
except TimeoutException:
|
||||||
signal.alarm(0)
|
signal.alarm(0)
|
||||||
|
logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
||||||
else:
|
else:
|
||||||
signal.alarm(0)
|
signal.alarm(0)
|
||||||
|
signal.alarm(0)
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def extract_description_from_html(html):
|
def extract_description_from_html(html):
|
||||||
|
|
Loading…
Add table
Reference in a new issue