fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep

This commit is contained in:
terrtia 2025-01-09 11:09:36 +01:00
parent 9425e01c85
commit 109ce56a4a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 41 additions and 29 deletions

View file

@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
from lib import ail_logger
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.exceptions import TimeoutException
from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames
from lib.objects import Etags
@ -30,6 +31,15 @@ from trackers.Tracker_Yara import Tracker_Yara
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
# SIGNAL ALARM
import signal
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
class Crawler(AbstractModule):
def __init__(self):
@ -104,7 +114,10 @@ class Crawler(AbstractModule):
self.is_lacus_up = False
if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time()))
try:
time.sleep(30)
except TimeoutException:
pass
def print_crawler_start_info(self, url, domain_url):
print()
@ -183,7 +196,10 @@ class Crawler(AbstractModule):
capture.update(-1)
self.refresh_lacus_status()
try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass
def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid)
@ -364,7 +380,16 @@ class Crawler(AbstractModule):
dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id)
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
# TITLE
signal.alarm(60)
try:
title_content = crawlers.extract_title_from_html(entries['html'])
except TimeoutException:
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
title_content = None
else:
signal.alarm(0)
if title_content:
title = Titles.create_title(title_content)
title.add(item.get_date(), item)

View file

@ -28,19 +28,6 @@ from pylacus import PyLacus
from pyfaup.faup import Faup
import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# interact with splash_crawler API
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
@ -73,7 +60,7 @@ config_loader = None
faup = Faup()
logger_crawler = logging.getLogger('crawlers.log')
# logger_crawler = logging.getLogger('crawlers.log')
# # # # # # # #
# #
@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
# #
# # # # # # # #
def extract_title_from_html(html, item_id):
# signal.alarm(60)
# try:
# /!\ REQUIRE ALARM SIGNAL
def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
if title:
title = title.string
if title:
return str(title)
# except TimeoutException:
# signal.alarm(0)
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
# else:
# signal.alarm(0)
# signal.alarm(0)
return ''
def extract_description_from_html(html):

View file

@ -1,7 +1,11 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
from pymisp import PyMISPError
# from pymisp import PyMISPError
# SIGNAL ALARM
class TimeoutException(Exception):
pass
class AILError(Exception):
def __init__(self, message):

View file

@ -21,7 +21,7 @@ sys.path.append(os.environ['AIL_BIN'])
from lib import ail_logger
from lib.ail_queues import AILQueue
from lib import regex_helper
from lib.exceptions import ModuleQueueError
from lib.exceptions import ModuleQueueError, TimeoutException
from lib.objects.ail_objects import get_obj_from_global_id
logging.config.dictConfig(ail_logger.get_config(name='modules'))
@ -193,7 +193,10 @@ class AbstractModule(ABC):
self.computeNone()
# Wait before next process
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass
def _module_name(self):
"""