fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep

This commit is contained in:
terrtia 2025-01-09 11:09:36 +01:00
parent 9425e01c85
commit 109ce56a4a
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 41 additions and 29 deletions

View file

@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
from lib import ail_logger from lib import ail_logger
from lib import crawlers from lib import crawlers
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.exceptions import TimeoutException
from lib.Tag import get_domain_vanity_tags from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames from lib.objects import CookiesNames
from lib.objects import Etags from lib.objects import Etags
@ -30,6 +31,15 @@ from trackers.Tracker_Yara import Tracker_Yara
logging.config.dictConfig(ail_logger.get_config(name='crawlers')) logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
# SIGNAL ALARM
import signal
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
class Crawler(AbstractModule): class Crawler(AbstractModule):
def __init__(self): def __init__(self):
@ -104,7 +114,10 @@ class Crawler(AbstractModule):
self.is_lacus_up = False self.is_lacus_up = False
if not self.is_lacus_up: if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time())) print("Can't reach lacus server", int(time.time()))
time.sleep(30) try:
time.sleep(30)
except TimeoutException:
pass
def print_crawler_start_info(self, url, domain_url): def print_crawler_start_info(self, url, domain_url):
print() print()
@ -183,7 +196,10 @@ class Crawler(AbstractModule):
capture.update(-1) capture.update(-1)
self.refresh_lacus_status() self.refresh_lacus_status()
time.sleep(self.pending_seconds) try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass
def enqueue_capture(self, task_uuid, priority): def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid) task = crawlers.CrawlerTask(task_uuid)
@ -364,7 +380,16 @@ class Crawler(AbstractModule):
dom_hash.add(self.date.replace('/', ''), item) dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id) dom_hash.add_correlation('domain', '', self.domain.id)
title_content = crawlers.extract_title_from_html(entries['html'], item_id) # TITLE
signal.alarm(60)
try:
title_content = crawlers.extract_title_from_html(entries['html'])
except TimeoutException:
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
title_content = None
else:
signal.alarm(0)
if title_content: if title_content:
title = Titles.create_title(title_content) title = Titles.create_title(title_content)
title.add(item.get_date(), item) title.add(item.get_date(), item)

View file

@ -28,19 +28,6 @@ from pylacus import PyLacus
from pyfaup.faup import Faup from pyfaup.faup import Faup
import signal
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# interact with splash_crawler API # interact with splash_crawler API
import requests import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
@ -73,7 +60,7 @@ config_loader = None
faup = Faup() faup = Faup()
logger_crawler = logging.getLogger('crawlers.log') # logger_crawler = logging.getLogger('crawlers.log')
# # # # # # # # # # # # # # # #
# # # #
@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
# # # #
# # # # # # # # # # # # # # # #
def extract_title_from_html(html, item_id): # /!\ REQUIRE ALARM SIGNAL
# signal.alarm(60) def extract_title_from_html(html):
# try:
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
title = soup.title title = soup.title
if title: if title:
title = title.string title = title.string
if title: if title:
return str(title) return str(title)
# except TimeoutException:
# signal.alarm(0)
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
# else:
# signal.alarm(0)
# signal.alarm(0)
return '' return ''
def extract_description_from_html(html): def extract_description_from_html(html):

View file

@ -1,7 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
from pymisp import PyMISPError # from pymisp import PyMISPError
# SIGNAL ALARM
class TimeoutException(Exception):
pass
class AILError(Exception): class AILError(Exception):
def __init__(self, message): def __init__(self, message):

View file

@ -21,7 +21,7 @@ sys.path.append(os.environ['AIL_BIN'])
from lib import ail_logger from lib import ail_logger
from lib.ail_queues import AILQueue from lib.ail_queues import AILQueue
from lib import regex_helper from lib import regex_helper
from lib.exceptions import ModuleQueueError from lib.exceptions import ModuleQueueError, TimeoutException
from lib.objects.ail_objects import get_obj_from_global_id from lib.objects.ail_objects import get_obj_from_global_id
logging.config.dictConfig(ail_logger.get_config(name='modules')) logging.config.dictConfig(ail_logger.get_config(name='modules'))
@ -193,7 +193,10 @@ class AbstractModule(ABC):
self.computeNone() self.computeNone()
# Wait before next process # Wait before next process
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s") self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
time.sleep(self.pending_seconds) try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass
def _module_name(self): def _module_name(self):
""" """