mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 08:26:15 +00:00
fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep
This commit is contained in:
parent
9425e01c85
commit
109ce56a4a
4 changed files with 41 additions and 29 deletions
|
@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
|
|||
from lib import ail_logger
|
||||
from lib import crawlers
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from lib.exceptions import TimeoutException
|
||||
from lib.Tag import get_domain_vanity_tags
|
||||
from lib.objects import CookiesNames
|
||||
from lib.objects import Etags
|
||||
|
@ -30,6 +31,15 @@ from trackers.Tracker_Yara import Tracker_Yara
|
|||
|
||||
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
|
||||
|
||||
# SIGNAL ALARM
|
||||
import signal
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
|
||||
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
|
||||
class Crawler(AbstractModule):
|
||||
|
||||
def __init__(self):
|
||||
|
@ -104,7 +114,10 @@ class Crawler(AbstractModule):
|
|||
self.is_lacus_up = False
|
||||
if not self.is_lacus_up:
|
||||
print("Can't reach lacus server", int(time.time()))
|
||||
time.sleep(30)
|
||||
try:
|
||||
time.sleep(30)
|
||||
except TimeoutException:
|
||||
pass
|
||||
|
||||
def print_crawler_start_info(self, url, domain_url):
|
||||
print()
|
||||
|
@ -183,7 +196,10 @@ class Crawler(AbstractModule):
|
|||
capture.update(-1)
|
||||
self.refresh_lacus_status()
|
||||
|
||||
time.sleep(self.pending_seconds)
|
||||
try:
|
||||
time.sleep(self.pending_seconds)
|
||||
except TimeoutException:
|
||||
pass
|
||||
|
||||
def enqueue_capture(self, task_uuid, priority):
|
||||
task = crawlers.CrawlerTask(task_uuid)
|
||||
|
@ -364,7 +380,16 @@ class Crawler(AbstractModule):
|
|||
dom_hash.add(self.date.replace('/', ''), item)
|
||||
dom_hash.add_correlation('domain', '', self.domain.id)
|
||||
|
||||
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
|
||||
# TITLE
|
||||
signal.alarm(60)
|
||||
try:
|
||||
title_content = crawlers.extract_title_from_html(entries['html'])
|
||||
except TimeoutException:
|
||||
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
||||
title_content = None
|
||||
else:
|
||||
signal.alarm(0)
|
||||
|
||||
if title_content:
|
||||
title = Titles.create_title(title_content)
|
||||
title.add(item.get_date(), item)
|
||||
|
|
|
@ -28,19 +28,6 @@ from pylacus import PyLacus
|
|||
|
||||
from pyfaup.faup import Faup
|
||||
|
||||
|
||||
import signal
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
def timeout_handler(signum, frame):
|
||||
raise TimeoutException
|
||||
|
||||
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
|
||||
|
||||
# interact with splash_crawler API
|
||||
import requests
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
@ -73,7 +60,7 @@ config_loader = None
|
|||
|
||||
faup = Faup()
|
||||
|
||||
logger_crawler = logging.getLogger('crawlers.log')
|
||||
# logger_crawler = logging.getLogger('crawlers.log')
|
||||
|
||||
# # # # # # # #
|
||||
# #
|
||||
|
@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
|
|||
# #
|
||||
# # # # # # # #
|
||||
|
||||
def extract_title_from_html(html, item_id):
|
||||
# signal.alarm(60)
|
||||
# try:
|
||||
# /!\ REQUIRE ALARM SIGNAL
|
||||
def extract_title_from_html(html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
title = soup.title
|
||||
if title:
|
||||
title = title.string
|
||||
if title:
|
||||
return str(title)
|
||||
# except TimeoutException:
|
||||
# signal.alarm(0)
|
||||
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
||||
# else:
|
||||
# signal.alarm(0)
|
||||
# signal.alarm(0)
|
||||
return ''
|
||||
|
||||
def extract_description_from_html(html):
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
from pymisp import PyMISPError
|
||||
# from pymisp import PyMISPError
|
||||
|
||||
# SIGNAL ALARM
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
class AILError(Exception):
|
||||
def __init__(self, message):
|
||||
|
|
|
@ -21,7 +21,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
from lib import ail_logger
|
||||
from lib.ail_queues import AILQueue
|
||||
from lib import regex_helper
|
||||
from lib.exceptions import ModuleQueueError
|
||||
from lib.exceptions import ModuleQueueError, TimeoutException
|
||||
from lib.objects.ail_objects import get_obj_from_global_id
|
||||
|
||||
logging.config.dictConfig(ail_logger.get_config(name='modules'))
|
||||
|
@ -193,7 +193,10 @@ class AbstractModule(ABC):
|
|||
self.computeNone()
|
||||
# Wait before next process
|
||||
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
|
||||
time.sleep(self.pending_seconds)
|
||||
try:
|
||||
time.sleep(self.pending_seconds)
|
||||
except TimeoutException:
|
||||
pass
|
||||
|
||||
def _module_name(self):
|
||||
"""
|
||||
|
|
Loading…
Add table
Reference in a new issue