mirror of
https://github.com/ail-project/ail-framework.git
synced 2025-01-18 16:36:13 +00:00
fix: [crawler] title extraction, sigalarm raised by signal.alarm and sleep
This commit is contained in:
parent
9425e01c85
commit
109ce56a4a
4 changed files with 41 additions and 29 deletions
|
@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
|
||||||
from lib import ail_logger
|
from lib import ail_logger
|
||||||
from lib import crawlers
|
from lib import crawlers
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
from lib.exceptions import TimeoutException
|
||||||
from lib.Tag import get_domain_vanity_tags
|
from lib.Tag import get_domain_vanity_tags
|
||||||
from lib.objects import CookiesNames
|
from lib.objects import CookiesNames
|
||||||
from lib.objects import Etags
|
from lib.objects import Etags
|
||||||
|
@ -30,6 +31,15 @@ from trackers.Tracker_Yara import Tracker_Yara
|
||||||
|
|
||||||
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
|
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
|
||||||
|
|
||||||
|
# SIGNAL ALARM
|
||||||
|
import signal
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutException
|
||||||
|
|
||||||
|
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
|
||||||
|
|
||||||
class Crawler(AbstractModule):
|
class Crawler(AbstractModule):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -104,7 +114,10 @@ class Crawler(AbstractModule):
|
||||||
self.is_lacus_up = False
|
self.is_lacus_up = False
|
||||||
if not self.is_lacus_up:
|
if not self.is_lacus_up:
|
||||||
print("Can't reach lacus server", int(time.time()))
|
print("Can't reach lacus server", int(time.time()))
|
||||||
time.sleep(30)
|
try:
|
||||||
|
time.sleep(30)
|
||||||
|
except TimeoutException:
|
||||||
|
pass
|
||||||
|
|
||||||
def print_crawler_start_info(self, url, domain_url):
|
def print_crawler_start_info(self, url, domain_url):
|
||||||
print()
|
print()
|
||||||
|
@ -183,7 +196,10 @@ class Crawler(AbstractModule):
|
||||||
capture.update(-1)
|
capture.update(-1)
|
||||||
self.refresh_lacus_status()
|
self.refresh_lacus_status()
|
||||||
|
|
||||||
time.sleep(self.pending_seconds)
|
try:
|
||||||
|
time.sleep(self.pending_seconds)
|
||||||
|
except TimeoutException:
|
||||||
|
pass
|
||||||
|
|
||||||
def enqueue_capture(self, task_uuid, priority):
|
def enqueue_capture(self, task_uuid, priority):
|
||||||
task = crawlers.CrawlerTask(task_uuid)
|
task = crawlers.CrawlerTask(task_uuid)
|
||||||
|
@ -364,7 +380,16 @@ class Crawler(AbstractModule):
|
||||||
dom_hash.add(self.date.replace('/', ''), item)
|
dom_hash.add(self.date.replace('/', ''), item)
|
||||||
dom_hash.add_correlation('domain', '', self.domain.id)
|
dom_hash.add_correlation('domain', '', self.domain.id)
|
||||||
|
|
||||||
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
|
# TITLE
|
||||||
|
signal.alarm(60)
|
||||||
|
try:
|
||||||
|
title_content = crawlers.extract_title_from_html(entries['html'])
|
||||||
|
except TimeoutException:
|
||||||
|
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
||||||
|
title_content = None
|
||||||
|
else:
|
||||||
|
signal.alarm(0)
|
||||||
|
|
||||||
if title_content:
|
if title_content:
|
||||||
title = Titles.create_title(title_content)
|
title = Titles.create_title(title_content)
|
||||||
title.add(item.get_date(), item)
|
title.add(item.get_date(), item)
|
||||||
|
|
|
@ -28,19 +28,6 @@ from pylacus import PyLacus
|
||||||
|
|
||||||
from pyfaup.faup import Faup
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
|
|
||||||
import signal
|
|
||||||
|
|
||||||
class TimeoutException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def timeout_handler(signum, frame):
|
|
||||||
raise TimeoutException
|
|
||||||
|
|
||||||
|
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
|
||||||
|
|
||||||
|
|
||||||
# interact with splash_crawler API
|
# interact with splash_crawler API
|
||||||
import requests
|
import requests
|
||||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
@ -73,7 +60,7 @@ config_loader = None
|
||||||
|
|
||||||
faup = Faup()
|
faup = Faup()
|
||||||
|
|
||||||
logger_crawler = logging.getLogger('crawlers.log')
|
# logger_crawler = logging.getLogger('crawlers.log')
|
||||||
|
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
# #
|
# #
|
||||||
|
@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
|
||||||
# #
|
# #
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
|
|
||||||
def extract_title_from_html(html, item_id):
|
# /!\ REQUIRE ALARM SIGNAL
|
||||||
# signal.alarm(60)
|
def extract_title_from_html(html):
|
||||||
# try:
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
title = soup.title
|
title = soup.title
|
||||||
if title:
|
if title:
|
||||||
title = title.string
|
title = title.string
|
||||||
if title:
|
if title:
|
||||||
return str(title)
|
return str(title)
|
||||||
# except TimeoutException:
|
|
||||||
# signal.alarm(0)
|
|
||||||
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
|
|
||||||
# else:
|
|
||||||
# signal.alarm(0)
|
|
||||||
# signal.alarm(0)
|
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def extract_description_from_html(html):
|
def extract_description_from_html(html):
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
from pymisp import PyMISPError
|
# from pymisp import PyMISPError
|
||||||
|
|
||||||
|
# SIGNAL ALARM
|
||||||
|
class TimeoutException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class AILError(Exception):
|
class AILError(Exception):
|
||||||
def __init__(self, message):
|
def __init__(self, message):
|
||||||
|
|
|
@ -21,7 +21,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
from lib import ail_logger
|
from lib import ail_logger
|
||||||
from lib.ail_queues import AILQueue
|
from lib.ail_queues import AILQueue
|
||||||
from lib import regex_helper
|
from lib import regex_helper
|
||||||
from lib.exceptions import ModuleQueueError
|
from lib.exceptions import ModuleQueueError, TimeoutException
|
||||||
from lib.objects.ail_objects import get_obj_from_global_id
|
from lib.objects.ail_objects import get_obj_from_global_id
|
||||||
|
|
||||||
logging.config.dictConfig(ail_logger.get_config(name='modules'))
|
logging.config.dictConfig(ail_logger.get_config(name='modules'))
|
||||||
|
@ -193,7 +193,10 @@ class AbstractModule(ABC):
|
||||||
self.computeNone()
|
self.computeNone()
|
||||||
# Wait before next process
|
# Wait before next process
|
||||||
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
|
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
|
||||||
time.sleep(self.pending_seconds)
|
try:
|
||||||
|
time.sleep(self.pending_seconds)
|
||||||
|
except TimeoutException:
|
||||||
|
pass
|
||||||
|
|
||||||
def _module_name(self):
|
def _module_name(self):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Add table
Reference in a new issue