fix: [favicon] crawler favicon

This commit is contained in:
terrtia 2024-02-21 14:34:20 +01:00
parent c219febd71
commit 81c4dde7b0
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -20,6 +20,7 @@ from lib.ConfigLoader import ConfigLoader
from lib.objects import CookiesNames from lib.objects import CookiesNames
from lib.objects import Etags from lib.objects import Etags
from lib.objects.Domains import Domain from lib.objects.Domains import Domain
from lib.objects import Favicons
from lib.objects.Items import Item from lib.objects.Items import Item
from lib.objects import Screenshots from lib.objects import Screenshots
from lib.objects import Titles from lib.objects import Titles
@ -198,6 +199,7 @@ class Crawler(AbstractModule):
user_agent=task.get_user_agent(), user_agent=task.get_user_agent(),
proxy=task.get_proxy(), proxy=task.get_proxy(),
cookies=task.get_cookies(), cookies=task.get_cookies(),
with_favicon=True,
force=force, force=force,
general_timeout_in_sec=90) # TODO increase timeout if onion ???? general_timeout_in_sec=90) # TODO increase timeout if onion ????
@ -245,6 +247,7 @@ class Crawler(AbstractModule):
parent_id = task.get_parent() parent_id = task.get_parent()
entries = self.lacus.get_capture(capture.uuid) entries = self.lacus.get_capture(capture.uuid)
print(entries.get('status')) print(entries.get('status'))
self.har = task.get_har() self.har = task.get_har()
self.screenshot = task.get_screenshot() self.screenshot = task.get_screenshot()
@ -369,6 +372,12 @@ class Crawler(AbstractModule):
etag.add(self.date.replace('/', ''), self.domain) etag.add(self.date.replace('/', ''), self.domain)
crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', '')) crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', ''))
# FAVICON
if entries.get('potential_favicons'):
for favicon in entries['potential_favicons']:
fav = Favicons.create(favicon)
fav.add(item.get_date(), item)
# Next Children # Next Children
entries_children = entries.get('children') entries_children = entries.get('children')
if entries_children: if entries_children: