fix: [crawler] crawler capture with empty task

This commit is contained in:
terrtia 2024-09-16 10:50:34 +02:00
parent 907f370b29
commit 7b66ff6a8c
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0

View file

@ -430,7 +430,6 @@ def _reprocess_all_hars_hhhashs():
extract_hhhash_by_id(har_id, domain, date) extract_hhhash_by_id(har_id, domain, date)
def _gzip_har(har_id): def _gzip_har(har_id):
har_path = os.path.join(HAR_DIR, har_id) har_path = os.path.join(HAR_DIR, har_id)
new_id = f'{har_path}.gz' new_id = f'{har_path}.gz'
@ -1436,8 +1435,8 @@ class CrawlerCapture:
launch_time = int(time.time()) launch_time = int(time.time())
r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', self.uuid) r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', self.uuid)
r_crawler.hset('crawler:captures:tasks', self.uuid, task_uuid) r_crawler.hset('crawler:captures:tasks', self.uuid, task_uuid)
r_crawler.zadd('crawler:captures', {self.uuid: launch_time})
r_cache.hset(f'crawler:capture:{self.uuid}', 'launch_time', launch_time) r_cache.hset(f'crawler:capture:{self.uuid}', 'launch_time', launch_time)
r_crawler.zadd('crawler:captures', {self.uuid: launch_time})
r_cache.zadd('crawler:captures', {self.uuid: launch_time}) r_cache.zadd('crawler:captures', {self.uuid: launch_time})
def update(self, status): def update(self, status):
@ -1481,15 +1480,24 @@ def get_captures_status():
for capture_uuid in get_crawler_captures(): for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid) capture = CrawlerCapture(capture_uuid)
task = capture.get_task() task = capture.get_task()
domain = task.get_domain() if not task:
dom = Domain(domain) meta = {
meta = { 'uuid': 'UNKNOWN',
'uuid': task.uuid, 'domain': 'UNKNOWN',
'domain': dom.get_id(), 'type': 'UNKNOWN',
'type': dom.get_domain_type(), 'start_time': capture.get_start_time(),
'start_time': capture.get_start_time(), 'status': capture.get_status(),
'status': capture.get_status(), }
} else:
domain = task.get_domain()
dom = Domain(domain)
meta = {
'uuid': task.uuid,
'domain': dom.get_id(),
'type': dom.get_domain_type(),
'start_time': capture.get_start_time(),
'status': capture.get_status(),
}
capture_status = capture.get_status() capture_status = capture.get_status()
if capture_status: if capture_status:
capture_status = CaptureStatus(int(capture_status)).name capture_status = CaptureStatus(int(capture_status)).name
@ -1502,7 +1510,7 @@ def delete_captures():
capture = CrawlerCapture(capture_uuid) capture = CrawlerCapture(capture_uuid)
capture.delete() capture.delete()
##-- CRAWLER STATE --## ## --CRAWLER STATE-- ##
#### CRAWLER TASK #### #### CRAWLER TASK ####
@ -1848,13 +1856,13 @@ def api_add_crawler_task(data, user_org, user_id=None):
if frequency: if frequency:
# TODO verify user # TODO verify user
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags) cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
else: else:
# TODO HEADERS # TODO HEADERS
# TODO USER AGENT # TODO USER AGENT
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
parent='manual', priority=90) parent='manual', priority=90)
return {'uuid': task_uuid}, 200 return {'uuid': task_uuid}, 200
@ -1919,10 +1927,10 @@ def create_item_id(item_dir, domain):
# remove / # remove /
domain = domain.replace('/', '_') domain = domain.replace('/', '_')
if len(domain) > 215: if len(domain) > 215:
UUID = domain[-215:]+str(uuid.uuid4()) n_uuid = domain[-215:]+str(uuid.uuid4())
else: else:
UUID = domain+str(uuid.uuid4()) n_uuid = domain+str(uuid.uuid4())
return os.path.join(item_dir, UUID) return os.path.join(item_dir, n_uuid)
# # # # # # # # # # # # # # # # # # # # # # # #
# # # #
@ -2160,15 +2168,15 @@ def test_ail_crawlers():
load_blacklist() load_blacklist()
# if __name__ == '__main__': # if __name__ == '__main__':
# delete_captures() # delete_captures()
#
# item_id = 'crawled/2023/02/20/data.gz' # item_id = 'crawled/2023/02/20/data.gz'
# item = Item(item_id) # item = Item(item_id)
# content = item.get_content() # content = item.get_content()
# temp_url = '' # temp_url = ''
# r = extract_favicon_from_html(content, temp_url) # r = extract_favicon_from_html(content, temp_url)
# print(r) # print(r)
# _reprocess_all_hars_cookie_name() # _reprocess_all_hars_cookie_name()
# _reprocess_all_hars_etag() # _reprocess_all_hars_etag()
# _gzip_all_hars() # _gzip_all_hars()
# _reprocess_all_hars_hhhashs() # _reprocess_all_hars_hhhashs()