chg: [crawler har] compress HAR

This commit is contained in:
Terrtia 2023-07-10 15:56:34 +02:00
parent c719990125
commit 28c647d370
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 32 additions and 11 deletions

View file

@ -268,7 +268,7 @@ def extract_author_from_html(html):
def create_har_id(date, item_id): def create_har_id(date, item_id):
item_id = item_id.split('/')[-1] item_id = item_id.split('/')[-1]
return os.path.join(date, f'{item_id}.json') return os.path.join(date, f'{item_id}.json.gz')
def save_har(har_id, har_content): def save_har(har_id, har_content):
# create dir # create dir
@ -277,8 +277,8 @@ def save_har(har_id, har_content):
os.makedirs(har_dir) os.makedirs(har_dir)
# save HAR # save HAR
filename = os.path.join(get_har_dir(), har_id) filename = os.path.join(get_har_dir(), har_id)
with open(filename, 'w') as f: with gzip.open(filename, 'wb') as f:
f.write(json.dumps(har_content)) f.write(json.dumps(har_content).encode())
def get_all_har_ids(): def get_all_har_ids():
har_ids = [] har_ids = []
@ -308,11 +308,15 @@ def get_all_har_ids():
def get_har_content(har_id): def get_har_content(har_id):
har_path = os.path.join(HAR_DIR, har_id) har_path = os.path.join(HAR_DIR, har_id)
with open(har_path) as f: try:
with gzip.open(har_path) as f:
try: try:
return json.loads(f.read()) return json.loads(f.read())
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
return {} return {}
except Exception as e:
print(e) # TODO LOGS
return {}
def extract_cookies_names_from_har(har): def extract_cookies_names_from_har(har):
cookies = set() cookies = set()
@ -362,6 +366,22 @@ def _reprocess_all_hars_etag():
etag = Etags.create(etag_content) etag = Etags.create(etag_content)
etag.add(date, domain) etag.add(date, domain)
def _gzip_all_hars():
for har_id in get_all_har_ids():
har_path = os.path.join(HAR_DIR, har_id)
new_id = f'{har_path}.gz'
if not har_id.endswith('.gz'):
if not os.path.exists(new_id):
with open(har_path, 'rb') as f:
content = f.read()
if content:
with gzip.open(new_id, 'wb') as f:
r = f.write(content)
print(r)
if os.path.exists(new_id) and os.path.exists(har_path):
os.remove(har_path)
print('delete:', har_path)
# # # - - # # # # # # - - # # #
################################################################################ ################################################################################
@ -1944,3 +1964,4 @@ load_blacklist()
# print(r) # print(r)
# _reprocess_all_hars_cookie_name() # _reprocess_all_hars_cookie_name()
# _reprocess_all_hars_etag() # _reprocess_all_hars_etag()
# _gzip_all_hars()

View file

@ -129,7 +129,7 @@ def get_item_url(item_id):
def get_item_har(item_id): def get_item_har(item_id):
har = '/'.join(item_id.rsplit('/')[-4:]) har = '/'.join(item_id.rsplit('/')[-4:])
har = f'{har}.json' har = f'{har}.json.gz'
path = os.path.join(ConfigLoader.get_hars_dir(), har) path = os.path.join(ConfigLoader.get_hars_dir(), har)
if os.path.isfile(path): if os.path.isfile(path):
return har return har

View file

@ -389,7 +389,7 @@ class Domain(AbstractObject):
har = get_item_har(item_id) har = get_item_har(item_id)
if har: if har:
print(har) print(har)
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json') _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
# Screenshot # Screenshot
screenshot = self._get_external_correlation('item', '', item_id, 'screenshot') screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
if screenshot and screenshot['screenshot']: if screenshot and screenshot['screenshot']: