From 28c647d370aaff95e56e6adc912c39f273dff005 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Jul 2023 15:56:34 +0200 Subject: [PATCH] chg: [crawler har] compress HAR --- bin/lib/crawlers.py | 37 +++++++++++++++++++++++++++++-------- bin/lib/item_basic.py | 2 +- bin/lib/objects/Domains.py | 2 +- bin/modules/Mixer.py | 2 +- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 406d9ccf..d52e6a22 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -268,7 +268,7 @@ def extract_author_from_html(html): def create_har_id(date, item_id): item_id = item_id.split('/')[-1] - return os.path.join(date, f'{item_id}.json') + return os.path.join(date, f'{item_id}.json.gz') def save_har(har_id, har_content): # create dir @@ -277,8 +277,8 @@ def save_har(har_id, har_content): os.makedirs(har_dir) # save HAR filename = os.path.join(get_har_dir(), har_id) - with open(filename, 'w') as f: - f.write(json.dumps(har_content)) + with gzip.open(filename, 'wb') as f: + f.write(json.dumps(har_content).encode()) def get_all_har_ids(): har_ids = [] @@ -308,11 +308,15 @@ def get_all_har_ids(): def get_har_content(har_id): har_path = os.path.join(HAR_DIR, har_id) - with open(har_path) as f: - try: - return json.loads(f.read()) - except json.decoder.JSONDecodeError: - return {} + try: + with gzip.open(har_path) as f: + try: + return json.loads(f.read()) + except json.decoder.JSONDecodeError: + return {} + except Exception as e: + print(e) # TODO LOGS + return {} def extract_cookies_names_from_har(har): cookies = set() @@ -362,6 +366,22 @@ def _reprocess_all_hars_etag(): etag = Etags.create(etag_content) etag.add(date, domain) +def _gzip_all_hars(): + for har_id in get_all_har_ids(): + har_path = os.path.join(HAR_DIR, har_id) + new_id = f'{har_path}.gz' + if not har_id.endswith('.gz'): + if not os.path.exists(new_id): + with open(har_path, 'rb') as f: + content = f.read() + if content: + with gzip.open(new_id, 'wb') as f: + r = f.write(content) + print(r) + if os.path.exists(new_id) and os.path.exists(har_path): + os.remove(har_path) + print('delete:', har_path) + # # # - - # # # ################################################################################ @@ -1944,3 +1964,4 @@ load_blacklist() # print(r) # _reprocess_all_hars_cookie_name() # _reprocess_all_hars_etag() +# _gzip_all_hars() diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index fdfe1059..71fa5378 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -129,7 +129,7 @@ def get_item_url(item_id): def get_item_har(item_id): har = '/'.join(item_id.rsplit('/')[-4:]) - har = f'{har}.json' + har = f'{har}.json.gz' path = os.path.join(ConfigLoader.get_hars_dir(), har) if os.path.isfile(path): return har diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py index ad494f3d..ede2cbea 100755 --- a/bin/lib/objects/Domains.py +++ b/bin/lib/objects/Domains.py @@ -389,7 +389,7 @@ class Domain(AbstractObject): har = get_item_har(item_id) if har: print(har) - _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json') + _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz') # Screenshot screenshot = self._get_external_correlation('item', '', item_id, 'screenshot') if screenshot and screenshot['screenshot']: diff --git a/bin/modules/Mixer.py b/bin/modules/Mixer.py index 49cd3046..59b202d1 100755 --- a/bin/modules/Mixer.py +++ b/bin/modules/Mixer.py @@ -131,7 +131,7 @@ class Mixer(AbstractModule): self.last_refresh = time.time() self.clear_feeders_stat() - time.sleep(0.5) + time.sleep(0.5) def computeNone(self): self.refresh_stats()