chg: [crawler har] compress HAR

2024-11-22 22:27:17 +00:00 · 2023-07-10 15:56:34 +02:00 · 2023-07-10 15:56:34 +02:00 · 28c647d370
commit 28c647d370
parent c719990125
4 changed files with 32 additions and 11 deletions
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@ -268,7 +268,7 @@ def extract_author_from_html(html):
 def create_har_id(date, item_id):
    item_id = item_id.split('/')[-1]
-    return os.path.join(date, f'{item_id}.json')
+    return os.path.join(date, f'{item_id}.json.gz')
 def save_har(har_id, har_content):
    # create dir
@ -277,8 +277,8 @@ def save_har(har_id, har_content):
        os.makedirs(har_dir)
    # save HAR
    filename = os.path.join(get_har_dir(), har_id)
-    with open(filename, 'w') as f:
+    with gzip.open(filename, 'wb') as f:
-        f.write(json.dumps(har_content))
+        f.write(json.dumps(har_content).encode())
 def get_all_har_ids():
    har_ids = []
@ -308,11 +308,15 @@ def get_all_har_ids():
 def get_har_content(har_id):
    har_path = os.path.join(HAR_DIR, har_id)
-    with open(har_path) as f:
+    try:
-        try:
+        with gzip.open(har_path) as f:
-            return json.loads(f.read())
+            try:
-        except json.decoder.JSONDecodeError:
+                return json.loads(f.read())
-            return {}
+            except json.decoder.JSONDecodeError:
                return {}
    except Exception as e:
        print(e) # TODO LOGS
        return {}
 def extract_cookies_names_from_har(har):
    cookies = set()
@ -362,6 +366,22 @@ def _reprocess_all_hars_etag():
            etag = Etags.create(etag_content)
            etag.add(date, domain)
 def _gzip_all_hars():
    for har_id in get_all_har_ids():
        har_path = os.path.join(HAR_DIR, har_id)
        new_id = f'{har_path}.gz'
        if not har_id.endswith('.gz'):
            if not os.path.exists(new_id):
                with open(har_path, 'rb') as f:
                    content = f.read()
                if content:
                    with gzip.open(new_id, 'wb') as f:
                        r = f.write(content)
                        print(r)
        if os.path.exists(new_id) and os.path.exists(har_path):
            os.remove(har_path)
            print('delete:', har_path)
 # # # - - # # #
 ################################################################################
@ -1944,3 +1964,4 @@ load_blacklist()
 #     print(r)
 #     _reprocess_all_hars_cookie_name()
 #     _reprocess_all_hars_etag()
 #     _gzip_all_hars()
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@ -129,7 +129,7 @@ def get_item_url(item_id):
 def get_item_har(item_id):
    har = '/'.join(item_id.rsplit('/')[-4:])
-    har = f'{har}.json'
+    har = f'{har}.json.gz'
    path = os.path.join(ConfigLoader.get_hars_dir(), har)
    if os.path.isfile(path):
        return har
--- a/bin/lib/objects/Domains.py
+++ b/bin/lib/objects/Domains.py
@ -389,7 +389,7 @@ class Domain(AbstractObject):
                har = get_item_har(item_id)
                if har:
                    print(har)
-                    _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json')
+                    _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
                # Screenshot
                screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
                if screenshot and screenshot['screenshot']:
--- a/bin/modules/Mixer.py
+++ b/bin/modules/Mixer.py
@ -131,7 +131,7 @@ class Mixer(AbstractModule):
            self.last_refresh = time.time()
            self.clear_feeders_stat()
-        time.sleep(0.5)
+            time.sleep(0.5)
    def computeNone(self):
        self.refresh_stats()