From 28c647d370aaff95e56e6adc912c39f273dff005 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Mon, 10 Jul 2023 15:56:34 +0200
Subject: [PATCH] chg: [crawler har] compress HAR

---
 bin/lib/crawlers.py        | 37 +++++++++++++++++++++++++++++--------
 bin/lib/item_basic.py      |  2 +-
 bin/lib/objects/Domains.py |  2 +-
 bin/modules/Mixer.py       |  2 +-
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 406d9ccf..d52e6a22 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -268,7 +268,7 @@ def extract_author_from_html(html):
 
 def create_har_id(date, item_id):
     item_id = item_id.split('/')[-1]
-    return os.path.join(date, f'{item_id}.json')
+    return os.path.join(date, f'{item_id}.json.gz')
 
 def save_har(har_id, har_content):
     # create dir
@@ -277,8 +277,8 @@ def save_har(har_id, har_content):
         os.makedirs(har_dir)
     # save HAR
     filename = os.path.join(get_har_dir(), har_id)
-    with open(filename, 'w') as f:
-        f.write(json.dumps(har_content))
+    with gzip.open(filename, 'wb') as f:
+        f.write(json.dumps(har_content).encode())
 
 def get_all_har_ids():
     har_ids = []
@@ -308,11 +308,15 @@ def get_all_har_ids():
 
 def get_har_content(har_id):
     har_path = os.path.join(HAR_DIR, har_id)
-    with open(har_path) as f:
-        try:
-            return json.loads(f.read())
-        except json.decoder.JSONDecodeError:
-            return {}
+    try:
+        with gzip.open(har_path) as f:
+            try:
+                return json.loads(f.read())
+            except json.decoder.JSONDecodeError:
+                return {}
+    except Exception as e:
+        print(e) # TODO LOGS
+        return {}
 
 def extract_cookies_names_from_har(har):
     cookies = set()
@@ -362,6 +366,22 @@ def _reprocess_all_hars_etag():
             etag = Etags.create(etag_content)
             etag.add(date, domain)
 
+def _gzip_all_hars():
+    for har_id in get_all_har_ids():
+        har_path = os.path.join(HAR_DIR, har_id)
+        new_id = f'{har_path}.gz'
+        if not har_id.endswith('.gz'):
+            if not os.path.exists(new_id):
+                with open(har_path, 'rb') as f:
+                    content = f.read()
+                if content:
+                    with gzip.open(new_id, 'wb') as f:
+                        r = f.write(content)
+                        print(r)
+        if os.path.exists(new_id) and os.path.exists(har_path):
+            os.remove(har_path)
+            print('delete:', har_path)
+
 # # # - - # # #
 
 ################################################################################
@@ -1944,3 +1964,4 @@ load_blacklist()
 #     print(r)
 #     _reprocess_all_hars_cookie_name()
 #     _reprocess_all_hars_etag()
+#     _gzip_all_hars()
diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py
index fdfe1059..71fa5378 100755
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@@ -129,7 +129,7 @@ def get_item_url(item_id):
 
 def get_item_har(item_id):
     har = '/'.join(item_id.rsplit('/')[-4:])
-    har = f'{har}.json'
+    har = f'{har}.json.gz'
     path = os.path.join(ConfigLoader.get_hars_dir(), har)
     if os.path.isfile(path):
         return har
diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py
index ad494f3d..ede2cbea 100755
--- a/bin/lib/objects/Domains.py
+++ b/bin/lib/objects/Domains.py
@@ -389,7 +389,7 @@ class Domain(AbstractObject):
                 har = get_item_har(item_id)
                 if har:
                     print(har)
-                    _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json')
+                    _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
                 # Screenshot
                 screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
                 if screenshot and screenshot['screenshot']:
diff --git a/bin/modules/Mixer.py b/bin/modules/Mixer.py
index 49cd3046..59b202d1 100755
--- a/bin/modules/Mixer.py
+++ b/bin/modules/Mixer.py
@@ -131,7 +131,7 @@ class Mixer(AbstractModule):
 
             self.last_refresh = time.time()
             self.clear_feeders_stat()
-        time.sleep(0.5)
+            time.sleep(0.5)
 
     def computeNone(self):
         self.refresh_stats()