mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
chg: [crawler har] compress HAR
This commit is contained in:
parent
c719990125
commit
28c647d370
4 changed files with 32 additions and 11 deletions
|
@ -268,7 +268,7 @@ def extract_author_from_html(html):
|
||||||
|
|
||||||
def create_har_id(date, item_id):
|
def create_har_id(date, item_id):
|
||||||
item_id = item_id.split('/')[-1]
|
item_id = item_id.split('/')[-1]
|
||||||
return os.path.join(date, f'{item_id}.json')
|
return os.path.join(date, f'{item_id}.json.gz')
|
||||||
|
|
||||||
def save_har(har_id, har_content):
|
def save_har(har_id, har_content):
|
||||||
# create dir
|
# create dir
|
||||||
|
@ -277,8 +277,8 @@ def save_har(har_id, har_content):
|
||||||
os.makedirs(har_dir)
|
os.makedirs(har_dir)
|
||||||
# save HAR
|
# save HAR
|
||||||
filename = os.path.join(get_har_dir(), har_id)
|
filename = os.path.join(get_har_dir(), har_id)
|
||||||
with open(filename, 'w') as f:
|
with gzip.open(filename, 'wb') as f:
|
||||||
f.write(json.dumps(har_content))
|
f.write(json.dumps(har_content).encode())
|
||||||
|
|
||||||
def get_all_har_ids():
|
def get_all_har_ids():
|
||||||
har_ids = []
|
har_ids = []
|
||||||
|
@ -308,11 +308,15 @@ def get_all_har_ids():
|
||||||
|
|
||||||
def get_har_content(har_id):
|
def get_har_content(har_id):
|
||||||
har_path = os.path.join(HAR_DIR, har_id)
|
har_path = os.path.join(HAR_DIR, har_id)
|
||||||
with open(har_path) as f:
|
try:
|
||||||
|
with gzip.open(har_path) as f:
|
||||||
try:
|
try:
|
||||||
return json.loads(f.read())
|
return json.loads(f.read())
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
return {}
|
return {}
|
||||||
|
except Exception as e:
|
||||||
|
print(e) # TODO LOGS
|
||||||
|
return {}
|
||||||
|
|
||||||
def extract_cookies_names_from_har(har):
|
def extract_cookies_names_from_har(har):
|
||||||
cookies = set()
|
cookies = set()
|
||||||
|
@ -362,6 +366,22 @@ def _reprocess_all_hars_etag():
|
||||||
etag = Etags.create(etag_content)
|
etag = Etags.create(etag_content)
|
||||||
etag.add(date, domain)
|
etag.add(date, domain)
|
||||||
|
|
||||||
|
def _gzip_all_hars():
|
||||||
|
for har_id in get_all_har_ids():
|
||||||
|
har_path = os.path.join(HAR_DIR, har_id)
|
||||||
|
new_id = f'{har_path}.gz'
|
||||||
|
if not har_id.endswith('.gz'):
|
||||||
|
if not os.path.exists(new_id):
|
||||||
|
with open(har_path, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
if content:
|
||||||
|
with gzip.open(new_id, 'wb') as f:
|
||||||
|
r = f.write(content)
|
||||||
|
print(r)
|
||||||
|
if os.path.exists(new_id) and os.path.exists(har_path):
|
||||||
|
os.remove(har_path)
|
||||||
|
print('delete:', har_path)
|
||||||
|
|
||||||
# # # - - # # #
|
# # # - - # # #
|
||||||
|
|
||||||
################################################################################
|
################################################################################
|
||||||
|
@ -1944,3 +1964,4 @@ load_blacklist()
|
||||||
# print(r)
|
# print(r)
|
||||||
# _reprocess_all_hars_cookie_name()
|
# _reprocess_all_hars_cookie_name()
|
||||||
# _reprocess_all_hars_etag()
|
# _reprocess_all_hars_etag()
|
||||||
|
# _gzip_all_hars()
|
||||||
|
|
|
@ -129,7 +129,7 @@ def get_item_url(item_id):
|
||||||
|
|
||||||
def get_item_har(item_id):
|
def get_item_har(item_id):
|
||||||
har = '/'.join(item_id.rsplit('/')[-4:])
|
har = '/'.join(item_id.rsplit('/')[-4:])
|
||||||
har = f'{har}.json'
|
har = f'{har}.json.gz'
|
||||||
path = os.path.join(ConfigLoader.get_hars_dir(), har)
|
path = os.path.join(ConfigLoader.get_hars_dir(), har)
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
return har
|
return har
|
||||||
|
|
|
@ -389,7 +389,7 @@ class Domain(AbstractObject):
|
||||||
har = get_item_har(item_id)
|
har = get_item_har(item_id)
|
||||||
if har:
|
if har:
|
||||||
print(har)
|
print(har)
|
||||||
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json')
|
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
|
||||||
# Screenshot
|
# Screenshot
|
||||||
screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
|
screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
|
||||||
if screenshot and screenshot['screenshot']:
|
if screenshot and screenshot['screenshot']:
|
||||||
|
|
Loading…
Reference in a new issue