fix: [crawler] cleanup

This commit is contained in:
Terrtia 2020-03-20 16:20:01 +01:00
parent 6cfd3fe36d
commit db634e8866
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 1 additions and 14 deletions

View file

@ -138,7 +138,3 @@ def save_har(har_dir, item_id, har_content):
filename = os.path.join(har_dir, item_id + '.json') filename = os.path.join(har_dir, item_id + '.json')
with open(filename, 'w') as f: with open(filename, 'w') as f:
f.write(json.dumps(har_content)) f.write(json.dumps(har_content))
if __name__ == "__main__":
all_cookies = load_cookies(get_cookies(), '3thxemke2x7hcibu.onion', crawler_type='onion')
print(json.dumps(all_cookies))

View file

@ -120,7 +120,7 @@ class TorSplashCrawler():
self.full_date = date['date_day'] self.full_date = date['date_day']
self.date_month = date['date_month'] self.date_month = date['date_month']
self.date_epoch = int(date['epoch']) self.date_epoch = int(date['epoch'])
self.png = True self.png = True
self.har = True self.har = True
self.cookies = cookies self.cookies = cookies
@ -177,9 +177,6 @@ class TorSplashCrawler():
error_log = (json.loads(response.body.decode())) error_log = (json.loads(response.body.decode()))
print(error_log) print(error_log)
else: else:
# DEBUG:
# print('----')
# print(response.data.keys())
item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0]) item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0])
self.save_crawled_item(item_id, response.data['html']) self.save_crawled_item(item_id, response.data['html'])
@ -190,14 +187,8 @@ class TorSplashCrawler():
crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
#print(response.data['cookies'])
if 'cookies' in response.data: if 'cookies' in response.data:
all_cookies = response.data['cookies'] all_cookies = response.data['cookies']
# for cookie in all_cookies:
# print('------------------------')
# print(cookie['name'])
# print(cookie['value'])
# print(cookie)
else: else:
all_cookies = [] all_cookies = []