diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index ac42fdf8..d1417e6e 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -140,7 +140,7 @@ class TorSplashCrawler(): UUID = self.domains[0]+str(uuid.uuid4()) filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) - filename_har = os.path.join(self.crawled_har, UUID +'.png') + filename_har = os.path.join(self.crawled_har, UUID) # # TODO: modify me # save new paste on disk @@ -180,17 +180,12 @@ class TorSplashCrawler(): self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) - dirname = os.path.dirname(filename_har) - if not os.path.exists(dirname): - os.makedirs(dirname) - if 'png' in response.data: size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000: #bytes image_content = base64.standard_b64decode(response.data['png'].encode()) hash = sha256(image_content).hexdigest() - print(hash) img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') dirname = os.path.dirname(filename_img) @@ -202,13 +197,13 @@ class TorSplashCrawler(): # add item metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) # add sha256 metadata - self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1) + self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste) if 'har' in response.data: dirname = os.path.dirname(filename_har) if not os.path.exists(dirname): os.makedirs(dirname) - with open(filename_har+'har.txt', 'wb') as f: + with open(filename_har+'.json', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set diff --git a/bin/update-background.py b/bin/update-background.py index 81441beb..75f14e16 100755 --- a/bin/update-background.py +++ b/bin/update-background.py @@ -31,7 +31,7 @@ if __name__ == "__main__": db=cfg.getint("ARDB_DB", "db"), decode_responses=True) - if r_serv.scard('ail:update_v1.5') != 4: + if r_serv.scard('ail:update_v1.5') != 5: r_serv.delete('ail:update_error') r_serv.set('ail:update_in_progress', 'v1.5') r_serv.set('ail:current_background_update', 'v1.5') @@ -50,7 +50,10 @@ if __name__ == "__main__": if not r_serv.sismember('ail:update_v1.5', 'tags_background'): update_file = os.path.join(os.environ['AIL_HOME'], 'update', 'v1.4', 'Update-ARDB_Tags_background.py') process = subprocess.run(['python' ,update_file]) - if r_serv.scard('ail:update_v1.5') != 4: + if not r_serv.sismember('ail:update_v1.5', 'crawled_screenshot'): + update_file = os.path.join(os.environ['AIL_HOME'], 'update', 'v1.4', 'Update-ARDB_Onions_screenshots.py') + process = subprocess.run(['python' ,update_file]) + if r_serv.scard('ail:update_v1.5') != 5: r_serv.set('ail:update_error', 'Update v1.5 Failed, please relaunch the bin/update-background.py script') else: r_serv.delete('ail:update_in_progress') diff --git a/update/v1.4/Update-ARDB_Onions_screenshots.py b/update/v1.4/Update-ARDB_Onions_screenshots.py new file mode 100755 index 00000000..6b39a66b --- /dev/null +++ b/update/v1.4/Update-ARDB_Onions_screenshots.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time +import redis +import datetime +import configparser + +from hashlib import sha256 + +def rreplace(s, old, new, occurrence): + li = s.rsplit(old, occurrence) + return new.join(li) + +def substract_date(date_from, date_to): + date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) + date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) + delta = date_to - date_from # timedelta + l_date = [] + for i in range(delta.days + 1): + date = date_from + datetime.timedelta(i) + l_date.append( date.strftime('%Y%m%d') ) + return l_date + +if __name__ == '__main__': + + start_deb = time.time() + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + cfg = configparser.ConfigParser() + cfg.read(configfile) + SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + NEW_SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot') + + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' + + r_serv = redis.StrictRedis( + host=cfg.get("ARDB_DB", "host"), + port=cfg.getint("ARDB_DB", "port"), + db=cfg.getint("ARDB_DB", "db"), + decode_responses=True) + + r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_serv_tag = redis.StrictRedis( + host=cfg.get("ARDB_Tags", "host"), + port=cfg.getint("ARDB_Tags", "port"), + db=cfg.getint("ARDB_Tags", "db"), + decode_responses=True) + + r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + r_serv.set('ail:current_background_script', 'crawled_screenshot') + r_serv.set('ail:current_background_script_stat', 0) + + ## Update Onion ## + print('Updating ARDB_Onion ...') + index = 0 + start = time.time() + + # clean down domain from db + date_from = '20180801' + date_today = datetime.date.today().strftime("%Y%m%d") + list_date = substract_date(date_from, date_today) + nb_done = 0 + last_progress = 0 + total_to_update = len(list_date) + for date in list_date: + screenshot_dir = os.path.join(SCREENSHOT_FOLDER, date[0:4], date[4:6], date[6:8]) + if os.path.isdir(screenshot_dir): + print(screenshot_dir) + for file in os.listdir(screenshot_dir): + if file.endswith(".png"): + index += 1 + #print(file) + + img_path = os.path.join(screenshot_dir, file) + with open(img_path, 'br') as f: + image_content = f.read() + + hash = sha256(image_content).hexdigest() + img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) + filename_img = os.path.join(NEW_SCREENSHOT_FOLDER, img_dir_path, hash[12:] +'.png') + dirname = os.path.dirname(filename_img) + if not os.path.exists(dirname): + os.makedirs(dirname) + if not os.path.exists(filename_img): + os.rename(img_path, filename_img) + + item = os.path.join('crawled', date[0:4], date[4:6], date[6:8], file[:-4]) + # add item metadata + r_serv_metadata.hset('paste_metadata:{}'.format(item), 'screenshot', hash) + # add sha256 metadata + r_serv_onion.sadd('screenshot:{}'.format(hash), item) + + if file.endswith('.pnghar.txt'): + har_path = os.path.join(screenshot_dir, file) + new_file = rreplace(file, '.pnghar.txt', '.json', 1) + new_har_path = os.path.join(screenshot_dir, new_file) + os.rename(har_path, new_har_path) + + progress = int((nb_done * 100) /total_to_update) + # update progress stats + if progress != last_progress: + r_serv.set('ail:current_background_script_stat', progress) + print('{}/{} screenshot updated {}%'.format(nb_done, total_to_update, progress)) + last_progress = progress + + nb_done += 1 + + r_serv.set('ail:current_background_script_stat', 100) + + + end = time.time() + print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start)) + print() + print('Done in {} s'.format(end - start_deb)) + + r_serv.sadd('ail:update_v1.5', 'crawled_screenshot') diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 311eaaff..d4251e5f 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -160,7 +160,7 @@ DiffMaxLineLength = int(cfg.get("Flask", "DiffMaxLineLength"))#Use to display t bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] -dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_message': 'An Update is running on the background. Some informations like Tags, screenshot can be', +dict_update_description = {'v1.5':{'nb_background_update': 5, 'update_warning_message': 'An Update is running on the background. Some informations like Tags, screenshot can be', 'update_warning_message_notice_me': 'missing from the UI.'} } diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 4389f4ce..adbbe87c 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -658,7 +658,10 @@ def show_domain(): unpack_url = faup.get() domain = unpack_url['domain'].decode() if not port: - port = unpack_url['port'].decode() + if unpack_url['port']: + port = unpack_url['port'].decode() + else: + port = 80 try: port = int(port) except: