From b4f06c21f99f03d4b4fb0df87434b2c9e9a12525 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 7 Jun 2019 13:47:44 +0200 Subject: [PATCH] chg: [backend crawler] domains: download 1 archive by crawled (most recent) --- bin/packages/HiddenServices.py | 100 +++++++++++++++++- .../hiddenServices/Flask_hiddenServices.py | 53 +++++++++- 2 files changed, 150 insertions(+), 3 deletions(-) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index f1ed0767..869d3f7d 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -21,6 +21,9 @@ import gzip import redis import random +from io import BytesIO +import zipfile + import configparser import sys sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) @@ -71,6 +74,7 @@ class HiddenServices(object): self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled")) self.paste_crawled_directory_name = cfg.get("Directories", "crawled") self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + self.screenshot_directory_screenshot = os.path.join(self.screenshot_directory, 'screenshot') elif type == 'i2p': self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) @@ -230,6 +234,13 @@ class HiddenServices(object): return l_crawled_pastes ''' + def get_item_screenshot(self, item): + screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot') + if screenshot: + screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) + return screenshot + return '' + def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): l_screenshot_paste = [] for paste in l_crawled_pastes: @@ -237,9 +248,8 @@ class HiddenServices(object): origin_paste = paste paste= paste.replace(self.paste_directory+'/', '') - screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot') + screenshot = self.get_item_screenshot(paste) if screenshot: - screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste}) if len(l_screenshot_paste) > num_screenshot: @@ -250,6 +260,35 @@ class HiddenServices(object): else: return l_screenshot_paste + def get_all_domain_screenshot(self, l_crawled_pastes, filename=False): + l_screenshot_paste = [] + for paste in l_crawled_pastes: + ## FIXME: # TODO: remove me + origin_paste = paste + paste= paste.replace(self.paste_directory+'/', '') + + screenshot = self.get_item_screenshot(paste) + if screenshot: + screenshot = screenshot + '.png' + screenshot_full_path = os.path.join(self.screenshot_directory_screenshot, screenshot) + if filename: + screen_file_name = os.path.basename(paste) + '.png' + l_screenshot_paste.append( (screenshot_full_path, screen_file_name) ) + else: + l_screenshot_paste.append(screenshot_full_path) + return l_screenshot_paste + + def get_all_item_full_path(self, l_items, filename=False): + l_full_items = [] + for item in l_items: + item = os.path.join(self.PASTES_FOLDER, item) + if filename: + file_name = os.path.basename(item) + '.gz' + l_full_items.append( (item, file_name) ) + else: + l_full_items.append(item) + return l_full_items + def get_crawled_pastes_by_date(self, date): pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) @@ -258,6 +297,63 @@ class HiddenServices(object): l_crawled_pastes = [] return l_crawled_pastes + def get_all_har(self, l_pastes, filename=False): + all_har = [] + for item in l_pastes: + if filename: + all_har.append( (self.get_item_har(item), os.path.basename(item) + '.json') ) + else: + all_har.append(self.get_item_har(item)) + return all_har + + + def get_item_har(self, item_path): + item_path = item_path.replace('{}/'.format(self.paste_crawled_directory_name), '', 1) + har_path = os.path.join(self.screenshot_directory, item_path) + '.json' + return har_path + + def create_domain_basic_archive(self, l_pastes): + all_har = self.get_all_har(l_pastes, filename=True) + all_screenshot = self.get_all_domain_screenshot(l_pastes, filename=True) + all_items = self.get_all_item_full_path(l_pastes, filename=True) + + # try: + + # zip buffer + zip_buffer = BytesIO() + + with zipfile.ZipFile(zip_buffer, "a") as zf: + + #print(all_har) + self.write_in_zip_buffer(zf, all_har) + self.write_in_zip_buffer(zf, all_screenshot) + self.write_in_zip_buffer(zf, all_items) + + # write map url + map_file_content = self.get_metadata_file(l_pastes).encode() + zf.writestr( '_URL_MAP_', BytesIO(map_file_content).getvalue()) + + zip_buffer.seek(0) + return zip_buffer + + # except Exception as e: + # print(e) + # return 'Server Error' + + def write_in_zip_buffer(self, zf, list_file): + for file_path, file_name in list_file: + with open(file_path, "rb") as f: + har_content = f.read() + zf.writestr( file_name, BytesIO(har_content).getvalue()) + + def get_metadata_file(self, list_items): + file_content = '' + dict_url = self.get_all_links(list_items) + for key in dict_url: + file_content = '{}\n{} : {}'.format(file_content, os.path.basename(key), dict_url[key]) + return file_content + + ''' def get_last_crawled_pastes_fileSearch(self): diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index ed5cbcd4..72fe6f66 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -11,7 +11,7 @@ import os import time import json from pyfaup.faup import Faup -from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for +from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for from Date import Date from HiddenServices import HiddenServices @@ -35,6 +35,7 @@ list_types=['onion', 'regular'] dic_type_name={'onion':'Onion', 'regular':'Website'} # ============ FUNCTIONS ============ + def one(): return 1 @@ -782,6 +783,56 @@ def show_domain(): origin_paste=origin_paste, origin_paste_name=origin_paste_name, domain_tags=domain_tags, screenshot=screenshot) +@hiddenServices.route("/crawlers/download_domain", methods=['GET']) +def download_domain(): + domain = request.args.get('domain') + epoch = request.args.get('epoch') + try: + epoch = int(epoch) + except: + epoch = None + port = request.args.get('port') + faup.decode(domain) + unpack_url = faup.get() + + ## TODO: # FIXME: remove me + try: + domain = unpack_url['domain'].decode() + except: + domain = unpack_url['domain'] + + if not port: + if unpack_url['port']: + try: + port = unpack_url['port'].decode() + except: + port = unpack_url['port'] + else: + port = 80 + try: + port = int(port) + except: + port = 80 + type = get_type_domain(domain) + if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)): + return '404' + # # TODO: FIXME return 404 + + origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent') + + h = HiddenServices(domain, type, port=port) + item_core = h.get_domain_crawled_core_item(epoch=epoch) + if item_core: + l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item']) + else: + l_pastes = [] + #dict_links = h.get_all_links(l_pastes) + + zip_file = h.create_domain_basic_archive(l_pastes) + + return send_file(zip_file, attachment_filename='test.zip', as_attachment=True) + + @hiddenServices.route("/hiddenServices/onion_son", methods=['GET']) def onion_son(): onion_domain = request.args.get('onion_domain')