mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
chg: [backend crawler] domains: download 1 archive by crawled (most recent)
This commit is contained in:
parent
0d3d4aae1d
commit
b4f06c21f9
2 changed files with 150 additions and 3 deletions
|
@ -21,6 +21,9 @@ import gzip
|
|||
import redis
|
||||
import random
|
||||
|
||||
from io import BytesIO
|
||||
import zipfile
|
||||
|
||||
import configparser
|
||||
import sys
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||
|
@ -71,6 +74,7 @@ class HiddenServices(object):
|
|||
self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled"))
|
||||
self.paste_crawled_directory_name = cfg.get("Directories", "crawled")
|
||||
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||
self.screenshot_directory_screenshot = os.path.join(self.screenshot_directory, 'screenshot')
|
||||
elif type == 'i2p':
|
||||
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||
|
@ -230,6 +234,13 @@ class HiddenServices(object):
|
|||
return l_crawled_pastes
|
||||
'''
|
||||
|
||||
def get_item_screenshot(self, item):
|
||||
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
|
||||
if screenshot:
|
||||
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
||||
return screenshot
|
||||
return ''
|
||||
|
||||
def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1):
|
||||
l_screenshot_paste = []
|
||||
for paste in l_crawled_pastes:
|
||||
|
@ -237,9 +248,8 @@ class HiddenServices(object):
|
|||
origin_paste = paste
|
||||
paste= paste.replace(self.paste_directory+'/', '')
|
||||
|
||||
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot')
|
||||
screenshot = self.get_item_screenshot(paste)
|
||||
if screenshot:
|
||||
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
||||
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
|
||||
|
||||
if len(l_screenshot_paste) > num_screenshot:
|
||||
|
@ -250,6 +260,35 @@ class HiddenServices(object):
|
|||
else:
|
||||
return l_screenshot_paste
|
||||
|
||||
def get_all_domain_screenshot(self, l_crawled_pastes, filename=False):
|
||||
l_screenshot_paste = []
|
||||
for paste in l_crawled_pastes:
|
||||
## FIXME: # TODO: remove me
|
||||
origin_paste = paste
|
||||
paste= paste.replace(self.paste_directory+'/', '')
|
||||
|
||||
screenshot = self.get_item_screenshot(paste)
|
||||
if screenshot:
|
||||
screenshot = screenshot + '.png'
|
||||
screenshot_full_path = os.path.join(self.screenshot_directory_screenshot, screenshot)
|
||||
if filename:
|
||||
screen_file_name = os.path.basename(paste) + '.png'
|
||||
l_screenshot_paste.append( (screenshot_full_path, screen_file_name) )
|
||||
else:
|
||||
l_screenshot_paste.append(screenshot_full_path)
|
||||
return l_screenshot_paste
|
||||
|
||||
def get_all_item_full_path(self, l_items, filename=False):
|
||||
l_full_items = []
|
||||
for item in l_items:
|
||||
item = os.path.join(self.PASTES_FOLDER, item)
|
||||
if filename:
|
||||
file_name = os.path.basename(item) + '.gz'
|
||||
l_full_items.append( (item, file_name) )
|
||||
else:
|
||||
l_full_items.append(item)
|
||||
return l_full_items
|
||||
|
||||
def get_crawled_pastes_by_date(self, date):
|
||||
|
||||
pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
|
||||
|
@ -258,6 +297,63 @@ class HiddenServices(object):
|
|||
l_crawled_pastes = []
|
||||
return l_crawled_pastes
|
||||
|
||||
def get_all_har(self, l_pastes, filename=False):
|
||||
all_har = []
|
||||
for item in l_pastes:
|
||||
if filename:
|
||||
all_har.append( (self.get_item_har(item), os.path.basename(item) + '.json') )
|
||||
else:
|
||||
all_har.append(self.get_item_har(item))
|
||||
return all_har
|
||||
|
||||
|
||||
def get_item_har(self, item_path):
|
||||
item_path = item_path.replace('{}/'.format(self.paste_crawled_directory_name), '', 1)
|
||||
har_path = os.path.join(self.screenshot_directory, item_path) + '.json'
|
||||
return har_path
|
||||
|
||||
def create_domain_basic_archive(self, l_pastes):
|
||||
all_har = self.get_all_har(l_pastes, filename=True)
|
||||
all_screenshot = self.get_all_domain_screenshot(l_pastes, filename=True)
|
||||
all_items = self.get_all_item_full_path(l_pastes, filename=True)
|
||||
|
||||
# try:
|
||||
|
||||
# zip buffer
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, "a") as zf:
|
||||
|
||||
#print(all_har)
|
||||
self.write_in_zip_buffer(zf, all_har)
|
||||
self.write_in_zip_buffer(zf, all_screenshot)
|
||||
self.write_in_zip_buffer(zf, all_items)
|
||||
|
||||
# write map url
|
||||
map_file_content = self.get_metadata_file(l_pastes).encode()
|
||||
zf.writestr( '_URL_MAP_', BytesIO(map_file_content).getvalue())
|
||||
|
||||
zip_buffer.seek(0)
|
||||
return zip_buffer
|
||||
|
||||
# except Exception as e:
|
||||
# print(e)
|
||||
# return 'Server Error'
|
||||
|
||||
def write_in_zip_buffer(self, zf, list_file):
|
||||
for file_path, file_name in list_file:
|
||||
with open(file_path, "rb") as f:
|
||||
har_content = f.read()
|
||||
zf.writestr( file_name, BytesIO(har_content).getvalue())
|
||||
|
||||
def get_metadata_file(self, list_items):
|
||||
file_content = ''
|
||||
dict_url = self.get_all_links(list_items)
|
||||
for key in dict_url:
|
||||
file_content = '{}\n{} : {}'.format(file_content, os.path.basename(key), dict_url[key])
|
||||
return file_content
|
||||
|
||||
|
||||
'''
|
||||
def get_last_crawled_pastes_fileSearch(self):
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ import os
|
|||
import time
|
||||
import json
|
||||
from pyfaup.faup import Faup
|
||||
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for
|
||||
from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for
|
||||
|
||||
from Date import Date
|
||||
from HiddenServices import HiddenServices
|
||||
|
@ -35,6 +35,7 @@ list_types=['onion', 'regular']
|
|||
dic_type_name={'onion':'Onion', 'regular':'Website'}
|
||||
|
||||
# ============ FUNCTIONS ============
|
||||
|
||||
def one():
|
||||
return 1
|
||||
|
||||
|
@ -782,6 +783,56 @@ def show_domain():
|
|||
origin_paste=origin_paste, origin_paste_name=origin_paste_name,
|
||||
domain_tags=domain_tags, screenshot=screenshot)
|
||||
|
||||
@hiddenServices.route("/crawlers/download_domain", methods=['GET'])
|
||||
def download_domain():
|
||||
domain = request.args.get('domain')
|
||||
epoch = request.args.get('epoch')
|
||||
try:
|
||||
epoch = int(epoch)
|
||||
except:
|
||||
epoch = None
|
||||
port = request.args.get('port')
|
||||
faup.decode(domain)
|
||||
unpack_url = faup.get()
|
||||
|
||||
## TODO: # FIXME: remove me
|
||||
try:
|
||||
domain = unpack_url['domain'].decode()
|
||||
except:
|
||||
domain = unpack_url['domain']
|
||||
|
||||
if not port:
|
||||
if unpack_url['port']:
|
||||
try:
|
||||
port = unpack_url['port'].decode()
|
||||
except:
|
||||
port = unpack_url['port']
|
||||
else:
|
||||
port = 80
|
||||
try:
|
||||
port = int(port)
|
||||
except:
|
||||
port = 80
|
||||
type = get_type_domain(domain)
|
||||
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
|
||||
return '404'
|
||||
# # TODO: FIXME return 404
|
||||
|
||||
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
|
||||
|
||||
h = HiddenServices(domain, type, port=port)
|
||||
item_core = h.get_domain_crawled_core_item(epoch=epoch)
|
||||
if item_core:
|
||||
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||
else:
|
||||
l_pastes = []
|
||||
#dict_links = h.get_all_links(l_pastes)
|
||||
|
||||
zip_file = h.create_domain_basic_archive(l_pastes)
|
||||
|
||||
return send_file(zip_file, attachment_filename='test.zip', as_attachment=True)
|
||||
|
||||
|
||||
@hiddenServices.route("/hiddenServices/onion_son", methods=['GET'])
|
||||
def onion_son():
|
||||
onion_domain = request.args.get('onion_domain')
|
||||
|
|
Loading…
Reference in a new issue