mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [crawled screenshot] use sha256 as filepath
This commit is contained in:
parent
99b9c95638
commit
9868833c77
4 changed files with 45 additions and 19 deletions
|
@ -233,9 +233,10 @@ class HiddenServices(object):
|
|||
origin_paste = paste
|
||||
paste= paste.replace(self.paste_directory+'/', '')
|
||||
|
||||
paste = paste.replace(self.paste_crawled_directory_name, '')
|
||||
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):
|
||||
l_screenshot_paste.append({'screenshot': paste[1:], 'item': origin_paste})
|
||||
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot')
|
||||
if screenshot:
|
||||
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
||||
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
|
||||
|
||||
if len(l_screenshot_paste) > num_screenshot:
|
||||
l_random_screenshot = []
|
||||
|
|
|
@ -12,6 +12,8 @@ import redis
|
|||
import json
|
||||
import time
|
||||
|
||||
from hashlib import sha256
|
||||
|
||||
from scrapy.spidermiddlewares.httperror import HttpError
|
||||
from twisted.internet.error import DNSLookupError
|
||||
from twisted.internet.error import TimeoutError
|
||||
|
@ -103,7 +105,8 @@ class TorSplashCrawler():
|
|||
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
|
||||
self.p.config.get("Directories", "crawled"), date_str )
|
||||
|
||||
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
|
||||
self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
|
||||
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
|
||||
|
||||
def start_requests(self):
|
||||
yield SplashRequest(
|
||||
|
@ -135,13 +138,13 @@ class TorSplashCrawler():
|
|||
UUID = self.domains[0][-215:]+str(uuid.uuid4())
|
||||
else:
|
||||
UUID = self.domains[0]+str(uuid.uuid4())
|
||||
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
|
||||
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
|
||||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
||||
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
||||
filename_har = os.path.join(self.crawled_har, UUID +'.png')
|
||||
|
||||
# # TODO: modify me
|
||||
# save new paste on disk
|
||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||
if self.save_crawled_paste(relative_filename_paste, response.data['html']):
|
||||
|
||||
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
||||
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
||||
|
@ -170,14 +173,14 @@ class TorSplashCrawler():
|
|||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
|
||||
|
||||
#create paste metadata
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)
|
||||
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
|
||||
|
||||
dirname = os.path.dirname(filename_screenshot)
|
||||
dirname = os.path.dirname(filename_har)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
|
@ -185,11 +188,27 @@ class TorSplashCrawler():
|
|||
size_screenshot = (len(response.data['png'])*3) /4
|
||||
|
||||
if size_screenshot < 5000000: #bytes
|
||||
with open(filename_screenshot, 'wb') as f:
|
||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||
image_content = base64.standard_b64decode(response.data['png'].encode())
|
||||
hash = sha256(image_content).hexdigest()
|
||||
print(hash)
|
||||
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
|
||||
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
|
||||
dirname = os.path.dirname(filename_img)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
if not os.path.exists(filename_img):
|
||||
with open(filename_img, 'wb') as f:
|
||||
f.write(image_content)
|
||||
# add item metadata
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
|
||||
# add sha256 metadata
|
||||
self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1)
|
||||
|
||||
if 'har' in response.data:
|
||||
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||
dirname = os.path.dirname(filename_har)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
with open(filename_har+'har.txt', 'wb') as f:
|
||||
f.write(json.dumps(response.data['har']).encode())
|
||||
|
||||
# save external links in set
|
||||
|
|
|
@ -167,7 +167,7 @@ dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_me
|
|||
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
|
||||
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
|
||||
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot')
|
||||
|
||||
REPO_ORIGIN = 'https://github.com/CIRCL/AIL-framework.git'
|
||||
|
||||
|
|
|
@ -40,6 +40,12 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
|
|||
|
||||
# ============ FUNCTIONS ============
|
||||
|
||||
def get_item_screenshot_path(item):
|
||||
screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
|
||||
if screenshot:
|
||||
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
|
||||
return screenshot
|
||||
|
||||
def showpaste(content_range, requested_path):
|
||||
if PASTES_FOLDER not in requested_path:
|
||||
# remove full path
|
||||
|
@ -200,7 +206,7 @@ def showpaste(content_range, requested_path):
|
|||
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
|
||||
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
|
||||
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
|
||||
crawler_metadata['screenshot'] = paste.get_p_date_path()
|
||||
crawler_metadata['screenshot'] = get_item_screenshot_path(requested_path)
|
||||
else:
|
||||
crawler_metadata['get_metadata'] = False
|
||||
|
||||
|
@ -342,7 +348,7 @@ def show_item_min(requested_path , content_range=0):
|
|||
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain')
|
||||
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father')
|
||||
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link')
|
||||
crawler_metadata['screenshot'] = paste.get_p_rel_path()
|
||||
crawler_metadata['screenshot'] = get_item_screenshot_path(relative_path)
|
||||
else:
|
||||
crawler_metadata['get_metadata'] = False
|
||||
|
||||
|
|
Loading…
Reference in a new issue