chg: [crawled screenshot] use sha256 as filepath

This commit is contained in:
Terrtia 2019-04-24 14:09:04 +02:00
parent 99b9c95638
commit 9868833c77
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 45 additions and 19 deletions

View file

@ -233,9 +233,10 @@ class HiddenServices(object):
origin_paste = paste
paste= paste.replace(self.paste_directory+'/', '')
paste = paste.replace(self.paste_crawled_directory_name, '')
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):
l_screenshot_paste.append({'screenshot': paste[1:], 'item': origin_paste})
screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot')
if screenshot:
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
if len(l_screenshot_paste) > num_screenshot:
l_random_screenshot = []

View file

@ -12,6 +12,8 @@ import redis
import json
import time
from hashlib import sha256
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
@ -103,7 +105,8 @@ class TorSplashCrawler():
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date_str )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
def start_requests(self):
yield SplashRequest(
@ -135,13 +138,13 @@ class TorSplashCrawler():
UUID = self.domains[0][-215:]+str(uuid.uuid4())
else:
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
filename_har = os.path.join(self.crawled_har, UUID +'.png')
# # TODO: modify me
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
if self.save_crawled_paste(relative_filename_paste, response.data['html']):
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
@ -170,14 +173,14 @@ class TorSplashCrawler():
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
dirname = os.path.dirname(filename_screenshot)
dirname = os.path.dirname(filename_har)
if not os.path.exists(dirname):
os.makedirs(dirname)
@ -185,11 +188,27 @@ class TorSplashCrawler():
size_screenshot = (len(response.data['png'])*3) /4
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
image_content = base64.standard_b64decode(response.data['png'].encode())
hash = sha256(image_content).hexdigest()
print(hash)
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
dirname = os.path.dirname(filename_img)
if not os.path.exists(dirname):
os.makedirs(dirname)
if not os.path.exists(filename_img):
with open(filename_img, 'wb') as f:
f.write(image_content)
# add item metadata
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
# add sha256 metadata
self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1)
if 'har' in response.data:
with open(filename_screenshot+'har.txt', 'wb') as f:
dirname = os.path.dirname(filename_har)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename_har+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
# save external links in set

View file

@ -167,7 +167,7 @@ dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_me
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot')
REPO_ORIGIN = 'https://github.com/CIRCL/AIL-framework.git'

View file

@ -40,6 +40,12 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
# ============ FUNCTIONS ============
def get_item_screenshot_path(item):
screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
if screenshot:
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
return screenshot
def showpaste(content_range, requested_path):
if PASTES_FOLDER not in requested_path:
# remove full path
@ -200,7 +206,7 @@ def showpaste(content_range, requested_path):
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
crawler_metadata['screenshot'] = paste.get_p_date_path()
crawler_metadata['screenshot'] = get_item_screenshot_path(requested_path)
else:
crawler_metadata['get_metadata'] = False
@ -342,7 +348,7 @@ def show_item_min(requested_path , content_range=0):
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link')
crawler_metadata['screenshot'] = paste.get_p_rel_path()
crawler_metadata['screenshot'] = get_item_screenshot_path(relative_path)
else:
crawler_metadata['get_metadata'] = False