mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
chg: [Crawler] catch server response
This commit is contained in:
parent
6f0817365a
commit
0c63f2f24f
3 changed files with 83 additions and 63 deletions
|
@ -38,6 +38,7 @@ class TorSplashCrawler():
|
||||||
},
|
},
|
||||||
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -96,7 +97,7 @@ class TorSplashCrawler():
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
self.start_urls,
|
self.start_urls,
|
||||||
self.parse,
|
self.parse,
|
||||||
errback=self.errback_catcher,
|
#errback=self.errback_catcher,
|
||||||
endpoint='render.json',
|
endpoint='render.json',
|
||||||
meta={'father': self.original_paste},
|
meta={'father': self.original_paste},
|
||||||
args={ 'html': 1,
|
args={ 'html': 1,
|
||||||
|
@ -109,84 +110,89 @@ class TorSplashCrawler():
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
#print(response.headers)
|
#print(response.headers)
|
||||||
#print(response.status)
|
#print(response.status)
|
||||||
|
print(' | ')
|
||||||
|
if response.status == 504:
|
||||||
|
# down ?
|
||||||
|
print('504 detected')
|
||||||
|
#elif response.status in in range(400, 600):
|
||||||
|
elif response.status != 200:
|
||||||
|
print('other: {}'.format(response.status))
|
||||||
|
else:
|
||||||
|
|
||||||
# # TODO: # FIXME:
|
UUID = self.domains[0]+str(uuid.uuid4())
|
||||||
self.r_cache.setbit(response.url, 0, 1)
|
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
|
||||||
self.r_cache.expire(response.url, 360000)
|
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
||||||
|
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
||||||
|
|
||||||
UUID = self.domains[0]+str(uuid.uuid4())
|
# save new paste on disk
|
||||||
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
|
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
|
||||||
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
|
||||||
|
|
||||||
# save new paste on disk
|
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
||||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
||||||
|
|
||||||
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
||||||
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
||||||
|
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
|
||||||
|
|
||||||
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
# create onion metadata
|
||||||
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
|
||||||
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
|
||||||
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
|
||||||
|
|
||||||
# create onion metadata
|
#create paste metadata
|
||||||
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
|
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
||||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
|
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
|
||||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
|
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
|
||||||
|
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
||||||
|
|
||||||
#create paste metadata
|
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
|
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
|
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
|
||||||
|
|
||||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
dirname = os.path.dirname(filename_screenshot)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
|
||||||
dirname = os.path.dirname(filename_screenshot)
|
size_screenshot = (len(response.data['png'])*3) /4
|
||||||
if not os.path.exists(dirname):
|
|
||||||
os.makedirs(dirname)
|
|
||||||
|
|
||||||
size_screenshot = (len(response.data['png'])*3) /4
|
if size_screenshot < 5000000: #bytes
|
||||||
|
with open(filename_screenshot, 'wb') as f:
|
||||||
|
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||||
|
|
||||||
if size_screenshot < 5000000: #bytes
|
#interest = response.data['har']['log']['entries'][0]['response']['header'][0]
|
||||||
with open(filename_screenshot, 'wb') as f:
|
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
f.write(json.dumps(response.data['har']).encode())
|
||||||
|
|
||||||
#interest = response.data['har']['log']['entries'][0]['response']['header'][0]
|
# save external links in set
|
||||||
with open(filename_screenshot+'har.txt', 'wb') as f:
|
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||||
f.write(json.dumps(response.data['har']).encode())
|
for link in lext.extract_links(response):
|
||||||
|
self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
||||||
|
self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
||||||
|
|
||||||
# save external links in set
|
#le = LinkExtractor(unique=True)
|
||||||
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||||
for link in lext.extract_links(response):
|
for link in le.extract_links(response):
|
||||||
self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
self.r_cache.setbit(link, 0, 0)
|
||||||
self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
self.r_cache.expire(link, 360000)
|
||||||
|
yield SplashRequest(
|
||||||
#le = LinkExtractor(unique=True)
|
link.url,
|
||||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
self.parse,
|
||||||
for link in le.extract_links(response):
|
#errback=self.errback_catcher,
|
||||||
self.r_cache.setbit(link, 0, 0)
|
endpoint='render.json',
|
||||||
self.r_cache.expire(link, 360000)
|
meta={'father': relative_filename_paste},
|
||||||
yield SplashRequest(
|
args={ 'html': 1,
|
||||||
link.url,
|
'png': 1,
|
||||||
self.parse,
|
'render_all': 1,
|
||||||
errback=self.errback_catcher,
|
'har': 1,
|
||||||
endpoint='render.json',
|
'wait': 10}
|
||||||
meta={'father': relative_filename_paste},
|
)
|
||||||
args={ 'html': 1,
|
|
||||||
'png': 1,
|
|
||||||
'render_all': 1,
|
|
||||||
'har': 1,
|
|
||||||
'wait': 10}
|
|
||||||
#errback=self.errback_catcher
|
|
||||||
)
|
|
||||||
|
|
||||||
|
'''
|
||||||
def errback_catcher(self, failure):
|
def errback_catcher(self, failure):
|
||||||
# catch all errback failures,
|
# catch all errback failures,
|
||||||
self.logger.error(repr(failure))
|
self.logger.error(repr(failure))
|
||||||
print('failure')
|
print('failure')
|
||||||
print(failure)
|
#print(failure)
|
||||||
print(failure.request.meta['item'])
|
print(failure.type)
|
||||||
|
#print(failure.request.meta['item'])
|
||||||
|
|
||||||
#if isinstance(failure.value, HttpError):
|
#if isinstance(failure.value, HttpError):
|
||||||
if failure.check(HttpError):
|
if failure.check(HttpError):
|
||||||
|
@ -209,7 +215,7 @@ class TorSplashCrawler():
|
||||||
print('TimeoutError')
|
print('TimeoutError')
|
||||||
print(TimeoutError)
|
print(TimeoutError)
|
||||||
self.logger.error('TimeoutError on %s', request.url)
|
self.logger.error('TimeoutError on %s', request.url)
|
||||||
|
'''
|
||||||
|
|
||||||
def save_crawled_paste(self, filename, content):
|
def save_crawled_paste(self, filename, content):
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,7 @@ bootstrap_label = Flask_config.bootstrap_label
|
||||||
misp_event_url = Flask_config.misp_event_url
|
misp_event_url = Flask_config.misp_event_url
|
||||||
hive_case_url = Flask_config.hive_case_url
|
hive_case_url = Flask_config.hive_case_url
|
||||||
vt_enabled = Flask_config.vt_enabled
|
vt_enabled = Flask_config.vt_enabled
|
||||||
|
PASTES_FOLDER = Flask_config.PASTES_FOLDER
|
||||||
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
|
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
|
||||||
|
|
||||||
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
|
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
|
||||||
|
@ -40,6 +41,14 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
|
||||||
# ============ FUNCTIONS ============
|
# ============ FUNCTIONS ============
|
||||||
|
|
||||||
def showpaste(content_range, requested_path):
|
def showpaste(content_range, requested_path):
|
||||||
|
if PASTES_FOLDER not in requested_path:
|
||||||
|
requested_path = os.path.join(PASTES_FOLDER, requested_path)
|
||||||
|
# remove old full path
|
||||||
|
#requested_path = requested_path.replace(PASTES_FOLDER, '')
|
||||||
|
# escape directory transversal
|
||||||
|
if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER:
|
||||||
|
return 'path transversal detected'
|
||||||
|
|
||||||
vt_enabled = Flask_config.vt_enabled
|
vt_enabled = Flask_config.vt_enabled
|
||||||
|
|
||||||
paste = Paste.Paste(requested_path)
|
paste = Paste.Paste(requested_path)
|
||||||
|
@ -173,6 +182,7 @@ def showpaste(content_range, requested_path):
|
||||||
crawler_metadata = {}
|
crawler_metadata = {}
|
||||||
if 'infoleak:submission="crawler"' in l_tags:
|
if 'infoleak:submission="crawler"' in l_tags:
|
||||||
crawler_metadata['get_metadata'] = True
|
crawler_metadata['get_metadata'] = True
|
||||||
|
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
|
||||||
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
|
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
|
||||||
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
|
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
|
||||||
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
|
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
|
||||||
|
|
|
@ -435,9 +435,13 @@
|
||||||
|
|
||||||
<table class="table table-hover table-striped">
|
<table class="table table-hover table-striped">
|
||||||
<tbody>
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>Domain</td>
|
||||||
|
<td><a target="_blank" href="{{ url_for('hiddenServices.onion_domain') }}?onion_domain={{ crawler_metadata['domain'] }}" id='onion_domain'>{{ crawler_metadata['domain'] }}</a></td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Father</td>
|
<td>Father</td>
|
||||||
<td>{{ crawler_metadata['paste_father'] }}</td>
|
<td><a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste') }}?paste={{ crawler_metadata['paste_father'] }}" id='paste_father'>{{ crawler_metadata['paste_father'] }}</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Source link</td>
|
<td>Source link</td>
|
||||||
|
|
Loading…
Reference in a new issue