chg: [Crawler] catch server response

This commit is contained in:
Terrtia 2018-09-17 15:35:06 +02:00
parent 6f0817365a
commit 0c63f2f24f
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 83 additions and 63 deletions

View file

@ -38,6 +38,7 @@ class TorSplashCrawler():
}, },
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'DEPTH_LIMIT': crawler_depth_limit 'DEPTH_LIMIT': crawler_depth_limit
}) })
@ -96,7 +97,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
self.start_urls, self.start_urls,
self.parse, self.parse,
errback=self.errback_catcher, #errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': self.original_paste}, meta={'father': self.original_paste},
args={ 'html': 1, args={ 'html': 1,
@ -109,10 +110,14 @@ class TorSplashCrawler():
def parse(self,response): def parse(self,response):
#print(response.headers) #print(response.headers)
#print(response.status) #print(response.status)
print(' | ')
# # TODO: # FIXME: if response.status == 504:
self.r_cache.setbit(response.url, 0, 1) # down ?
self.r_cache.expire(response.url, 360000) print('504 detected')
#elif response.status in in range(400, 600):
elif response.status != 200:
print('other: {}'.format(response.status))
else:
UUID = self.domains[0]+str(uuid.uuid4()) UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID) filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
@ -170,7 +175,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
link.url, link.url,
self.parse, self.parse,
errback=self.errback_catcher, #errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': relative_filename_paste}, meta={'father': relative_filename_paste},
args={ 'html': 1, args={ 'html': 1,
@ -178,15 +183,16 @@ class TorSplashCrawler():
'render_all': 1, 'render_all': 1,
'har': 1, 'har': 1,
'wait': 10} 'wait': 10}
#errback=self.errback_catcher
) )
'''
def errback_catcher(self, failure): def errback_catcher(self, failure):
# catch all errback failures, # catch all errback failures,
self.logger.error(repr(failure)) self.logger.error(repr(failure))
print('failure') print('failure')
print(failure) #print(failure)
print(failure.request.meta['item']) print(failure.type)
#print(failure.request.meta['item'])
#if isinstance(failure.value, HttpError): #if isinstance(failure.value, HttpError):
if failure.check(HttpError): if failure.check(HttpError):
@ -209,7 +215,7 @@ class TorSplashCrawler():
print('TimeoutError') print('TimeoutError')
print(TimeoutError) print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url) self.logger.error('TimeoutError on %s', request.url)
'''
def save_crawled_paste(self, filename, content): def save_crawled_paste(self, filename, content):

View file

@ -33,6 +33,7 @@ bootstrap_label = Flask_config.bootstrap_label
misp_event_url = Flask_config.misp_event_url misp_event_url = Flask_config.misp_event_url
hive_case_url = Flask_config.hive_case_url hive_case_url = Flask_config.hive_case_url
vt_enabled = Flask_config.vt_enabled vt_enabled = Flask_config.vt_enabled
PASTES_FOLDER = Flask_config.PASTES_FOLDER
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
@ -40,6 +41,14 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
# ============ FUNCTIONS ============ # ============ FUNCTIONS ============
def showpaste(content_range, requested_path): def showpaste(content_range, requested_path):
if PASTES_FOLDER not in requested_path:
requested_path = os.path.join(PASTES_FOLDER, requested_path)
# remove old full path
#requested_path = requested_path.replace(PASTES_FOLDER, '')
# escape directory transversal
if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER:
return 'path transversal detected'
vt_enabled = Flask_config.vt_enabled vt_enabled = Flask_config.vt_enabled
paste = Paste.Paste(requested_path) paste = Paste.Paste(requested_path)
@ -173,6 +182,7 @@ def showpaste(content_range, requested_path):
crawler_metadata = {} crawler_metadata = {}
if 'infoleak:submission="crawler"' in l_tags: if 'infoleak:submission="crawler"' in l_tags:
crawler_metadata['get_metadata'] = True crawler_metadata['get_metadata'] = True
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path) crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)

View file

@ -435,9 +435,13 @@
<table class="table table-hover table-striped"> <table class="table table-hover table-striped">
<tbody> <tbody>
<tr>
<td>Domain</td>
<td><a target="_blank" href="{{ url_for('hiddenServices.onion_domain') }}?onion_domain={{ crawler_metadata['domain'] }}" id='onion_domain'>{{ crawler_metadata['domain'] }}</a></td>
</tr>
<tr> <tr>
<td>Father</td> <td>Father</td>
<td>{{ crawler_metadata['paste_father'] }}</td> <td><a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste') }}?paste={{ crawler_metadata['paste_father'] }}" id='paste_father'>{{ crawler_metadata['paste_father'] }}</a></td>
</tr> </tr>
<tr> <tr>
<td>Source link</td> <td>Source link</td>