fix: [Global Crawler] max filename size

This commit is contained in:
Terrtia 2019-02-12 15:45:58 +01:00
parent 423c7b1455
commit 7a4989ce10
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 21 additions and 13 deletions

View file

@ -23,23 +23,17 @@ Requirements
import base64
import os
import time
import uuid
from pubsublogger import publisher
from Helper import Process
import magic
import io
#import gzip
'''
def gunzip_bytes_obj(bytes_obj):
in_ = io.BytesIO()
in_.write(bytes_obj)
in_.seek(0)
with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
gunzipped_bytes_obj = fo.read()
def rreplace(s, old, new, occurrence):
li = s.rsplit(old, occurrence)
return new.join(li)
return gunzipped_bytes_obj.decode()'''
if __name__ == '__main__':
publisher.port = 6380
@ -77,6 +71,12 @@ if __name__ == '__main__':
processed_paste = 0
time.sleep(1)
continue
file_name_paste = paste.split('/')[-1]
if len(file_name_paste)>255:
new_file_name_paste = '{}{}.gz'.format(file_name_paste[:215], str(uuid.uuid4()))
paste = rreplace(paste, file_name_paste, new_file_name_paste, 1)
# Creating the full filepath
filename = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "pastes"), paste)

View file

@ -198,8 +198,12 @@ if __name__ == "__main__":
print(len(domains_list))
if len(domains_list) > 0:
publisher.warning('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_path))
if not activate_crawler:
publisher.warning('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_path))
else:
publisher.info('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_path))
now = datetime.datetime.now()
path = os.path.join('onions', str(now.year).zfill(4),
str(now.month).zfill(2),

View file

@ -126,7 +126,11 @@ class TorSplashCrawler():
print('Connection to proxy refused')
else:
UUID = self.domains[0]+str(uuid.uuid4())
#avoid filename too big
if self.domains[0] > 225:
UUID = self.domains[0][-215:]+str(uuid.uuid4())
else
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')