fix: [Global Crawler] max filename size

This commit is contained in:
Terrtia 2019-02-12 15:45:58 +01:00
parent 423c7b1455
commit 7a4989ce10
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 21 additions and 13 deletions

View file

@ -23,23 +23,17 @@ Requirements
import base64 import base64
import os import os
import time import time
import uuid
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
import magic import magic
import io
#import gzip
''' def rreplace(s, old, new, occurrence):
def gunzip_bytes_obj(bytes_obj): li = s.rsplit(old, occurrence)
in_ = io.BytesIO() return new.join(li)
in_.write(bytes_obj)
in_.seek(0)
with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
gunzipped_bytes_obj = fo.read()
return gunzipped_bytes_obj.decode()'''
if __name__ == '__main__': if __name__ == '__main__':
publisher.port = 6380 publisher.port = 6380
@ -77,6 +71,12 @@ if __name__ == '__main__':
processed_paste = 0 processed_paste = 0
time.sleep(1) time.sleep(1)
continue continue
file_name_paste = paste.split('/')[-1]
if len(file_name_paste)>255:
new_file_name_paste = '{}{}.gz'.format(file_name_paste[:215], str(uuid.uuid4()))
paste = rreplace(paste, file_name_paste, new_file_name_paste, 1)
# Creating the full filepath # Creating the full filepath
filename = os.path.join(os.environ['AIL_HOME'], filename = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "pastes"), paste) p.config.get("Directories", "pastes"), paste)

View file

@ -198,8 +198,12 @@ if __name__ == "__main__":
print(len(domains_list)) print(len(domains_list))
if len(domains_list) > 0: if len(domains_list) > 0:
if not activate_crawler:
publisher.warning('{}Detected {} .onion(s);{}'.format( publisher.warning('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_path)) to_print, len(domains_list),PST.p_path))
else:
publisher.info('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_path))
now = datetime.datetime.now() now = datetime.datetime.now()
path = os.path.join('onions', str(now.year).zfill(4), path = os.path.join('onions', str(now.year).zfill(4),
str(now.month).zfill(2), str(now.month).zfill(2),

View file

@ -126,6 +126,10 @@ class TorSplashCrawler():
print('Connection to proxy refused') print('Connection to proxy refused')
else: else:
#avoid filename too big
if self.domains[0] > 225:
UUID = self.domains[0][-215:]+str(uuid.uuid4())
else
UUID = self.domains[0]+str(uuid.uuid4()) UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID) filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID)