From 5872cf9196da2567be49dddcc75ddf6a77e9ef56 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 2 Nov 2018 16:07:27 +0100 Subject: [PATCH 1/4] fix: [Scripts] Remove absolute path --- bin/ApiKey.py | 6 +++--- bin/Attributes.py | 4 ++-- bin/BankAccount.py | 4 ++-- bin/Bitcoin.py | 4 ++-- bin/Categ.py | 10 ++-------- bin/Credential.py | 2 +- bin/CreditCards.py | 6 +++--- bin/Decoder.py | 2 +- bin/DomClassifier.py | 6 +++--- bin/Duplicates.py | 10 +++++----- bin/Global.py | 13 +++++++++---- bin/LibInjection.py | 2 +- bin/Lines.py | 9 +++++---- bin/MISP_The_Hive_feeder.py | 2 +- bin/Mail.py | 2 +- bin/Mixer.py | 8 ++++++-- bin/Onion.py | 14 +++++++------- bin/RegexForTermsFrequency.py | 2 +- bin/Release.py | 4 ++-- bin/SQLInjectionDetection.py | 4 ++-- bin/Tokenize.py | 4 ++-- bin/Web.py | 2 +- bin/packages/Paste.py | 5 ++++- var/www/modules/showpaste/Flask_showpaste.py | 18 ++++++++++-------- 24 files changed, 76 insertions(+), 67 deletions(-) diff --git a/bin/ApiKey.py b/bin/ApiKey.py index faf4b2d9..bab2745c 100755 --- a/bin/ApiKey.py +++ b/bin/ApiKey.py @@ -40,7 +40,7 @@ def search_api_key(message): print('found google api key') print(to_print) publisher.warning('{}Checked {} found Google API Key;{}'.format( - to_print, len(google_api_key), paste.p_path)) + to_print, len(google_api_key), paste.p_rel_path)) msg = 'infoleak:automatic-detection="google-api-key";{}'.format(filename) p.populate_set_out(msg, 'Tags') @@ -49,7 +49,7 @@ def search_api_key(message): print(to_print) total = len(aws_access_key) + len(aws_secret_key) publisher.warning('{}Checked {} found AWS Key;{}'.format( - to_print, total, paste.p_path)) + to_print, total, paste.p_rel_path)) msg = 'infoleak:automatic-detection="aws-key";{}'.format(filename) p.populate_set_out(msg, 'Tags') @@ -86,7 +86,7 @@ if __name__ == "__main__": if message is not None: - search_api_key(message) + search_api_key(message) else: publisher.debug("Script ApiKey is Idling 10s") diff --git a/bin/Attributes.py b/bin/Attributes.py index a29f34b3..74357065 100755 --- a/bin/Attributes.py +++ b/bin/Attributes.py @@ -43,8 +43,8 @@ if __name__ == "__main__": # FIXME why not all saving everything there. PST.save_all_attributes_redis() # FIXME Not used. - PST.store.sadd("Pastes_Objects", PST.p_path) + PST.store.sadd("Pastes_Objects", PST.p_rel_path) except IOError: - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name)) diff --git a/bin/BankAccount.py b/bin/BankAccount.py index 06e86d06..cd58e3c3 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -67,7 +67,7 @@ def check_all_iban(l_iban, paste, filename): if(nb_valid_iban > 0): to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) publisher.warning('{}Checked found {} IBAN;{}'.format( - to_print, nb_valid_iban, paste.p_path)) + to_print, nb_valid_iban, paste.p_rel_path)) msg = 'infoleak:automatic-detection="iban";{}'.format(filename) p.populate_set_out(msg, 'Tags') @@ -113,7 +113,7 @@ if __name__ == "__main__": try: l_iban = iban_regex.findall(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py index 1b7694b7..da1fc22a 100755 --- a/bin/Bitcoin.py +++ b/bin/Bitcoin.py @@ -32,7 +32,7 @@ def decode_base58(bc, length): for char in bc: n = n * 58 + digits58.index(char) return n.to_bytes(length, 'big') - + def check_bc(bc): try: bcbytes = decode_base58(bc, 25) @@ -75,7 +75,7 @@ def search_key(content, message, paste): to_print = 'Bitcoin;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) publisher.warning('{}Detected {} Bitcoin private key;{}'.format( - to_print, len(bitcoin_private_key),paste.p_path)) + to_print, len(bitcoin_private_key),paste.p_rel_path)) if __name__ == "__main__": publisher.port = 6380 diff --git a/bin/Categ.py b/bin/Categ.py index cf78f90f..3ebc42ea 100755 --- a/bin/Categ.py +++ b/bin/Categ.py @@ -89,16 +89,10 @@ if __name__ == "__main__": paste = Paste.Paste(filename) content = paste.get_p_content() - #print('-----------------------------------------------------') - #print(filename) - #print(content) - #print('-----------------------------------------------------') - for categ, pattern in tmp_dict.items(): found = set(re.findall(pattern, content)) if len(found) >= matchingThreshold: - msg = '{} {}'.format(paste.p_path, len(found)) - #msg = " ".join( [paste.p_path, bytes(len(found))] ) + msg = '{} {}'.format(paste.p_rel_path, len(found)) print(msg, categ) p.populate_set_out(msg, categ) @@ -106,4 +100,4 @@ if __name__ == "__main__": publisher.info( 'Categ;{};{};{};Detected {} as {};{}'.format( paste.p_source, paste.p_date, paste.p_name, - len(found), categ, paste.p_path)) + len(found), categ, paste.p_rel_path)) diff --git a/bin/Credential.py b/bin/Credential.py index 7f665227..417b30eb 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -97,7 +97,7 @@ if __name__ == "__main__": if sites_set: message += ' Related websites: {}'.format( (', '.join(sites_set)) ) - to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path) + to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path) print('\n '.join(creds)) diff --git a/bin/CreditCards.py b/bin/CreditCards.py index 260d1345..a7921a6e 100755 --- a/bin/CreditCards.py +++ b/bin/CreditCards.py @@ -77,9 +77,9 @@ if __name__ == "__main__": paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s);{}'.format( - to_print, len(creditcard_set), paste.p_path)) + to_print, len(creditcard_set), paste.p_rel_path)) print('{}Checked {} valid number(s);{}'.format( - to_print, len(creditcard_set), paste.p_path)) + to_print, len(creditcard_set), paste.p_rel_path)) #Send to duplicate p.populate_set_out(filename, 'Duplicate') #send to Browse_warning_paste @@ -89,7 +89,7 @@ if __name__ == "__main__": msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename) p.populate_set_out(msg, 'Tags') else: - publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path)) + publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_rel_path)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10) diff --git a/bin/Decoder.py b/bin/Decoder.py index abbf760b..fa18e5e6 100755 --- a/bin/Decoder.py +++ b/bin/Decoder.py @@ -229,7 +229,7 @@ if __name__ == '__main__': except TimeoutException: encoded_list = [] p.incr_module_timeout_statistic() # add encoder type - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py index aed87a55..1ae5ba13 100755 --- a/bin/DomClassifier.py +++ b/bin/DomClassifier.py @@ -54,14 +54,14 @@ def main(): if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_rel_path)) localizeddomains = c.localizedomain(cc=cc) if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_rel_path)) except IOError: - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name)) diff --git a/bin/Duplicates.py b/bin/Duplicates.py index 0c24bec1..611368a1 100755 --- a/bin/Duplicates.py +++ b/bin/Duplicates.py @@ -142,17 +142,17 @@ if __name__ == "__main__": paste_date = paste_date paste_date = paste_date if paste_date != None else "No date available" if paste_path != None: - if paste_path != PST.p_path: + if paste_path != PST.p_rel_path: hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date) - print('['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)) + print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + ' and ' + str(paste_path) + ' percentage: ' + str(percent)) except Exception: print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash) # Add paste in DB after checking to prevent its analysis twice # hash_type_i -> index_i AND index_i -> PST.PATH - r_serv1.set(index, PST.p_path) + r_serv1.set(index, PST.p_rel_path) r_serv1.set(index+'_date', PST._get_p_date()) r_serv1.sadd("INDEX", index) # Adding hashes in Redis @@ -180,7 +180,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_duplicate(dupl) PST.save_others_pastes_attribute_duplicate(dupl) - publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path)) + publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path)) print('{}Detected {}'.format(to_print, len(dupl))) print('') @@ -191,5 +191,5 @@ if __name__ == "__main__": except IOError: to_print = 'Duplicate;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/Global.py b/bin/Global.py index 32a3656b..22b4c4e7 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -51,6 +51,9 @@ if __name__ == '__main__': p = Process(config_section) + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + print(PASTES_FOLDER) + # LOGGING # publisher.info("Feed Script started to receive & publish.") @@ -78,8 +81,9 @@ if __name__ == '__main__': time.sleep(1) continue # Creating the full filepath - filename = os.path.join(os.environ['AIL_HOME'], - p.config.get("Directories", "pastes"), paste) + filename = os.path.join(PASTES_FOLDER, paste) + print(filename) + print(paste) dirname = os.path.dirname(filename) if not os.path.exists(dirname): @@ -102,6 +106,7 @@ if __name__ == '__main__': print(filename) print(type) print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - ''' - p.populate_set_out(filename) + ''' + + p.populate_set_out(paste) processed_paste+=1 diff --git a/bin/LibInjection.py b/bin/LibInjection.py index 283bba00..5088d9c5 100755 --- a/bin/LibInjection.py +++ b/bin/LibInjection.py @@ -47,7 +47,7 @@ def analyse(url, path): paste = Paste.Paste(path) print("Detected (libinjection) SQL in URL: ") print(urllib.request.unquote(url)) - to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path) + to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path) publisher.warning(to_print) #Send to duplicate p.populate_set_out(path, 'Duplicate') diff --git a/bin/Lines.py b/bin/Lines.py index 8c9f6827..e4187dc7 100755 --- a/bin/Lines.py +++ b/bin/Lines.py @@ -75,10 +75,11 @@ if __name__ == '__main__': PST.save_attribute_redis("p_max_length_line", lines_infos[1]) # FIXME Not used. - PST.store.sadd("Pastes_Objects", PST.p_path) + PST.store.sadd("Pastes_Objects", PST.p_rel_path) + print(PST.p_rel_path) if lines_infos[1] < args.max: - p.populate_set_out( PST.p_path , 'LinesShort') + p.populate_set_out( PST.p_rel_path , 'LinesShort') else: - p.populate_set_out( PST.p_path , 'LinesLong') + p.populate_set_out( PST.p_rel_path , 'LinesLong') except IOError: - print("CRC Checksum Error on : ", PST.p_path) + print("CRC Checksum Error on : ", PST.p_rel_path) diff --git a/bin/MISP_The_Hive_feeder.py b/bin/MISP_The_Hive_feeder.py index 0a8f1791..c1ef414d 100755 --- a/bin/MISP_The_Hive_feeder.py +++ b/bin/MISP_The_Hive_feeder.py @@ -180,7 +180,7 @@ if __name__ == "__main__": if flag_the_hive or flag_misp: tag, path = message.split(';') paste = Paste.Paste(path) - source = '/'.join(paste.p_path.split('/')[-6:]) + source = '/'.join(paste.p_rel_path.split('/')[-6:]) full_path = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"), path) diff --git a/bin/Mail.py b/bin/Mail.py index 1f682661..33d8de43 100755 --- a/bin/Mail.py +++ b/bin/Mail.py @@ -78,7 +78,7 @@ if __name__ == "__main__": to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\ format(PST.p_source, PST.p_date, PST.p_name, - MX_values[0], PST.p_path) + MX_values[0], PST.p_rel_path) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate diff --git a/bin/Mixer.py b/bin/Mixer.py index e1656b8e..e41e8e0d 100755 --- a/bin/Mixer.py +++ b/bin/Mixer.py @@ -81,6 +81,8 @@ if __name__ == '__main__': operation_mode = cfg.getint("Module_Mixer", "operation_mode") ttl_key = cfg.getint("Module_Mixer", "ttl_duplicate") + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + # STATS # processed_paste = 0 processed_paste_per_feeder = {} @@ -103,11 +105,12 @@ if __name__ == '__main__': feeder_name.replace(" ","") if 'import_dir' in feeder_name: feeder_name = feeder_name.split('/')[1] - paste_name = complete_paste except ValueError as e: feeder_name = "unnamed_feeder" - paste_name = complete_paste + + # remove absolute path + paste_name = complete_paste.replace(PASTES_FOLDER, '', 1) # Processed paste processed_paste += 1 @@ -118,6 +121,7 @@ if __name__ == '__main__': processed_paste_per_feeder[feeder_name] = 1 duplicated_paste_per_feeder[feeder_name] = 0 + relay_message = "{0} {1}".format(paste_name, gzip64encoded) #relay_message = b" ".join( [paste_name, gzip64encoded] ) diff --git a/bin/Onion.py b/bin/Onion.py index 1f233fcf..e38f363a 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -164,7 +164,7 @@ if __name__ == "__main__": r_onion.sadd('i2p_domain', domain) r_onion.sadd('i2p_link', url) r_onion.sadd('i2p_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) + msg = '{};{}'.format(url,PST.p_rel_path) r_onion.sadd('i2p_crawler_queue', msg) ''' @@ -178,7 +178,7 @@ if __name__ == "__main__": if len(domains_list) > 0: publisher.warning('{}Detected {} .onion(s);{}'.format( - to_print, len(domains_list),PST.p_path)) + to_print, len(domains_list),PST.p_rel_path)) now = datetime.datetime.now() path = os.path.join('onions', str(now.year).zfill(4), str(now.month).zfill(2), @@ -203,19 +203,19 @@ if __name__ == "__main__": if not r_onion.sismember('onion_domain_crawler_queue', domain): print('send to onion crawler') r_onion.sadd('onion_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) + msg = '{};{}'.format(url,PST.p_rel_path) r_onion.sadd('onion_crawler_queue', msg) #p.populate_set_out(msg, 'Crawler') else: for url in fetch(p, r_cache, urls, domains_list, path): - publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) - p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') + publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path)) + p.populate_set_out('onion;{}'.format(PST.p_rel_path), 'alertHandler') - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path) p.populate_set_out(msg, 'Tags') else: - publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) + publisher.info('{}Onion related;{}'.format(to_print, PST.p_rel_path)) prec_filename = filename else: diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py index fae7a03a..4e98edcc 100755 --- a/bin/RegexForTermsFrequency.py +++ b/bin/RegexForTermsFrequency.py @@ -106,7 +106,7 @@ if __name__ == "__main__": try: matched = compiled_regex.search(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Release.py b/bin/Release.py index 43c84b04..d2f18441 100755 --- a/bin/Release.py +++ b/bin/Release.py @@ -54,7 +54,7 @@ if __name__ == "__main__": if len(releases) == 0: continue - to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path) + to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_rel_path) print(to_print) if len(releases) > 30: publisher.warning(to_print) @@ -63,7 +63,7 @@ if __name__ == "__main__": except TimeoutException: p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/SQLInjectionDetection.py b/bin/SQLInjectionDetection.py index f03d7555..9464fd8a 100755 --- a/bin/SQLInjectionDetection.py +++ b/bin/SQLInjectionDetection.py @@ -78,7 +78,7 @@ def analyse(url, path): if (result_path > 1) or (result_query > 1): print("Detected SQL in URL: ") print(urllib.request.unquote(url)) - to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path) + to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path) publisher.warning(to_print) #Send to duplicate p.populate_set_out(path, 'Duplicate') @@ -97,7 +97,7 @@ def analyse(url, path): else: print("Potential SQL injection:") print(urllib.request.unquote(url)) - to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_path) + to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_rel_path) publisher.info(to_print) diff --git a/bin/Tokenize.py b/bin/Tokenize.py index 698b4fbc..4e13b9ff 100755 --- a/bin/Tokenize.py +++ b/bin/Tokenize.py @@ -57,11 +57,11 @@ if __name__ == "__main__": try: for word, score in paste._get_top_words().items(): if len(word) >= 4: - msg = '{} {} {}'.format(paste.p_path, word, score) + msg = '{} {} {}'.format(paste.p_rel_path, word, score) p.populate_set_out(msg) except TimeoutException: p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Web.py b/bin/Web.py index 3d53e306..7cc96822 100755 --- a/bin/Web.py +++ b/bin/Web.py @@ -153,7 +153,7 @@ if __name__ == "__main__": pprint.pprint(A_values) publisher.info('Url;{};{};{};Checked {} URL;{}'.format( - PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path)) prec_filename = filename else: diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index d02a92f5..c5dcc0a6 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -101,7 +101,7 @@ class Paste(object): var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) - self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) + self.p_date_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) @@ -304,6 +304,9 @@ class Paste(object): def get_p_rel_path(self): return self.p_rel_path + def get_p_date_path(self): + return self.p_date_path + def save_all_attributes_redis(self, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index 188af759..c24e3335 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -41,12 +41,10 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa # ============ FUNCTIONS ============ def showpaste(content_range, requested_path): - relative_path = None - if PASTES_FOLDER not in requested_path: - relative_path = requested_path - requested_path = os.path.join(PASTES_FOLDER, requested_path) - # remove old full path - #requested_path = requested_path.replace(PASTES_FOLDER, '') + if PASTES_FOLDER in requested_path: + # remove full path + requested_path = requested_path.replace(PASTES_FOLDER, '', 1) + #requested_path = os.path.join(PASTES_FOLDER, requested_path) # escape directory transversal if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: return 'path transversal detected' @@ -124,8 +122,12 @@ def showpaste(content_range, requested_path): active_taxonomies = r_serv_tags.smembers('active_taxonomies') l_tags = r_serv_metadata.smembers('tag:'+requested_path) + print(l_tags) if relative_path is not None: - l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) ) + print('union') + print(relative_path) + print(r_serv_metadata.smembers('tag:'+relative_path)) + l_tags = l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) ) #active galaxies active_galaxies = r_serv_tags.smembers('active_galaxies') @@ -189,7 +191,7 @@ def showpaste(content_range, requested_path): crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') - crawler_metadata['screenshot'] = paste.get_p_rel_path() + crawler_metadata['screenshot'] = paste.get_p_date_path() else: crawler_metadata['get_metadata'] = False From 108fdb868e2615d703ed91443bc3bda50fe11811 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 15 Nov 2018 10:39:41 +0100 Subject: [PATCH 2/4] chg: update Overview --- OVERVIEW.md | 10 ++++++++++ bin/SentimentAnalysis.py | 20 +++++++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/OVERVIEW.md b/OVERVIEW.md index 3d3a62ab..8b324e21 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -46,6 +46,16 @@ ARDB_DB * DB 3 - Trending * DB 4 - Sentiment + ----------------------------------------- SENTIMENT ------------------------------------ + + SET - 'Provider_set' Provider + + KEY - 'UniqID' INT + + SET - provider_timestamp UniqID + + SET - UniqID avg_score + * DB 5 - TermCred * DB 6 - Tags * DB 7 - Metadata diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index 8442befa..3a014050 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -45,6 +45,13 @@ cfg = configparser.ConfigParser() cfg.read(configfile) sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file") +time_clean_sentiment_db = 60*60 + +def clean_db() + sevenDays = oneHour*24*7 + dateStart = datetime.datetime.now() + dateStart = dateStart.replace(minute=0, second=0, microsecond=0) + dateStart_timestamp = calendar.timegm(dateStart.timetuple()) def Analyse(message, server): path = message @@ -157,12 +164,19 @@ if __name__ == '__main__': db=p.config.get("ARDB_Sentiment", "db"), decode_responses=True) + time1 = time.time() + while True: message = p.get_from_set() if message is None: - publisher.debug("{} queue is empty, waiting".format(config_section)) - time.sleep(1) - continue + if int(time.time() - time1) > time_clean_sentiment_db: + clean_db() + time1 = time.time() + continue + else: + publisher.debug("{} queue is empty, waiting".format(config_section)) + time.sleep(1) + continue signal.alarm(60) try: Analyse(message, server) From 4e680aabf03750292cb6795c814c4eb80a6b4087 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 20 Nov 2018 14:39:45 +0100 Subject: [PATCH 3/4] chg: [Overview] add doc --- OVERVIEW.md | 18 ++++++++++++++++-- bin/SentimentAnalysis.py | 24 +++++++++--------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/OVERVIEW.md b/OVERVIEW.md index 8b324e21..32eae1d8 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -58,10 +58,18 @@ ARDB_DB * DB 5 - TermCred * DB 6 - Tags -* DB 7 - Metadata -* DB 8 - Statistics + ---------------------------------------------------------------------------------------- + + SET - tag paste* + + ---------------------------------------------------------------------------------------- * DB 7 - Metadata: + ---------------------------------------------------------------------------------------- + + SET - 'tag:' + paste tag + + ---------------------------------------------------------------------------------------- ----------------------------------------- BASE64 ---------------------------------------- HSET - 'metadata_hash:'+hash 'saved_path' saved_path @@ -99,3 +107,9 @@ ARDB_DB GET - 'base64_decoded:'+date nd_decoded GET - 'binary_decoded:'+date nd_decoded + +* DB 8 - Statistics +* DB 9 - Onion: + ---------------------------------------------------------------------------------------- + + diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index 3a014050..1305fb4f 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -45,13 +45,7 @@ cfg = configparser.ConfigParser() cfg.read(configfile) sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file") -time_clean_sentiment_db = 60*60 - -def clean_db() - sevenDays = oneHour*24*7 - dateStart = datetime.datetime.now() - dateStart = dateStart.replace(minute=0, second=0, microsecond=0) - dateStart_timestamp = calendar.timegm(dateStart.timetuple()) +#time_clean_sentiment_db = 60*60 def Analyse(message, server): path = message @@ -169,14 +163,14 @@ if __name__ == '__main__': while True: message = p.get_from_set() if message is None: - if int(time.time() - time1) > time_clean_sentiment_db: - clean_db() - time1 = time.time() - continue - else: - publisher.debug("{} queue is empty, waiting".format(config_section)) - time.sleep(1) - continue + #if int(time.time() - time1) > time_clean_sentiment_db: + # clean_db() + # time1 = time.time() + # continue + #else: + publisher.debug("{} queue is empty, waiting".format(config_section)) + time.sleep(1) + continue signal.alarm(60) try: Analyse(message, server) From 31a8dfe0b39f213dae64115f76d5e6d2e8048807 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 21 Nov 2018 16:45:25 +0100 Subject: [PATCH 4/4] fix: [AIL] use only relative paths pastes duplicates are fixed on the fly --- bin/Global.py | 3 --- bin/Mixer.py | 2 +- bin/packages/HiddenServices.py | 19 +++--------------- bin/packages/Paste.py | 20 +++++++++++-------- var/www/modules/Flask_config.py | 2 +- var/www/modules/Tags/Flask_Tags.py | 1 - .../browsepastes/Flask_browsepastes.py | 3 +++ .../hiddenServices/Flask_hiddenServices.py | 6 +----- .../hiddenServices/templates/showDomain.html | 2 +- var/www/modules/search/Flask_search.py | 7 ++++--- var/www/modules/showpaste/Flask_showpaste.py | 16 +++++++-------- 11 files changed, 33 insertions(+), 48 deletions(-) diff --git a/bin/Global.py b/bin/Global.py index 22b4c4e7..c1e16496 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -52,7 +52,6 @@ if __name__ == '__main__': p = Process(config_section) PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) - print(PASTES_FOLDER) # LOGGING # publisher.info("Feed Script started to receive & publish.") @@ -82,8 +81,6 @@ if __name__ == '__main__': continue # Creating the full filepath filename = os.path.join(PASTES_FOLDER, paste) - print(filename) - print(paste) dirname = os.path.dirname(filename) if not os.path.exists(dirname): diff --git a/bin/Mixer.py b/bin/Mixer.py index 760a3480..cbb39676 100755 --- a/bin/Mixer.py +++ b/bin/Mixer.py @@ -82,7 +82,7 @@ if __name__ == '__main__': ttl_key = cfg.getint("Module_Mixer", "ttl_duplicate") default_unnamed_feed_name = cfg.get("Module_Mixer", "default_unnamed_feed_name") - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + '/' # STATS # processed_paste = 0 diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index d515c955..170e1dc3 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -99,11 +99,7 @@ class HiddenServices(object): if father is None: return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) for children in paste_childrens: if self.domain in children: l_crawled_pastes.append(children) @@ -117,14 +113,9 @@ class HiddenServices(object): set_domain = set() for paste in l_paste: - paste_full = paste.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_full)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) for children in paste_childrens: if not self.domain in children: - print(children) set_domain.add((children.split('.onion')[0]+'.onion').split('/')[-1]) return set_domain @@ -133,11 +124,7 @@ class HiddenServices(object): if father is None: return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) for children in paste_childrens: if not self.domain in children: l_crawled_pastes.append(children) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index c5dcc0a6..f1521d22 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -82,14 +82,14 @@ class Paste(object): db=cfg.getint("ARDB_Metadata", "db"), decode_responses=True) - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) - if PASTES_FOLDER not in p_path: + self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + if self.PASTES_FOLDER not in p_path: self.p_rel_path = p_path - p_path = os.path.join(PASTES_FOLDER, p_path) + self.p_path = os.path.join(self.PASTES_FOLDER, p_path) else: - self.p_rel_path = None + self.p_path = p_path + self.p_rel_path = p_path.replace(self.PASTES_FOLDER+'/', '', 1) - self.p_path = p_path self.p_name = os.path.basename(self.p_path) self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2) self.p_mime = magic.from_buffer("test", mime=True) @@ -286,9 +286,13 @@ class Paste(object): return False, var def _get_p_duplicate(self): - self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_path) - if self.p_rel_path is not None: - self.p_duplicate.union( self.store_metadata.smembers('dup:'+self.p_rel_path) ) + p_duplicate = self.store_metadata.smembers('dup:'+self.p_path) + # remove absolute path #fix-db + if p_duplicate: + for duplicate_string in p_duplicate: + self.store_metadata.srem('dup:'+self.p_path, duplicate_string) + self.store_metadata.sadd('dup:'+self.p_rel_path, duplicate_string.replace(self.PASTES_FOLDER+'/', '', 1)) + self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_rel_path) if self.p_duplicate is not None: return list(self.p_duplicate) else: diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 7cc802f0..104a1c25 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -154,7 +154,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') -PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) +PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/Tags/Flask_Tags.py b/var/www/modules/Tags/Flask_Tags.py index bbc918ed..e79d56fc 100644 --- a/var/www/modules/Tags/Flask_Tags.py +++ b/var/www/modules/Tags/Flask_Tags.py @@ -28,7 +28,6 @@ r_serv_statistics = Flask_config.r_serv_statistics max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal bootstrap_label = Flask_config.bootstrap_label -PASTES_FOLDER = Flask_config.PASTES_FOLDER Tags = Blueprint('Tags', __name__, template_folder='templates') diff --git a/var/www/modules/browsepastes/Flask_browsepastes.py b/var/www/modules/browsepastes/Flask_browsepastes.py index eb962ffe..96839d78 100644 --- a/var/www/modules/browsepastes/Flask_browsepastes.py +++ b/var/www/modules/browsepastes/Flask_browsepastes.py @@ -23,6 +23,7 @@ max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal r_serv_metadata = Flask_config.r_serv_metadata bootstrap_label = Flask_config.bootstrap_label +PASTES_FOLDER = Flask_config.PASTES_FOLDER #init all lvlDB servers curYear = datetime.now().year @@ -62,6 +63,7 @@ def event_stream_getImportantPasteByModule(module_name, year): paste_tags = [] for path in all_pastes_list: + path = path.replace(PASTES_FOLDER, '', 1) index += 1 paste = Paste.Paste(path) content = paste.get_p_content() @@ -125,6 +127,7 @@ def importantPasteByModule(): allPastes = getPastebyType(r_serv_db[currentSelectYear], module_name) for path in allPastes[0:10]: + path = path.replace(PASTES_FOLDER, '', 1) all_path.append(path) paste = Paste.Paste(path) content = paste.get_p_content() diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 47ea56f1..ee5d7ee1 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -22,7 +22,6 @@ baseUrl = Flask_config.baseUrl r_serv_onion = Flask_config.r_serv_onion r_serv_metadata = Flask_config.r_serv_metadata bootstrap_label = Flask_config.bootstrap_label -PASTES_FOLDER = Flask_config.PASTES_FOLDER hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') @@ -124,15 +123,13 @@ def onion_domain(): origin_paste_name = h.get_origin_paste_name() origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste))) paste_tags = [] - path_name = [] for path in l_pastes: - path_name.append(path.replace(PASTES_FOLDER+'/', '')) p_tags = r_serv_metadata.smembers('tag:'+path) paste_tags.append(unpack_paste_tags(p_tags)) return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label, - path_name=path_name, origin_paste_tags=origin_paste_tags, status=status, + origin_paste_tags=origin_paste_tags, status=status, origin_paste=origin_paste, origin_paste_name=origin_paste_name, domain_tags=domain_tags, screenshot=screenshot) @@ -143,7 +140,6 @@ def onion_son(): h = HiddenServices(onion_domain, 'onion') l_pastes = h.get_last_crawled_pastes() l_son = h.get_domain_son(l_pastes) - print(l_son) return 'l_son' # ============= JSON ============== diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index dd6b2056..49f9a5f3 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -105,7 +105,7 @@ {% for path in l_pastes %} - {{ path_name[loop.index0] }} + {{ path }}
{% for tag in paste_tags[loop.index0] %} diff --git a/var/www/modules/search/Flask_search.py b/var/www/modules/search/Flask_search.py index 7f6cd724..7405b1e9 100644 --- a/var/www/modules/search/Flask_search.py +++ b/var/www/modules/search/Flask_search.py @@ -29,7 +29,7 @@ r_serv_metadata = Flask_config.r_serv_metadata max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal bootstrap_label = Flask_config.bootstrap_label - +PASTES_FOLDER = Flask_config.PASTES_FOLDER baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) indexRegister_path = os.path.join(os.environ['AIL_HOME'], @@ -133,8 +133,8 @@ def search(): query = QueryParser("content", ix.schema).parse("".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) for x in results: - r.append(x.items()[0][1]) - path = x.items()[0][1] + r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1)) + path = x.items()[0][1].replace(PASTES_FOLDER, '', 1) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 @@ -208,6 +208,7 @@ def get_more_search_result(): results = searcher.search_page(query, page_offset, num_elem_to_get) for x in results: path = x.items()[0][1] + path = path.replace(PASTES_FOLDER, '', 1) path_array.append(path) paste = Paste.Paste(path) content = paste.get_p_content() diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index c24e3335..970102ca 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -41,12 +41,15 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa # ============ FUNCTIONS ============ def showpaste(content_range, requested_path): - if PASTES_FOLDER in requested_path: + if PASTES_FOLDER not in requested_path: # remove full path + requested_path_full = os.path.join(requested_path, PASTES_FOLDER) + else: + requested_path_full = requested_path requested_path = requested_path.replace(PASTES_FOLDER, '', 1) - #requested_path = os.path.join(PASTES_FOLDER, requested_path) + # escape directory transversal - if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: + if os.path.commonprefix((requested_path_full,PASTES_FOLDER)) != PASTES_FOLDER: return 'path transversal detected' vt_enabled = Flask_config.vt_enabled @@ -122,12 +125,6 @@ def showpaste(content_range, requested_path): active_taxonomies = r_serv_tags.smembers('active_taxonomies') l_tags = r_serv_metadata.smembers('tag:'+requested_path) - print(l_tags) - if relative_path is not None: - print('union') - print(relative_path) - print(r_serv_metadata.smembers('tag:'+relative_path)) - l_tags = l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) ) #active galaxies active_galaxies = r_serv_tags.smembers('active_galaxies') @@ -280,6 +277,7 @@ def send_file_to_vt(): paste = request.form['paste'] hash = request.form['hash'] + ## TODO: # FIXME: path transversal b64_full_path = os.path.join(os.environ['AIL_HOME'], b64_path) b64_content = '' with open(b64_full_path, 'rb') as f: