diff --git a/OVERVIEW.md b/OVERVIEW.md index 7524bea7..dc2ce694 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -143,12 +143,30 @@ ARDB_DB * DB 3 - Trending * DB 4 - Sentiment + ----------------------------------------- SENTIMENT ------------------------------------ + + SET - 'Provider_set' Provider + + KEY - 'UniqID' INT + + SET - provider_timestamp UniqID + + SET - UniqID avg_score + * DB 5 - TermCred * DB 6 - Tags -* DB 7 - Metadata -* DB 8 - Statistics + ---------------------------------------------------------------------------------------- + + SET - tag paste* + + ---------------------------------------------------------------------------------------- * DB 7 - Metadata: + ---------------------------------------------------------------------------------------- + + SET - 'tag:' + paste tag + + ---------------------------------------------------------------------------------------- ----------------------------------------- BASE64 ---------------------------------------- HSET - 'metadata_hash:'+hash 'saved_path' saved_path @@ -185,3 +203,9 @@ ARDB_DB GET - 'base64_decoded:'+date nd_decoded GET - 'binary_decoded:'+date nd_decoded + +* DB 8 - Statistics +* DB 9 - Onion: + ---------------------------------------------------------------------------------------- + + diff --git a/bin/ApiKey.py b/bin/ApiKey.py index 3e0bad42..07325885 100755 --- a/bin/ApiKey.py +++ b/bin/ApiKey.py @@ -40,7 +40,7 @@ def search_api_key(message): print('found google api key') print(to_print) publisher.warning('{}Checked {} found Google API Key;{}'.format( - to_print, len(google_api_key), paste.p_path)) + to_print, len(google_api_key), paste.p_rel_path)) msg = 'infoleak:automatic-detection="google-api-key";{}'.format(filename) p.populate_set_out(msg, 'Tags') @@ -49,7 +49,7 @@ def search_api_key(message): print(to_print) total = len(aws_access_key) + len(aws_secret_key) publisher.warning('{}Checked {} found AWS Key;{}'.format( - to_print, total, paste.p_path)) + to_print, total, paste.p_rel_path)) msg = 'infoleak:automatic-detection="aws-key";{}'.format(filename) p.populate_set_out(msg, 'Tags') diff --git a/bin/Attributes.py b/bin/Attributes.py index a29f34b3..74357065 100755 --- a/bin/Attributes.py +++ b/bin/Attributes.py @@ -43,8 +43,8 @@ if __name__ == "__main__": # FIXME why not all saving everything there. PST.save_all_attributes_redis() # FIXME Not used. - PST.store.sadd("Pastes_Objects", PST.p_path) + PST.store.sadd("Pastes_Objects", PST.p_rel_path) except IOError: - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name)) diff --git a/bin/BankAccount.py b/bin/BankAccount.py index 06e86d06..cd58e3c3 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -67,7 +67,7 @@ def check_all_iban(l_iban, paste, filename): if(nb_valid_iban > 0): to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) publisher.warning('{}Checked found {} IBAN;{}'.format( - to_print, nb_valid_iban, paste.p_path)) + to_print, nb_valid_iban, paste.p_rel_path)) msg = 'infoleak:automatic-detection="iban";{}'.format(filename) p.populate_set_out(msg, 'Tags') @@ -113,7 +113,7 @@ if __name__ == "__main__": try: l_iban = iban_regex.findall(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py index acaaa8bd..ff76c5f0 100755 --- a/bin/Bitcoin.py +++ b/bin/Bitcoin.py @@ -73,7 +73,7 @@ def search_key(content, message, paste): to_print = 'Bitcoin;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) publisher.warning('{}Detected {} Bitcoin private key;{}'.format( - to_print, len(bitcoin_private_key),paste.p_path)) + to_print, len(bitcoin_private_key),paste.p_rel_path)) if __name__ == "__main__": publisher.port = 6380 diff --git a/bin/Categ.py b/bin/Categ.py index cf78f90f..3ebc42ea 100755 --- a/bin/Categ.py +++ b/bin/Categ.py @@ -89,16 +89,10 @@ if __name__ == "__main__": paste = Paste.Paste(filename) content = paste.get_p_content() - #print('-----------------------------------------------------') - #print(filename) - #print(content) - #print('-----------------------------------------------------') - for categ, pattern in tmp_dict.items(): found = set(re.findall(pattern, content)) if len(found) >= matchingThreshold: - msg = '{} {}'.format(paste.p_path, len(found)) - #msg = " ".join( [paste.p_path, bytes(len(found))] ) + msg = '{} {}'.format(paste.p_rel_path, len(found)) print(msg, categ) p.populate_set_out(msg, categ) @@ -106,4 +100,4 @@ if __name__ == "__main__": publisher.info( 'Categ;{};{};{};Detected {} as {};{}'.format( paste.p_source, paste.p_date, paste.p_name, - len(found), categ, paste.p_path)) + len(found), categ, paste.p_rel_path)) diff --git a/bin/Credential.py b/bin/Credential.py index 77dc693d..8da84883 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -97,7 +97,7 @@ if __name__ == "__main__": if sites_set: message += ' Related websites: {}'.format( (', '.join(sites_set)) ) - to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path) + to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path) print('\n '.join(creds)) diff --git a/bin/CreditCards.py b/bin/CreditCards.py index 492ec372..0c6bdf3f 100755 --- a/bin/CreditCards.py +++ b/bin/CreditCards.py @@ -77,16 +77,16 @@ if __name__ == "__main__": paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s);{}'.format( - to_print, len(creditcard_set), paste.p_path)) + to_print, len(creditcard_set), paste.p_rel_path)) print('{}Checked {} valid number(s);{}'.format( - to_print, len(creditcard_set), paste.p_path)) + to_print, len(creditcard_set), paste.p_rel_path)) #Send to duplicate p.populate_set_out(filename, 'Duplicate') msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename) p.populate_set_out(msg, 'Tags') else: - publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path)) + publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_rel_path)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10) diff --git a/bin/Decoder.py b/bin/Decoder.py index e6252834..76228dfb 100755 --- a/bin/Decoder.py +++ b/bin/Decoder.py @@ -226,7 +226,7 @@ if __name__ == '__main__': except TimeoutException: encoded_list = [] p.incr_module_timeout_statistic() # add encoder type - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py index aed87a55..1ae5ba13 100755 --- a/bin/DomClassifier.py +++ b/bin/DomClassifier.py @@ -54,14 +54,14 @@ def main(): if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_rel_path)) localizeddomains = c.localizedomain(cc=cc) if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( - PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_rel_path)) except IOError: - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name)) diff --git a/bin/Duplicates.py b/bin/Duplicates.py index 0c24bec1..611368a1 100755 --- a/bin/Duplicates.py +++ b/bin/Duplicates.py @@ -142,17 +142,17 @@ if __name__ == "__main__": paste_date = paste_date paste_date = paste_date if paste_date != None else "No date available" if paste_path != None: - if paste_path != PST.p_path: + if paste_path != PST.p_rel_path: hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date) - print('['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)) + print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + ' and ' + str(paste_path) + ' percentage: ' + str(percent)) except Exception: print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash) # Add paste in DB after checking to prevent its analysis twice # hash_type_i -> index_i AND index_i -> PST.PATH - r_serv1.set(index, PST.p_path) + r_serv1.set(index, PST.p_rel_path) r_serv1.set(index+'_date', PST._get_p_date()) r_serv1.sadd("INDEX", index) # Adding hashes in Redis @@ -180,7 +180,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_duplicate(dupl) PST.save_others_pastes_attribute_duplicate(dupl) - publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path)) + publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path)) print('{}Detected {}'.format(to_print, len(dupl))) print('') @@ -191,5 +191,5 @@ if __name__ == "__main__": except IOError: to_print = 'Duplicate;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) - print("CRC Checksum Failed on :", PST.p_path) + print("CRC Checksum Failed on :", PST.p_rel_path) publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/Global.py b/bin/Global.py index 2e4595eb..6f26ad0e 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -45,6 +45,8 @@ if __name__ == '__main__': p = Process(config_section) + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + # LOGGING # publisher.info("Feed Script started to receive & publish.") @@ -78,8 +80,7 @@ if __name__ == '__main__': paste = rreplace(paste, file_name_paste, new_file_name_paste, 1) # Creating the full filepath - filename = os.path.join(os.environ['AIL_HOME'], - p.config.get("Directories", "pastes"), paste) + filename = os.path.join(PASTES_FOLDER, paste) dirname = os.path.dirname(filename) if not os.path.exists(dirname): @@ -102,6 +103,7 @@ if __name__ == '__main__': print(filename) print(type) print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - ''' - p.populate_set_out(filename) + ''' + + p.populate_set_out(paste) processed_paste+=1 diff --git a/bin/LibInjection.py b/bin/LibInjection.py index 85f1aa5c..4b851f21 100755 --- a/bin/LibInjection.py +++ b/bin/LibInjection.py @@ -47,7 +47,7 @@ def analyse(url, path): paste = Paste.Paste(path) print("Detected (libinjection) SQL in URL: ") print(urllib.request.unquote(url)) - to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path) + to_print = 'LibInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path) publisher.warning(to_print) #Send to duplicate p.populate_set_out(path, 'Duplicate') diff --git a/bin/Lines.py b/bin/Lines.py index 8c9f6827..e4187dc7 100755 --- a/bin/Lines.py +++ b/bin/Lines.py @@ -75,10 +75,11 @@ if __name__ == '__main__': PST.save_attribute_redis("p_max_length_line", lines_infos[1]) # FIXME Not used. - PST.store.sadd("Pastes_Objects", PST.p_path) + PST.store.sadd("Pastes_Objects", PST.p_rel_path) + print(PST.p_rel_path) if lines_infos[1] < args.max: - p.populate_set_out( PST.p_path , 'LinesShort') + p.populate_set_out( PST.p_rel_path , 'LinesShort') else: - p.populate_set_out( PST.p_path , 'LinesLong') + p.populate_set_out( PST.p_rel_path , 'LinesLong') except IOError: - print("CRC Checksum Error on : ", PST.p_path) + print("CRC Checksum Error on : ", PST.p_rel_path) diff --git a/bin/Mail.py b/bin/Mail.py index dfd75719..b8311d3c 100755 --- a/bin/Mail.py +++ b/bin/Mail.py @@ -78,7 +78,7 @@ if __name__ == "__main__": to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\ format(PST.p_source, PST.p_date, PST.p_name, - MX_values[0], PST.p_path) + MX_values[0], PST.p_rel_path) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate diff --git a/bin/Mixer.py b/bin/Mixer.py index e7f9e6de..cbb39676 100755 --- a/bin/Mixer.py +++ b/bin/Mixer.py @@ -82,6 +82,8 @@ if __name__ == '__main__': ttl_key = cfg.getint("Module_Mixer", "ttl_duplicate") default_unnamed_feed_name = cfg.get("Module_Mixer", "default_unnamed_feed_name") + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + '/' + # STATS # processed_paste = 0 processed_paste_per_feeder = {} @@ -104,12 +106,14 @@ if __name__ == '__main__': feeder_name.replace(" ","") if 'import_dir' in feeder_name: feeder_name = feeder_name.split('/')[1] - paste_name = complete_paste except ValueError as e: feeder_name = default_unnamed_feed_name paste_name = complete_paste + # remove absolute path + paste_name = paste_name.replace(PASTES_FOLDER, '', 1) + # Processed paste processed_paste += 1 try: @@ -119,6 +123,7 @@ if __name__ == '__main__': processed_paste_per_feeder[feeder_name] = 1 duplicated_paste_per_feeder[feeder_name] = 0 + relay_message = "{0} {1}".format(paste_name, gzip64encoded) #relay_message = b" ".join( [paste_name, gzip64encoded] ) diff --git a/bin/Onion.py b/bin/Onion.py index dc9bc99f..30b62ba6 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -167,7 +167,7 @@ if __name__ == "__main__": except TimeoutException: encoded_list = [] p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(PST.p_path)) + print ("{0} processing timeout".format(PST.p_rel_path)) continue signal.alarm(0) @@ -185,7 +185,7 @@ if __name__ == "__main__": r_onion.sadd('i2p_domain', domain) r_onion.sadd('i2p_link', url) r_onion.sadd('i2p_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) + msg = '{};{}'.format(url,PST.p_rel_path) r_onion.sadd('i2p_crawler_queue', msg) ''' @@ -200,10 +200,10 @@ if __name__ == "__main__": if not activate_crawler: publisher.warning('{}Detected {} .onion(s);{}'.format( - to_print, len(domains_list),PST.p_path)) + to_print, len(domains_list),PST.p_rel_path)) else: publisher.info('{}Detected {} .onion(s);{}'.format( - to_print, len(domains_list),PST.p_path)) + to_print, len(domains_list),PST.p_rel_path)) now = datetime.datetime.now() path = os.path.join('onions', str(now.year).zfill(4), str(now.month).zfill(2), @@ -232,7 +232,7 @@ if __name__ == "__main__": if not r_onion.sismember('onion_domain_crawler_queue', domain): print('send to onion crawler') r_onion.sadd('onion_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) + msg = '{};{}'.format(url,PST.p_rel_path) if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'): r_onion.sadd('onion_crawler_priority_queue', msg) print('send to priority queue') @@ -242,13 +242,13 @@ if __name__ == "__main__": else: for url in fetch(p, r_cache, urls, domains_list, path): - publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) + publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path)) # TAG Item - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path) p.populate_set_out(msg, 'Tags') else: - publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) + publisher.info('{}Onion related;{}'.format(to_print, PST.p_rel_path)) prec_filename = filename else: diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py index 0db7f2ee..cd8102c1 100755 --- a/bin/RegexForTermsFrequency.py +++ b/bin/RegexForTermsFrequency.py @@ -108,7 +108,7 @@ if __name__ == "__main__": try: matched = compiled_regex.search(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Release.py b/bin/Release.py index 43c84b04..d2f18441 100755 --- a/bin/Release.py +++ b/bin/Release.py @@ -54,7 +54,7 @@ if __name__ == "__main__": if len(releases) == 0: continue - to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path) + to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_rel_path) print(to_print) if len(releases) > 30: publisher.warning(to_print) @@ -63,7 +63,7 @@ if __name__ == "__main__": except TimeoutException: p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/SQLInjectionDetection.py b/bin/SQLInjectionDetection.py index 6c8b70f7..a5595dc7 100755 --- a/bin/SQLInjectionDetection.py +++ b/bin/SQLInjectionDetection.py @@ -78,7 +78,7 @@ def analyse(url, path): if (result_path > 1) or (result_query > 1): print("Detected SQL in URL: ") print(urllib.request.unquote(url)) - to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_path) + to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Detected SQL in URL", paste.p_rel_path) publisher.warning(to_print) #Send to duplicate p.populate_set_out(path, 'Duplicate') @@ -95,7 +95,7 @@ def analyse(url, path): else: print("Potential SQL injection:") print(urllib.request.unquote(url)) - to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_path) + to_print = 'SQLInjection;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, "Potential SQL injection", paste.p_rel_path) publisher.info(to_print) diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index 8442befa..1305fb4f 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -45,6 +45,7 @@ cfg = configparser.ConfigParser() cfg.read(configfile) sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file") +#time_clean_sentiment_db = 60*60 def Analyse(message, server): path = message @@ -157,9 +158,16 @@ if __name__ == '__main__': db=p.config.get("ARDB_Sentiment", "db"), decode_responses=True) + time1 = time.time() + while True: message = p.get_from_set() if message is None: + #if int(time.time() - time1) > time_clean_sentiment_db: + # clean_db() + # time1 = time.time() + # continue + #else: publisher.debug("{} queue is empty, waiting".format(config_section)) time.sleep(1) continue diff --git a/bin/Tokenize.py b/bin/Tokenize.py index 698b4fbc..4e13b9ff 100755 --- a/bin/Tokenize.py +++ b/bin/Tokenize.py @@ -57,11 +57,11 @@ if __name__ == "__main__": try: for word, score in paste._get_top_words().items(): if len(word) >= 4: - msg = '{} {} {}'.format(paste.p_path, word, score) + msg = '{} {} {}'.format(paste.p_rel_path, word, score) p.populate_set_out(msg) except TimeoutException: p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_path)) + print ("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) diff --git a/bin/Web.py b/bin/Web.py index 3d53e306..7cc96822 100755 --- a/bin/Web.py +++ b/bin/Web.py @@ -153,7 +153,7 @@ if __name__ == "__main__": pprint.pprint(A_values) publisher.info('Url;{};{};{};Checked {} URL;{}'.format( - PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_path)) + PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path)) prec_filename = filename else: diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 72cca451..2361f60f 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -158,11 +158,7 @@ class HiddenServices(object): if father is None: return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) for children in paste_childrens: if self.domain in children: l_crawled_pastes.append(children) @@ -198,14 +194,9 @@ class HiddenServices(object): set_domain = set() for paste in l_paste: - paste_full = paste.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_full)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) for children in paste_childrens: if not self.domain in children: - print(children) set_domain.add((children.split('.onion')[0]+'.onion').split('/')[-1]) return set_domain @@ -215,11 +206,7 @@ class HiddenServices(object): if father is None: return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory+'/', '') - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) - ## TODO: # FIXME: remove me - paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) - paste_childrens = paste_childrens | paste_children + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) for children in paste_childrens: if not self.domain in children: l_crawled_pastes.append(children) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 0ab0286c..f8be2f9b 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -82,14 +82,14 @@ class Paste(object): db=cfg.getint("ARDB_Metadata", "db"), decode_responses=True) - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) - if PASTES_FOLDER not in p_path: + self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + if self.PASTES_FOLDER not in p_path: self.p_rel_path = p_path - p_path = os.path.join(PASTES_FOLDER, p_path) + self.p_path = os.path.join(self.PASTES_FOLDER, p_path) else: - self.p_rel_path = None + self.p_path = p_path + self.p_rel_path = p_path.replace(self.PASTES_FOLDER+'/', '', 1) - self.p_path = p_path self.p_name = os.path.basename(self.p_path) self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2) self.p_mime = magic.from_buffer("test", mime=True) @@ -101,7 +101,7 @@ class Paste(object): var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) - self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) + self.p_date_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) @@ -296,9 +296,13 @@ class Paste(object): return False, var def _get_p_duplicate(self): - self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_path) - if self.p_rel_path is not None: - self.p_duplicate.union( self.store_metadata.smembers('dup:'+self.p_rel_path) ) + p_duplicate = self.store_metadata.smembers('dup:'+self.p_path) + # remove absolute path #fix-db + if p_duplicate: + for duplicate_string in p_duplicate: + self.store_metadata.srem('dup:'+self.p_path, duplicate_string) + self.store_metadata.sadd('dup:'+self.p_rel_path, duplicate_string.replace(self.PASTES_FOLDER+'/', '', 1)) + self.p_duplicate = self.store_metadata.smembers('dup:'+self.p_rel_path) if self.p_duplicate is not None: return list(self.p_duplicate) else: @@ -318,6 +322,9 @@ class Paste(object): def get_p_rel_path(self): return self.p_rel_path + def get_p_date_path(self): + return self.p_date_path + def save_all_attributes_redis(self, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index f516a60b..0f7b0e3f 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -162,8 +162,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') -PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) -PASTES_FOLDERS = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' +PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/hashDecoded/Flask_hashDecoded.py b/var/www/modules/hashDecoded/Flask_hashDecoded.py index 8f8e7279..705fc328 100644 --- a/var/www/modules/hashDecoded/Flask_hashDecoded.py +++ b/var/www/modules/hashDecoded/Flask_hashDecoded.py @@ -25,7 +25,7 @@ baseUrl = Flask_config.baseUrl r_serv_metadata = Flask_config.r_serv_metadata vt_enabled = Flask_config.vt_enabled vt_auth = Flask_config.vt_auth -PASTES_FOLDER = Flask_config.PASTES_FOLDERS +PASTES_FOLDER = Flask_config.PASTES_FOLDER hashDecoded = Blueprint('hashDecoded', __name__, template_folder='templates') diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 78406368..bb8ee2b3 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -26,7 +26,6 @@ r_cache = Flask_config.r_cache r_serv_onion = Flask_config.r_serv_onion r_serv_metadata = Flask_config.r_serv_metadata bootstrap_label = Flask_config.bootstrap_label -PASTES_FOLDER = Flask_config.PASTES_FOLDER hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') @@ -579,16 +578,14 @@ def show_domain(): origin_paste_name = h.get_origin_paste_name() origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste))) paste_tags = [] - path_name = [] for path in l_pastes: - path_name.append(path.replace(PASTES_FOLDER+'/', '')) p_tags = r_serv_metadata.smembers('tag:'+path) paste_tags.append(unpack_paste_tags(p_tags)) return render_template("showDomain.html", domain=domain, last_check=last_check, first_seen=first_seen, l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label, dict_links=dict_links, - path_name=path_name, origin_paste_tags=origin_paste_tags, status=status, + origin_paste_tags=origin_paste_tags, status=status, origin_paste=origin_paste, origin_paste_name=origin_paste_name, domain_tags=domain_tags, screenshot=screenshot) @@ -599,7 +596,6 @@ def onion_son(): h = HiddenServices(onion_domain, 'onion') l_pastes = h.get_last_crawled_pastes() l_son = h.get_domain_son(l_pastes) - print(l_son) return 'l_son' # ============= JSON ============== diff --git a/var/www/modules/search/Flask_search.py b/var/www/modules/search/Flask_search.py index 7f6cd724..7405b1e9 100644 --- a/var/www/modules/search/Flask_search.py +++ b/var/www/modules/search/Flask_search.py @@ -29,7 +29,7 @@ r_serv_metadata = Flask_config.r_serv_metadata max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal bootstrap_label = Flask_config.bootstrap_label - +PASTES_FOLDER = Flask_config.PASTES_FOLDER baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) indexRegister_path = os.path.join(os.environ['AIL_HOME'], @@ -133,8 +133,8 @@ def search(): query = QueryParser("content", ix.schema).parse("".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) for x in results: - r.append(x.items()[0][1]) - path = x.items()[0][1] + r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1)) + path = x.items()[0][1].replace(PASTES_FOLDER, '', 1) paste = Paste.Paste(path) content = paste.get_p_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 @@ -208,6 +208,7 @@ def get_more_search_result(): results = searcher.search_page(query, page_offset, num_elem_to_get) for x in results: path = x.items()[0][1] + path = path.replace(PASTES_FOLDER, '', 1) path_array.append(path) paste = Paste.Paste(path) content = paste.get_p_content() diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index a457615e..2269dffc 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -41,14 +41,15 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa # ============ FUNCTIONS ============ def showpaste(content_range, requested_path): - relative_path = None if PASTES_FOLDER not in requested_path: - relative_path = requested_path - requested_path = os.path.join(PASTES_FOLDER, requested_path) - # remove old full path - #requested_path = requested_path.replace(PASTES_FOLDER, '') + # remove full path + requested_path_full = os.path.join(requested_path, PASTES_FOLDER) + else: + requested_path_full = requested_path + requested_path = requested_path.replace(PASTES_FOLDER, '', 1) + # escape directory transversal - if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: + if os.path.commonprefix((requested_path_full,PASTES_FOLDER)) != PASTES_FOLDER: return 'path transversal detected' vt_enabled = Flask_config.vt_enabled @@ -124,8 +125,6 @@ def showpaste(content_range, requested_path): active_taxonomies = r_serv_tags.smembers('active_taxonomies') l_tags = r_serv_metadata.smembers('tag:'+requested_path) - if relative_path is not None: - l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) ) #active galaxies active_galaxies = r_serv_tags.smembers('active_galaxies') @@ -190,7 +189,7 @@ def showpaste(content_range, requested_path): crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') - crawler_metadata['screenshot'] = paste.get_p_rel_path() + crawler_metadata['screenshot'] = paste.get_p_date_path() else: crawler_metadata['get_metadata'] = False @@ -406,6 +405,7 @@ def send_file_to_vt(): paste = request.form['paste'] hash = request.form['hash'] + ## TODO: # FIXME: path transversal b64_full_path = os.path.join(os.environ['AIL_HOME'], b64_path) b64_content = '' with open(b64_full_path, 'rb') as f: