From c52caebe7c94ce492e0e3b420ad11fd9100ad90b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 2 May 2018 17:07:10 +0200 Subject: [PATCH] improve Duplicate + tlsh + add time out handler + debug + clean --- bin/ApiKey.py | 10 ------ bin/Base64.py | 32 +++++++++++++---- bin/Credential.py | 9 ----- bin/DomClassifier.py | 3 +- bin/Duplicates.py | 8 +++-- bin/Global.py | 12 +++---- bin/Keys.py | 10 ++++++ bin/LAUNCH.sh | 2 +- bin/Mail.py | 3 +- bin/ModuleStats.py | 1 + bin/ModulesInformationV2.py | 4 ++- bin/RegexForTermsFrequency.py | 3 -- bin/Release.py | 36 ++++++++++++++----- bin/packages/Paste.py | 21 +++++++++++ bin/packages/config.cfg.sample | 8 +++++ bin/packages/lib_refine.py | 13 ++++--- bin/packages/lib_words.py | 5 +-- var/www/modules/Flask_config.py | 1 - var/www/modules/showpaste/Flask_showpaste.py | 9 ++--- .../showpaste/templates/show_saved_paste.html | 7 ++-- 20 files changed, 133 insertions(+), 64 deletions(-) diff --git a/bin/ApiKey.py b/bin/ApiKey.py index ff05fb84..7c12751e 100755 --- a/bin/ApiKey.py +++ b/bin/ApiKey.py @@ -32,17 +32,8 @@ def search_api_key(message): aws_secret_key = regex_aws_secret_key.findall(content) google_api_key = regex_google_api_key.findall(content) - print(aws_access_key) - print(aws_secret_key) - print(google_api_key) - if(len(aws_access_key) > 0 or len(aws_secret_key) > 0 or len(google_api_key) > 0): - print('-------------------------------') - print(aws_access_key) - print(aws_secret_key) - print(google_api_key) - to_print = 'ApiKey;{};{};{};'.format( paste.p_source, paste.p_date, paste.p_name) if(len(google_api_key) > 0): @@ -99,5 +90,4 @@ if __name__ == "__main__": else: publisher.debug("Script ApiKey is Idling 10s") - #print('Sleeping') time.sleep(10) diff --git a/bin/Base64.py b/bin/Base64.py index 77cedced..7cfc98a7 100755 --- a/bin/Base64.py +++ b/bin/Base64.py @@ -20,6 +20,16 @@ from hashlib import sha1 import magic import json +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum, frame): + raise TimeoutException + +signal.signal(signal.SIGALRM, timeout_handler) + def search_base64(content, message): find = False @@ -88,6 +98,7 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) + max_execution_time = p.config.getint("Base64", "max_execution_time") # Sent to the logging a description of the module publisher.info("Base64 started") @@ -105,14 +116,21 @@ if __name__ == '__main__': time.sleep(1) continue - # Do something with the message from the queue - filename = message paste = Paste.Paste(filename) - content = paste.get_p_content() - #print(filename) - search_base64(content,message) + signal.alarm(max_execution_time) + try: + # Do something with the message from the queue + #print(filename) + content = paste.get_p_content() + search_base64(content,message) - # (Optional) Send that thing to the next queue - #p.populate_set_out(something_has_been_done) + # (Optional) Send that thing to the next queue + #p.populate_set_out(something_has_been_done) + + except TimeoutException: + print ("{0} processing timeout".format(paste.p_path)) + continue + else: + signal.alarm(0) diff --git a/bin/Credential.py b/bin/Credential.py index b83b9086..776f75a8 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -74,18 +74,9 @@ if __name__ == "__main__": filepath, count = message.split(' ') - #if count < minTopPassList: - # Less than 5 matches from the top password list, false positive. - #print("false positive:", count) - #continue - paste = Paste.Paste(filepath) content = paste.get_p_content() creds = set(re.findall(regex_cred, content)) - print(len(creds)) - print(creds) - print(content) - print('-----') publisher.warning('to_print') diff --git a/bin/DomClassifier.py b/bin/DomClassifier.py index 45fbd486..53d4299d 100755 --- a/bin/DomClassifier.py +++ b/bin/DomClassifier.py @@ -24,10 +24,11 @@ def main(): config_section = 'DomClassifier' p = Process(config_section) + addr_dns = p.config.get("DomClassifier", "dns") publisher.info("""ZMQ DomainClassifier is Running""") - c = DomainClassifier.domainclassifier.Extract(rawtext="") + c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) cc = p.config.get("DomClassifier", "cc") cc_tld = p.config.get("DomClassifier", "cc_tld") diff --git a/bin/Duplicates.py b/bin/Duplicates.py index 10f9f4fc..5f7b3bb8 100755 --- a/bin/Duplicates.py +++ b/bin/Duplicates.py @@ -62,7 +62,7 @@ if __name__ == "__main__": while True: try: hash_dico = {} - dupl = [] + dupl = set() dico_range_list = [] x = time.time() @@ -124,6 +124,8 @@ if __name__ == "__main__": percent = 100-ssdeep.compare(dico_hash, paste_hash) else: percent = tlsh.diffxlen(dico_hash, paste_hash) + if percent > 100: + percent = 100 threshold_duplicate = threshold_set[hash_type] if percent < threshold_duplicate: @@ -163,14 +165,16 @@ if __name__ == "__main__": if len(hash_dico) != 0: # paste_tuple = (hash_type, date, paste_path, percent) for dico_hash, paste_tuple in hash_dico.items(): - dupl.append(paste_tuple) + dupl.add(paste_tuple) # Creating the object attribute and save it. to_print = 'Duplicate;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) if dupl != []: + dupl = list(dupl) PST.__setattr__("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl) + PST.save_others_pastes_attribute_duplicate("p_duplicate", dupl) publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path)) print('{}Detected {}'.format(to_print, len(dupl))) diff --git a/bin/Global.py b/bin/Global.py index e952713d..29893df8 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -29,8 +29,9 @@ from Helper import Process import magic import io -import gzip +#import gzip +''' def gunzip_bytes_obj(bytes_obj): in_ = io.BytesIO() in_.write(bytes_obj) @@ -38,7 +39,7 @@ def gunzip_bytes_obj(bytes_obj): with gzip.GzipFile(fileobj=in_, mode='rb') as fo: gunzipped_bytes_obj = fo.read() - return gunzipped_bytes_obj.decode() + return gunzipped_bytes_obj.decode()''' if __name__ == '__main__': publisher.port = 6380 @@ -80,7 +81,7 @@ if __name__ == '__main__': # Creating the full filepath filename = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"), paste) - + dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) @@ -89,7 +90,7 @@ if __name__ == '__main__': with open(filename, 'wb') as f: f.write(decoded) - try: + '''try: decoded2 = gunzip_bytes_obj(decoded) except: decoded2 ='' @@ -101,8 +102,7 @@ if __name__ == '__main__': print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') print(filename) print(type) - print(decoded2) print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - + ''' p.populate_set_out(filename) processed_paste+=1 diff --git a/bin/Keys.py b/bin/Keys.py index a8c368ca..565f874e 100755 --- a/bin/Keys.py +++ b/bin/Keys.py @@ -27,6 +27,7 @@ def search_key(paste): find = False if '-----BEGIN PGP MESSAGE-----' in content: publisher.warning('{} has a PGP enc message'.format(paste.p_name)) + find = True if '-----BEGIN CERTIFICATE-----' in content: @@ -35,18 +36,27 @@ def search_key(paste): if '-----BEGIN RSA PRIVATE KEY-----' in content: publisher.warning('{} has a RSA private key message'.format(paste.p_name)) + print('rsa private key message found') find = True if '-----BEGIN PRIVATE KEY-----' in content: publisher.warning('{} has a private key message'.format(paste.p_name)) + print('private key message found') find = True if '-----BEGIN ENCRYPTED PRIVATE KEY-----' in content: publisher.warning('{} has an encrypted private key message'.format(paste.p_name)) + print('encrypted private key message found') find = True if '-----BEGIN OPENSSH PRIVATE KEY-----' in content: publisher.warning('{} has an openssh private key message'.format(paste.p_name)) + print('openssh private key message found') + find = True + + if '-----BEGIN OpenVPN Static key V1-----' in content: + publisher.warning('{} has an openssh private key message'.format(paste.p_name)) + print('OpenVPN Static key message found') find = True if '-----BEGIN DSA PRIVATE KEY-----' in content: diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index c8246555..bdcd87ab 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -130,7 +130,7 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Lines" bash -c 'python3 Lines.py; read x' sleep 0.1 - #screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x' + screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x' sleep 0.1 screen -S "Script_AIL" -X screen -t "Categ" bash -c 'python3 Categ.py; read x' sleep 0.1 diff --git a/bin/Mail.py b/bin/Mail.py index b03061a4..847f2f9d 100755 --- a/bin/Mail.py +++ b/bin/Mail.py @@ -28,6 +28,7 @@ if __name__ == "__main__": config_section = 'Mail' p = Process(config_section) + addr_dns = p.config.get("Mail", "dns") # REDIS # r_serv2 = redis.StrictRedis( @@ -56,7 +57,7 @@ if __name__ == "__main__": if prec_filename is None or filename != prec_filename: PST = Paste.Paste(filename) MX_values = lib_refine.checking_MX_record( - r_serv2, PST.get_regex(email_regex)) + r_serv2, PST.get_regex(email_regex), addr_dns) if MX_values[0] >= 1: diff --git a/bin/ModuleStats.py b/bin/ModuleStats.py index 9c25eb75..13847685 100755 --- a/bin/ModuleStats.py +++ b/bin/ModuleStats.py @@ -29,6 +29,7 @@ def get_date_range(num_day): def compute_most_posted(server, message): + print(message) module, num, keyword, paste_date = message.split(';') redis_progression_name_set = 'top_'+ module +'_set_' + paste_date diff --git a/bin/ModulesInformationV2.py b/bin/ModulesInformationV2.py index 5afe5687..05170a95 100755 --- a/bin/ModulesInformationV2.py +++ b/bin/ModulesInformationV2.py @@ -617,7 +617,9 @@ def fetchQueueData(): for moduleNum in server.smembers(keySet): moduleNum = moduleNum.decode('utf8') value = ( server.get(key + str(moduleNum)) ).decode('utf8') - complete_paste_path = ( server.get(key + str(moduleNum) + "_PATH") ).decode('utf8') + complete_paste_path = ( server.get(key + str(moduleNum) + "_PATH") ) + if(complete_paste_path is not None): + complete_paste_path = complete_paste_path.decode('utf8') COMPLETE_PASTE_PATH_PER_PID[moduleNum] = complete_paste_path if value is not None: diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py index 88df0924..12758219 100755 --- a/bin/RegexForTermsFrequency.py +++ b/bin/RegexForTermsFrequency.py @@ -76,9 +76,6 @@ if __name__ == "__main__": dico_regex, dico_regexname_to_redis = refresh_dicos() print('dico got refreshed') - print(dico_regex) - print(dico_regexname_to_redis) - filename = message temp = filename.split('/') timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) diff --git a/bin/Release.py b/bin/Release.py index 6ba2b577..68496e91 100755 --- a/bin/Release.py +++ b/bin/Release.py @@ -6,6 +6,16 @@ from pubsublogger import publisher from Helper import Process import re +import signal + +class TimeoutException(Exception): + pass + +def timeout_handler(signum, frame): + raise TimeoutException + +signal.signal(signal.SIGALRM, timeout_handler) + ''' This module takes its input from the global module. It applies some regex and publish matched content @@ -16,6 +26,7 @@ if __name__ == "__main__": publisher.channel = "Script" config_section = "Release" p = Process(config_section) + max_execution_time = p.config.getint("Curve", "max_execution_time") publisher.info("Release scripts to find release names") movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+" @@ -35,13 +46,22 @@ if __name__ == "__main__": paste = Paste.Paste(filepath) content = paste.get_p_content() - releases = set(re.findall(regex, content)) - if len(releases) == 0: - continue - to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path) - print(to_print) - if len(releases) > 30: - publisher.warning(to_print) + signal.alarm(max_execution_time) + try: + releases = set(re.findall(regex, content)) + if len(releases) == 0: + continue + + to_print = 'Release;{};{};{};{} releases;{}'.format(paste.p_source, paste.p_date, paste.p_name, len(releases), paste.p_path) + print(to_print) + if len(releases) > 30: + publisher.warning(to_print) + else: + publisher.info(to_print) + + except TimeoutException: + print ("{0} processing timeout".format(paste.p_path)) + continue else: - publisher.info(to_print) + signal.alarm(0) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index e5e58370..87570aab 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -329,6 +329,27 @@ class Paste(object): else: self.store.hset(self.p_path, attr_name, json.dumps(value)) + def save_others_pastes_attribute_duplicate(self, attr_name, list_value): + """ + Save a new duplicate on others pastes + """ + for hash_type, path, percent, date in list_value: + print(hash_type, path, percent, date) + #get json + json_duplicate = self.store.hget(path, attr_name) + #json save on redis + if json_duplicate is not None: + list_duplicate = json.loads(json_duplicate.decode('utf8')) + # add new duplicate + list_duplicate.append([hash_type, self.p_path, percent, date]) + self.store.hset(path, attr_name, json.dumps(list_duplicate)) + + else: + # create the new list + list_duplicate = [[hash_type, self.p_path, percent, date]] + self.store.hset(path, attr_name, json.dumps(list_duplicate)) + + def _get_from_redis(self, r_serv): ans = {} for hash_name, the_hash in self.p_hash: diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 9373cd17..dc8364c9 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -54,6 +54,10 @@ criticalNumberToAlert=8 #Will be considered as false positive if less that X matches from the top password list minTopPassList=5 +[Base64] +path = Base64/ +max_execution_time = 60 + [Modules_Duplicates] #Number of month to look back maximum_month_range = 3 @@ -145,6 +149,10 @@ cc_critical = DE [DomClassifier] cc = DE cc_tld = r'\.de$' +dns = 8.8.8.8 + +[Mail] +dns = 8.8.8.8 # Indexer configuration [Indexer] diff --git a/bin/packages/lib_refine.py b/bin/packages/lib_refine.py index 6c0bcd9b..fe03e730 100644 --- a/bin/packages/lib_refine.py +++ b/bin/packages/lib_refine.py @@ -17,19 +17,18 @@ def is_luhn_valid(card_number): return (sum(r[0::2]) + sum(sum(divmod(d*2, 10)) for d in r[1::2])) % 10 == 0 -def checking_MX_record(r_serv, adress_set): +def checking_MX_record(r_serv, adress_set, addr_dns): """Check if emails MX domains are responding. :param r_serv: -- Redis connexion database :param adress_set: -- (set) This is a set of emails adress + :param adress_set: -- (str) This is a server dns address :return: (int) Number of adress with a responding and valid MX domains This function will split the email adress and try to resolve their domains names: on example@gmail.com it will try to resolve gmail.com """ - print('mails:') - print(adress_set) #remove duplicate adress_set = list(set(adress_set)) @@ -40,7 +39,7 @@ def checking_MX_record(r_serv, adress_set): # Transforming the set into a string MXdomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", str(adress_set).lower()) resolver = dns.resolver.Resolver() - resolver.nameservers = ['149.13.33.69'] + resolver.nameservers = [addr_dns] resolver.timeout = 5 resolver.lifetime = 2 if MXdomains != []: @@ -64,21 +63,27 @@ def checking_MX_record(r_serv, adress_set): except dns.resolver.NoNameservers: publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.') + print('NoNameserver, No non-broken nameservers are available to answer the query.') except dns.resolver.NoAnswer: publisher.debug('NoAnswer, The response did not contain an answer to the question.') + print('NoAnswer, The response did not contain an answer to the question.') except dns.name.EmptyLabel: publisher.debug('SyntaxError: EmptyLabel') + print('SyntaxError: EmptyLabel') except dns.resolver.NXDOMAIN: r_serv.setex(MXdomain[1:], 1, timedelta(days=1)) publisher.debug('The query name does not exist.') + print('The query name does not exist.') except dns.name.LabelTooLong: publisher.debug('The Label is too long') + print('The Label is too long') except dns.resolver.Timeout: + print('timeout') r_serv.setex(MXdomain[1:], 1, timedelta(days=1)) except Exception as e: diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index 2101f77d..f58e85db 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -81,7 +81,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month to keep the timeline of the curve correct. """ - threshold = 50 + threshold = 30 first_day = date(year, month, 1) last_day = date(year, month, calendar.monthrange(year, month)[1]) words = [] @@ -135,6 +135,7 @@ def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month): redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2) words = list(server.smembers(redis_set_name)) + words = [x.decode('utf-8') for x in words] headers = ['Date'] + words with open(csvfilename+'.csv', 'w') as f: @@ -153,5 +154,5 @@ def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month): row.append(0) else: # if the word have a value for the day - row.append(value) + row.append(value.decode('utf8')) writer.writerow(row) diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 9473dd92..2801242e 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -62,5 +62,4 @@ r_serv_pasteName = redis.StrictRedis( max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal -tlsh_to_percent = 1000.0 #Use to display the estimated percentage instead of a raw value DiffMaxLineLength = int(cfg.get("Flask", "DiffMaxLineLength"))#Use to display the estimated percentage instead of a raw value diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index fd9739b8..a2abf2bf 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -20,7 +20,6 @@ cfg = Flask_config.cfg r_serv_pasteName = Flask_config.r_serv_pasteName max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal -tlsh_to_percent = Flask_config.tlsh_to_percent DiffMaxLineLength = Flask_config.DiffMaxLineLength showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') @@ -48,8 +47,10 @@ def showpaste(content_range): for dup_list in p_duplicate_full_list: if dup_list[0] == "tlsh": - dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100) + dup_list[2] = 100 - int(dup_list[2]) else: + print('dup_list') + print(dup_list) dup_list[2] = int(dup_list[2]) #p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True) @@ -64,12 +65,13 @@ def showpaste(content_range): hash_types = [] comp_vals = [] for i in indices: - hash_types.append(p_duplicate_full_list[i][0].encode('utf8')) + hash_types.append(p_duplicate_full_list[i][0]) comp_vals.append(p_duplicate_full_list[i][2]) dup_list_removed.append(i) hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) + if len(p_duplicate_full_list[dup_list_index]) > 3: try: date_paste = str(int(p_duplicate_full_list[dup_list_index][3])) @@ -91,7 +93,6 @@ def showpaste(content_range): if content_range != 0: p_content = p_content[0:content_range] - return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list) # ============ ROUTES ============ diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index d1d89dd7..f1e609af 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -69,13 +69,12 @@ {% for dup_path in duplicate_list %} - {{ hashtype_list[i] }} - Similarity: {{ simil_list[i] }}% - {{ date_list[i] }} + {{ hashtype_list[loop.index - 1] }} + Similarity: {{ simil_list[loop.index - 1] }}% + {{ date_list[loop.index - 1] }} {{ dup_path }} - {% set i = i + 1 %} {% endfor %}