From 4f6813350b08c805fe9207af3ec8b4181f8685ea Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Mon, 18 Jul 2016 15:50:41 +0200 Subject: [PATCH] Added two new version of duplicate module. One with hashes are saved in json on disk The other with only leveldb --- bin/Duplicate_ssdeep.py | 58 ++++++------ bin/Duplicate_ssdeep_v2.py | 160 +++++++++++++++++++++++++++++++++ bin/LAUNCH.sh | 2 +- bin/packages/config.cfg.sample | 7 ++ 4 files changed, 198 insertions(+), 29 deletions(-) create mode 100755 bin/Duplicate_ssdeep_v2.py diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py index 916bc0ba..1b173eca 100755 --- a/bin/Duplicate_ssdeep.py +++ b/bin/Duplicate_ssdeep.py @@ -20,7 +20,6 @@ import json import ssdeep from packages import Paste from pubsublogger import publisher -from pybloomfilter import BloomFilter from Helper import Process @@ -29,10 +28,10 @@ if __name__ == "__main__": publisher.channel = "Script" config_section = 'Duplicates' - saved_dico_and_reload = 1 #min + save_dico_and_reload = 1 #min time_1 = time.time() - flag_reload = True - flag_to_disk = False + flag_reload_from_disk = True + flag_write_to_disk = False p = Process(config_section) @@ -81,18 +80,16 @@ if __name__ == "__main__": filedicopath_today = filedicopath # Save I/O - if time.time() - time_1 > saved_dico_and_reload*60: - flag_to_disk = True + if time.time() - time_1 > save_dico_and_reload*60: + flag_write_to_disk = True if os.path.exists(filedicopath): - if flag_reload == True: - flag_reload = False + if flag_reload_from_disk == True: + flag_reload_from_disk = False print 'Reloading' - time_1 = time.time() with open(filedicopath, 'r') as fp: today_dico = json.load(fp) else: - time_1 = time.time() today_dico = {} with open(filedicopath, 'w') as fp: json.dump(today_dico, fp) @@ -105,44 +102,47 @@ if __name__ == "__main__": r_serv0 = dico_redis[yearly_index] r_serv0.incr("current_index") index = r_serv0.get("current_index")+str(PST.p_date) - # HASHTABLES PER MONTH (because of r_serv1 changing db) - r_serv1.set(index, PST.p_path) - r_serv1.sadd("INDEX", index) + # For each dico opened_dico = [] for dico in dico_path_set: # Opening dico if dico == filedicopath_today: opened_dico.append([dico, today_dico]) - with open(dico, 'r') as fp: - opened_dico.append([dico, json.load(fp)]) + else: + with open(dico, 'r') as fp: + opened_dico.append([dico, json.load(fp)]) #retrieve hash from paste paste_hash = PST._get_p_hash() - # Adding the hash in Redis - r_serv1.set(paste_hash, index) - r_serv1.sadd("HASHS", paste_hash) + # Go throught the Database of the dico (of the month) - threshold_dup = 10 + threshold_dup = 99 for dico_name, dico in opened_dico: for dico_key, dico_hash in dico.items(): percent = ssdeep.compare(dico_hash, paste_hash) if percent > threshold_dup: db = dico_name[-6:] - # Go throught the Database of the bloom filter (month) + # Go throught the Database of the dico filter (month) r_serv_dico = dico_redis[db] # index of paste - # FIXME Use r_serv_dico and do not consider only 1 server!! - index_current = r_serv1.get(dico_hash) - paste_path = r_serv1.get(index_current) + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) if paste_path != None: hash_dico[dico_hash] = (paste_path, percent) - print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) - print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + #print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) + print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + # Add paste in DB to prevent its analyse twice + # HASHTABLES PER MONTH (because of r_serv1 changing db) + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) ##################### Similarity found ####################### # if there is data in this dictionnary @@ -168,9 +168,11 @@ if __name__ == "__main__": # Adding the hash in the dico of the month today_dico[index] = paste_hash - if flag_to_disk: - flag_to_disk = False - flag_reload = True + if flag_write_to_disk: + time_1 = time.time() + flag_write_to_disk = False + flag_reload_from_disk = True + print 'writing' with open(filedicopath, 'w') as fp: json.dump(today_dico, fp) except IOError: diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py new file mode 100755 index 00000000..35874371 --- /dev/null +++ b/bin/Duplicate_ssdeep_v2.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. + +This one differ from v1 by only using redis and not json file on disk + +Requirements: +------------- + + +""" +import redis +import os +import time +from datetime import datetime, timedelta +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + + p = Process(config_section) + + maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) + threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) + min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) + + # REDIS # + dico_redis = {} + date_today = datetime.today() + for year in xrange(2013, date_today.year+1): + for month in xrange(0, 13): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + while True: + try: + hash_dico = {} + dupl = [] + dico_range_list = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + # the paste is too small + if (PST._get_p_size() < min_paste_size): + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dico name: yyyymm + # Get the date of the range + date_range = date_today - timedelta(days = maximum_month_range*30.4166666) + num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month) + for diff_month in xrange(0, num_of_month+1): + curr_date_range = date_today - timedelta(days = diff_month*30.4166666) + to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) + dico_range_list.append(to_append) + + # Use all dico in range + dico_range_list = dico_range_list[0:maximum_month_range] + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(date_today.year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + + # Open selected dico range + opened_dico = [] + for dico_name in dico_range_list: + opened_dico.append([dico_name, dico_redis[dico_name]]) + + # retrieve hash from paste + paste_hash = PST._get_p_hash() + + # Go throught the Database of the dico (of the month) + for curr_dico_name, curr_dico_redis in opened_dico: + for dico_hash in curr_dico_redis.smembers('HASHS'): + try: + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_duplicate: + # Go throught the Database of the dico filter (month) + r_serv_dico = dico_redis[curr_dico_name] + + # index of paste + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) + #print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + except: + # ssdeep hash not comparable + print 'ssdeep hash not comparable' + publisher.error('ssdeep hash not comparable') + + # Add paste in DB after checking to prevent its analysis twice + # hash_i -> index_i AND index_i -> PST.PATH + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + # paste_tuple = (paste_path, percent) + for dico_hash, paste_tuple in hash_dico.items(): + dupl.append(paste_tuple) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + #print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + #print '{}Processed in {} sec'.format(to_print, y-x) + + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 86e155b1..d6706e1e 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -105,7 +105,7 @@ function launching_scripts { screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' sleep 0.1 - screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x' + screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' sleep 0.1 diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 6d07707c..b5f2c308 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -4,6 +4,13 @@ pastes = PASTES wordtrending_csv = var/www/static/csv/wordstrendingdata wordsfile = files/wordfile +#### Modules #### +[Modules_Duplicates] +#Number of month to look back +maximum_month_range = 3 +#The value where two pastes are considerate duplicate. +threshold_duplicate = 50 + ##### Redis ##### [Redis_Cache] host = localhost