From 14e9850dd6d1f87a78276dcf317b9f0f3605960b Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 15 Jul 2016 16:58:48 +0200 Subject: [PATCH] Added new module for Duplicate paste. Seems working but has some small bug (re-check same paste twice) --- bin/Duplicate_ssdeep.py | 180 ++++++++++++++++++++++++++++++++++++++++ bin/LAUNCH.sh | 2 +- bin/packages/Hash.py | 6 +- 3 files changed, 184 insertions(+), 4 deletions(-) create mode 100755 bin/Duplicate_ssdeep.py diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py new file mode 100755 index 00000000..916bc0ba --- /dev/null +++ b/bin/Duplicate_ssdeep.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. + +Requirements: +------------- + + +""" +import redis +import os +import time +import datetime +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher +from pybloomfilter import BloomFilter + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + saved_dico_and_reload = 1 #min + time_1 = time.time() + flag_reload = True + flag_to_disk = False + + p = Process(config_section) + + # REDIS # + # DB OBJECT & HASHS ( DISK ) + # FIXME increase flexibility + dico_redis = {} + for year in xrange(2013, datetime.date.today().year+1): + for month in xrange(0, 16): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + dicopath = os.path.join(os.environ['AIL_HOME'], + p.config.get("Directories", "dicofilters")) + + dico_path_set = set() + while True: + try: + hash_dico = {} + dupl = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dicor name: dicoyyyymm + filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year + + PST.p_date.month) + filedicopath_today = filedicopath + + # Save I/O + if time.time() - time_1 > saved_dico_and_reload*60: + flag_to_disk = True + + if os.path.exists(filedicopath): + if flag_reload == True: + flag_reload = False + print 'Reloading' + time_1 = time.time() + with open(filedicopath, 'r') as fp: + today_dico = json.load(fp) + else: + time_1 = time.time() + today_dico = {} + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + + # For now, just use monthly dico + dico_path_set.add(filedicopath) + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(datetime.date.today().year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + # HASHTABLES PER MONTH (because of r_serv1 changing db) + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # For each dico + opened_dico = [] + for dico in dico_path_set: + # Opening dico + if dico == filedicopath_today: + opened_dico.append([dico, today_dico]) + with open(dico, 'r') as fp: + opened_dico.append([dico, json.load(fp)]) + + + #retrieve hash from paste + paste_hash = PST._get_p_hash() + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + # Go throught the Database of the dico (of the month) + threshold_dup = 10 + for dico_name, dico in opened_dico: + for dico_key, dico_hash in dico.items(): + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_dup: + db = dico_name[-6:] + # Go throught the Database of the bloom filter (month) + r_serv_dico = dico_redis[db] + + # index of paste + # FIXME Use r_serv_dico and do not consider only 1 server!! + index_current = r_serv1.get(dico_hash) + paste_path = r_serv1.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) + print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + for dico_hash, paste_tuple in hash_dico.items(): + paste_path, percent = paste_tuple + dupl.append((paste_path, percent)) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + + + # Adding the hash in the dico of the month + today_dico[index] = paste_hash + + if flag_to_disk: + flag_to_disk = False + flag_reload = True + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index fc8c9ff1..86e155b1 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -105,7 +105,7 @@ function launching_scripts { screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' sleep 0.1 - screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate.py; read x' + screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' sleep 0.1 diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index d46abcba..2f34c5c7 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -1,7 +1,7 @@ import hashlib import crcmod import mmh3 -import simhash +import ssdeep class Hash(object): @@ -33,7 +33,7 @@ class Hash(object): elif self.name == "murmur": hash = mmh3.hash(string) - elif self.name == "simhash": - hash = Simhash(string) + elif self.name == "ssdeep": + hash = ssdeep.hash(string) return hash