Added two new version of duplicate module.

One with hashes are saved in json on disk The other with only leveldb
2025-03-18 20:39:51 +00:00 · 2016-07-18 15:50:41 +02:00 · 2016-07-18 15:50:41 +02:00 · 4f6813350b
commit 4f6813350b
parent 14e9850dd6
4 changed files with 198 additions and 29 deletions
--- a/bin/Duplicate_ssdeep.py
+++ b/bin/Duplicate_ssdeep.py
@ -20,7 +20,6 @@ import json
 import ssdeep
 from packages import Paste
 from pubsublogger import publisher
 from pybloomfilter import BloomFilter
 from Helper import Process
@ -29,10 +28,10 @@ if __name__ == "__main__":
    publisher.channel = "Script"
    config_section = 'Duplicates'
-    saved_dico_and_reload = 1 #min
+    save_dico_and_reload = 1 #min
    time_1 = time.time()
-    flag_reload = True
+    flag_reload_from_disk = True
-    flag_to_disk = False
+    flag_write_to_disk = False
    p = Process(config_section)
@ -81,18 +80,16 @@ if __name__ == "__main__":
            filedicopath_today = filedicopath
            # Save I/O
-            if time.time() - time_1 > saved_dico_and_reload*60:
+            if time.time() - time_1 > save_dico_and_reload*60:
-                flag_to_disk = True
+                flag_write_to_disk = True
            if os.path.exists(filedicopath):
-                if flag_reload == True:
+                if flag_reload_from_disk == True:
-                    flag_reload = False
+                    flag_reload_from_disk = False
                    print 'Reloading'
                    time_1 = time.time()
                    with open(filedicopath, 'r') as fp:
                        today_dico = json.load(fp)
            else:
                time_1 = time.time()
                today_dico = {}
                with open(filedicopath, 'w') as fp:
                    json.dump(today_dico, fp)
@ -105,44 +102,47 @@ if __name__ == "__main__":
            r_serv0 = dico_redis[yearly_index]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index")+str(PST.p_date)
-            # HASHTABLES PER MONTH (because of r_serv1 changing db)
+            
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)
            # For each dico
            opened_dico = []
            for dico in dico_path_set:
                # Opening dico
                if dico == filedicopath_today:
                    opened_dico.append([dico, today_dico])
-                with open(dico, 'r') as fp:
+                else:
-                    opened_dico.append([dico, json.load(fp)])
+                    with open(dico, 'r') as fp:
                        opened_dico.append([dico, json.load(fp)])
            #retrieve hash from paste
            paste_hash = PST._get_p_hash()
-            # Adding the hash in Redis
+            
            r_serv1.set(paste_hash, index)
            r_serv1.sadd("HASHS", paste_hash)
            # Go throught the Database of the dico (of the month)
-            threshold_dup = 10 
+            threshold_dup = 99 
            for dico_name, dico in opened_dico:
                for dico_key, dico_hash in dico.items():
                    percent = ssdeep.compare(dico_hash, paste_hash)
                    if percent > threshold_dup:
                        db = dico_name[-6:]
-                        # Go throught the Database of the bloom filter (month)
+                        # Go throught the Database of the dico filter (month)
                        r_serv_dico = dico_redis[db]
                        # index of paste
-                        # FIXME Use r_serv_dico and do not consider only 1 server!!
+                        index_current = r_serv_dico.get(dico_hash)
-                        index_current = r_serv1.get(dico_hash)
+                        paste_path = r_serv_dico.get(index_current)
                        paste_path = r_serv1.get(index_current)
                        if paste_path != None:
                            hash_dico[dico_hash] = (paste_path, percent)
-                        print 'comparing: ' + str(dico_hash[:20]) + '  and  ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
+                        #print 'comparing: ' + str(dico_hash[:20]) + '  and  ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
-                        print '   '+ PST.p_path[44:]  +', '+ paste_path[44:]
+                        print '   '+ PST.p_path[44:]  +', '+ paste_path[44:] + ', ' + str(percent)
            # Add paste in DB to prevent its analyse twice
            # HASHTABLES PER MONTH (because of r_serv1 changing db)
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)
            # Adding the hash in Redis
            r_serv1.set(paste_hash, index)
            r_serv1.sadd("HASHS", paste_hash)
    ##################### Similarity found  #######################
            # if there is data in this dictionnary
@ -168,9 +168,11 @@ if __name__ == "__main__":
            # Adding the hash in the dico of the month
            today_dico[index] = paste_hash
-            if flag_to_disk:
+            if flag_write_to_disk:
-                flag_to_disk = False
+                time_1 = time.time()
-                flag_reload = True
+                flag_write_to_disk = False
                flag_reload_from_disk = True
                print 'writing'
                with open(filedicopath, 'w') as fp:
                    json.dump(today_dico, fp)
        except IOError:
--- a/bin/Duplicate_ssdeep_v2.py
+++ b/bin/Duplicate_ssdeep_v2.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python2
 # -*-coding:UTF-8 -*
 """
 The Duplicate module
 ====================
 This huge module is, in short term, checking duplicates.
 This one differ from v1 by only using redis and not json file on disk
 Requirements:
 -------------
 """
 import redis
 import os
 import time
 from datetime import datetime, timedelta
 import json
 import ssdeep
 from packages import Paste
 from pubsublogger import publisher
 from Helper import Process
 if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = 'Duplicates'
    p = Process(config_section)
    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
    threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) 
    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) 
    # REDIS #
    dico_redis = {}
    date_today = datetime.today()
    for year in xrange(2013, date_today.year+1):
        for month in xrange(0, 13):
            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
                host=p.config.get("Redis_Level_DB", "host"), port=year,
                db=month)
 	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
    # FUNCTIONS #
    publisher.info("Script duplicate started")
    while True:
        try:
            hash_dico = {}
            dupl = []
            dico_range_list = []
            x = time.time()
            message = p.get_from_set()
            if message is not None:
                path = message
                PST = Paste.Paste(path)
            else:
                publisher.debug("Script Attribute is idling 10s")
                time.sleep(10)
                continue
            # the paste is too small
            if (PST._get_p_size() < min_paste_size): 
                continue
            PST._set_p_hash_kind("ssdeep")
            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
            # Creating the dico name: yyyymm
            # Get the date of the range
            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
            for diff_month in xrange(0, num_of_month+1):
                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
                dico_range_list.append(to_append)
            # Use all dico in range
            dico_range_list = dico_range_list[0:maximum_month_range]
            # UNIQUE INDEX HASHS TABLE
            yearly_index = str(date_today.year)+'00'
            r_serv0 = dico_redis[yearly_index]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index")+str(PST.p_date)
            # Open selected dico range 
            opened_dico = []
            for dico_name in dico_range_list:
                opened_dico.append([dico_name, dico_redis[dico_name]])
            # retrieve hash from paste
            paste_hash = PST._get_p_hash()
            # Go throught the Database of the dico (of the month)
            for curr_dico_name, curr_dico_redis in opened_dico:
                for dico_hash in curr_dico_redis.smembers('HASHS'):
                    try:
                        percent = ssdeep.compare(dico_hash, paste_hash)
                        if percent > threshold_duplicate:
                            # Go throught the Database of the dico filter (month)
                            r_serv_dico = dico_redis[curr_dico_name]
                            # index of paste
                            index_current = r_serv_dico.get(dico_hash)
                            paste_path = r_serv_dico.get(index_current)
                            if paste_path != None:
                                hash_dico[dico_hash] = (paste_path, percent)
                            print 'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
                            #print '   '+ PST.p_path[44:]  +', '+ paste_path[44:] + ', ' + str(percent)
                    except:
                        # ssdeep hash not comparable
                        print 'ssdeep hash not comparable' 
                        publisher.error('ssdeep hash not comparable')
            # Add paste in DB after checking to prevent its analysis twice
            # hash_i -> index_i  AND  index_i -> PST.PATH
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)
            # Adding the hash in Redis
            r_serv1.set(paste_hash, index)
            r_serv1.sadd("HASHS", paste_hash)
    ##################### Similarity found  #######################
            # if there is data in this dictionnary
            if len(hash_dico) != 0:
                # paste_tuple = (paste_path, percent)
                for dico_hash, paste_tuple in hash_dico.items():
                    dupl.append(paste_tuple)
                # Creating the object attribute and save it.
                to_print = 'Duplicate;{};{};{};'.format(
                    PST.p_source, PST.p_date, PST.p_name)
                if dupl != []:
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_redis("p_duplicate", dupl)
                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))
                    #print '{}Detected {}'.format(to_print, len(dupl))
                y = time.time()
                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
                #print '{}Processed in {} sec'.format(to_print, y-x)
        except IOError:
            to_print = 'Duplicate;{};{};{};'.format(
                PST.p_source, PST.p_date, PST.p_name)
            print "CRC Checksum Failed on :", PST.p_path
            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -105,7 +105,7 @@ function launching_scripts {
    screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
    sleep 0.1
-    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x'
+    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x'
    sleep 0.1
    screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
    sleep 0.1
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -4,6 +4,13 @@ pastes = PASTES
 wordtrending_csv = var/www/static/csv/wordstrendingdata
 wordsfile = files/wordfile
 #### Modules #### 
 [Modules_Duplicates]
 #Number of month to look back
 maximum_month_range = 3
 #The value where two pastes are considerate duplicate.
 threshold_duplicate = 50
 ##### Redis #####
 [Redis_Cache]
 host = localhost