From 4f6813350b08c805fe9207af3ec8b4181f8685ea Mon Sep 17 00:00:00 2001
From: Mokaddem <mokaddem.sami@gmail.com>
Date: Mon, 18 Jul 2016 15:50:41 +0200
Subject: [PATCH] Added two new version of duplicate module. One with hashes
 are saved in json on disk The other with only leveldb

---
 bin/Duplicate_ssdeep.py        |  58 ++++++------
 bin/Duplicate_ssdeep_v2.py     | 160 +++++++++++++++++++++++++++++++++
 bin/LAUNCH.sh                  |   2 +-
 bin/packages/config.cfg.sample |   7 ++
 4 files changed, 198 insertions(+), 29 deletions(-)
 create mode 100755 bin/Duplicate_ssdeep_v2.py

diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py
index 916bc0ba..1b173eca 100755
--- a/bin/Duplicate_ssdeep.py
+++ b/bin/Duplicate_ssdeep.py
@@ -20,7 +20,6 @@ import json
 import ssdeep
 from packages import Paste
 from pubsublogger import publisher
-from pybloomfilter import BloomFilter
 
 from Helper import Process
 
@@ -29,10 +28,10 @@ if __name__ == "__main__":
     publisher.channel = "Script"
 
     config_section = 'Duplicates'
-    saved_dico_and_reload = 1 #min
+    save_dico_and_reload = 1 #min
     time_1 = time.time()
-    flag_reload = True
-    flag_to_disk = False
+    flag_reload_from_disk = True
+    flag_write_to_disk = False
 
     p = Process(config_section)
 
@@ -81,18 +80,16 @@ if __name__ == "__main__":
             filedicopath_today = filedicopath
 
             # Save I/O
-            if time.time() - time_1 > saved_dico_and_reload*60:
-                flag_to_disk = True
+            if time.time() - time_1 > save_dico_and_reload*60:
+                flag_write_to_disk = True
 
             if os.path.exists(filedicopath):
-                if flag_reload == True:
-                    flag_reload = False
+                if flag_reload_from_disk == True:
+                    flag_reload_from_disk = False
                     print 'Reloading'
-                    time_1 = time.time()
                     with open(filedicopath, 'r') as fp:
                         today_dico = json.load(fp)
             else:
-                time_1 = time.time()
                 today_dico = {}
                 with open(filedicopath, 'w') as fp:
                     json.dump(today_dico, fp)
@@ -105,44 +102,47 @@ if __name__ == "__main__":
             r_serv0 = dico_redis[yearly_index]
             r_serv0.incr("current_index")
             index = r_serv0.get("current_index")+str(PST.p_date)
-            # HASHTABLES PER MONTH (because of r_serv1 changing db)
-            r_serv1.set(index, PST.p_path)
-            r_serv1.sadd("INDEX", index)
+            
             # For each dico
             opened_dico = []
             for dico in dico_path_set:
                 # Opening dico
                 if dico == filedicopath_today:
                     opened_dico.append([dico, today_dico])
-                with open(dico, 'r') as fp:
-                    opened_dico.append([dico, json.load(fp)])
+                else:
+                    with open(dico, 'r') as fp:
+                        opened_dico.append([dico, json.load(fp)])
 
               
             #retrieve hash from paste
             paste_hash = PST._get_p_hash()
-            # Adding the hash in Redis
-            r_serv1.set(paste_hash, index)
-            r_serv1.sadd("HASHS", paste_hash)
+            
             # Go throught the Database of the dico (of the month)
-            threshold_dup = 10 
+            threshold_dup = 99 
             for dico_name, dico in opened_dico:
                 for dico_key, dico_hash in dico.items():
                     percent = ssdeep.compare(dico_hash, paste_hash)
                     if percent > threshold_dup:
                         db = dico_name[-6:]
-                        # Go throught the Database of the bloom filter (month)
+                        # Go throught the Database of the dico filter (month)
                         r_serv_dico = dico_redis[db]
                         
                         # index of paste
-                        # FIXME Use r_serv_dico and do not consider only 1 server!!
-                        index_current = r_serv1.get(dico_hash)
-                        paste_path = r_serv1.get(index_current)
+                        index_current = r_serv_dico.get(dico_hash)
+                        paste_path = r_serv_dico.get(index_current)
                         if paste_path != None:
                             hash_dico[dico_hash] = (paste_path, percent)
 
-                        print 'comparing: ' + str(dico_hash[:20]) + '  and  ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
-                        print '   '+ PST.p_path[44:]  +', '+ paste_path[44:]
+                        #print 'comparing: ' + str(dico_hash[:20]) + '  and  ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
+                        print '   '+ PST.p_path[44:]  +', '+ paste_path[44:] + ', ' + str(percent)
 
+            # Add paste in DB to prevent its analyse twice
+            # HASHTABLES PER MONTH (because of r_serv1 changing db)
+            r_serv1.set(index, PST.p_path)
+            r_serv1.sadd("INDEX", index)
+            # Adding the hash in Redis
+            r_serv1.set(paste_hash, index)
+            r_serv1.sadd("HASHS", paste_hash)
     ##################### Similarity found  #######################
 
             # if there is data in this dictionnary
@@ -168,9 +168,11 @@ if __name__ == "__main__":
             # Adding the hash in the dico of the month
             today_dico[index] = paste_hash
 
-            if flag_to_disk:
-                flag_to_disk = False
-                flag_reload = True
+            if flag_write_to_disk:
+                time_1 = time.time()
+                flag_write_to_disk = False
+                flag_reload_from_disk = True
+                print 'writing'
                 with open(filedicopath, 'w') as fp:
                     json.dump(today_dico, fp)
         except IOError:
diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py
new file mode 100755
index 00000000..35874371
--- /dev/null
+++ b/bin/Duplicate_ssdeep_v2.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python2
+# -*-coding:UTF-8 -*
+
+"""
+The Duplicate module
+====================
+
+This huge module is, in short term, checking duplicates.
+
+This one differ from v1 by only using redis and not json file on disk
+
+Requirements:
+-------------
+
+
+"""
+import redis
+import os
+import time
+from datetime import datetime, timedelta
+import json
+import ssdeep
+from packages import Paste
+from pubsublogger import publisher
+
+from Helper import Process
+
+if __name__ == "__main__":
+    publisher.port = 6380
+    publisher.channel = "Script"
+
+    config_section = 'Duplicates'
+
+    p = Process(config_section)
+
+    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
+    threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) 
+    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) 
+
+    # REDIS #
+    dico_redis = {}
+    date_today = datetime.today()
+    for year in xrange(2013, date_today.year+1):
+        for month in xrange(0, 13):
+            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
+                host=p.config.get("Redis_Level_DB", "host"), port=year,
+                db=month)
+	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
+
+    # FUNCTIONS #
+    publisher.info("Script duplicate started")
+
+    while True:
+        try:
+            hash_dico = {}
+            dupl = []
+            dico_range_list = []
+
+            x = time.time()
+
+            message = p.get_from_set()
+            if message is not None:
+                path = message
+                PST = Paste.Paste(path)
+            else:
+                publisher.debug("Script Attribute is idling 10s")
+                time.sleep(10)
+                continue
+
+            # the paste is too small
+            if (PST._get_p_size() < min_paste_size): 
+                continue
+
+            PST._set_p_hash_kind("ssdeep")
+
+            # Assignate the correct redis connexion
+            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
+
+            # Creating the dico name: yyyymm
+            # Get the date of the range
+            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
+            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
+            for diff_month in xrange(0, num_of_month+1):
+                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
+                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
+                dico_range_list.append(to_append)
+            
+            # Use all dico in range
+            dico_range_list = dico_range_list[0:maximum_month_range]
+
+            # UNIQUE INDEX HASHS TABLE
+            yearly_index = str(date_today.year)+'00'
+            r_serv0 = dico_redis[yearly_index]
+            r_serv0.incr("current_index")
+            index = r_serv0.get("current_index")+str(PST.p_date)
+            
+            # Open selected dico range 
+            opened_dico = []
+            for dico_name in dico_range_list:
+                opened_dico.append([dico_name, dico_redis[dico_name]])
+              
+            # retrieve hash from paste
+            paste_hash = PST._get_p_hash()
+            
+            # Go throught the Database of the dico (of the month)
+            for curr_dico_name, curr_dico_redis in opened_dico:
+                for dico_hash in curr_dico_redis.smembers('HASHS'):
+                    try:
+                        percent = ssdeep.compare(dico_hash, paste_hash)
+                        if percent > threshold_duplicate:
+                            # Go throught the Database of the dico filter (month)
+                            r_serv_dico = dico_redis[curr_dico_name]
+                            
+                            # index of paste
+                            index_current = r_serv_dico.get(dico_hash)
+                            paste_path = r_serv_dico.get(index_current)
+                            if paste_path != None:
+                                hash_dico[dico_hash] = (paste_path, percent)
+
+                            print 'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
+                            #print '   '+ PST.p_path[44:]  +', '+ paste_path[44:] + ', ' + str(percent)
+                    except:
+                        # ssdeep hash not comparable
+                        print 'ssdeep hash not comparable' 
+                        publisher.error('ssdeep hash not comparable')
+
+            # Add paste in DB after checking to prevent its analysis twice
+            # hash_i -> index_i  AND  index_i -> PST.PATH
+            r_serv1.set(index, PST.p_path)
+            r_serv1.sadd("INDEX", index)
+            # Adding the hash in Redis
+            r_serv1.set(paste_hash, index)
+            r_serv1.sadd("HASHS", paste_hash)
+    ##################### Similarity found  #######################
+
+            # if there is data in this dictionnary
+            if len(hash_dico) != 0:
+                # paste_tuple = (paste_path, percent)
+                for dico_hash, paste_tuple in hash_dico.items():
+                    dupl.append(paste_tuple)
+
+                # Creating the object attribute and save it.
+                to_print = 'Duplicate;{};{};{};'.format(
+                    PST.p_source, PST.p_date, PST.p_name)
+                if dupl != []:
+                    PST.__setattr__("p_duplicate", dupl)
+                    PST.save_attribute_redis("p_duplicate", dupl)
+                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))
+                    #print '{}Detected {}'.format(to_print, len(dupl))
+
+                y = time.time()
+
+                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
+                #print '{}Processed in {} sec'.format(to_print, y-x)
+           
+        except IOError:
+            to_print = 'Duplicate;{};{};{};'.format(
+                PST.p_source, PST.p_date, PST.p_name)
+            print "CRC Checksum Failed on :", PST.p_path
+            publisher.error('{}CRC Checksum Failed'.format(to_print))
diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh
index 86e155b1..d6706e1e 100755
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@@ -105,7 +105,7 @@ function launching_scripts {
 
     screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
     sleep 0.1
-    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x'
+    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x'
     sleep 0.1
     screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
     sleep 0.1
diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample
index 6d07707c..b5f2c308 100644
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@@ -4,6 +4,13 @@ pastes = PASTES
 wordtrending_csv = var/www/static/csv/wordstrendingdata
 wordsfile = files/wordfile
 
+#### Modules #### 
+[Modules_Duplicates]
+#Number of month to look back
+maximum_month_range = 3
+#The value where two pastes are considerate duplicate.
+threshold_duplicate = 50
+
 ##### Redis #####
 [Redis_Cache]
 host = localhost