Draft: added new duplicate hash comparison - tlsh

2024-09-19 23:48:23 +00:00 · 2016-08-04 11:55:38 +02:00 · 2016-08-04 11:55:38 +02:00 · d9316771cd
commit d9316771cd
parent 50d2848a40
6 changed files with 125 additions and 79 deletions
--- a/bin/Duplicate_ssdeep_v2.py
+++ b/bin/Duplicate_ssdeep_v2.py
@ -22,6 +22,7 @@ import time
 from datetime import datetime, timedelta
 import json
 import ssdeep
 import tlsh
 from packages import Paste
 from pubsublogger import publisher
@ -36,8 +37,12 @@ if __name__ == "__main__":
    p = Process(config_section)
    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
-    threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) 
+    threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
-    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) 
+    threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
    threshold_set = {}
    threshold_set['ssdeep'] = threshold_duplicate_ssdeep 
    threshold_set['tlsh'] = threshold_duplicate_tlsh 
    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
    # REDIS #
    dico_redis = {}
@ -47,7 +52,7 @@ if __name__ == "__main__":
            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
                host=p.config.get("Redis_Level_DB", "host"), port=year,
                db=month)
-	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
+            #print("dup: "+str(year)+str(month).zfill(2)+"\n")
    # FUNCTIONS #
    publisher.info("Script duplicate started")
@ -70,10 +75,11 @@ if __name__ == "__main__":
                continue
            # the paste is too small
-            if (PST._get_p_size() < min_paste_size): 
+            if (PST._get_p_size() < min_paste_size):
                continue
            PST._set_p_hash_kind("ssdeep")
            PST._set_p_hash_kind("tlsh")
            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
@ -86,7 +92,7 @@ if __name__ == "__main__":
                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
                dico_range_list.append(to_append)
-            
+
            # Use all dico in range
            dico_range_list = dico_range_list[0:maximum_month_range]
@ -95,43 +101,47 @@ if __name__ == "__main__":
            r_serv0 = dico_redis[yearly_index]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index")+str(PST.p_date)
-            
+
-            # Open selected dico range 
+            # Open selected dico range
            opened_dico = []
            for dico_name in dico_range_list:
                opened_dico.append([dico_name, dico_redis[dico_name]])
-              
+
            # retrieve hash from paste
-            paste_hash = PST._get_p_hash()
+            paste_hashes = PST._get_p_hash()
-            
+
            # Go throught the Database of the dico (of the month)
            for curr_dico_name, curr_dico_redis in opened_dico:
-                for dico_hash in curr_dico_redis.smembers('HASHS'):
+                for hash_type, paste_hash in paste_hashes.iteritems():
-                    try:
+                    for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
-                        percent = ssdeep.compare(dico_hash, paste_hash)
+                        try:
-                        if percent > threshold_duplicate:
+                            percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
-                            # Go throught the Database of the dico filter (month)
+                            threshold_duplicate = threshold_set[hash_type]
-                            r_serv_dico = dico_redis[curr_dico_name]
+                            if percent < threshold_duplicate:
-                            
+                                # Go throught the Database of the dico filter (month)
-                            # index of paste
+                                r_serv_dico = dico_redis[curr_dico_name]
                            index_current = r_serv_dico.get(dico_hash)
                            paste_path = r_serv_dico.get(index_current)
                            if paste_path != None:
                                hash_dico[dico_hash] = (paste_path, percent)
-                            #print 'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
+                                # index of paste
-                    except:
+                                index_current = r_serv_dico.get(dico_hash)
-                        # ssdeep hash not comparable
+                                paste_path = r_serv_dico.get(index_current)
-                        print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
+                                if paste_path != None:
-                        curr_dico_redis.srem('HASHS', dico_hash)
+                                    hash_dico[dico_hash] = (hash_type, paste_path, percent)
                                print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
                        except Exception,e:
                            print str(e)
                            # ssdeep hash not comparable
                            #print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
                            #curr_dico_redis.srem('HASHS', dico_hash)
            # Add paste in DB after checking to prevent its analysis twice
            # hash_i -> index_i  AND  index_i -> PST.PATH
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)
            # Adding the hash in Redis
-            r_serv1.set(paste_hash, index)
+            for hash_type, paste_hash in paste_hashes.iteritems():
-            r_serv1.sadd("HASHS", paste_hash)
+                r_serv1.set(paste_hash, index)
                r_serv1.sadd("HASHS_"+hash_type, paste_hash)
    ##################### Similarity found  #######################
            # if there is data in this dictionnary
@ -153,7 +163,7 @@ if __name__ == "__main__":
                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
                #print '{}Processed in {} sec'.format(to_print, y-x)
-           
+
        except IOError:
            to_print = 'Duplicate;{};{};{};'.format(
                PST.p_source, PST.p_date, PST.p_name)
--- a/bin/packages/Hash.py
+++ b/bin/packages/Hash.py
@ -2,6 +2,7 @@ import hashlib
 import crcmod
 import mmh3
 import ssdeep
 import tlsh
 class Hash(object):
@ -36,4 +37,7 @@ class Hash(object):
        elif self.name == "ssdeep":
            hash = ssdeep.hash(string)
        elif self.name == "tlsh":
            hash = tlsh.hash(string)
        return hash
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -86,8 +86,8 @@ class Paste(object):
        self.p_source = var[-5]
        self.p_encoding = None
-        self.p_hash_kind = None
+        self.p_hash_kind = {}
-        self.p_hash = None
+        self.p_hash = {}
        self.p_langage = None
        self.p_nb_lines = None
        self.p_max_length_line = None
@ -159,7 +159,7 @@ class Paste(object):
        .. seealso:: Hash.py Object to get the available hashs.
        """
-        self.p_hash_kind = Hash(hashkind)
+        self.p_hash_kind[hashkind] = (Hash(hashkind))
    def _get_p_hash(self):
        """
@ -174,7 +174,8 @@ class Paste(object):
        .. seealso:: _set_p_hash_kind("md5")
        """
-        self.p_hash = self.p_hash_kind.Calculate(self.get_p_content())
+        for hash_name, the_hash in self.p_hash_kind.iteritems():
            self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
        return self.p_hash
    def _get_p_language(self):
@ -202,42 +203,6 @@ class Paste(object):
    def _get_p_size(self):
        return self.p_size
    def _get_hash_lines(self, min=1, start=1, jump=10):
        """
        Returning all the lines of the paste hashed.
        :param min: -- (int) Minimum line length to be hashed.
        :param start: -- (int) Number the line where to start.
        :param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
        (Maximum Granularity)
        :return: a set([]) of hash.
        .. warning:: Using a set here mean that this function will only return uniq hash.
        If the paste is composed with 1000 time the same line, this function will return
        just once the line.
        This choice was made to avoid a certain redundancy and useless hash checking.
        :Example: PST._get_hash_lines(1, 1, 0)
        .. note:: You need first to "declare which kind of hash you want to use
        before using this function
        .. seealso:: _set_p_hash_kind("md5")
        """
        S = set([])
        f = self.get_p_content_as_file()
        for num, line in enumerate(f, start):
            if len(line) >= min:
                if jump > 1:
                    if (num % jump) == 1:
                        S.add(self.p_hash_kind.Calculate(line))
                else:
                    S.add(self.p_hash_kind.Calculate(line))
        return S
    def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
        """
        Returning the percent of similarity with another paste.
@ -329,7 +294,10 @@ class Paste(object):
            self.store.hset(self.p_path, attr_name, json.dumps(value))
    def _get_from_redis(self, r_serv):
-        return r_serv.hgetall(self.p_hash)
+        ans = {}
        for hash_name, the_hash in self.p_hash:
            ans[hash_name] = r_serv.hgetall(the_hash)
        return ans
    def _get_top_words(self, sort=False):
        """
--- a/installing_deps.sh
+++ b/installing_deps.sh
@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
 sudo ldconfig
 popd
 # tlsh
 test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
 pushd tlsh/
 ./make
 popd
 # REDIS LEVEL DB #
 test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
 pushd redis-leveldb/
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
 python setup.py install
 popd
 # Py tlsh
 pushd tlsh/py_ext
 python setup.py build
 python setup.py install
 # Download the necessary NLTK corpora
 HOME=$(pwd) python -m textblob.download_corpora
--- a/var/www/Flask_server.py
+++ b/var/www/Flask_server.py
@ -14,6 +14,8 @@ import Paste
 from Date import Date
 # CONFIG #
 tlsh_to_percent = 1000.0
 configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
 if not os.path.exists(configfile):
    raise Exception('Unable to find the configuration file. \
@ -74,13 +76,28 @@ def parseStringToList(the_string):
                strList += c
        else:
            the_list = strList.split(',')
-            if len(the_list) == 2:
+            if len(the_list) == 3:
               elemList = elemList + the_list
            elif len(the_list) == 2:
               elemList.append(the_list)
            elif len(the_list) > 1:
               elemList.append(the_list[1:])
            strList = ""
    return elemList
 def parseStringToList2(the_string):
    res = []
    tab_str = the_string.split('], [')
    tab_str[0] = tab_str[0][1:]+']'
    tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
    res.append(parseStringToList(tab_str[0]))
    for i in range(1, len(tab_str)-2):
        tab_str[i] = '['+tab_str[i]+']'
        res.append(parseStringToList(tab_str[i]))
    res.append(parseStringToList(tab_str[len(tab_str)-1]))
    return res
 def showpaste(content_range):    
    requested_path = request.args.get('paste', '')
    paste = Paste.Paste(requested_path)
@ -93,19 +110,47 @@ def showpaste(content_range):
    p_mime = paste.p_mime
    p_lineinfo = paste.get_lines_info()
    p_content = paste.get_p_content().decode('utf-8', 'ignore')
-    p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
+    p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
    p_duplicate_list = []
    p_simil_list = []
    p_hashtype_list = []
    for dup_list in p_duplicate_full_list:
-        path, simil_percent = dup_list
+        if dup_list[0] == "tlsh":
            dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
        else:
            dup_list[2] = int(dup_list[2])
    p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
    new_dup_list = []
    dup_list_removed = []
    for dup_list_index in range(0, len(p_duplicate_full_list)):
        if dup_list_index in dup_list_removed:
            continue
        indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
        hash_types = []
        comp_vals = []
        for i in indices:
            hash_types.append(p_duplicate_full_list[i][0])
            comp_vals.append(p_duplicate_full_list[i][2])
            dup_list_removed.append(i)
        hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
        comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
        new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
    for dup_list in new_dup_list:
        hash_type, path, simil_percent = dup_list
        p_duplicate_list.append(path)
        p_simil_list.append(simil_percent)
        p_hashtype_list.append(hash_type)
    if content_range != 0:
       p_content = p_content[0:content_range] 
-    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)
+    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)
 def get_date_range(num_day):
    curr_date = datetime.date.today()
--- a/var/www/templates/show_saved_paste.html
+++ b/var/www/templates/show_saved_paste.html
@ -43,16 +43,25 @@
  </div>
  <div class="panel-body" id="panel-body">
  {% if  duplicate_list|length == 0 %}
-      <h4> No Duplicate </h4>
+      <h3> No Duplicate </h3>
  {% else %}
-      <h4> Duplicate list: </h4>
+      <h3> Duplicate list: </h3>
      <table style="width:100%">
      {% set i = 0 %}
      <tr>
          <th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
      </tr>
      {% for dup_path in duplicate_list %}
-          Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
+          <tr>
              <td>{{ hashtype_list[i] }}</td>
              <td>Similarity: {{ simil_list[i] }}%</td>
              <td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
          </tr>
          {% set i = i + 1 %}
      {% endfor %}
      </table>
  {% endif %}
-    <h4> Content: </h4>
+    <h3> Content: </h3>
  <p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
  </div>
 </div>