Draft: added new duplicate hash comparison - tlsh

This commit is contained in:
Mokaddem 2016-08-04 11:55:38 +02:00
parent 50d2848a40
commit d9316771cd
6 changed files with 125 additions and 79 deletions

View file

@ -22,6 +22,7 @@ import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
import json import json
import ssdeep import ssdeep
import tlsh
from packages import Paste from packages import Paste
from pubsublogger import publisher from pubsublogger import publisher
@ -36,8 +37,12 @@ if __name__ == "__main__":
p = Process(config_section) p = Process(config_section)
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
threshold_set = {}
threshold_set['ssdeep'] = threshold_duplicate_ssdeep
threshold_set['tlsh'] = threshold_duplicate_tlsh
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
# REDIS # # REDIS #
dico_redis = {} dico_redis = {}
@ -47,7 +52,7 @@ if __name__ == "__main__":
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"), port=year, host=p.config.get("Redis_Level_DB", "host"), port=year,
db=month) db=month)
#print("dup: "+str(year)+str(month).zfill(2)+"\n") #print("dup: "+str(year)+str(month).zfill(2)+"\n")
# FUNCTIONS # # FUNCTIONS #
publisher.info("Script duplicate started") publisher.info("Script duplicate started")
@ -70,10 +75,11 @@ if __name__ == "__main__":
continue continue
# the paste is too small # the paste is too small
if (PST._get_p_size() < min_paste_size): if (PST._get_p_size() < min_paste_size):
continue continue
PST._set_p_hash_kind("ssdeep") PST._set_p_hash_kind("ssdeep")
PST._set_p_hash_kind("tlsh")
# Assignate the correct redis connexion # Assignate the correct redis connexion
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
@ -86,7 +92,7 @@ if __name__ == "__main__":
curr_date_range = date_today - timedelta(days = diff_month*30.4166666) curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
dico_range_list.append(to_append) dico_range_list.append(to_append)
# Use all dico in range # Use all dico in range
dico_range_list = dico_range_list[0:maximum_month_range] dico_range_list = dico_range_list[0:maximum_month_range]
@ -95,43 +101,47 @@ if __name__ == "__main__":
r_serv0 = dico_redis[yearly_index] r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index") r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date) index = r_serv0.get("current_index")+str(PST.p_date)
# Open selected dico range # Open selected dico range
opened_dico = [] opened_dico = []
for dico_name in dico_range_list: for dico_name in dico_range_list:
opened_dico.append([dico_name, dico_redis[dico_name]]) opened_dico.append([dico_name, dico_redis[dico_name]])
# retrieve hash from paste # retrieve hash from paste
paste_hash = PST._get_p_hash() paste_hashes = PST._get_p_hash()
# Go throught the Database of the dico (of the month) # Go throught the Database of the dico (of the month)
for curr_dico_name, curr_dico_redis in opened_dico: for curr_dico_name, curr_dico_redis in opened_dico:
for dico_hash in curr_dico_redis.smembers('HASHS'): for hash_type, paste_hash in paste_hashes.iteritems():
try: for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
percent = ssdeep.compare(dico_hash, paste_hash) try:
if percent > threshold_duplicate: percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
# Go throught the Database of the dico filter (month) threshold_duplicate = threshold_set[hash_type]
r_serv_dico = dico_redis[curr_dico_name] if percent < threshold_duplicate:
# Go throught the Database of the dico filter (month)
# index of paste r_serv_dico = dico_redis[curr_dico_name]
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent)
#print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) # index of paste
except: index_current = r_serv_dico.get(dico_hash)
# ssdeep hash not comparable paste_path = r_serv_dico.get(index_current)
print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash if paste_path != None:
curr_dico_redis.srem('HASHS', dico_hash) hash_dico[dico_hash] = (hash_type, paste_path, percent)
print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
except Exception,e:
print str(e)
# ssdeep hash not comparable
#print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
#curr_dico_redis.srem('HASHS', dico_hash)
# Add paste in DB after checking to prevent its analysis twice # Add paste in DB after checking to prevent its analysis twice
# hash_i -> index_i AND index_i -> PST.PATH # hash_i -> index_i AND index_i -> PST.PATH
r_serv1.set(index, PST.p_path) r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index) r_serv1.sadd("INDEX", index)
# Adding the hash in Redis # Adding the hash in Redis
r_serv1.set(paste_hash, index) for hash_type, paste_hash in paste_hashes.iteritems():
r_serv1.sadd("HASHS", paste_hash) r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS_"+hash_type, paste_hash)
##################### Similarity found ####################### ##################### Similarity found #######################
# if there is data in this dictionnary # if there is data in this dictionnary
@ -153,7 +163,7 @@ if __name__ == "__main__":
publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
#print '{}Processed in {} sec'.format(to_print, y-x) #print '{}Processed in {} sec'.format(to_print, y-x)
except IOError: except IOError:
to_print = 'Duplicate;{};{};{};'.format( to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name) PST.p_source, PST.p_date, PST.p_name)

View file

@ -2,6 +2,7 @@ import hashlib
import crcmod import crcmod
import mmh3 import mmh3
import ssdeep import ssdeep
import tlsh
class Hash(object): class Hash(object):
@ -36,4 +37,7 @@ class Hash(object):
elif self.name == "ssdeep": elif self.name == "ssdeep":
hash = ssdeep.hash(string) hash = ssdeep.hash(string)
elif self.name == "tlsh":
hash = tlsh.hash(string)
return hash return hash

View file

@ -86,8 +86,8 @@ class Paste(object):
self.p_source = var[-5] self.p_source = var[-5]
self.p_encoding = None self.p_encoding = None
self.p_hash_kind = None self.p_hash_kind = {}
self.p_hash = None self.p_hash = {}
self.p_langage = None self.p_langage = None
self.p_nb_lines = None self.p_nb_lines = None
self.p_max_length_line = None self.p_max_length_line = None
@ -159,7 +159,7 @@ class Paste(object):
.. seealso:: Hash.py Object to get the available hashs. .. seealso:: Hash.py Object to get the available hashs.
""" """
self.p_hash_kind = Hash(hashkind) self.p_hash_kind[hashkind] = (Hash(hashkind))
def _get_p_hash(self): def _get_p_hash(self):
""" """
@ -174,7 +174,8 @@ class Paste(object):
.. seealso:: _set_p_hash_kind("md5") .. seealso:: _set_p_hash_kind("md5")
""" """
self.p_hash = self.p_hash_kind.Calculate(self.get_p_content()) for hash_name, the_hash in self.p_hash_kind.iteritems():
self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
return self.p_hash return self.p_hash
def _get_p_language(self): def _get_p_language(self):
@ -202,42 +203,6 @@ class Paste(object):
def _get_p_size(self): def _get_p_size(self):
return self.p_size return self.p_size
def _get_hash_lines(self, min=1, start=1, jump=10):
"""
Returning all the lines of the paste hashed.
:param min: -- (int) Minimum line length to be hashed.
:param start: -- (int) Number the line where to start.
:param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
(Maximum Granularity)
:return: a set([]) of hash.
.. warning:: Using a set here mean that this function will only return uniq hash.
If the paste is composed with 1000 time the same line, this function will return
just once the line.
This choice was made to avoid a certain redundancy and useless hash checking.
:Example: PST._get_hash_lines(1, 1, 0)
.. note:: You need first to "declare which kind of hash you want to use
before using this function
.. seealso:: _set_p_hash_kind("md5")
"""
S = set([])
f = self.get_p_content_as_file()
for num, line in enumerate(f, start):
if len(line) >= min:
if jump > 1:
if (num % jump) == 1:
S.add(self.p_hash_kind.Calculate(line))
else:
S.add(self.p_hash_kind.Calculate(line))
return S
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10): def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
""" """
Returning the percent of similarity with another paste. Returning the percent of similarity with another paste.
@ -329,7 +294,10 @@ class Paste(object):
self.store.hset(self.p_path, attr_name, json.dumps(value)) self.store.hset(self.p_path, attr_name, json.dumps(value))
def _get_from_redis(self, r_serv): def _get_from_redis(self, r_serv):
return r_serv.hgetall(self.p_hash) ans = {}
for hash_name, the_hash in self.p_hash:
ans[hash_name] = r_serv.hgetall(the_hash)
return ans
def _get_top_words(self, sort=False): def _get_top_words(self, sort=False):
""" """

View file

@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
sudo ldconfig sudo ldconfig
popd popd
# tlsh
test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
pushd tlsh/
./make
popd
# REDIS LEVEL DB # # REDIS LEVEL DB #
test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
pushd redis-leveldb/ pushd redis-leveldb/
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
python setup.py install python setup.py install
popd popd
# Py tlsh
pushd tlsh/py_ext
python setup.py build
python setup.py install
# Download the necessary NLTK corpora # Download the necessary NLTK corpora
HOME=$(pwd) python -m textblob.download_corpora HOME=$(pwd) python -m textblob.download_corpora

View file

@ -14,6 +14,8 @@ import Paste
from Date import Date from Date import Date
# CONFIG # # CONFIG #
tlsh_to_percent = 1000.0
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile): if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \ raise Exception('Unable to find the configuration file. \
@ -74,13 +76,28 @@ def parseStringToList(the_string):
strList += c strList += c
else: else:
the_list = strList.split(',') the_list = strList.split(',')
if len(the_list) == 2: if len(the_list) == 3:
elemList = elemList + the_list
elif len(the_list) == 2:
elemList.append(the_list) elemList.append(the_list)
elif len(the_list) > 1: elif len(the_list) > 1:
elemList.append(the_list[1:]) elemList.append(the_list[1:])
strList = "" strList = ""
return elemList return elemList
def parseStringToList2(the_string):
res = []
tab_str = the_string.split('], [')
tab_str[0] = tab_str[0][1:]+']'
tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
res.append(parseStringToList(tab_str[0]))
for i in range(1, len(tab_str)-2):
tab_str[i] = '['+tab_str[i]+']'
res.append(parseStringToList(tab_str[i]))
res.append(parseStringToList(tab_str[len(tab_str)-1]))
return res
def showpaste(content_range): def showpaste(content_range):
requested_path = request.args.get('paste', '') requested_path = request.args.get('paste', '')
paste = Paste.Paste(requested_path) paste = Paste.Paste(requested_path)
@ -93,19 +110,47 @@ def showpaste(content_range):
p_mime = paste.p_mime p_mime = paste.p_mime
p_lineinfo = paste.get_lines_info() p_lineinfo = paste.get_lines_info()
p_content = paste.get_p_content().decode('utf-8', 'ignore') p_content = paste.get_p_content().decode('utf-8', 'ignore')
p_duplicate_full_list = parseStringToList(paste._get_p_duplicate()) p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
p_duplicate_list = [] p_duplicate_list = []
p_simil_list = [] p_simil_list = []
p_hashtype_list = []
for dup_list in p_duplicate_full_list: for dup_list in p_duplicate_full_list:
path, simil_percent = dup_list if dup_list[0] == "tlsh":
dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
else:
dup_list[2] = int(dup_list[2])
p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
new_dup_list = []
dup_list_removed = []
for dup_list_index in range(0, len(p_duplicate_full_list)):
if dup_list_index in dup_list_removed:
continue
indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
hash_types = []
comp_vals = []
for i in indices:
hash_types.append(p_duplicate_full_list[i][0])
comp_vals.append(p_duplicate_full_list[i][2])
dup_list_removed.append(i)
hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
for dup_list in new_dup_list:
hash_type, path, simil_percent = dup_list
p_duplicate_list.append(path) p_duplicate_list.append(path)
p_simil_list.append(simil_percent) p_simil_list.append(simil_percent)
p_hashtype_list.append(hash_type)
if content_range != 0: if content_range != 0:
p_content = p_content[0:content_range] p_content = p_content[0:content_range]
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list) return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)
def get_date_range(num_day): def get_date_range(num_day):
curr_date = datetime.date.today() curr_date = datetime.date.today()

View file

@ -43,16 +43,25 @@
</div> </div>
<div class="panel-body" id="panel-body"> <div class="panel-body" id="panel-body">
{% if duplicate_list|length == 0 %} {% if duplicate_list|length == 0 %}
<h4> No Duplicate </h4> <h3> No Duplicate </h3>
{% else %} {% else %}
<h4> Duplicate list: </h4> <h3> Duplicate list: </h3>
<table style="width:100%">
{% set i = 0 %} {% set i = 0 %}
<tr>
<th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
</tr>
{% for dup_path in duplicate_list %} {% for dup_path in duplicate_list %}
Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br> <tr>
<td>{{ hashtype_list[i] }}</td>
<td>Similarity: {{ simil_list[i] }}%</td>
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
</tr>
{% set i = i + 1 %} {% set i = i + 1 %}
{% endfor %} {% endfor %}
</table>
{% endif %} {% endif %}
<h4> Content: </h4> <h3> Content: </h3>
<p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p> <p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
</div> </div>
</div> </div>