mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-23 06:37:15 +00:00
Draft: added new duplicate hash comparison - tlsh
This commit is contained in:
parent
50d2848a40
commit
d9316771cd
6 changed files with 125 additions and 79 deletions
|
@ -22,6 +22,7 @@ import time
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import json
|
import json
|
||||||
import ssdeep
|
import ssdeep
|
||||||
|
import tlsh
|
||||||
from packages import Paste
|
from packages import Paste
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
@ -36,7 +37,11 @@ if __name__ == "__main__":
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
|
|
||||||
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
|
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
|
||||||
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
|
threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
|
||||||
|
threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
|
||||||
|
threshold_set = {}
|
||||||
|
threshold_set['ssdeep'] = threshold_duplicate_ssdeep
|
||||||
|
threshold_set['tlsh'] = threshold_duplicate_tlsh
|
||||||
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
|
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
|
||||||
|
|
||||||
# REDIS #
|
# REDIS #
|
||||||
|
@ -47,7 +52,7 @@ if __name__ == "__main__":
|
||||||
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
|
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
|
||||||
host=p.config.get("Redis_Level_DB", "host"), port=year,
|
host=p.config.get("Redis_Level_DB", "host"), port=year,
|
||||||
db=month)
|
db=month)
|
||||||
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
||||||
|
|
||||||
# FUNCTIONS #
|
# FUNCTIONS #
|
||||||
publisher.info("Script duplicate started")
|
publisher.info("Script duplicate started")
|
||||||
|
@ -74,6 +79,7 @@ if __name__ == "__main__":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
PST._set_p_hash_kind("ssdeep")
|
PST._set_p_hash_kind("ssdeep")
|
||||||
|
PST._set_p_hash_kind("tlsh")
|
||||||
|
|
||||||
# Assignate the correct redis connexion
|
# Assignate the correct redis connexion
|
||||||
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
|
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
|
||||||
|
@ -102,36 +108,40 @@ if __name__ == "__main__":
|
||||||
opened_dico.append([dico_name, dico_redis[dico_name]])
|
opened_dico.append([dico_name, dico_redis[dico_name]])
|
||||||
|
|
||||||
# retrieve hash from paste
|
# retrieve hash from paste
|
||||||
paste_hash = PST._get_p_hash()
|
paste_hashes = PST._get_p_hash()
|
||||||
|
|
||||||
# Go throught the Database of the dico (of the month)
|
# Go throught the Database of the dico (of the month)
|
||||||
for curr_dico_name, curr_dico_redis in opened_dico:
|
for curr_dico_name, curr_dico_redis in opened_dico:
|
||||||
for dico_hash in curr_dico_redis.smembers('HASHS'):
|
for hash_type, paste_hash in paste_hashes.iteritems():
|
||||||
try:
|
for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
|
||||||
percent = ssdeep.compare(dico_hash, paste_hash)
|
try:
|
||||||
if percent > threshold_duplicate:
|
percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
|
||||||
# Go throught the Database of the dico filter (month)
|
threshold_duplicate = threshold_set[hash_type]
|
||||||
r_serv_dico = dico_redis[curr_dico_name]
|
if percent < threshold_duplicate:
|
||||||
|
# Go throught the Database of the dico filter (month)
|
||||||
|
r_serv_dico = dico_redis[curr_dico_name]
|
||||||
|
|
||||||
# index of paste
|
# index of paste
|
||||||
index_current = r_serv_dico.get(dico_hash)
|
index_current = r_serv_dico.get(dico_hash)
|
||||||
paste_path = r_serv_dico.get(index_current)
|
paste_path = r_serv_dico.get(index_current)
|
||||||
if paste_path != None:
|
if paste_path != None:
|
||||||
hash_dico[dico_hash] = (paste_path, percent)
|
hash_dico[dico_hash] = (hash_type, paste_path, percent)
|
||||||
|
|
||||||
#print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
|
print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
|
||||||
except:
|
except Exception,e:
|
||||||
# ssdeep hash not comparable
|
print str(e)
|
||||||
print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
|
# ssdeep hash not comparable
|
||||||
curr_dico_redis.srem('HASHS', dico_hash)
|
#print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
|
||||||
|
#curr_dico_redis.srem('HASHS', dico_hash)
|
||||||
|
|
||||||
# Add paste in DB after checking to prevent its analysis twice
|
# Add paste in DB after checking to prevent its analysis twice
|
||||||
# hash_i -> index_i AND index_i -> PST.PATH
|
# hash_i -> index_i AND index_i -> PST.PATH
|
||||||
r_serv1.set(index, PST.p_path)
|
r_serv1.set(index, PST.p_path)
|
||||||
r_serv1.sadd("INDEX", index)
|
r_serv1.sadd("INDEX", index)
|
||||||
# Adding the hash in Redis
|
# Adding the hash in Redis
|
||||||
r_serv1.set(paste_hash, index)
|
for hash_type, paste_hash in paste_hashes.iteritems():
|
||||||
r_serv1.sadd("HASHS", paste_hash)
|
r_serv1.set(paste_hash, index)
|
||||||
|
r_serv1.sadd("HASHS_"+hash_type, paste_hash)
|
||||||
##################### Similarity found #######################
|
##################### Similarity found #######################
|
||||||
|
|
||||||
# if there is data in this dictionnary
|
# if there is data in this dictionnary
|
||||||
|
|
|
@ -2,6 +2,7 @@ import hashlib
|
||||||
import crcmod
|
import crcmod
|
||||||
import mmh3
|
import mmh3
|
||||||
import ssdeep
|
import ssdeep
|
||||||
|
import tlsh
|
||||||
|
|
||||||
|
|
||||||
class Hash(object):
|
class Hash(object):
|
||||||
|
@ -36,4 +37,7 @@ class Hash(object):
|
||||||
elif self.name == "ssdeep":
|
elif self.name == "ssdeep":
|
||||||
hash = ssdeep.hash(string)
|
hash = ssdeep.hash(string)
|
||||||
|
|
||||||
|
elif self.name == "tlsh":
|
||||||
|
hash = tlsh.hash(string)
|
||||||
|
|
||||||
return hash
|
return hash
|
||||||
|
|
|
@ -86,8 +86,8 @@ class Paste(object):
|
||||||
self.p_source = var[-5]
|
self.p_source = var[-5]
|
||||||
|
|
||||||
self.p_encoding = None
|
self.p_encoding = None
|
||||||
self.p_hash_kind = None
|
self.p_hash_kind = {}
|
||||||
self.p_hash = None
|
self.p_hash = {}
|
||||||
self.p_langage = None
|
self.p_langage = None
|
||||||
self.p_nb_lines = None
|
self.p_nb_lines = None
|
||||||
self.p_max_length_line = None
|
self.p_max_length_line = None
|
||||||
|
@ -159,7 +159,7 @@ class Paste(object):
|
||||||
.. seealso:: Hash.py Object to get the available hashs.
|
.. seealso:: Hash.py Object to get the available hashs.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self.p_hash_kind = Hash(hashkind)
|
self.p_hash_kind[hashkind] = (Hash(hashkind))
|
||||||
|
|
||||||
def _get_p_hash(self):
|
def _get_p_hash(self):
|
||||||
"""
|
"""
|
||||||
|
@ -174,7 +174,8 @@ class Paste(object):
|
||||||
.. seealso:: _set_p_hash_kind("md5")
|
.. seealso:: _set_p_hash_kind("md5")
|
||||||
|
|
||||||
"""
|
"""
|
||||||
self.p_hash = self.p_hash_kind.Calculate(self.get_p_content())
|
for hash_name, the_hash in self.p_hash_kind.iteritems():
|
||||||
|
self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
|
||||||
return self.p_hash
|
return self.p_hash
|
||||||
|
|
||||||
def _get_p_language(self):
|
def _get_p_language(self):
|
||||||
|
@ -202,42 +203,6 @@ class Paste(object):
|
||||||
def _get_p_size(self):
|
def _get_p_size(self):
|
||||||
return self.p_size
|
return self.p_size
|
||||||
|
|
||||||
def _get_hash_lines(self, min=1, start=1, jump=10):
|
|
||||||
"""
|
|
||||||
Returning all the lines of the paste hashed.
|
|
||||||
|
|
||||||
:param min: -- (int) Minimum line length to be hashed.
|
|
||||||
:param start: -- (int) Number the line where to start.
|
|
||||||
:param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
|
|
||||||
(Maximum Granularity)
|
|
||||||
|
|
||||||
:return: a set([]) of hash.
|
|
||||||
|
|
||||||
.. warning:: Using a set here mean that this function will only return uniq hash.
|
|
||||||
|
|
||||||
If the paste is composed with 1000 time the same line, this function will return
|
|
||||||
just once the line.
|
|
||||||
|
|
||||||
This choice was made to avoid a certain redundancy and useless hash checking.
|
|
||||||
|
|
||||||
:Example: PST._get_hash_lines(1, 1, 0)
|
|
||||||
|
|
||||||
.. note:: You need first to "declare which kind of hash you want to use
|
|
||||||
before using this function
|
|
||||||
.. seealso:: _set_p_hash_kind("md5")
|
|
||||||
|
|
||||||
"""
|
|
||||||
S = set([])
|
|
||||||
f = self.get_p_content_as_file()
|
|
||||||
for num, line in enumerate(f, start):
|
|
||||||
if len(line) >= min:
|
|
||||||
if jump > 1:
|
|
||||||
if (num % jump) == 1:
|
|
||||||
S.add(self.p_hash_kind.Calculate(line))
|
|
||||||
else:
|
|
||||||
S.add(self.p_hash_kind.Calculate(line))
|
|
||||||
return S
|
|
||||||
|
|
||||||
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
|
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
|
||||||
"""
|
"""
|
||||||
Returning the percent of similarity with another paste.
|
Returning the percent of similarity with another paste.
|
||||||
|
@ -329,7 +294,10 @@ class Paste(object):
|
||||||
self.store.hset(self.p_path, attr_name, json.dumps(value))
|
self.store.hset(self.p_path, attr_name, json.dumps(value))
|
||||||
|
|
||||||
def _get_from_redis(self, r_serv):
|
def _get_from_redis(self, r_serv):
|
||||||
return r_serv.hgetall(self.p_hash)
|
ans = {}
|
||||||
|
for hash_name, the_hash in self.p_hash:
|
||||||
|
ans[hash_name] = r_serv.hgetall(the_hash)
|
||||||
|
return ans
|
||||||
|
|
||||||
def _get_top_words(self, sort=False):
|
def _get_top_words(self, sort=False):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
|
||||||
sudo ldconfig
|
sudo ldconfig
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
# tlsh
|
||||||
|
test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
|
||||||
|
pushd tlsh/
|
||||||
|
./make
|
||||||
|
popd
|
||||||
|
|
||||||
# REDIS LEVEL DB #
|
# REDIS LEVEL DB #
|
||||||
test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
|
test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
|
||||||
pushd redis-leveldb/
|
pushd redis-leveldb/
|
||||||
|
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
|
||||||
python setup.py install
|
python setup.py install
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
# Py tlsh
|
||||||
|
pushd tlsh/py_ext
|
||||||
|
python setup.py build
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
# Download the necessary NLTK corpora
|
# Download the necessary NLTK corpora
|
||||||
HOME=$(pwd) python -m textblob.download_corpora
|
HOME=$(pwd) python -m textblob.download_corpora
|
||||||
|
|
|
@ -14,6 +14,8 @@ import Paste
|
||||||
from Date import Date
|
from Date import Date
|
||||||
|
|
||||||
# CONFIG #
|
# CONFIG #
|
||||||
|
tlsh_to_percent = 1000.0
|
||||||
|
|
||||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
if not os.path.exists(configfile):
|
if not os.path.exists(configfile):
|
||||||
raise Exception('Unable to find the configuration file. \
|
raise Exception('Unable to find the configuration file. \
|
||||||
|
@ -74,13 +76,28 @@ def parseStringToList(the_string):
|
||||||
strList += c
|
strList += c
|
||||||
else:
|
else:
|
||||||
the_list = strList.split(',')
|
the_list = strList.split(',')
|
||||||
if len(the_list) == 2:
|
if len(the_list) == 3:
|
||||||
|
elemList = elemList + the_list
|
||||||
|
elif len(the_list) == 2:
|
||||||
elemList.append(the_list)
|
elemList.append(the_list)
|
||||||
elif len(the_list) > 1:
|
elif len(the_list) > 1:
|
||||||
elemList.append(the_list[1:])
|
elemList.append(the_list[1:])
|
||||||
strList = ""
|
strList = ""
|
||||||
return elemList
|
return elemList
|
||||||
|
|
||||||
|
def parseStringToList2(the_string):
|
||||||
|
res = []
|
||||||
|
tab_str = the_string.split('], [')
|
||||||
|
tab_str[0] = tab_str[0][1:]+']'
|
||||||
|
tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
|
||||||
|
res.append(parseStringToList(tab_str[0]))
|
||||||
|
for i in range(1, len(tab_str)-2):
|
||||||
|
tab_str[i] = '['+tab_str[i]+']'
|
||||||
|
res.append(parseStringToList(tab_str[i]))
|
||||||
|
res.append(parseStringToList(tab_str[len(tab_str)-1]))
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def showpaste(content_range):
|
def showpaste(content_range):
|
||||||
requested_path = request.args.get('paste', '')
|
requested_path = request.args.get('paste', '')
|
||||||
paste = Paste.Paste(requested_path)
|
paste = Paste.Paste(requested_path)
|
||||||
|
@ -93,19 +110,47 @@ def showpaste(content_range):
|
||||||
p_mime = paste.p_mime
|
p_mime = paste.p_mime
|
||||||
p_lineinfo = paste.get_lines_info()
|
p_lineinfo = paste.get_lines_info()
|
||||||
p_content = paste.get_p_content().decode('utf-8', 'ignore')
|
p_content = paste.get_p_content().decode('utf-8', 'ignore')
|
||||||
p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
|
p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
|
||||||
p_duplicate_list = []
|
p_duplicate_list = []
|
||||||
p_simil_list = []
|
p_simil_list = []
|
||||||
|
p_hashtype_list = []
|
||||||
|
|
||||||
|
|
||||||
for dup_list in p_duplicate_full_list:
|
for dup_list in p_duplicate_full_list:
|
||||||
path, simil_percent = dup_list
|
if dup_list[0] == "tlsh":
|
||||||
|
dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
|
||||||
|
else:
|
||||||
|
dup_list[2] = int(dup_list[2])
|
||||||
|
|
||||||
|
p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
|
||||||
|
|
||||||
|
new_dup_list = []
|
||||||
|
dup_list_removed = []
|
||||||
|
for dup_list_index in range(0, len(p_duplicate_full_list)):
|
||||||
|
if dup_list_index in dup_list_removed:
|
||||||
|
continue
|
||||||
|
indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
|
||||||
|
hash_types = []
|
||||||
|
comp_vals = []
|
||||||
|
for i in indices:
|
||||||
|
hash_types.append(p_duplicate_full_list[i][0])
|
||||||
|
comp_vals.append(p_duplicate_full_list[i][2])
|
||||||
|
dup_list_removed.append(i)
|
||||||
|
|
||||||
|
hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
|
||||||
|
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
|
||||||
|
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
|
||||||
|
|
||||||
|
for dup_list in new_dup_list:
|
||||||
|
hash_type, path, simil_percent = dup_list
|
||||||
p_duplicate_list.append(path)
|
p_duplicate_list.append(path)
|
||||||
p_simil_list.append(simil_percent)
|
p_simil_list.append(simil_percent)
|
||||||
|
p_hashtype_list.append(hash_type)
|
||||||
|
|
||||||
if content_range != 0:
|
if content_range != 0:
|
||||||
p_content = p_content[0:content_range]
|
p_content = p_content[0:content_range]
|
||||||
|
|
||||||
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)
|
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)
|
||||||
|
|
||||||
def get_date_range(num_day):
|
def get_date_range(num_day):
|
||||||
curr_date = datetime.date.today()
|
curr_date = datetime.date.today()
|
||||||
|
|
|
@ -43,16 +43,25 @@
|
||||||
</div>
|
</div>
|
||||||
<div class="panel-body" id="panel-body">
|
<div class="panel-body" id="panel-body">
|
||||||
{% if duplicate_list|length == 0 %}
|
{% if duplicate_list|length == 0 %}
|
||||||
<h4> No Duplicate </h4>
|
<h3> No Duplicate </h3>
|
||||||
{% else %}
|
{% else %}
|
||||||
<h4> Duplicate list: </h4>
|
<h3> Duplicate list: </h3>
|
||||||
|
<table style="width:100%">
|
||||||
{% set i = 0 %}
|
{% set i = 0 %}
|
||||||
|
<tr>
|
||||||
|
<th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
|
||||||
|
</tr>
|
||||||
{% for dup_path in duplicate_list %}
|
{% for dup_path in duplicate_list %}
|
||||||
Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
|
<tr>
|
||||||
|
<td>{{ hashtype_list[i] }}</td>
|
||||||
|
<td>Similarity: {{ simil_list[i] }}%</td>
|
||||||
|
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
|
||||||
|
</tr>
|
||||||
{% set i = i + 1 %}
|
{% set i = i + 1 %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
</table>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<h4> Content: </h4>
|
<h3> Content: </h3>
|
||||||
<p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
|
<p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue