ail-framework/bin/Duplicate_ssdeep.py
Mokaddem 4f6813350b Added two new version of duplicate module.
One with hashes are saved in json on disk
The other with only leveldb
2016-07-18 15:50:41 +02:00

182 lines
6.3 KiB
Python
Executable file

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The Duplicate module
====================
This huge module is, in short term, checking duplicates.
Requirements:
-------------
"""
import redis
import os
import time
import datetime
import json
import ssdeep
from packages import Paste
from pubsublogger import publisher
from Helper import Process
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Duplicates'
save_dico_and_reload = 1 #min
time_1 = time.time()
flag_reload_from_disk = True
flag_write_to_disk = False
p = Process(config_section)
# REDIS #
# DB OBJECT & HASHS ( DISK )
# FIXME increase flexibility
dico_redis = {}
for year in xrange(2013, datetime.date.today().year+1):
for month in xrange(0, 16):
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"), port=year,
db=month)
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
# FUNCTIONS #
publisher.info("Script duplicate started")
dicopath = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "dicofilters"))
dico_path_set = set()
while True:
try:
hash_dico = {}
dupl = []
x = time.time()
message = p.get_from_set()
if message is not None:
path = message
PST = Paste.Paste(path)
else:
publisher.debug("Script Attribute is idling 10s")
time.sleep(10)
continue
PST._set_p_hash_kind("ssdeep")
# Assignate the correct redis connexion
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
# Creating the dicor name: dicoyyyymm
filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year +
PST.p_date.month)
filedicopath_today = filedicopath
# Save I/O
if time.time() - time_1 > save_dico_and_reload*60:
flag_write_to_disk = True
if os.path.exists(filedicopath):
if flag_reload_from_disk == True:
flag_reload_from_disk = False
print 'Reloading'
with open(filedicopath, 'r') as fp:
today_dico = json.load(fp)
else:
today_dico = {}
with open(filedicopath, 'w') as fp:
json.dump(today_dico, fp)
# For now, just use monthly dico
dico_path_set.add(filedicopath)
# UNIQUE INDEX HASHS TABLE
yearly_index = str(datetime.date.today().year)+'00'
r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date)
# For each dico
opened_dico = []
for dico in dico_path_set:
# Opening dico
if dico == filedicopath_today:
opened_dico.append([dico, today_dico])
else:
with open(dico, 'r') as fp:
opened_dico.append([dico, json.load(fp)])
#retrieve hash from paste
paste_hash = PST._get_p_hash()
# Go throught the Database of the dico (of the month)
threshold_dup = 99
for dico_name, dico in opened_dico:
for dico_key, dico_hash in dico.items():
percent = ssdeep.compare(dico_hash, paste_hash)
if percent > threshold_dup:
db = dico_name[-6:]
# Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[db]
# index of paste
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent)
#print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent)
# Add paste in DB to prevent its analyse twice
# HASHTABLES PER MONTH (because of r_serv1 changing db)
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
##################### Similarity found #######################
# if there is data in this dictionnary
if len(hash_dico) != 0:
for dico_hash, paste_tuple in hash_dico.items():
paste_path, percent = paste_tuple
dupl.append((paste_path, percent))
# Creating the object attribute and save it.
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
if dupl != []:
PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl)
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
print '{}Detected {}'.format(to_print, len(dupl))
y = time.time()
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
# Adding the hash in the dico of the month
today_dico[index] = paste_hash
if flag_write_to_disk:
time_1 = time.time()
flag_write_to_disk = False
flag_reload_from_disk = True
print 'writing'
with open(filedicopath, 'w') as fp:
json.dump(today_dico, fp)
except IOError:
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
print "CRC Checksum Failed on :", PST.p_path
publisher.error('{}CRC Checksum Failed'.format(to_print))