ail-framework/bin/Duplicate_ssdeep_v2.py

162 lines
6 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The Duplicate module
====================
This huge module is, in short term, checking duplicates.
Its input comes from other modules, namely:
Credential, CreditCard, Keys, Mails and Phone
This one differ from v1 by only using redis and not json file stored on disk
Requirements:
-------------
"""
import redis
import os
import time
from datetime import datetime, timedelta
import json
import ssdeep
from packages import Paste
from pubsublogger import publisher
from Helper import Process
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Duplicates'
p = Process(config_section)
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
# REDIS #
dico_redis = {}
date_today = datetime.today()
for year in xrange(2013, date_today.year+1):
for month in xrange(0, 13):
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"), port=year,
db=month)
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
# FUNCTIONS #
publisher.info("Script duplicate started")
while True:
try:
hash_dico = {}
dupl = []
dico_range_list = []
x = time.time()
message = p.get_from_set()
if message is not None:
path = message
PST = Paste.Paste(path)
else:
publisher.debug("Script Attribute is idling 10s")
time.sleep(10)
continue
# the paste is too small
if (PST._get_p_size() < min_paste_size):
continue
PST._set_p_hash_kind("ssdeep")
# Assignate the correct redis connexion
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
# Creating the dico name: yyyymm
# Get the date of the range
date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
for diff_month in xrange(0, num_of_month+1):
curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
dico_range_list.append(to_append)
# Use all dico in range
dico_range_list = dico_range_list[0:maximum_month_range]
# UNIQUE INDEX HASHS TABLE
yearly_index = str(date_today.year)+'00'
r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date)
# Open selected dico range
opened_dico = []
for dico_name in dico_range_list:
opened_dico.append([dico_name, dico_redis[dico_name]])
# retrieve hash from paste
paste_hash = PST._get_p_hash()
# Go throught the Database of the dico (of the month)
for curr_dico_name, curr_dico_redis in opened_dico:
for dico_hash in curr_dico_redis.smembers('HASHS'):
try:
percent = ssdeep.compare(dico_hash, paste_hash)
if percent > threshold_duplicate:
# Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[curr_dico_name]
# index of paste
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent)
#print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
except:
# ssdeep hash not comparable
print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
curr_dico_redis.srem('HASHS', dico_hash)
# Add paste in DB after checking to prevent its analysis twice
# hash_i -> index_i AND index_i -> PST.PATH
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
##################### Similarity found #######################
# if there is data in this dictionnary
if len(hash_dico) != 0:
# paste_tuple = (paste_path, percent)
for dico_hash, paste_tuple in hash_dico.items():
dupl.append(paste_tuple)
# Creating the object attribute and save it.
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
if dupl != []:
PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl)
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
print '{}Detected {}'.format(to_print, len(dupl))
y = time.time()
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
#print '{}Processed in {} sec'.format(to_print, y-x)
except IOError:
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
print "CRC Checksum Failed on :", PST.p_path
publisher.error('{}CRC Checksum Failed'.format(to_print))