Added two new version of duplicate module.

One with hashes are saved in json on disk
The other with only leveldb
This commit is contained in:
Mokaddem 2016-07-18 15:50:41 +02:00
parent 14e9850dd6
commit 4f6813350b
4 changed files with 198 additions and 29 deletions

View file

@ -20,7 +20,6 @@ import json
import ssdeep import ssdeep
from packages import Paste from packages import Paste
from pubsublogger import publisher from pubsublogger import publisher
from pybloomfilter import BloomFilter
from Helper import Process from Helper import Process
@ -29,10 +28,10 @@ if __name__ == "__main__":
publisher.channel = "Script" publisher.channel = "Script"
config_section = 'Duplicates' config_section = 'Duplicates'
saved_dico_and_reload = 1 #min save_dico_and_reload = 1 #min
time_1 = time.time() time_1 = time.time()
flag_reload = True flag_reload_from_disk = True
flag_to_disk = False flag_write_to_disk = False
p = Process(config_section) p = Process(config_section)
@ -81,18 +80,16 @@ if __name__ == "__main__":
filedicopath_today = filedicopath filedicopath_today = filedicopath
# Save I/O # Save I/O
if time.time() - time_1 > saved_dico_and_reload*60: if time.time() - time_1 > save_dico_and_reload*60:
flag_to_disk = True flag_write_to_disk = True
if os.path.exists(filedicopath): if os.path.exists(filedicopath):
if flag_reload == True: if flag_reload_from_disk == True:
flag_reload = False flag_reload_from_disk = False
print 'Reloading' print 'Reloading'
time_1 = time.time()
with open(filedicopath, 'r') as fp: with open(filedicopath, 'r') as fp:
today_dico = json.load(fp) today_dico = json.load(fp)
else: else:
time_1 = time.time()
today_dico = {} today_dico = {}
with open(filedicopath, 'w') as fp: with open(filedicopath, 'w') as fp:
json.dump(today_dico, fp) json.dump(today_dico, fp)
@ -105,44 +102,47 @@ if __name__ == "__main__":
r_serv0 = dico_redis[yearly_index] r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index") r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date) index = r_serv0.get("current_index")+str(PST.p_date)
# HASHTABLES PER MONTH (because of r_serv1 changing db)
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# For each dico # For each dico
opened_dico = [] opened_dico = []
for dico in dico_path_set: for dico in dico_path_set:
# Opening dico # Opening dico
if dico == filedicopath_today: if dico == filedicopath_today:
opened_dico.append([dico, today_dico]) opened_dico.append([dico, today_dico])
else:
with open(dico, 'r') as fp: with open(dico, 'r') as fp:
opened_dico.append([dico, json.load(fp)]) opened_dico.append([dico, json.load(fp)])
#retrieve hash from paste #retrieve hash from paste
paste_hash = PST._get_p_hash() paste_hash = PST._get_p_hash()
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
# Go throught the Database of the dico (of the month) # Go throught the Database of the dico (of the month)
threshold_dup = 10 threshold_dup = 99
for dico_name, dico in opened_dico: for dico_name, dico in opened_dico:
for dico_key, dico_hash in dico.items(): for dico_key, dico_hash in dico.items():
percent = ssdeep.compare(dico_hash, paste_hash) percent = ssdeep.compare(dico_hash, paste_hash)
if percent > threshold_dup: if percent > threshold_dup:
db = dico_name[-6:] db = dico_name[-6:]
# Go throught the Database of the bloom filter (month) # Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[db] r_serv_dico = dico_redis[db]
# index of paste # index of paste
# FIXME Use r_serv_dico and do not consider only 1 server!! index_current = r_serv_dico.get(dico_hash)
index_current = r_serv1.get(dico_hash) paste_path = r_serv_dico.get(index_current)
paste_path = r_serv1.get(index_current)
if paste_path != None: if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent) hash_dico[dico_hash] = (paste_path, percent)
print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) #print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
print ' '+ PST.p_path[44:] +', '+ paste_path[44:] print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent)
# Add paste in DB to prevent its analyse twice
# HASHTABLES PER MONTH (because of r_serv1 changing db)
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
##################### Similarity found ####################### ##################### Similarity found #######################
# if there is data in this dictionnary # if there is data in this dictionnary
@ -168,9 +168,11 @@ if __name__ == "__main__":
# Adding the hash in the dico of the month # Adding the hash in the dico of the month
today_dico[index] = paste_hash today_dico[index] = paste_hash
if flag_to_disk: if flag_write_to_disk:
flag_to_disk = False time_1 = time.time()
flag_reload = True flag_write_to_disk = False
flag_reload_from_disk = True
print 'writing'
with open(filedicopath, 'w') as fp: with open(filedicopath, 'w') as fp:
json.dump(today_dico, fp) json.dump(today_dico, fp)
except IOError: except IOError:

160
bin/Duplicate_ssdeep_v2.py Executable file
View file

@ -0,0 +1,160 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The Duplicate module
====================
This huge module is, in short term, checking duplicates.
This one differ from v1 by only using redis and not json file on disk
Requirements:
-------------
"""
import redis
import os
import time
from datetime import datetime, timedelta
import json
import ssdeep
from packages import Paste
from pubsublogger import publisher
from Helper import Process
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Duplicates'
p = Process(config_section)
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
# REDIS #
dico_redis = {}
date_today = datetime.today()
for year in xrange(2013, date_today.year+1):
for month in xrange(0, 13):
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"), port=year,
db=month)
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
# FUNCTIONS #
publisher.info("Script duplicate started")
while True:
try:
hash_dico = {}
dupl = []
dico_range_list = []
x = time.time()
message = p.get_from_set()
if message is not None:
path = message
PST = Paste.Paste(path)
else:
publisher.debug("Script Attribute is idling 10s")
time.sleep(10)
continue
# the paste is too small
if (PST._get_p_size() < min_paste_size):
continue
PST._set_p_hash_kind("ssdeep")
# Assignate the correct redis connexion
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
# Creating the dico name: yyyymm
# Get the date of the range
date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
for diff_month in xrange(0, num_of_month+1):
curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
dico_range_list.append(to_append)
# Use all dico in range
dico_range_list = dico_range_list[0:maximum_month_range]
# UNIQUE INDEX HASHS TABLE
yearly_index = str(date_today.year)+'00'
r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date)
# Open selected dico range
opened_dico = []
for dico_name in dico_range_list:
opened_dico.append([dico_name, dico_redis[dico_name]])
# retrieve hash from paste
paste_hash = PST._get_p_hash()
# Go throught the Database of the dico (of the month)
for curr_dico_name, curr_dico_redis in opened_dico:
for dico_hash in curr_dico_redis.smembers('HASHS'):
try:
percent = ssdeep.compare(dico_hash, paste_hash)
if percent > threshold_duplicate:
# Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[curr_dico_name]
# index of paste
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent)
print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
#print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent)
except:
# ssdeep hash not comparable
print 'ssdeep hash not comparable'
publisher.error('ssdeep hash not comparable')
# Add paste in DB after checking to prevent its analysis twice
# hash_i -> index_i AND index_i -> PST.PATH
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
##################### Similarity found #######################
# if there is data in this dictionnary
if len(hash_dico) != 0:
# paste_tuple = (paste_path, percent)
for dico_hash, paste_tuple in hash_dico.items():
dupl.append(paste_tuple)
# Creating the object attribute and save it.
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
if dupl != []:
PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl)
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
#print '{}Detected {}'.format(to_print, len(dupl))
y = time.time()
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
#print '{}Processed in {} sec'.format(to_print, y-x)
except IOError:
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)
print "CRC Checksum Failed on :", PST.p_path
publisher.error('{}CRC Checksum Failed'.format(to_print))

View file

@ -105,7 +105,7 @@ function launching_scripts {
screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
sleep 0.1 sleep 0.1
screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x' screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x'
sleep 0.1 sleep 0.1
screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
sleep 0.1 sleep 0.1

View file

@ -4,6 +4,13 @@ pastes = PASTES
wordtrending_csv = var/www/static/csv/wordstrendingdata wordtrending_csv = var/www/static/csv/wordstrendingdata
wordsfile = files/wordfile wordsfile = files/wordfile
#### Modules ####
[Modules_Duplicates]
#Number of month to look back
maximum_month_range = 3
#The value where two pastes are considerate duplicate.
threshold_duplicate = 50
##### Redis ##### ##### Redis #####
[Redis_Cache] [Redis_Cache]
host = localhost host = localhost