fix Duplicate, save list of duplicates on disk + prevent empty hash creation

This commit is contained in:
Terrtia 2018-05-15 23:28:47 +02:00
parent 225fe76c96
commit f66a528bc2
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
6 changed files with 52 additions and 31 deletions

View file

@ -158,6 +158,10 @@ if __name__ == "__main__":
# Adding hashes in Redis # Adding hashes in Redis
for hash_type, paste_hash in paste_hashes.items(): for hash_type, paste_hash in paste_hashes.items():
r_serv1.set(paste_hash, index) r_serv1.set(paste_hash, index)
#bad hash
if paste_hash == '':
print('bad Hash: ' + hash_type)
else:
r_serv1.sadd("HASHS_"+hash_type, paste_hash) r_serv1.sadd("HASHS_"+hash_type, paste_hash)
##################### Similarity found ####################### ##################### Similarity found #######################
@ -174,10 +178,11 @@ if __name__ == "__main__":
if dupl != []: if dupl != []:
dupl = list(dupl) dupl = list(dupl)
PST.__setattr__("p_duplicate", dupl) PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl) PST.save_attribute_duplicate(dupl)
PST.save_others_pastes_attribute_duplicate("p_duplicate", dupl) PST.save_others_pastes_attribute_duplicate(dupl)
publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path)) publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_path))
print('{}Detected {}'.format(to_print, len(dupl))) print('{}Detected {}'.format(to_print, len(dupl)))
print('')
y = time.time() y = time.time()

View file

@ -110,8 +110,6 @@ function launching_scripts {
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "Duplicates" bash -c './Duplicates.py; read x' screen -S "Script_AIL" -X screen -t "Duplicates" bash -c './Duplicates.py; read x'
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "Attributes" bash -c './Attributes.py; read x'
sleep 0.1
screen -S "Script_AIL" -X screen -t "Lines" bash -c './Lines.py; read x' screen -S "Script_AIL" -X screen -t "Lines" bash -c './Lines.py; read x'
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x' screen -S "Script_AIL" -X screen -t "DomClassifier" bash -c './DomClassifier.py; read x'

View file

@ -76,6 +76,11 @@ class Paste(object):
port=cfg.getint("Redis_Data_Merging", "port"), port=cfg.getint("Redis_Data_Merging", "port"),
db=cfg.getint("Redis_Data_Merging", "db"), db=cfg.getint("Redis_Data_Merging", "db"),
decode_responses=True) decode_responses=True)
self.store_duplicate = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.p_path = p_path self.p_path = p_path
self.p_name = os.path.basename(self.p_path) self.p_name = os.path.basename(self.p_path)
@ -272,9 +277,9 @@ class Paste(object):
return False, var return False, var
def _get_p_duplicate(self): def _get_p_duplicate(self):
self.p_duplicate = self.store.hget(self.p_path, "p_duplicate") self.p_duplicate = self.store_duplicate.smembers('dup:'+self.p_path)
if self.p_duplicate is not None: if self.p_duplicate is not None:
return self.p_duplicate return list(self.p_duplicate)
else: else:
return '[]' return '[]'
@ -323,27 +328,20 @@ class Paste(object):
else: else:
self.store.hset(self.p_path, attr_name, json.dumps(value)) self.store.hset(self.p_path, attr_name, json.dumps(value))
def save_others_pastes_attribute_duplicate(self, attr_name, list_value): def save_attribute_duplicate(self, value):
"""
Save an attribute as a field
"""
for tuple in value:
self.store_duplicate.sadd('dup:'+self.p_path, tuple)
def save_others_pastes_attribute_duplicate(self, list_value):
""" """
Save a new duplicate on others pastes Save a new duplicate on others pastes
""" """
for hash_type, path, percent, date in list_value: for hash_type, path, percent, date in list_value:
#get json
json_duplicate = self.store.hget(path, attr_name)
#json save on redis
if json_duplicate is not None:
list_duplicate = (json.loads(json_duplicate))
# avoid duplicate, a paste can be send by multiples modules
to_add = [hash_type, self.p_path, percent, date] to_add = [hash_type, self.p_path, percent, date]
if to_add not in list_duplicate: self.store_duplicate.sadd('dup:'+path,to_add)
list_duplicate.append(to_add)
self.store.hset(path, attr_name, json.dumps(list_duplicate))
else:
# create the new list
list_duplicate = [[hash_type, self.p_path, percent, date]]
self.store.hset(path, attr_name, json.dumps(list_duplicate))
def _get_from_redis(self, r_serv): def _get_from_redis(self, r_serv):
ans = {} ans = {}

View file

@ -152,6 +152,11 @@ host = localhost
port = 6382 port = 6382
db = 6 db = 6
[ARDB_Metadata]
host = localhost
port = 6382
db = 7
[Url] [Url]
cc_critical = DE cc_critical = DE

View file

@ -65,6 +65,18 @@ r_serv_pasteName = redis.StrictRedis(
db=cfg.getint("Redis_Paste_Name", "db"), db=cfg.getint("Redis_Paste_Name", "db"),
decode_responses=True) decode_responses=True)
r_serv_tags = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)
r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
# VARIABLES # # VARIABLES #
max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip max_preview_char = int(cfg.get("Flask", "max_preview_char")) # Maximum number of character to display in the tooltip
max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal max_preview_modal = int(cfg.get("Flask", "max_preview_modal")) # Maximum number of character to display in the modal

View file

@ -18,6 +18,7 @@ import Flask_config
app = Flask_config.app app = Flask_config.app
cfg = Flask_config.cfg cfg = Flask_config.cfg
r_serv_pasteName = Flask_config.r_serv_pasteName r_serv_pasteName = Flask_config.r_serv_pasteName
r_serv_metadata = Flask_config.r_serv_metadata
max_preview_char = Flask_config.max_preview_char max_preview_char = Flask_config.max_preview_char
max_preview_modal = Flask_config.max_preview_modal max_preview_modal = Flask_config.max_preview_modal
DiffMaxLineLength = Flask_config.DiffMaxLineLength DiffMaxLineLength = Flask_config.DiffMaxLineLength
@ -38,20 +39,22 @@ def showpaste(content_range):
p_mime = paste.p_mime p_mime = paste.p_mime
p_lineinfo = paste.get_lines_info() p_lineinfo = paste.get_lines_info()
p_content = paste.get_p_content() p_content = paste.get_p_content()
p_duplicate_full_list = json.loads(paste._get_p_duplicate()) p_duplicate_str_full_list = paste._get_p_duplicate()
p_duplicate_full_list = []
p_duplicate_list = [] p_duplicate_list = []
p_simil_list = [] p_simil_list = []
p_date_list = [] p_date_list = []
p_hashtype_list = [] p_hashtype_list = []
for dup_list in p_duplicate_full_list: for dup_list in p_duplicate_str_full_list:
dup_list = dup_list[1:-1].replace('\'', '').replace(' ', '').split(',')
if dup_list[0] == "tlsh": if dup_list[0] == "tlsh":
dup_list[2] = 100 - int(dup_list[2]) dup_list[2] = 100 - int(dup_list[2])
else: else:
print('dup_list')
print(dup_list)
dup_list[2] = int(dup_list[2]) dup_list[2] = int(dup_list[2])
p_duplicate_full_list.append(dup_list)
#p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True) #p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
@ -69,8 +72,8 @@ def showpaste(content_range):
comp_vals.append(p_duplicate_full_list[i][2]) comp_vals.append(p_duplicate_full_list[i][2])
dup_list_removed.append(i) dup_list_removed.append(i)
hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) #hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) #comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
if len(p_duplicate_full_list[dup_list_index]) > 3: if len(p_duplicate_full_list[dup_list_index]) > 3:
try: try:
@ -80,7 +83,7 @@ def showpaste(content_range):
date_paste = str(p_duplicate_full_list[dup_list_index][3]) date_paste = str(p_duplicate_full_list[dup_list_index][3])
else: else:
date_paste = "No date available" date_paste = "No date available"
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste]) new_dup_list.append([hash_types, p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste])
# Create the list to pass to the webpage # Create the list to pass to the webpage
for dup_list in new_dup_list: for dup_list in new_dup_list: