From 60512a30a2f3bab9a84ce11973d125442dcf2c9a Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Mon, 8 Aug 2016 15:28:26 +0200 Subject: [PATCH] Fixeed bug in Duplicate (The comparison value was not saved correctly in redis) + Modified to progression detection algo --- bin/Duplicate_ssdeep_v2.py | 1 + bin/WebStats.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py index a2ab55aa..2eedda96 100755 --- a/bin/Duplicate_ssdeep_v2.py +++ b/bin/Duplicate_ssdeep_v2.py @@ -120,6 +120,7 @@ if __name__ == "__main__": percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash) threshold_duplicate = threshold_set[hash_type] if percent < threshold_duplicate: + percent = 100 - percent if hash_type == 'ssdeep' else percent #recovert the correct percent value for ssdeep # Go throught the Database of the dico filter (month) r_serv_dico = dico_redis[curr_dico_name] diff --git a/bin/WebStats.py b/bin/WebStats.py index aea75a81..c6507542 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -49,15 +49,24 @@ def compute_progression(server, field_name, num_day, url_parsed): keyword = url_parsed[field_name] if keyword is not None: date_range = get_date_range(num_day) + # check if this keyword is eligible for progression keyword_total_sum = 0 value_list = [] - for date in date_range: + for date in date_range: # get value up to date_range curr_value = server.hget(keyword, date) value_list.append(int(curr_value if curr_value is not None else 0)) keyword_total_sum += int(curr_value) if curr_value is not None else 0 oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division - keyword_increase = value_list[0] / oldest_value + + # The progression is based on the ratio: value[i] / value[i-1] + keyword_increase = 0 + value_list_reversed = value_list[:] + value_list_reversed.reverse() + for i in range(1, len(value_list_reversed)): + divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1 + keyword_increase += value_list_reversed[i] / divisor + # filter if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): @@ -66,7 +75,7 @@ def compute_progression(server, field_name, num_day, url_parsed): server.hset(redis_progression_name, keyword, keyword_increase) #update its value elif (server.scard(redis_progression_name_set) < max_set_cardinality): - server.sadd(redis_progression_name_set, keyword) + server.sadd(redis_progression_name_set, keyword) else: #not in the set #Check value for all members