From 56b6659d8be27cc7785df7e186ebfa4dced5f6e0 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Wed, 13 Jul 2016 08:59:48 +0200 Subject: [PATCH 01/14] Commented out get_language because it adds too much overhead --- bin/Attribute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/Attribute.py b/bin/Attribute.py index 46d80858..a7f78696 100755 --- a/bin/Attribute.py +++ b/bin/Attribute.py @@ -51,12 +51,13 @@ if __name__ == "__main__": PST = Paste.Paste(message) else: publisher.debug("Script Attribute is idling 1s") + print 'sleeping' time.sleep(1) continue # FIXME do it directly in the class PST.save_attribute_redis("p_encoding", PST._get_p_encoding()) - PST.save_attribute_redis("p_language", PST._get_p_language()) + #PST.save_attribute_redis("p_language", PST._get_p_language()) # FIXME why not all saving everything there. PST.save_all_attributes_redis() # FIXME Not used. From 594d2def359e055f1c8d94dc17ab33eec295bbe8 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Wed, 13 Jul 2016 15:57:33 +0200 Subject: [PATCH 02/14] In index: Added number of processed pastes chart --- bin/Global.py | 9 +++++ var/www/Flask_server.py | 3 +- var/www/static/js/indexjavascript.js | 60 ++++++++++++++++++++++++++++ var/www/templates/index.html | 15 +++++-- 4 files changed, 83 insertions(+), 4 deletions(-) diff --git a/bin/Global.py b/bin/Global.py index fb44c70b..8b6e482f 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -31,6 +31,8 @@ from Helper import Process if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' + processed_paste = 0 + time_1 = time.time() config_section = 'Global' @@ -54,6 +56,12 @@ if __name__ == '__main__': continue else: print "Empty Queues: Waiting..." + if int(time.time() - time_1) > 30: + to_print = 'Global; ; ; ;glob Processed {0} paste(s)'.format(processed_paste) + print to_print + publisher.info(to_print) + time_1 = time.time() + processed_paste = 0 time.sleep(1) continue # Creating the full filepath @@ -66,3 +74,4 @@ if __name__ == '__main__': with open(filename, 'wb') as f: f.write(base64.standard_b64decode(gzip64encoded)) p.populate_set_out(filename) + processed_paste+=1 diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 018608f1..36fcfbcc 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -122,7 +122,8 @@ def search(): @app.route("/") def index(): - return render_template("index.html") + default_minute = cfg.get("Flask", "minute_processed_paste") + return render_template("index.html", default_minute = default_minute) @app.route("/monitoring/") diff --git a/var/www/static/js/indexjavascript.js b/var/www/static/js/indexjavascript.js index 7eb7e5c5..ef9bf0c6 100644 --- a/var/www/static/js/indexjavascript.js +++ b/var/www/static/js/indexjavascript.js @@ -1,3 +1,54 @@ +// Plot and update the number of processed pastes +$(function() { + var data = []; + var totalPoints = 60*10; //60s*10m + var curr_max = 0; + + function getData() { + if (data.length > 0){ + curr_max = curr_max == data[0] ? Math.max.apply(null, data) : curr_max; + data = data.slice(1); + } + + while (data.length < totalPoints) { + var y = (typeof window.paste_num_tabvar !== "undefined") ? window.paste_num_tabvar : 0; + curr_max = curr_max < y ? y : curr_max; + data.push(y); + } + + // Zip the generated y values with the x values + var res = []; + for (var i = 0; i < data.length; ++i) { + res.push([i, data[i]]) + } + return res; + } + + var updateInterval = 1000; + var options = { + series: { shadowSize: 1 }, + lines: { fill: true, fillColor: { colors: [ { opacity: 1 }, { opacity: 0.1 } ] }}, + yaxis: { min: 0, max: 40 }, + xaxis: { show: false }, + colors: ["#F4A506"], + grid: { + tickColor: "#dddddd", + borderWidth: 0 + }, + }; + var plot = $.plot("#realtimechart", [ getData() ], options); + + function update() { +console.log(curr_max); + plot.setData([getData()]); + plot.getOptions().yaxes[0].max = curr_max; + plot.setupGrid(); + plot.draw(); + setTimeout(update, updateInterval); + } + update(); +}); + function initfunc( csvay, scroot) { window.csv = csvay; window.scroot = scroot; @@ -38,6 +89,13 @@ function create_log_table(obj_json) { var chansplit = obj_json.channel.split('.'); var parsedmess = obj_json.data.split(';'); + if (parsedmess[0] == "Global"){ + var paste_processed = parsedmess[4].split(" ")[2]; + console.log(paste_processed) + window.paste_num_tabvar = paste_processed; + return; + } + if( chansplit[1] == "INFO" ){ tr.className = "info"; } @@ -270,3 +328,5 @@ $(document).ready(function () { } }); + + diff --git a/var/www/templates/index.html b/var/www/templates/index.html index eb34aaf0..551c56b8 100644 --- a/var/www/templates/index.html +++ b/var/www/templates/index.html @@ -14,9 +14,10 @@ - + + - From 60552bca4df0bfad8c095808204fa5c9ee3099ed Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 14 Jul 2016 10:31:47 +0200 Subject: [PATCH 03/14] Fixed a bug in processed_pastes graph --- var/www/static/js/indexjavascript.js | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/var/www/static/js/indexjavascript.js b/var/www/static/js/indexjavascript.js index ef9bf0c6..628ffe86 100644 --- a/var/www/static/js/indexjavascript.js +++ b/var/www/static/js/indexjavascript.js @@ -6,16 +6,16 @@ $(function() { function getData() { if (data.length > 0){ - curr_max = curr_max == data[0] ? Math.max.apply(null, data) : curr_max; - data = data.slice(1); + var data_old = data[0]; + data = data.slice(1); + curr_max = curr_max == data_old ? Math.max.apply(null, data) : curr_max; } while (data.length < totalPoints) { - var y = (typeof window.paste_num_tabvar !== "undefined") ? window.paste_num_tabvar : 0; - curr_max = curr_max < y ? y : curr_max; - data.push(y); + var y = (typeof window.paste_num_tabvar !== "undefined") ? parseInt(window.paste_num_tabvar) : 0; + curr_max = y > curr_max ? y : curr_max; + data.push(y); } - // Zip the generated y values with the x values var res = []; for (var i = 0; i < data.length; ++i) { @@ -29,17 +29,15 @@ $(function() { series: { shadowSize: 1 }, lines: { fill: true, fillColor: { colors: [ { opacity: 1 }, { opacity: 0.1 } ] }}, yaxis: { min: 0, max: 40 }, - xaxis: { show: false }, - colors: ["#F4A506"], + colors: ["#a971ff"], grid: { - tickColor: "#dddddd", - borderWidth: 0 + tickColor: "#dddddd", + borderWidth: 0 }, }; var plot = $.plot("#realtimechart", [ getData() ], options); function update() { -console.log(curr_max); plot.setData([getData()]); plot.getOptions().yaxes[0].max = curr_max; plot.setupGrid(); @@ -91,7 +89,6 @@ function create_log_table(obj_json) { if (parsedmess[0] == "Global"){ var paste_processed = parsedmess[4].split(" ")[2]; - console.log(paste_processed) window.paste_num_tabvar = paste_processed; return; } From 0332f23579ae7dee01c7f42db72aff070c8aa019 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 15 Jul 2016 08:56:16 +0200 Subject: [PATCH 04/14] Added SimHash library --- bin/Duplicate.py | 7 ++++--- bin/packages/Hash.py | 4 ++++ pip_packages_requirement.txt | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/Duplicate.py b/bin/Duplicate.py index a7a41dc1..59610f83 100755 --- a/bin/Duplicate.py +++ b/bin/Duplicate.py @@ -74,9 +74,9 @@ if __name__ == "__main__": # Creating the bloom filter name: bloomyyyymm filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year + PST.p_date.month) - if os.path.exists(filebloompath): bloom = BloomFilter.open(filebloompath) + bloop_path_set.add(filebloompath) else: bloom = BloomFilter(100000000, 0.01, filebloompath) bloop_path_set.add(filebloompath) @@ -94,7 +94,6 @@ if __name__ == "__main__": for bloo in bloop_path_set: # Opening blooms opened_bloom.append(BloomFilter.open(bloo)) - # For each hash of the paste for line_hash in PST._get_hash_lines(min=5, start=1, jump=0): nb_hash_current += 1 @@ -105,7 +104,6 @@ if __name__ == "__main__": r_serv1.sadd("HASHS", line_hash) # Adding the hash in the bloom of the month bloom.add(line_hash) - # Go throught the Database of the bloom filter (of the month) for bloo in opened_bloom: if line_hash in bloo: @@ -148,6 +146,8 @@ if __name__ == "__main__": percentage = round((count/float(nb_hash_current))*100, 2) if percentage >= 50: dupl.append((paste, percentage)) + else: + print 'percentage: ' + str(percentage) # Creating the object attribute and save it. to_print = 'Duplicate;{};{};{};'.format( @@ -156,6 +156,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl) publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) y = time.time() diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index f8dcac0f..d46abcba 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -1,6 +1,7 @@ import hashlib import crcmod import mmh3 +import simhash class Hash(object): @@ -32,4 +33,7 @@ class Hash(object): elif self.name == "murmur": hash = mmh3.hash(string) + elif self.name == "simhash": + hash = Simhash(string) + return hash diff --git a/pip_packages_requirement.txt b/pip_packages_requirement.txt index 40dcda8e..db2f23c5 100644 --- a/pip_packages_requirement.txt +++ b/pip_packages_requirement.txt @@ -17,6 +17,7 @@ nltk # Hashlib crcmod mmh3 +simhash #Others python-magic From 14e9850dd6d1f87a78276dcf317b9f0f3605960b Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 15 Jul 2016 16:58:48 +0200 Subject: [PATCH 05/14] Added new module for Duplicate paste. Seems working but has some small bug (re-check same paste twice) --- bin/Duplicate_ssdeep.py | 180 ++++++++++++++++++++++++++++++++++++++++ bin/LAUNCH.sh | 2 +- bin/packages/Hash.py | 6 +- 3 files changed, 184 insertions(+), 4 deletions(-) create mode 100755 bin/Duplicate_ssdeep.py diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py new file mode 100755 index 00000000..916bc0ba --- /dev/null +++ b/bin/Duplicate_ssdeep.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. + +Requirements: +------------- + + +""" +import redis +import os +import time +import datetime +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher +from pybloomfilter import BloomFilter + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + saved_dico_and_reload = 1 #min + time_1 = time.time() + flag_reload = True + flag_to_disk = False + + p = Process(config_section) + + # REDIS # + # DB OBJECT & HASHS ( DISK ) + # FIXME increase flexibility + dico_redis = {} + for year in xrange(2013, datetime.date.today().year+1): + for month in xrange(0, 16): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + dicopath = os.path.join(os.environ['AIL_HOME'], + p.config.get("Directories", "dicofilters")) + + dico_path_set = set() + while True: + try: + hash_dico = {} + dupl = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dicor name: dicoyyyymm + filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year + + PST.p_date.month) + filedicopath_today = filedicopath + + # Save I/O + if time.time() - time_1 > saved_dico_and_reload*60: + flag_to_disk = True + + if os.path.exists(filedicopath): + if flag_reload == True: + flag_reload = False + print 'Reloading' + time_1 = time.time() + with open(filedicopath, 'r') as fp: + today_dico = json.load(fp) + else: + time_1 = time.time() + today_dico = {} + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + + # For now, just use monthly dico + dico_path_set.add(filedicopath) + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(datetime.date.today().year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + # HASHTABLES PER MONTH (because of r_serv1 changing db) + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # For each dico + opened_dico = [] + for dico in dico_path_set: + # Opening dico + if dico == filedicopath_today: + opened_dico.append([dico, today_dico]) + with open(dico, 'r') as fp: + opened_dico.append([dico, json.load(fp)]) + + + #retrieve hash from paste + paste_hash = PST._get_p_hash() + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + # Go throught the Database of the dico (of the month) + threshold_dup = 10 + for dico_name, dico in opened_dico: + for dico_key, dico_hash in dico.items(): + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_dup: + db = dico_name[-6:] + # Go throught the Database of the bloom filter (month) + r_serv_dico = dico_redis[db] + + # index of paste + # FIXME Use r_serv_dico and do not consider only 1 server!! + index_current = r_serv1.get(dico_hash) + paste_path = r_serv1.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) + print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + for dico_hash, paste_tuple in hash_dico.items(): + paste_path, percent = paste_tuple + dupl.append((paste_path, percent)) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + + + # Adding the hash in the dico of the month + today_dico[index] = paste_hash + + if flag_to_disk: + flag_to_disk = False + flag_reload = True + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index fc8c9ff1..86e155b1 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -105,7 +105,7 @@ function launching_scripts { screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' sleep 0.1 - screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate.py; read x' + screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' sleep 0.1 diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index d46abcba..2f34c5c7 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -1,7 +1,7 @@ import hashlib import crcmod import mmh3 -import simhash +import ssdeep class Hash(object): @@ -33,7 +33,7 @@ class Hash(object): elif self.name == "murmur": hash = mmh3.hash(string) - elif self.name == "simhash": - hash = Simhash(string) + elif self.name == "ssdeep": + hash = ssdeep.hash(string) return hash From 4f6813350b08c805fe9207af3ec8b4181f8685ea Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Mon, 18 Jul 2016 15:50:41 +0200 Subject: [PATCH 06/14] Added two new version of duplicate module. One with hashes are saved in json on disk The other with only leveldb --- bin/Duplicate_ssdeep.py | 58 ++++++------ bin/Duplicate_ssdeep_v2.py | 160 +++++++++++++++++++++++++++++++++ bin/LAUNCH.sh | 2 +- bin/packages/config.cfg.sample | 7 ++ 4 files changed, 198 insertions(+), 29 deletions(-) create mode 100755 bin/Duplicate_ssdeep_v2.py diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py index 916bc0ba..1b173eca 100755 --- a/bin/Duplicate_ssdeep.py +++ b/bin/Duplicate_ssdeep.py @@ -20,7 +20,6 @@ import json import ssdeep from packages import Paste from pubsublogger import publisher -from pybloomfilter import BloomFilter from Helper import Process @@ -29,10 +28,10 @@ if __name__ == "__main__": publisher.channel = "Script" config_section = 'Duplicates' - saved_dico_and_reload = 1 #min + save_dico_and_reload = 1 #min time_1 = time.time() - flag_reload = True - flag_to_disk = False + flag_reload_from_disk = True + flag_write_to_disk = False p = Process(config_section) @@ -81,18 +80,16 @@ if __name__ == "__main__": filedicopath_today = filedicopath # Save I/O - if time.time() - time_1 > saved_dico_and_reload*60: - flag_to_disk = True + if time.time() - time_1 > save_dico_and_reload*60: + flag_write_to_disk = True if os.path.exists(filedicopath): - if flag_reload == True: - flag_reload = False + if flag_reload_from_disk == True: + flag_reload_from_disk = False print 'Reloading' - time_1 = time.time() with open(filedicopath, 'r') as fp: today_dico = json.load(fp) else: - time_1 = time.time() today_dico = {} with open(filedicopath, 'w') as fp: json.dump(today_dico, fp) @@ -105,44 +102,47 @@ if __name__ == "__main__": r_serv0 = dico_redis[yearly_index] r_serv0.incr("current_index") index = r_serv0.get("current_index")+str(PST.p_date) - # HASHTABLES PER MONTH (because of r_serv1 changing db) - r_serv1.set(index, PST.p_path) - r_serv1.sadd("INDEX", index) + # For each dico opened_dico = [] for dico in dico_path_set: # Opening dico if dico == filedicopath_today: opened_dico.append([dico, today_dico]) - with open(dico, 'r') as fp: - opened_dico.append([dico, json.load(fp)]) + else: + with open(dico, 'r') as fp: + opened_dico.append([dico, json.load(fp)]) #retrieve hash from paste paste_hash = PST._get_p_hash() - # Adding the hash in Redis - r_serv1.set(paste_hash, index) - r_serv1.sadd("HASHS", paste_hash) + # Go throught the Database of the dico (of the month) - threshold_dup = 10 + threshold_dup = 99 for dico_name, dico in opened_dico: for dico_key, dico_hash in dico.items(): percent = ssdeep.compare(dico_hash, paste_hash) if percent > threshold_dup: db = dico_name[-6:] - # Go throught the Database of the bloom filter (month) + # Go throught the Database of the dico filter (month) r_serv_dico = dico_redis[db] # index of paste - # FIXME Use r_serv_dico and do not consider only 1 server!! - index_current = r_serv1.get(dico_hash) - paste_path = r_serv1.get(index_current) + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) if paste_path != None: hash_dico[dico_hash] = (paste_path, percent) - print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) - print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + #print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) + print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + # Add paste in DB to prevent its analyse twice + # HASHTABLES PER MONTH (because of r_serv1 changing db) + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) ##################### Similarity found ####################### # if there is data in this dictionnary @@ -168,9 +168,11 @@ if __name__ == "__main__": # Adding the hash in the dico of the month today_dico[index] = paste_hash - if flag_to_disk: - flag_to_disk = False - flag_reload = True + if flag_write_to_disk: + time_1 = time.time() + flag_write_to_disk = False + flag_reload_from_disk = True + print 'writing' with open(filedicopath, 'w') as fp: json.dump(today_dico, fp) except IOError: diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py new file mode 100755 index 00000000..35874371 --- /dev/null +++ b/bin/Duplicate_ssdeep_v2.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. + +This one differ from v1 by only using redis and not json file on disk + +Requirements: +------------- + + +""" +import redis +import os +import time +from datetime import datetime, timedelta +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + + p = Process(config_section) + + maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) + threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) + min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) + + # REDIS # + dico_redis = {} + date_today = datetime.today() + for year in xrange(2013, date_today.year+1): + for month in xrange(0, 13): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + while True: + try: + hash_dico = {} + dupl = [] + dico_range_list = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + # the paste is too small + if (PST._get_p_size() < min_paste_size): + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dico name: yyyymm + # Get the date of the range + date_range = date_today - timedelta(days = maximum_month_range*30.4166666) + num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month) + for diff_month in xrange(0, num_of_month+1): + curr_date_range = date_today - timedelta(days = diff_month*30.4166666) + to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) + dico_range_list.append(to_append) + + # Use all dico in range + dico_range_list = dico_range_list[0:maximum_month_range] + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(date_today.year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + + # Open selected dico range + opened_dico = [] + for dico_name in dico_range_list: + opened_dico.append([dico_name, dico_redis[dico_name]]) + + # retrieve hash from paste + paste_hash = PST._get_p_hash() + + # Go throught the Database of the dico (of the month) + for curr_dico_name, curr_dico_redis in opened_dico: + for dico_hash in curr_dico_redis.smembers('HASHS'): + try: + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_duplicate: + # Go throught the Database of the dico filter (month) + r_serv_dico = dico_redis[curr_dico_name] + + # index of paste + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) + #print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + except: + # ssdeep hash not comparable + print 'ssdeep hash not comparable' + publisher.error('ssdeep hash not comparable') + + # Add paste in DB after checking to prevent its analysis twice + # hash_i -> index_i AND index_i -> PST.PATH + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + # paste_tuple = (paste_path, percent) + for dico_hash, paste_tuple in hash_dico.items(): + dupl.append(paste_tuple) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + #print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + #print '{}Processed in {} sec'.format(to_print, y-x) + + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 86e155b1..d6706e1e 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -105,7 +105,7 @@ function launching_scripts { screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' sleep 0.1 - screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x' + screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' sleep 0.1 diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 6d07707c..b5f2c308 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -4,6 +4,13 @@ pastes = PASTES wordtrending_csv = var/www/static/csv/wordstrendingdata wordsfile = files/wordfile +#### Modules #### +[Modules_Duplicates] +#Number of month to look back +maximum_month_range = 3 +#The value where two pastes are considerate duplicate. +threshold_duplicate = 50 + ##### Redis ##### [Redis_Cache] host = localhost From 6805ed6488b87b3f887f56fcaf984dc46fed0089 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Mon, 18 Jul 2016 15:52:53 +0200 Subject: [PATCH 07/14] Added default config --- bin/packages/config.cfg.sample | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index b5f2c308..0d2abb79 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -10,6 +10,8 @@ wordsfile = files/wordfile maximum_month_range = 3 #The value where two pastes are considerate duplicate. threshold_duplicate = 50 +#Minimum size of the paste considered +min_paste_size = 0.3 ##### Redis ##### [Redis_Cache] From 996c0e02dea69d49334a58c9a6fc1ae81058fc2d Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Mon, 18 Jul 2016 16:22:33 +0200 Subject: [PATCH 08/14] Duplicate module takes its messages from other modules and no more from Global. --- bin/Credential.py | 2 ++ bin/CreditCard.py | 2 ++ bin/Duplicate_ssdeep_v2.py | 9 +++++---- bin/Keys.py | 2 ++ bin/Mail.py | 2 ++ bin/Phone.py | 2 ++ bin/packages/modules.cfg | 7 ++++++- 7 files changed, 21 insertions(+), 5 deletions(-) diff --git a/bin/Credential.py b/bin/Credential.py index 3ac61faf..d81c9ff6 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -50,6 +50,8 @@ if __name__ == "__main__": if len(creds) > critical: print("========> Found more than 10 credentials in this file : {}".format(filepath)) publisher.warning(to_print) + #Send to duplicate + p.populate_set_out(filepath) if sites: print("=======> Probably on : {}".format(', '.join(sites))) else: diff --git a/bin/CreditCard.py b/bin/CreditCard.py index d4660c13..18703f4e 100755 --- a/bin/CreditCard.py +++ b/bin/CreditCard.py @@ -65,6 +65,8 @@ if __name__ == "__main__": if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s)'.format( to_print, len(creditcard_set))) + #Send to duplicate + p.populate_set_out(filename) else: publisher.info('{}CreditCard related'.format(to_print)) else: diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py index 35874371..f6aaca4f 100755 --- a/bin/Duplicate_ssdeep_v2.py +++ b/bin/Duplicate_ssdeep_v2.py @@ -6,8 +6,10 @@ The Duplicate module ==================== This huge module is, in short term, checking duplicates. +Its input comes from other modules, namely: + Credential, CreditCard, Keys, Mails and Phone -This one differ from v1 by only using redis and not json file on disk +This one differ from v1 by only using redis and not json file stored on disk Requirements: ------------- @@ -117,8 +119,7 @@ if __name__ == "__main__": if paste_path != None: hash_dico[dico_hash] = (paste_path, percent) - print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) - #print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + #print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) except: # ssdeep hash not comparable print 'ssdeep hash not comparable' @@ -146,7 +147,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl) publisher.info('{}Detected {}'.format(to_print, len(dupl))) - #print '{}Detected {}'.format(to_print, len(dupl)) + print '{}Detected {}'.format(to_print, len(dupl)) y = time.time() diff --git a/bin/Keys.py b/bin/Keys.py index 8058545b..9c44f60a 100755 --- a/bin/Keys.py +++ b/bin/Keys.py @@ -16,6 +16,8 @@ def search_gpg(message): content = paste.get_p_content() if '-----BEGIN PGP MESSAGE-----' in content: publisher.warning('{} has a PGP enc message'.format(paste.p_name)) + #Send to duplicate + p.populate_set_out(message) if __name__ == '__main__': diff --git a/bin/Mail.py b/bin/Mail.py index dd348ba6..964deb19 100755 --- a/bin/Mail.py +++ b/bin/Mail.py @@ -60,6 +60,8 @@ if __name__ == "__main__": MX_values[0]) if MX_values[0] > is_critical: publisher.warning(to_print) + #Send to duplicate + p.populate_set_out(filename) else: publisher.info(to_print) prec_filename = filename diff --git a/bin/Phone.py b/bin/Phone.py index 628f77c2..b53b079c 100755 --- a/bin/Phone.py +++ b/bin/Phone.py @@ -23,6 +23,8 @@ def search_phone(message): if len(results) > 4: print results publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name)) + #Send to duplicate + p.populate_set_out(message) if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 9d8d6637..5f087427 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -3,7 +3,7 @@ subscribe = ZMQ_Global publish = Redis_Global [Duplicates] -subscribe = Redis_Global +subscribe = Redis_Duplicate [Indexer] subscribe = Redis_Global @@ -31,9 +31,11 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Re [CreditCards] subscribe = Redis_CreditCards +publish = Redis_Duplicate [Mail] subscribe = Redis_Mail +publish = Redis_Duplicate [Onion] subscribe = Redis_Onion @@ -55,15 +57,18 @@ subscribe = Redis_Global [Credential] subscribe = Redis_Credential +publish = Redis_Duplicate [Cve] subscribe = Redis_Cve [Phone] subscribe = Redis_Global +publish = Redis_Duplicate [SourceCode] subscribe = Redis_SourceCode [Keys] subscribe = Redis_Global +publish = Redis_Duplicate From a6996c0b23e6e9bac1510f8fb71befdb3afb838b Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Tue, 19 Jul 2016 10:48:44 +0200 Subject: [PATCH 09/14] Added related functions and display of duplicated paste in search.py --- bin/packages/Paste.py | 5 +++++ var/www/Flask_server.py | 26 ++++++++++++++++++++++++- var/www/templates/show_saved_paste.html | 12 +++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index bedf36b0..172f0931 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -91,6 +91,7 @@ class Paste(object): self.p_langage = None self.p_nb_lines = None self.p_max_length_line = None + self.p_duplicate = None def get_p_content(self): """ @@ -277,6 +278,10 @@ class Paste(object): return True, var else: return False, var + + def _get_p_duplicate(self): + self.p_duplicate = self.store.hget(self.p_path, "p_duplicate") + return self.p_duplicate if self.p_duplicate is not None else [] def save_all_attributes_redis(self, key=None): """ diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 36fcfbcc..08ea0675 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -58,6 +58,21 @@ def list_len(s): return len(s) app.jinja_env.filters['list_len'] = list_len +def parseStringToList(the_string): + strList = "" + elemList = [] + for c in the_string: + if c != ']': + if c != '[' and c !=' ' and c != '"': + strList += c + else: + the_list = strList.split(',') + if len(the_list) == 2: + elemList.append(the_list) + elif len(the_list) > 1: + elemList.append(the_list[1:]) + strList = "" + return elemList def showpaste(content_range): requested_path = request.args.get('paste', '') @@ -71,10 +86,19 @@ def showpaste(content_range): p_mime = paste.p_mime p_lineinfo = paste.get_lines_info() p_content = paste.get_p_content().decode('utf-8', 'ignore') + p_duplicate_full_list = parseStringToList(paste._get_p_duplicate()) + p_duplicate_list = [] + p_simil_list = [] + + for dup_list in p_duplicate_full_list: + path, simil_percent = dup_list + p_duplicate_list.append(path) + p_simil_list.append(simil_percent) + if content_range != 0: p_content = p_content[0:content_range] - return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content)) + return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list) @app.route("/_logs") diff --git a/var/www/templates/show_saved_paste.html b/var/www/templates/show_saved_paste.html index ce68465c..707786d7 100644 --- a/var/www/templates/show_saved_paste.html +++ b/var/www/templates/show_saved_paste.html @@ -42,7 +42,17 @@
-

Content:

+ {% if duplicate_list|length == 0 %} +

No Duplicate

+ {% else %} +

Duplicate list:

+ {% set i = 0 %} + {% for dup_path in duplicate_list %} + Similarity: {{ simil_list[i] }}% - {{ dup_path }}
+ {% set i = i + 1 %} + {% endfor %} + {% endif %} +

Content:

{{ content }}

From 4bc84a2580a0203977da304722fbae0251dbdf6e Mon Sep 17 00:00:00 2001 From: mokaddem Date: Tue, 19 Jul 2016 16:49:57 +0200 Subject: [PATCH 10/14] Added dependency for flot chart --- var/www/update_thirdparty.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/var/www/update_thirdparty.sh b/var/www/update_thirdparty.sh index 563ee4c4..3c937cbb 100755 --- a/var/www/update_thirdparty.sh +++ b/var/www/update_thirdparty.sh @@ -29,6 +29,10 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/ wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js +#Ressource for graph +wget https://raw.githubusercontent.com/flot/flot/master/jquery.flot.js -O ./static/js/jquery.flot.js +wget https://raw.githubusercontent.com/flot/flot/master/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js + rm -rf ./static/js/plugins mv temp/${filename}/js/* ./static/js/ From 6f4bfeb4ef154bb262658f6e426452e5ec3e99e6 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 21 Jul 2016 14:45:41 +0200 Subject: [PATCH 11/14] restored deleted part of a failed merge-conflict --- var/www/static/js/indexjavascript.js | 42 ++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/var/www/static/js/indexjavascript.js b/var/www/static/js/indexjavascript.js index bd73522a..1219ffc3 100644 --- a/var/www/static/js/indexjavascript.js +++ b/var/www/static/js/indexjavascript.js @@ -6,6 +6,48 @@ $(function() { var curr_max = 0; function getData() { + if (data.length > 0){ + var data_old = data[0]; + data = data.slice(1); + curr_max = curr_max == data_old ? Math.max.apply(null, data) : curr_max; + } + + while (data.length < totalPoints) { + var y = (typeof window.paste_num_tabvar !== "undefined") ? parseInt(window.paste_num_tabvar) : 0; + curr_max = y > curr_max ? y : curr_max; + data.push(y); + } + // Zip the generated y values with the x values + var res = []; + for (var i = 0; i < data.length; ++i) { + res.push([i, data[i]]) + } + return res; + } + + var updateInterval = 1000; + var options = { + series: { shadowSize: 1 }, + lines: { fill: true, fillColor: { colors: [ { opacity: 1 }, { opacity: 0.1 } ] }}, + yaxis: { min: 0, max: 40 }, + colors: ["#a971ff"], + grid: { + tickColor: "#dddddd", + borderWidth: 0 + }, + }; + var plot = $.plot("#realtimechart", [ getData() ], options); + + function update() { + plot.setData([getData()]); + plot.getOptions().yaxes[0].max = curr_max; + plot.setupGrid(); + plot.draw(); + setTimeout(update, updateInterval); + } + update(); +}); + function initfunc( csvay, scroot) { window.csv = csvay; window.scroot = scroot; From 9a34a587cc1958a5792517730af41fba95991e57 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 21 Jul 2016 14:53:34 +0200 Subject: [PATCH 12/14] bug global_tabvar seems solved: Caused by a race condition, Just switched two functions execution --- var/www/static/js/indexjavascript.js | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/var/www/static/js/indexjavascript.js b/var/www/static/js/indexjavascript.js index 1219ffc3..7fd463f7 100644 --- a/var/www/static/js/indexjavascript.js +++ b/var/www/static/js/indexjavascript.js @@ -1,3 +1,17 @@ +function initfunc( csvay, scroot) { + window.csv = csvay; + window.scroot = scroot; +}; + +function update_values() { + $SCRIPT_ROOT = window.scroot ; + $.getJSON($SCRIPT_ROOT+"/_stuff", + function(data) { + window.glob_tabvar = data; + }); + }; + + // Plot and update the number of processed pastes $(function() { var data = []; From f125a6211513bf4f1db6997fdaba76df5a13b3e1 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Thu, 21 Jul 2016 15:32:07 +0200 Subject: [PATCH 13/14] Added ssdeep in pip_requirments --- pip_packages_requirement.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pip_packages_requirement.txt b/pip_packages_requirement.txt index db2f23c5..bd734175 100644 --- a/pip_packages_requirement.txt +++ b/pip_packages_requirement.txt @@ -17,7 +17,7 @@ nltk # Hashlib crcmod mmh3 -simhash +ssdeep #Others python-magic From c686f69ca67f3bfea99da68277f8dfd038321709 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 22 Jul 2016 10:04:58 +0200 Subject: [PATCH 14/14] Clean not compatible ssdeep hash encountered --- bin/Duplicate_ssdeep_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py index f6aaca4f..e8930c02 100755 --- a/bin/Duplicate_ssdeep_v2.py +++ b/bin/Duplicate_ssdeep_v2.py @@ -122,8 +122,8 @@ if __name__ == "__main__": #print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) except: # ssdeep hash not comparable - print 'ssdeep hash not comparable' - publisher.error('ssdeep hash not comparable') + print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash + curr_dico_redis.srem('HASHS', dico_hash) # Add paste in DB after checking to prevent its analysis twice # hash_i -> index_i AND index_i -> PST.PATH