diff --git a/bin/Credential.py b/bin/Credential.py index 3ac61faf..d81c9ff6 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -50,6 +50,8 @@ if __name__ == "__main__": if len(creds) > critical: print("========> Found more than 10 credentials in this file : {}".format(filepath)) publisher.warning(to_print) + #Send to duplicate + p.populate_set_out(filepath) if sites: print("=======> Probably on : {}".format(', '.join(sites))) else: diff --git a/bin/CreditCard.py b/bin/CreditCard.py index d4660c13..18703f4e 100755 --- a/bin/CreditCard.py +++ b/bin/CreditCard.py @@ -65,6 +65,8 @@ if __name__ == "__main__": if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s)'.format( to_print, len(creditcard_set))) + #Send to duplicate + p.populate_set_out(filename) else: publisher.info('{}CreditCard related'.format(to_print)) else: diff --git a/bin/Duplicate.py b/bin/Duplicate.py index a7a41dc1..59610f83 100755 --- a/bin/Duplicate.py +++ b/bin/Duplicate.py @@ -74,9 +74,9 @@ if __name__ == "__main__": # Creating the bloom filter name: bloomyyyymm filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year + PST.p_date.month) - if os.path.exists(filebloompath): bloom = BloomFilter.open(filebloompath) + bloop_path_set.add(filebloompath) else: bloom = BloomFilter(100000000, 0.01, filebloompath) bloop_path_set.add(filebloompath) @@ -94,7 +94,6 @@ if __name__ == "__main__": for bloo in bloop_path_set: # Opening blooms opened_bloom.append(BloomFilter.open(bloo)) - # For each hash of the paste for line_hash in PST._get_hash_lines(min=5, start=1, jump=0): nb_hash_current += 1 @@ -105,7 +104,6 @@ if __name__ == "__main__": r_serv1.sadd("HASHS", line_hash) # Adding the hash in the bloom of the month bloom.add(line_hash) - # Go throught the Database of the bloom filter (of the month) for bloo in opened_bloom: if line_hash in bloo: @@ -148,6 +146,8 @@ if __name__ == "__main__": percentage = round((count/float(nb_hash_current))*100, 2) if percentage >= 50: dupl.append((paste, percentage)) + else: + print 'percentage: ' + str(percentage) # Creating the object attribute and save it. to_print = 'Duplicate;{};{};{};'.format( @@ -156,6 +156,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl) publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) y = time.time() diff --git a/bin/Duplicate_ssdeep.py b/bin/Duplicate_ssdeep.py new file mode 100755 index 00000000..1b173eca --- /dev/null +++ b/bin/Duplicate_ssdeep.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. + +Requirements: +------------- + + +""" +import redis +import os +import time +import datetime +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + save_dico_and_reload = 1 #min + time_1 = time.time() + flag_reload_from_disk = True + flag_write_to_disk = False + + p = Process(config_section) + + # REDIS # + # DB OBJECT & HASHS ( DISK ) + # FIXME increase flexibility + dico_redis = {} + for year in xrange(2013, datetime.date.today().year+1): + for month in xrange(0, 16): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + dicopath = os.path.join(os.environ['AIL_HOME'], + p.config.get("Directories", "dicofilters")) + + dico_path_set = set() + while True: + try: + hash_dico = {} + dupl = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dicor name: dicoyyyymm + filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year + + PST.p_date.month) + filedicopath_today = filedicopath + + # Save I/O + if time.time() - time_1 > save_dico_and_reload*60: + flag_write_to_disk = True + + if os.path.exists(filedicopath): + if flag_reload_from_disk == True: + flag_reload_from_disk = False + print 'Reloading' + with open(filedicopath, 'r') as fp: + today_dico = json.load(fp) + else: + today_dico = {} + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + + # For now, just use monthly dico + dico_path_set.add(filedicopath) + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(datetime.date.today().year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + + # For each dico + opened_dico = [] + for dico in dico_path_set: + # Opening dico + if dico == filedicopath_today: + opened_dico.append([dico, today_dico]) + else: + with open(dico, 'r') as fp: + opened_dico.append([dico, json.load(fp)]) + + + #retrieve hash from paste + paste_hash = PST._get_p_hash() + + # Go throught the Database of the dico (of the month) + threshold_dup = 99 + for dico_name, dico in opened_dico: + for dico_key, dico_hash in dico.items(): + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_dup: + db = dico_name[-6:] + # Go throught the Database of the dico filter (month) + r_serv_dico = dico_redis[db] + + # index of paste + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + #print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent) + print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent) + + # Add paste in DB to prevent its analyse twice + # HASHTABLES PER MONTH (because of r_serv1 changing db) + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + for dico_hash, paste_tuple in hash_dico.items(): + paste_path, percent = paste_tuple + dupl.append((paste_path, percent)) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + + + # Adding the hash in the dico of the month + today_dico[index] = paste_hash + + if flag_write_to_disk: + time_1 = time.time() + flag_write_to_disk = False + flag_reload_from_disk = True + print 'writing' + with open(filedicopath, 'w') as fp: + json.dump(today_dico, fp) + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py new file mode 100755 index 00000000..e8930c02 --- /dev/null +++ b/bin/Duplicate_ssdeep_v2.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. +Its input comes from other modules, namely: + Credential, CreditCard, Keys, Mails and Phone + +This one differ from v1 by only using redis and not json file stored on disk + +Requirements: +------------- + + +""" +import redis +import os +import time +from datetime import datetime, timedelta +import json +import ssdeep +from packages import Paste +from pubsublogger import publisher + +from Helper import Process + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'Duplicates' + + p = Process(config_section) + + maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) + threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) + min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) + + # REDIS # + dico_redis = {} + date_today = datetime.today() + for year in xrange(2013, date_today.year+1): + for month in xrange(0, 13): + dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( + host=p.config.get("Redis_Level_DB", "host"), port=year, + db=month) + #print("dup: "+str(year)+str(month).zfill(2)+"\n") + + # FUNCTIONS # + publisher.info("Script duplicate started") + + while True: + try: + hash_dico = {} + dupl = [] + dico_range_list = [] + + x = time.time() + + message = p.get_from_set() + if message is not None: + path = message + PST = Paste.Paste(path) + else: + publisher.debug("Script Attribute is idling 10s") + time.sleep(10) + continue + + # the paste is too small + if (PST._get_p_size() < min_paste_size): + continue + + PST._set_p_hash_kind("ssdeep") + + # Assignate the correct redis connexion + r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] + + # Creating the dico name: yyyymm + # Get the date of the range + date_range = date_today - timedelta(days = maximum_month_range*30.4166666) + num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month) + for diff_month in xrange(0, num_of_month+1): + curr_date_range = date_today - timedelta(days = diff_month*30.4166666) + to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) + dico_range_list.append(to_append) + + # Use all dico in range + dico_range_list = dico_range_list[0:maximum_month_range] + + # UNIQUE INDEX HASHS TABLE + yearly_index = str(date_today.year)+'00' + r_serv0 = dico_redis[yearly_index] + r_serv0.incr("current_index") + index = r_serv0.get("current_index")+str(PST.p_date) + + # Open selected dico range + opened_dico = [] + for dico_name in dico_range_list: + opened_dico.append([dico_name, dico_redis[dico_name]]) + + # retrieve hash from paste + paste_hash = PST._get_p_hash() + + # Go throught the Database of the dico (of the month) + for curr_dico_name, curr_dico_redis in opened_dico: + for dico_hash in curr_dico_redis.smembers('HASHS'): + try: + percent = ssdeep.compare(dico_hash, paste_hash) + if percent > threshold_duplicate: + # Go throught the Database of the dico filter (month) + r_serv_dico = dico_redis[curr_dico_name] + + # index of paste + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (paste_path, percent) + + #print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) + except: + # ssdeep hash not comparable + print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash + curr_dico_redis.srem('HASHS', dico_hash) + + # Add paste in DB after checking to prevent its analysis twice + # hash_i -> index_i AND index_i -> PST.PATH + r_serv1.set(index, PST.p_path) + r_serv1.sadd("INDEX", index) + # Adding the hash in Redis + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS", paste_hash) + ##################### Similarity found ####################### + + # if there is data in this dictionnary + if len(hash_dico) != 0: + # paste_tuple = (paste_path, percent) + for dico_hash, paste_tuple in hash_dico.items(): + dupl.append(paste_tuple) + + # Creating the object attribute and save it. + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + if dupl != []: + PST.__setattr__("p_duplicate", dupl) + PST.save_attribute_redis("p_duplicate", dupl) + publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) + + y = time.time() + + publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) + #print '{}Processed in {} sec'.format(to_print, y-x) + + except IOError: + to_print = 'Duplicate;{};{};{};'.format( + PST.p_source, PST.p_date, PST.p_name) + print "CRC Checksum Failed on :", PST.p_path + publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/Keys.py b/bin/Keys.py index 8058545b..9c44f60a 100755 --- a/bin/Keys.py +++ b/bin/Keys.py @@ -16,6 +16,8 @@ def search_gpg(message): content = paste.get_p_content() if '-----BEGIN PGP MESSAGE-----' in content: publisher.warning('{} has a PGP enc message'.format(paste.p_name)) + #Send to duplicate + p.populate_set_out(message) if __name__ == '__main__': diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index fc8c9ff1..d6706e1e 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -105,7 +105,7 @@ function launching_scripts { screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x' sleep 0.1 - screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate.py; read x' + screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x' sleep 0.1 diff --git a/bin/Mail.py b/bin/Mail.py index dd348ba6..964deb19 100755 --- a/bin/Mail.py +++ b/bin/Mail.py @@ -60,6 +60,8 @@ if __name__ == "__main__": MX_values[0]) if MX_values[0] > is_critical: publisher.warning(to_print) + #Send to duplicate + p.populate_set_out(filename) else: publisher.info(to_print) prec_filename = filename diff --git a/bin/Phone.py b/bin/Phone.py index 628f77c2..b53b079c 100755 --- a/bin/Phone.py +++ b/bin/Phone.py @@ -23,6 +23,8 @@ def search_phone(message): if len(results) > 4: print results publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name)) + #Send to duplicate + p.populate_set_out(message) if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index f8dcac0f..2f34c5c7 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -1,6 +1,7 @@ import hashlib import crcmod import mmh3 +import ssdeep class Hash(object): @@ -32,4 +33,7 @@ class Hash(object): elif self.name == "murmur": hash = mmh3.hash(string) + elif self.name == "ssdeep": + hash = ssdeep.hash(string) + return hash diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index bedf36b0..172f0931 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -91,6 +91,7 @@ class Paste(object): self.p_langage = None self.p_nb_lines = None self.p_max_length_line = None + self.p_duplicate = None def get_p_content(self): """ @@ -277,6 +278,10 @@ class Paste(object): return True, var else: return False, var + + def _get_p_duplicate(self): + self.p_duplicate = self.store.hget(self.p_path, "p_duplicate") + return self.p_duplicate if self.p_duplicate is not None else [] def save_all_attributes_redis(self, key=None): """ diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 0f8775b5..f7839aae 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -9,7 +9,7 @@ protocolstrending_csv = var/www/static/csv/protocolstrendingdata protocolsfile = files/protocolsfile tldstrending_csv = var/www/static/csv/tldstrendingdata -tldsfile = AILENV/faup/src/data/mozilla.tlds +tldsfile = faup/src/data/mozilla.tlds domainstrending_csv = var/www/static/csv/domainstrendingdata @@ -25,6 +25,16 @@ max_preview_modal = 800 #Default number of header to display in trending graphs default_display = 10 +#### Modules #### +[Modules_Duplicates] +#Number of month to look back +maximum_month_range = 3 +#The value where two pastes are considerate duplicate. +threshold_duplicate = 50 +#Minimum size of the paste considered +min_paste_size = 0.3 + + ##### Redis ##### [Redis_Cache] host = localhost @@ -49,7 +59,7 @@ db = 1 ##### LevelDB ##### [Redis_Level_DB] host = localhost -port = 2013 +port = 2016 db = 0 [Redis_Level_DB_Domain] @@ -59,7 +69,7 @@ db = 3 [Redis_Level_DB_Hashs] host = localhost -port = 2013 +port = 2016 db = 1 [Url] diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index c5bd23c6..f29d7fae 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -2,8 +2,8 @@ subscribe = ZMQ_Global publish = Redis_Global -#[Duplicates] -#subscribe = Redis_Global +[Duplicates] +subscribe = Redis_Duplicate [Indexer] subscribe = Redis_Global @@ -31,9 +31,11 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential [CreditCards] subscribe = Redis_CreditCards +publish = Redis_Duplicate [Mail] subscribe = Redis_Mail +publish = Redis_Duplicate [Onion] subscribe = Redis_Onion @@ -55,6 +57,7 @@ subscribe = Redis_Url [Credential] subscribe = Redis_Credential +publish = Redis_Duplicate #[Cve] #subscribe = Redis_Cve @@ -62,8 +65,11 @@ subscribe = Redis_Credential #[SourceCode] #subscribe = Redis_SourceCode -#[Phone] -#subscribe = Redis_Global +[Phone] +subscribe = Redis_Global +publish = Redis_Duplicate + +[Keys] +subscribe = Redis_Global +publish = Redis_Duplicate -#[Keys] -#subscribe = Redis_Global diff --git a/pip_packages_requirement.txt b/pip_packages_requirement.txt index 40dcda8e..bd734175 100644 --- a/pip_packages_requirement.txt +++ b/pip_packages_requirement.txt @@ -17,6 +17,7 @@ nltk # Hashlib crcmod mmh3 +ssdeep #Others python-magic diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 36fcfbcc..08ea0675 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -58,6 +58,21 @@ def list_len(s): return len(s) app.jinja_env.filters['list_len'] = list_len +def parseStringToList(the_string): + strList = "" + elemList = [] + for c in the_string: + if c != ']': + if c != '[' and c !=' ' and c != '"': + strList += c + else: + the_list = strList.split(',') + if len(the_list) == 2: + elemList.append(the_list) + elif len(the_list) > 1: + elemList.append(the_list[1:]) + strList = "" + return elemList def showpaste(content_range): requested_path = request.args.get('paste', '') @@ -71,10 +86,19 @@ def showpaste(content_range): p_mime = paste.p_mime p_lineinfo = paste.get_lines_info() p_content = paste.get_p_content().decode('utf-8', 'ignore') + p_duplicate_full_list = parseStringToList(paste._get_p_duplicate()) + p_duplicate_list = [] + p_simil_list = [] + + for dup_list in p_duplicate_full_list: + path, simil_percent = dup_list + p_duplicate_list.append(path) + p_simil_list.append(simil_percent) + if content_range != 0: p_content = p_content[0:content_range] - return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content)) + return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list) @app.route("/_logs") diff --git a/var/www/static/js/indexjavascript.js b/var/www/static/js/indexjavascript.js index cef3bf41..7fd463f7 100644 --- a/var/www/static/js/indexjavascript.js +++ b/var/www/static/js/indexjavascript.js @@ -1,3 +1,17 @@ +function initfunc( csvay, scroot) { + window.csv = csvay; + window.scroot = scroot; +}; + +function update_values() { + $SCRIPT_ROOT = window.scroot ; + $.getJSON($SCRIPT_ROOT+"/_stuff", + function(data) { + window.glob_tabvar = data; + }); + }; + + // Plot and update the number of processed pastes $(function() { var data = []; @@ -25,7 +39,7 @@ $(function() { return res; } - var updateInterval = 1000; //1s + var updateInterval = 1000; var options = { series: { shadowSize: 1 }, lines: { fill: true, fillColor: { colors: [ { opacity: 1 }, { opacity: 0.1 } ] }}, diff --git a/var/www/templates/show_saved_paste.html b/var/www/templates/show_saved_paste.html index ce68465c..707786d7 100644 --- a/var/www/templates/show_saved_paste.html +++ b/var/www/templates/show_saved_paste.html @@ -42,7 +42,17 @@