fix: [module Webstats + BankAccount-Decoder] fix faup return type + remove old Paste library from BankAccount-Decoder #465

This commit is contained in:
Terrtia 2020-02-10 10:31:53 +01:00
parent d8fbd72863
commit f9856a1589
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 37 additions and 33 deletions

View file

@ -5,7 +5,7 @@
The BankAccount Module The BankAccount Module
====================== ======================
It apply IBAN regexes on paste content and warn if above a threshold. It apply IBAN regexes on item content and warn if above a threshold.
""" """
@ -17,7 +17,7 @@ import re
import string import string
from itertools import chain from itertools import chain
from packages import Paste from packages import Item
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
@ -49,7 +49,7 @@ def is_valid_iban(iban):
return True return True
return False return False
def check_all_iban(l_iban, paste, filename): def check_all_iban(l_iban, obj_id):
nb_valid_iban = 0 nb_valid_iban = 0
for iban in l_iban: for iban in l_iban:
iban = iban[0]+iban[1]+iban[2] iban = iban[0]+iban[1]+iban[2]
@ -65,14 +65,14 @@ def check_all_iban(l_iban, paste, filename):
server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)
if(nb_valid_iban > 0): if(nb_valid_iban > 0):
to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
publisher.warning('{}Checked found {} IBAN;{}'.format( publisher.warning('{}Checked found {} IBAN;{}'.format(
to_print, nb_valid_iban, paste.p_rel_path)) to_print, nb_valid_iban, obj_id))
msg = 'infoleak:automatic-detection="iban";{}'.format(filename) msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
p.populate_set_out(msg, 'Tags') p.populate_set_out(msg, 'Tags')
#Send to duplicate #Send to duplicate
p.populate_set_out(filename, 'Duplicate') p.populate_set_out(obj_id, 'Duplicate')
if __name__ == "__main__": if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
@ -103,21 +103,21 @@ if __name__ == "__main__":
if message is not None: if message is not None:
filename = message obj_id = Item.get_item_id(message)
paste = Paste.Paste(filename)
content = paste.get_p_content() content = Item.get_item_content(obj_id)
signal.alarm(max_execution_time) signal.alarm(max_execution_time)
try: try:
l_iban = iban_regex.findall(content) l_iban = iban_regex.findall(content)
except TimeoutException: except TimeoutException:
print ("{0} processing timeout".format(paste.p_rel_path)) print ("{0} processing timeout".format(obj_id))
continue continue
else: else:
signal.alarm(0) signal.alarm(0)
if(len(l_iban) > 0): if(len(l_iban) > 0):
check_all_iban(l_iban, paste, filename) check_all_iban(l_iban, obj_id)
else: else:
publisher.debug("Script BankAccount is Idling 10s") publisher.debug("Script BankAccount is Idling 10s")

View file

@ -17,7 +17,6 @@ import datetime
from pubsublogger import publisher from pubsublogger import publisher
from Helper import Process from Helper import Process
from packages import Paste
from packages import Item from packages import Item
import re import re
@ -50,11 +49,11 @@ def decode_string(content, message, date, encoded_list, decoder_name, encoded_mi
save_hash(decoder_name, message, date, decode) save_hash(decoder_name, message, date, decode)
#remove encoded from paste content #remove encoded from item content
content = content.replace(encoded, '', 1) content = content.replace(encoded, '', 1)
if(find): if(find):
set_out_paste(decoder_name, message) set_out_item(decoder_name, message)
return content return content
@ -72,8 +71,8 @@ def save_hash(decoder_name, message, date, decoded):
data['estimated type'] = type data['estimated type'] = type
json_data = json.dumps(data) json_data = json.dumps(data)
date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) date_item = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8])
date_key = date[0:4] + date[4:6] + date[6:8] date_key = date
serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1) serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1)
serv_metadata.zincrby('hash_date:'+date_key, hash, 1) serv_metadata.zincrby('hash_date:'+date_key, hash, 1)
@ -81,24 +80,24 @@ def save_hash(decoder_name, message, date, decoded):
# first time we see this hash # first time we see this hash
if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'): if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'):
serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste) serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_item)
serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item)
else: else:
serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item)
# first time we see this hash (all encoding) on this paste # first time we see this hash (all encoding) on this item
if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None: if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None:
serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1) serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1)
serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map serv_metadata.sadd('hash_paste:'+message, hash) # item - hash map
# create hash metadata # create hash metadata
serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type) serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type)
serv_metadata.sadd('hash_all_type', type) serv_metadata.sadd('hash_all_type', type)
# first time we see this hash encoding on this paste # first time we see this hash encoding on this item
if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None: if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None:
print('first '+decoder_name) print('first '+decoder_name)
serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # item - hash map
# create hash metadata # create hash metadata
serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type) serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type)
@ -118,8 +117,8 @@ def save_hash(decoder_name, message, date, decoded):
serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1) serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1)
serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - item map
serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this item
# Domain Object # Domain Object
if Item.is_crawled(message): if Item.is_crawled(message):
@ -150,7 +149,7 @@ def save_hash_on_disk(decode, type, hash, json_data):
with open(filename_json, 'w') as f: with open(filename_json, 'w') as f:
f.write(json_data) f.write(json_data)
def set_out_paste(decoder_name, message): def set_out_item(decoder_name, message):
publisher.warning(decoder_name+' decoded') publisher.warning(decoder_name+' decoded')
#Send to duplicate #Send to duplicate
p.populate_set_out(message, 'Duplicate') p.populate_set_out(message, 'Duplicate')
@ -217,12 +216,11 @@ if __name__ == '__main__':
time.sleep(1) time.sleep(1)
continue continue
filename = message obj_id = Item.get_item_id(message)
paste = Paste.Paste(filename)
# Do something with the message from the queue # Do something with the message from the queue
content = paste.get_p_content() content = Item.get_item_content(obj_id)
date = str(paste._get_p_date()) date = Item.get_item_date(obj_id)
for decoder in decoder_order: # add threshold and size limit for decoder in decoder_order: # add threshold and size limit
@ -233,7 +231,7 @@ if __name__ == '__main__':
except TimeoutException: except TimeoutException:
encoded_list = [] encoded_list = []
p.incr_module_timeout_statistic() # add encoder type p.incr_module_timeout_statistic() # add encoder type
print ("{0} processing timeout".format(paste.p_rel_path)) print ("{0} processing timeout".format(obj_id))
continue continue
else: else:
signal.alarm(0) signal.alarm(0)

View file

@ -29,7 +29,10 @@ num_day_to_look = 5 # the detection of the progression start num_day_to_lo
def analyse(server, field_name, date, url_parsed): def analyse(server, field_name, date, url_parsed):
field = url_parsed[field_name] field = url_parsed[field_name]
if field is not None: if field is not None:
field = field.decode('utf8') try: # faup version
field = field.decode()
except:
pass
server.hincrby(field, date, 1) server.hincrby(field, date, 1)
if field_name == "domain": #save domain in a set for the monthly plot if field_name == "domain": #save domain in a set for the monthly plot
domain_set_name = "domain_set_" + date[0:6] domain_set_name = "domain_set_" + date[0:6]

View file

@ -32,6 +32,9 @@ def exist_item(item_id):
else: else:
return False return False
def get_basename(item_id):
return os.path.basename(item_id)
def get_item_id(full_path): def get_item_id(full_path):
return full_path.replace(PASTES_FOLDER, '', 1) return full_path.replace(PASTES_FOLDER, '', 1)