fix: [dynamic update v1.5] make sure updates are excuted in the correct order + fix nb_seen_hash dynamic update

This commit is contained in:
Terrtia 2019-04-15 11:01:33 +02:00
parent a3167a740a
commit fc1a04336c
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
3 changed files with 90 additions and 76 deletions

View file

@ -19,7 +19,6 @@ def update_hash_item(has_type):
if r_serv_metadata.exists(base64_key): if r_serv_metadata.exists(base64_key):
new_base64_key = base64_key.replace(PASTES_FOLDER, '', 1) new_base64_key = base64_key.replace(PASTES_FOLDER, '', 1)
res = r_serv_metadata.renamenx(base64_key, new_base64_key) res = r_serv_metadata.renamenx(base64_key, new_base64_key)
print(res)
if res == 0: if res == 0:
print('same key, double name: {}'.format(item_path)) print('same key, double name: {}'.format(item_path))
# fusion # fusion

View file

@ -57,75 +57,77 @@ if __name__ == '__main__':
db=2018, db=2018,
decode_responses=True) decode_responses=True)
print('Updating ARDB_Tags ...') if r_serv.exists('v1.5:onions') and r_serv.exists('v1.5:metadata'):
index = 0
start = time.time()
tags_list = r_serv_tag.smembers('list_tags') print('Updating ARDB_Tags ...')
# create temp tags metadata index = 0
tag_metadata = {} start = time.time()
for tag in tags_list:
tag_metadata[tag] = {}
tag_metadata[tag]['first_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'first_seen')
if tag_metadata[tag]['first_seen'] is None:
tag_metadata[tag]['first_seen'] = 99999999
else:
tag_metadata[tag]['first_seen'] = int(tag_metadata[tag]['first_seen'])
tag_metadata[tag]['last_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen') tags_list = r_serv_tag.smembers('list_tags')
if tag_metadata[tag]['last_seen'] is None: # create temp tags metadata
tag_metadata[tag]['last_seen'] = 0 tag_metadata = {}
else: for tag in tags_list:
tag_metadata[tag]['last_seen'] = int(tag_metadata[tag]['last_seen']) tag_metadata[tag] = {}
tag_metadata[tag]['first_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'first_seen')
if tag_metadata[tag]['first_seen'] is None:
tag_metadata[tag]['first_seen'] = 99999999
else:
tag_metadata[tag]['first_seen'] = int(tag_metadata[tag]['first_seen'])
for tag in tags_list: tag_metadata[tag]['last_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen')
if tag_metadata[tag]['last_seen'] is None:
tag_metadata[tag]['last_seen'] = 0
else:
tag_metadata[tag]['last_seen'] = int(tag_metadata[tag]['last_seen'])
all_item = r_serv_tag.smembers(tag) for tag in tags_list:
for item_path in all_item:
splitted_item_path = item_path.split('/') all_item = r_serv_tag.smembers(tag)
#print(tag) for item_path in all_item:
#print(item_path) splitted_item_path = item_path.split('/')
try: #print(tag)
item_date = int( ''.join([splitted_item_path[-4], splitted_item_path[-3], splitted_item_path[-2]]) ) #print(item_path)
except IndexError: try:
item_date = int( ''.join([splitted_item_path[-4], splitted_item_path[-3], splitted_item_path[-2]]) )
except IndexError:
r_serv_tag.srem(tag, item_path)
continue
# remove absolute path
new_path = item_path.replace(PASTES_FOLDER, '', 1)
if new_path != item_path:
# save in queue absolute path to remove
r_serv_tag.sadd('maj:v1.5:absolute_path_to_rename', item_path)
# update metadata first_seen
if item_date < tag_metadata[tag]['first_seen']:
tag_metadata[tag]['first_seen'] = item_date
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'first_seen', item_date)
# update metadata last_seen
if item_date > tag_metadata[tag]['last_seen']:
tag_metadata[tag]['last_seen'] = item_date
last_seen_db = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen')
if last_seen_db:
if item_date > int(last_seen_db):
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'last_seen', item_date)
else:
tag_metadata[tag]['last_seen'] = last_seen_db
r_serv_tag.sadd('{}:{}'.format(tag, item_date), new_path)
r_serv_tag.hincrby('daily_tags:{}'.format(item_date), tag, 1)
# clean db
r_serv_tag.srem(tag, item_path) r_serv_tag.srem(tag, item_path)
continue index = index + 1
# remove absolute path #flush browse importante pastes db
new_path = item_path.replace(PASTES_FOLDER, '', 1) r_important_paste_2018.flushdb()
if new_path != item_path: r_important_paste_2019.flushdb()
# save in queue absolute path to remove
r_serv_tag.sadd('maj:v1.5:absolute_path_to_rename', item_path)
# update metadata first_seen end = time.time()
if item_date < tag_metadata[tag]['first_seen']:
tag_metadata[tag]['first_seen'] = item_date
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'first_seen', item_date)
# update metadata last_seen
if item_date > tag_metadata[tag]['last_seen']:
tag_metadata[tag]['last_seen'] = item_date
last_seen_db = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen')
if last_seen_db:
if item_date > int(last_seen_db):
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'last_seen', item_date)
else:
tag_metadata[tag]['last_seen'] = last_seen_db
r_serv_tag.sadd('{}:{}'.format(tag, item_date), new_path)
r_serv_tag.hincrby('daily_tags:{}'.format(item_date), tag, 1)
# clean db
r_serv_tag.srem(tag, item_path)
index = index + 1
#flush browse importante pastes db
r_important_paste_2018.flushdb()
r_important_paste_2019.flushdb()
end = time.time()
print('Updating ARDB_Tags Done => {} paths: {} s'.format(index, end - start)) print('Updating ARDB_Tags Done => {} paths: {} s'.format(index, end - start))
r_serv.set('v1.5:tags', 1) r_serv.set('v1.5:tags', 1)

View file

@ -157,13 +157,13 @@ def showpaste(content_range, requested_path):
# item list not updated # item list not updated
if nb_in_file is None: if nb_in_file is None:
l_pastes = r_serv_metadata.zrange('nb_seen_hash:'+hash, 0, -1) l_pastes = r_serv_metadata.zrange('nb_seen_hash:'+hash, 0, -1)
for paste in l_pastes: for paste_name in l_pastes:
# dynamic update # dynamic update
if PASTES_FOLDER in paste: if PASTES_FOLDER in paste_name:
score = r_serv_metadata.zscore('nb_seen_hash:{}'.format(hash), paste) score = r_serv_metadata.zscore('nb_seen_hash:{}'.format(hash), paste_name)
r_serv_metadata.zrem('nb_seen_hash:{}'.format(hash), paste) r_serv_metadata.zrem('nb_seen_hash:{}'.format(hash), paste_name)
paste = paste.replace(PASTES_FOLDER, '', 1) paste_name = paste_name.replace(PASTES_FOLDER, '', 1)
r_serv_metadata.zadd('nb_seen_hash:{}'.format(hash), score, paste) r_serv_metadata.zadd('nb_seen_hash:{}'.format(hash), score, paste_name)
nb_in_file = r_serv_metadata.zscore('nb_seen_hash:'+hash, requested_path) nb_in_file = r_serv_metadata.zscore('nb_seen_hash:'+hash, requested_path)
nb_in_file = int(nb_in_file) nb_in_file = int(nb_in_file)
estimated_type = r_serv_metadata.hget('metadata_hash:'+hash, 'estimated_type') estimated_type = r_serv_metadata.hget('metadata_hash:'+hash, 'estimated_type')
@ -282,7 +282,8 @@ def show_item_min(requested_path , content_range=0):
p_hashtype_list = [] p_hashtype_list = []
l_tags = r_serv_metadata.smembers('tag:'+requested_path) print(requested_path)
l_tags = r_serv_metadata.smembers('tag:'+relative_path)
if relative_path is not None: if relative_path is not None:
l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) ) l_tags.union( r_serv_metadata.smembers('tag:'+relative_path) )
item_info['tags'] = l_tags item_info['tags'] = l_tags
@ -291,10 +292,22 @@ def show_item_min(requested_path , content_range=0):
l_64 = [] l_64 = []
# load hash files # load hash files
if r_serv_metadata.scard('hash_paste:'+requested_path) > 0: if r_serv_metadata.scard('hash_paste:'+relative_path) > 0:
set_b64 = r_serv_metadata.smembers('hash_paste:'+requested_path) set_b64 = r_serv_metadata.smembers('hash_paste:'+relative_path)
for hash in set_b64: for hash in set_b64:
nb_in_file = int(r_serv_metadata.zscore('nb_seen_hash:'+hash, requested_path)) nb_in_file = r_serv_metadata.zscore('nb_seen_hash:'+hash, relative_path)
# item list not updated
if nb_in_file is None:
l_pastes = r_serv_metadata.zrange('nb_seen_hash:'+hash, 0, -1)
for paste_name in l_pastes:
# dynamic update
if PASTES_FOLDER in paste_name:
score = r_serv_metadata.zscore('nb_seen_hash:{}'.format(hash), paste_name)
r_serv_metadata.zrem('nb_seen_hash:{}'.format(hash), paste_name)
paste_name = paste_name.replace(PASTES_FOLDER, '', 1)
r_serv_metadata.zadd('nb_seen_hash:{}'.format(hash), score, paste_name)
nb_in_file = r_serv_metadata.zscore('nb_seen_hash:{}'.format(hash), relative_path)
nb_in_file = int(nb_in_file)
estimated_type = r_serv_metadata.hget('metadata_hash:'+hash, 'estimated_type') estimated_type = r_serv_metadata.hget('metadata_hash:'+hash, 'estimated_type')
file_type = estimated_type.split('/')[0] file_type = estimated_type.split('/')[0]
# set file icon # set file icon
@ -326,9 +339,9 @@ def show_item_min(requested_path , content_range=0):
crawler_metadata = {} crawler_metadata = {}
if 'infoleak:submission="crawler"' in l_tags: if 'infoleak:submission="crawler"' in l_tags:
crawler_metadata['get_metadata'] = True crawler_metadata['get_metadata'] = True
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link')
crawler_metadata['screenshot'] = paste.get_p_rel_path() crawler_metadata['screenshot'] = paste.get_p_rel_path()
else: else:
crawler_metadata['get_metadata'] = False crawler_metadata['get_metadata'] = False