From 8a6e72f48717ccc9b2006867c99f11eceba1bd11 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 7 Jul 2020 11:23:23 +0200 Subject: [PATCH] chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain --- bin/lib/domain_basic.py | 29 +++++++ bin/lib/item_basic.py | 112 ++++++++++++++++++++++++++- bin/packages/Item.py | 56 +++++++------- var/www/blueprints/crawler_splash.py | 2 +- 4 files changed, 168 insertions(+), 31 deletions(-) create mode 100755 bin/lib/domain_basic.py diff --git a/bin/lib/domain_basic.py b/bin/lib/domain_basic.py new file mode 100755 index 00000000..5bf24a72 --- /dev/null +++ b/bin/lib/domain_basic.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 + +""" +``basic domain lib`` +=================== + + +""" + +import os +import sys +import redis + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + +config_loader = ConfigLoader.ConfigLoader() +r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") +config_loader = None + +def get_domain_type(domain): + if str(domain).endswith('.onion'): + return 'onion' + else: + return 'regular' + +def delete_domain_item_core(item_id, domain, port): + domain_type = get_domain_type(domain) + r_serv_onion.zrem('crawler_history_{}:{}:{}'.format(domain_type, domain, port), item_id) diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index 6b606dda..c1005b49 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -3,6 +3,7 @@ import os import sys +import gzip sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader @@ -12,6 +13,7 @@ config_loader = ConfigLoader.ConfigLoader() PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/' PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '') +r_cache = config_loader.get_redis_conn("Redis_Cache") r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") config_loader = None @@ -43,6 +45,102 @@ def is_crawled(item_id): def get_item_domain(item_id): return item_id[19:-36] +def get_item_content(item_id): + item_full_path = os.path.join(PASTES_FOLDER, item_id) + try: + item_content = r_cache.get(item_full_path) + except UnicodeDecodeError: + item_content = None + except Exception as e: + item_content = None + if item_content is None: + try: + with gzip.open(item_full_path, 'r') as f: + item_content = f.read().decode() + r_cache.set(item_full_path, item_content) + r_cache.expire(item_full_path, 300) + except: + item_content = '' + return str(item_content) + +#### TREE CHILD/FATHER #### +def is_father(item_id): + return r_serv_metadata.exists('paste_children:{}'.format(item_id)) + +def is_children(item_id): + return r_serv_metadata.hexists('paste_metadata:{}'.format(item_id), 'father') + +def is_root_node(): + if is_father(item_id) and not is_children(item_id): + return True + else: + return False + +def is_node(item_id): + if is_father(item_id) or is_children(item_id): + return True + else: + return False + +def is_leaf(item_id): + if not is_father(item_id) and is_children(item_id): + return True + else: + return False + +def is_domain_root(item_id): + if not is_crawled(item_id): + return False + else: + domain = get_item_domain(item_id) + item_father = get_item_parent(item_id) + if not is_crawled(item_father): + return True + else: + # same domain + if get_item_domain(item_father) == domain: + return False + else: + return True + +def get_nb_children(item_id): + return r_serv_metadata.scard('paste_children:{}'.format(item_id)) + + +def get_item_parent(item_id): + return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father') + +def get_item_children(item_id): + return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id))) + +def add_item_parent(item_parent, item_id): + return item_basic.add_item_parent(item_parent, item_id) + +# # TODO: handle domain last origin in domain lib +def _delete_node(item_id): + # only if item isn't deleted + #if is_crawled(item_id): + # r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link') + for chidren_id in get_item_children(item_id): + r_serv_metadata.hdel('paste_metadata:{}'.format(chidren_id), 'father') + r_serv_metadata.delete('paste_children:{}'.format(item_id)) + + # delete regular + # simple if leaf + + # delete item node + +def get_all_domain_node_by_item_id(item_id, l_nodes=[]): + domain = get_item_domain(item_id) + for child_id in get_item_children(item_id): + if get_item_domain(child_id) == domain: + l_nodes.append(child_id) + l_nodes = get_all_domain_node_by_item_id(child_id, l_nodes) + return l_nodes + +##-- --## + + def add_item_parent_by_parent_id(parent_type, parent_id, item_id): parent_item_id = get_obj_id_item_id(parent_type, parent_id) if parent_item_id: @@ -53,9 +151,9 @@ def add_item_parent(parent_item_id, item_id): r_serv_metadata.sadd('paste_children:{}'.format(parent_item_id), item_id) return True -def add_map_obj_id_item_id(obj_id, item_id, obj_type): - if obj_type == 'twitter_id': - r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id) +# TODO: +# FIXME: +#### UNKNOW SECTION #### def get_obj_id_item_id(parent_type, parent_id): all_parents_type = ['twitter_id'] @@ -63,3 +161,11 @@ def get_obj_id_item_id(parent_type, parent_id): return r_serv_metadata.hget('map:twitter_id:item_id', parent_id) else: return None + +def add_map_obj_id_item_id(obj_id, item_id, obj_type): + if obj_type == 'twitter_id': + r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id) + +# delete twitter id + +##-- --## diff --git a/bin/packages/Item.py b/bin/packages/Item.py index 28a7349b..a551157f 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -3,7 +3,6 @@ import os import sys -import gzip import redis from io import BytesIO @@ -16,12 +15,15 @@ import Pgp sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import item_basic +import domain_basic import ConfigLoader import Correlate_object import Decoded import Screenshot import telegram +from item_basic import * + config_loader = ConfigLoader.ConfigLoader() # get and sanityze PASTE DIRECTORY PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/' @@ -30,6 +32,7 @@ PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '') r_cache = config_loader.get_redis_conn("Redis_Cache") r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") screenshot_directory = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "crawled_screenshot")) + config_loader = None def exist_item(item_id): @@ -71,22 +74,7 @@ def get_lines_info(item_id, item_content=None): def get_item_content(item_id): - item_full_path = os.path.join(PASTES_FOLDER, item_id) - try: - item_content = r_cache.get(item_full_path) - except UnicodeDecodeError: - item_content = None - except Exception as e: - item_content = None - if item_content is None: - try: - with gzip.open(item_full_path, 'r') as f: - item_content = f.read().decode() - r_cache.set(item_full_path, item_content) - r_cache.expire(item_full_path, 300) - except: - item_content = '' - return str(item_content) + return item_basic.get_item_content(item_id) # API def get_item(request_dict): @@ -292,14 +280,8 @@ def get_domain(item_id): item_id = item_id[-1] return item_id[:-36] -def get_item_parent(item_id): - return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father') - -def get_item_children(item_id): - return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id))) - -def add_item_parent(item_parent, item_id): - return item_basic.add_item_parent(item_parent, item_id) +def get_item_domain_with_port(item_id): + return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'domain') def get_item_link(item_id): return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'real_link') @@ -423,12 +405,32 @@ def delete_item(obj_id): else: for obj2_id in obj_correlations[correlation]: Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id) + + # delete father/child + delete_node(obj_id) + + # delete item metadata + r_serv_metadata.delete('paste_metadata:{}'.format(obj_id)) + return True - ### REQUIRE MORE WORK - # delete child/son !!! ### TODO in inport V2 # delete from tracked items # delete from queue ### return False + +#### #### +def delete_node(item_id): + if is_node(item_id): + if is_crawled(item_id): + delete_domain_node(item_id) + item_basic._delete_node(item_id) + +def delete_domain_node(item_id): + if is_domain_root(item_id): + # remove from domain history + domain, port = get_item_domain_with_port(item_id).split(':') + domain_basic.delete_domain_item_core(item_id, domain, port) + for child_id in get_all_domain_node_by_item_id(item_id): + delete_item(child_id) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index d2d3c65a..7d006c3d 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -115,7 +115,7 @@ def showDomain(): dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags']) dict_domain['history'] = domain.get_domain_history_with_status() dict_domain['crawler_history'] = domain.get_domain_items_crawled(items_link=True, epoch=epoch, item_screenshot=True, item_tag=True) # # TODO: handle multiple port - if dict_domain['crawler_history']['items']: + if dict_domain['crawler_history'].get('items', []): dict_domain['crawler_history']['random_item'] = random.choice(dict_domain['crawler_history']['items']) return render_template("showDomain.html", dict_domain=dict_domain, bootstrap_label=bootstrap_label,