mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 22:27:17 +00:00
chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain
This commit is contained in:
parent
1f8650a648
commit
8a6e72f487
4 changed files with 168 additions and 31 deletions
29
bin/lib/domain_basic.py
Executable file
29
bin/lib/domain_basic.py
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
"""
|
||||
``basic domain lib``
|
||||
===================
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import redis
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||
import ConfigLoader
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader()
|
||||
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
||||
config_loader = None
|
||||
|
||||
def get_domain_type(domain):
|
||||
if str(domain).endswith('.onion'):
|
||||
return 'onion'
|
||||
else:
|
||||
return 'regular'
|
||||
|
||||
def delete_domain_item_core(item_id, domain, port):
|
||||
domain_type = get_domain_type(domain)
|
||||
r_serv_onion.zrem('crawler_history_{}:{}:{}'.format(domain_type, domain, port), item_id)
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import gzip
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||
import ConfigLoader
|
||||
|
@ -12,6 +13,7 @@ config_loader = ConfigLoader.ConfigLoader()
|
|||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
|
||||
PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')
|
||||
|
||||
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||
config_loader = None
|
||||
|
||||
|
@ -43,6 +45,102 @@ def is_crawled(item_id):
|
|||
def get_item_domain(item_id):
|
||||
return item_id[19:-36]
|
||||
|
||||
def get_item_content(item_id):
|
||||
item_full_path = os.path.join(PASTES_FOLDER, item_id)
|
||||
try:
|
||||
item_content = r_cache.get(item_full_path)
|
||||
except UnicodeDecodeError:
|
||||
item_content = None
|
||||
except Exception as e:
|
||||
item_content = None
|
||||
if item_content is None:
|
||||
try:
|
||||
with gzip.open(item_full_path, 'r') as f:
|
||||
item_content = f.read().decode()
|
||||
r_cache.set(item_full_path, item_content)
|
||||
r_cache.expire(item_full_path, 300)
|
||||
except:
|
||||
item_content = ''
|
||||
return str(item_content)
|
||||
|
||||
#### TREE CHILD/FATHER ####
|
||||
def is_father(item_id):
|
||||
return r_serv_metadata.exists('paste_children:{}'.format(item_id))
|
||||
|
||||
def is_children(item_id):
|
||||
return r_serv_metadata.hexists('paste_metadata:{}'.format(item_id), 'father')
|
||||
|
||||
def is_root_node():
|
||||
if is_father(item_id) and not is_children(item_id):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_node(item_id):
|
||||
if is_father(item_id) or is_children(item_id):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_leaf(item_id):
|
||||
if not is_father(item_id) and is_children(item_id):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_domain_root(item_id):
|
||||
if not is_crawled(item_id):
|
||||
return False
|
||||
else:
|
||||
domain = get_item_domain(item_id)
|
||||
item_father = get_item_parent(item_id)
|
||||
if not is_crawled(item_father):
|
||||
return True
|
||||
else:
|
||||
# same domain
|
||||
if get_item_domain(item_father) == domain:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def get_nb_children(item_id):
|
||||
return r_serv_metadata.scard('paste_children:{}'.format(item_id))
|
||||
|
||||
|
||||
def get_item_parent(item_id):
|
||||
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')
|
||||
|
||||
def get_item_children(item_id):
|
||||
return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))
|
||||
|
||||
def add_item_parent(item_parent, item_id):
|
||||
return item_basic.add_item_parent(item_parent, item_id)
|
||||
|
||||
# # TODO: handle domain last origin in domain lib
|
||||
def _delete_node(item_id):
|
||||
# only if item isn't deleted
|
||||
#if is_crawled(item_id):
|
||||
# r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link')
|
||||
for chidren_id in get_item_children(item_id):
|
||||
r_serv_metadata.hdel('paste_metadata:{}'.format(chidren_id), 'father')
|
||||
r_serv_metadata.delete('paste_children:{}'.format(item_id))
|
||||
|
||||
# delete regular
|
||||
# simple if leaf
|
||||
|
||||
# delete item node
|
||||
|
||||
def get_all_domain_node_by_item_id(item_id, l_nodes=[]):
|
||||
domain = get_item_domain(item_id)
|
||||
for child_id in get_item_children(item_id):
|
||||
if get_item_domain(child_id) == domain:
|
||||
l_nodes.append(child_id)
|
||||
l_nodes = get_all_domain_node_by_item_id(child_id, l_nodes)
|
||||
return l_nodes
|
||||
|
||||
##-- --##
|
||||
|
||||
|
||||
def add_item_parent_by_parent_id(parent_type, parent_id, item_id):
|
||||
parent_item_id = get_obj_id_item_id(parent_type, parent_id)
|
||||
if parent_item_id:
|
||||
|
@ -53,9 +151,9 @@ def add_item_parent(parent_item_id, item_id):
|
|||
r_serv_metadata.sadd('paste_children:{}'.format(parent_item_id), item_id)
|
||||
return True
|
||||
|
||||
def add_map_obj_id_item_id(obj_id, item_id, obj_type):
|
||||
if obj_type == 'twitter_id':
|
||||
r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)
|
||||
# TODO:
|
||||
# FIXME:
|
||||
#### UNKNOW SECTION ####
|
||||
|
||||
def get_obj_id_item_id(parent_type, parent_id):
|
||||
all_parents_type = ['twitter_id']
|
||||
|
@ -63,3 +161,11 @@ def get_obj_id_item_id(parent_type, parent_id):
|
|||
return r_serv_metadata.hget('map:twitter_id:item_id', parent_id)
|
||||
else:
|
||||
return None
|
||||
|
||||
def add_map_obj_id_item_id(obj_id, item_id, obj_type):
|
||||
if obj_type == 'twitter_id':
|
||||
r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)
|
||||
|
||||
# delete twitter id
|
||||
|
||||
##-- --##
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import gzip
|
||||
import redis
|
||||
|
||||
from io import BytesIO
|
||||
|
@ -16,12 +15,15 @@ import Pgp
|
|||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||
import item_basic
|
||||
import domain_basic
|
||||
import ConfigLoader
|
||||
import Correlate_object
|
||||
import Decoded
|
||||
import Screenshot
|
||||
import telegram
|
||||
|
||||
from item_basic import *
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader()
|
||||
# get and sanityze PASTE DIRECTORY
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
|
||||
|
@ -30,6 +32,7 @@ PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')
|
|||
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||
screenshot_directory = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "crawled_screenshot"))
|
||||
|
||||
config_loader = None
|
||||
|
||||
def exist_item(item_id):
|
||||
|
@ -71,22 +74,7 @@ def get_lines_info(item_id, item_content=None):
|
|||
|
||||
|
||||
def get_item_content(item_id):
|
||||
item_full_path = os.path.join(PASTES_FOLDER, item_id)
|
||||
try:
|
||||
item_content = r_cache.get(item_full_path)
|
||||
except UnicodeDecodeError:
|
||||
item_content = None
|
||||
except Exception as e:
|
||||
item_content = None
|
||||
if item_content is None:
|
||||
try:
|
||||
with gzip.open(item_full_path, 'r') as f:
|
||||
item_content = f.read().decode()
|
||||
r_cache.set(item_full_path, item_content)
|
||||
r_cache.expire(item_full_path, 300)
|
||||
except:
|
||||
item_content = ''
|
||||
return str(item_content)
|
||||
return item_basic.get_item_content(item_id)
|
||||
|
||||
# API
|
||||
def get_item(request_dict):
|
||||
|
@ -292,14 +280,8 @@ def get_domain(item_id):
|
|||
item_id = item_id[-1]
|
||||
return item_id[:-36]
|
||||
|
||||
def get_item_parent(item_id):
|
||||
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')
|
||||
|
||||
def get_item_children(item_id):
|
||||
return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))
|
||||
|
||||
def add_item_parent(item_parent, item_id):
|
||||
return item_basic.add_item_parent(item_parent, item_id)
|
||||
def get_item_domain_with_port(item_id):
|
||||
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'domain')
|
||||
|
||||
def get_item_link(item_id):
|
||||
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'real_link')
|
||||
|
@ -423,12 +405,32 @@ def delete_item(obj_id):
|
|||
else:
|
||||
for obj2_id in obj_correlations[correlation]:
|
||||
Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id)
|
||||
|
||||
# delete father/child
|
||||
delete_node(obj_id)
|
||||
|
||||
# delete item metadata
|
||||
r_serv_metadata.delete('paste_metadata:{}'.format(obj_id))
|
||||
|
||||
return True
|
||||
|
||||
### REQUIRE MORE WORK
|
||||
# delete child/son !!!
|
||||
### TODO in inport V2
|
||||
# delete from tracked items
|
||||
# delete from queue
|
||||
###
|
||||
return False
|
||||
|
||||
#### ####
|
||||
def delete_node(item_id):
|
||||
if is_node(item_id):
|
||||
if is_crawled(item_id):
|
||||
delete_domain_node(item_id)
|
||||
item_basic._delete_node(item_id)
|
||||
|
||||
def delete_domain_node(item_id):
|
||||
if is_domain_root(item_id):
|
||||
# remove from domain history
|
||||
domain, port = get_item_domain_with_port(item_id).split(':')
|
||||
domain_basic.delete_domain_item_core(item_id, domain, port)
|
||||
for child_id in get_all_domain_node_by_item_id(item_id):
|
||||
delete_item(child_id)
|
||||
|
|
|
@ -115,7 +115,7 @@ def showDomain():
|
|||
dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags'])
|
||||
dict_domain['history'] = domain.get_domain_history_with_status()
|
||||
dict_domain['crawler_history'] = domain.get_domain_items_crawled(items_link=True, epoch=epoch, item_screenshot=True, item_tag=True) # # TODO: handle multiple port
|
||||
if dict_domain['crawler_history']['items']:
|
||||
if dict_domain['crawler_history'].get('items', []):
|
||||
dict_domain['crawler_history']['random_item'] = random.choice(dict_domain['crawler_history']['items'])
|
||||
|
||||
return render_template("showDomain.html", dict_domain=dict_domain, bootstrap_label=bootstrap_label,
|
||||
|
|
Loading…
Reference in a new issue