mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [launcher + modules] add module tests (Onion module)
This commit is contained in:
parent
869be4a493
commit
4896db98a3
7 changed files with 485 additions and 52 deletions
|
@ -78,10 +78,10 @@ function helptext {
|
|||
[-k | --killAll] Kill DB + Scripts
|
||||
[-ks | --killscript] Scripts
|
||||
[-u | --update] Update AIL
|
||||
[-c | --crawler] LAUNCH Crawlers
|
||||
[-f | --launchFeeder] LAUNCH Pystemon feeder
|
||||
[-t | --thirdpartyUpdate] Update Web
|
||||
[-ut | --thirdpartyUpdate] Update Web
|
||||
[-t | --test] Launch Tests
|
||||
[-rp | --resetPassword] Reset Password
|
||||
[-f | --launchFeeder] LAUNCH Pystemon feeder
|
||||
[-m | --menu] Display Advanced Menu
|
||||
[-h | --help] Help
|
||||
"
|
||||
|
@ -234,34 +234,34 @@ function launching_scripts {
|
|||
|
||||
}
|
||||
|
||||
function launching_crawler {
|
||||
if [[ ! $iscrawler ]]; then
|
||||
CONFIG=$AIL_HOME/configs/core.cfg
|
||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
||||
|
||||
IFS='-' read -ra PORTS <<< "$lport"
|
||||
if [ ${#PORTS[@]} -eq 1 ]
|
||||
then
|
||||
first_port=${PORTS[0]}
|
||||
last_port=${PORTS[0]}
|
||||
else
|
||||
first_port=${PORTS[0]}
|
||||
last_port=${PORTS[1]}
|
||||
fi
|
||||
|
||||
screen -dmS "Crawler_AIL"
|
||||
sleep 0.1
|
||||
|
||||
for ((i=first_port;i<=last_port;i++)); do
|
||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
|
||||
sleep 0.1
|
||||
done
|
||||
|
||||
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||
else
|
||||
echo -e $RED"\t* A screen is already launched"$DEFAULT
|
||||
fi
|
||||
}
|
||||
# function launching_crawler {
|
||||
# if [[ ! $iscrawler ]]; then
|
||||
# CONFIG=$AIL_HOME/configs/core.cfg
|
||||
# lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
||||
#
|
||||
# IFS='-' read -ra PORTS <<< "$lport"
|
||||
# if [ ${#PORTS[@]} -eq 1 ]
|
||||
# then
|
||||
# first_port=${PORTS[0]}
|
||||
# last_port=${PORTS[0]}
|
||||
# else
|
||||
# first_port=${PORTS[0]}
|
||||
# last_port=${PORTS[1]}
|
||||
# fi
|
||||
#
|
||||
# screen -dmS "Crawler_AIL"
|
||||
# sleep 0.1
|
||||
#
|
||||
# for ((i=first_port;i<=last_port;i++)); do
|
||||
# screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
|
||||
# sleep 0.1
|
||||
# done
|
||||
#
|
||||
# echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||
# else
|
||||
# echo -e $RED"\t* A screen is already launched"$DEFAULT
|
||||
# fi
|
||||
# }
|
||||
|
||||
function shutting_down_redis {
|
||||
redis_dir=${AIL_HOME}/redis/src/
|
||||
|
@ -490,6 +490,12 @@ function update_thirdparty {
|
|||
fi
|
||||
}
|
||||
|
||||
function launch_tests() {
|
||||
tests_dir=${AIL_HOME}/tests
|
||||
bin_dir=${AIL_BIN}
|
||||
python3 `which nosetests` -w $tests_dir --with-coverage --cover-package=$bin_dir -d
|
||||
}
|
||||
|
||||
function reset_password() {
|
||||
echo -e "\t* Reseting UI admin password..."
|
||||
if checking_ardb && checking_redis; then
|
||||
|
@ -557,9 +563,6 @@ function menu_display {
|
|||
Flask)
|
||||
launch_flask;
|
||||
;;
|
||||
Crawler)
|
||||
launching_crawler;
|
||||
;;
|
||||
Killall)
|
||||
killall;
|
||||
;;
|
||||
|
@ -614,12 +617,12 @@ while [ "$1" != "" ]; do
|
|||
;;
|
||||
-u | --update ) update "--manual";
|
||||
;;
|
||||
-t | --thirdpartyUpdate ) update_thirdparty;
|
||||
-t | --test ) launch_tests;
|
||||
;;
|
||||
-ut | --thirdpartyUpdate ) update_thirdparty;
|
||||
;;
|
||||
-rp | --resetPassword ) reset_password;
|
||||
;;
|
||||
-c | --crawler ) launching_crawler;
|
||||
;;
|
||||
-f | --launchFeeder ) launch_feeder;
|
||||
;;
|
||||
-h | --help ) helptext;
|
||||
|
|
16
bin/Onion.py
16
bin/Onion.py
|
@ -126,11 +126,9 @@ class Onion(AbstractModule):
|
|||
# list of tuples: (url, subdomains, domain)
|
||||
urls_to_crawl = []
|
||||
|
||||
print(message)
|
||||
id, score = message.split()
|
||||
item = Item(id)
|
||||
item_content = item.get_content()
|
||||
item_content = 'http://33333333.kingdom7rv6wkfzn.onion?sdsd=ooooo http://2222222.kingdom7rv6wkfzn.onion'
|
||||
|
||||
# max execution time on regex
|
||||
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
|
||||
|
@ -145,10 +143,6 @@ class Onion(AbstractModule):
|
|||
domain = url_unpack['domain'].decode().lower()
|
||||
except Exception as e:
|
||||
domain = url_unpack['domain'].lower()
|
||||
print('----')
|
||||
print(url)
|
||||
print(subdomain)
|
||||
print(domain)
|
||||
|
||||
if crawlers.is_valid_onion_domain(domain):
|
||||
urls_to_crawl.append((url, subdomain, domain))
|
||||
|
@ -164,8 +158,10 @@ class Onion(AbstractModule):
|
|||
|
||||
if crawlers.is_crawler_activated():
|
||||
for to_crawl in urls_to_crawl:
|
||||
print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
|
||||
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
|
||||
else:
|
||||
print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
||||
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
||||
# keep manual fetcher ????
|
||||
## Manually fetch first page if crawler is disabled
|
||||
|
@ -176,11 +172,3 @@ if __name__ == "__main__":
|
|||
|
||||
module = Onion()
|
||||
module.run()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##########################
|
||||
|
|
373
bin/lib/ail_objects.py
Executable file
373
bin/lib/ail_objects.py
Executable file
|
@ -0,0 +1,373 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import redis
|
||||
|
||||
from abc import ABC
|
||||
from flask import url_for
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||
import ConfigLoader
|
||||
|
||||
class AbstractObject(ABC):
|
||||
"""
|
||||
Abstract Object
|
||||
"""
|
||||
|
||||
# first seen last/seen ??
|
||||
# # TODO: - tags
|
||||
# - handle + refactor coorelations
|
||||
# - creates others objects
|
||||
|
||||
def __init__(self, obj_type, id):
|
||||
""" Abstract for all the AIL object
|
||||
|
||||
:param obj_type: object type (item, ...)
|
||||
:param id: Object ID
|
||||
"""
|
||||
self.id = id
|
||||
self.type = obj_type
|
||||
|
||||
def get_type(self):
|
||||
return self.type
|
||||
|
||||
def get_id(self):
|
||||
return self.id
|
||||
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader()
|
||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||
config_loader = None
|
||||
|
||||
def is_valid_object_type(object_type):
|
||||
if object_type in ['domain', 'item', 'image', 'decoded']:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_all_objects():
|
||||
return ['domain', 'paste', 'pgp', 'cryptocurrency', 'decoded', 'screenshot']
|
||||
|
||||
def get_all_correlation_names():
|
||||
'''
|
||||
Return a list of all available correlations
|
||||
'''
|
||||
return ['pgp', 'cryptocurrency', 'decoded', 'screenshot']
|
||||
|
||||
def get_all_correlation_objects():
|
||||
'''
|
||||
Return a list of all correllated objects
|
||||
'''
|
||||
return ['domain', 'paste']
|
||||
|
||||
def exist_object(object_type, correlation_id, type_id=None):
|
||||
if object_type == 'domain':
|
||||
return Domain.verify_if_domain_exist(correlation_id)
|
||||
elif object_type == 'paste' or object_type == 'item':
|
||||
return Item.exist_item(correlation_id)
|
||||
elif object_type == 'decoded':
|
||||
return Decoded.exist_decoded(correlation_id)
|
||||
elif object_type == 'pgp':
|
||||
return Pgp.pgp._exist_corelation_field(type_id, correlation_id)
|
||||
elif object_type == 'cryptocurrency':
|
||||
return Cryptocurrency.cryptocurrency._exist_corelation_field(type_id, correlation_id)
|
||||
elif object_type == 'screenshot' or object_type == 'image':
|
||||
return Screenshot.exist_screenshot(correlation_id)
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_obj_date(object_type, object_id):
|
||||
if object_type == "item":
|
||||
return int(Item.get_item_date(object_id))
|
||||
else:
|
||||
return None
|
||||
|
||||
# request_type => api or ui
|
||||
def get_object_metadata(object_type, correlation_id, type_id=None):
|
||||
if object_type == 'domain':
|
||||
return Domain.Domain(correlation_id).get_domain_metadata(tags=True)
|
||||
elif object_type == 'paste' or object_type == 'item':
|
||||
return Item.get_item({"id": correlation_id, "date": True, "date_separator": True, "tags": True})[0]
|
||||
elif object_type == 'decoded':
|
||||
return Decoded.get_decoded_metadata(correlation_id, nb_seen=True, size=True, file_type=True, tag=True)
|
||||
elif object_type == 'pgp':
|
||||
return Pgp.pgp.get_metadata(type_id, correlation_id)
|
||||
elif object_type == 'cryptocurrency':
|
||||
return Cryptocurrency.cryptocurrency.get_metadata(type_id, correlation_id)
|
||||
elif object_type == 'screenshot' or object_type == 'image':
|
||||
return Screenshot.get_metadata(correlation_id)
|
||||
|
||||
def get_object_correlation(object_type, value, correlation_names=None, correlation_objects=None, requested_correl_type=None):
|
||||
if object_type == 'domain':
|
||||
return Domain.get_domain_all_correlation(value, correlation_names=correlation_names)
|
||||
elif object_type == 'paste' or object_type == 'item':
|
||||
return Item.get_item_all_correlation(value, correlation_names=correlation_names)
|
||||
elif object_type == 'decoded':
|
||||
return Decoded.get_decoded_correlated_object(value, correlation_objects=correlation_objects)
|
||||
elif object_type == 'pgp':
|
||||
return Pgp.pgp.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
|
||||
elif object_type == 'cryptocurrency':
|
||||
return Cryptocurrency.cryptocurrency.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
|
||||
elif object_type == 'screenshot' or object_type == 'image':
|
||||
return Screenshot.get_screenshot_correlated_object(value, correlation_objects=correlation_objects)
|
||||
return {}
|
||||
|
||||
def get_correlation_node_icon(correlation_name, correlation_type=None, value=None):
|
||||
'''
|
||||
Used in UI Graph.
|
||||
Return a font awesome icon for a given correlation_name.
|
||||
|
||||
:param correlation_name: correlation name
|
||||
:param correlation_name: str
|
||||
:param correlation_type: correlation type
|
||||
:type correlation_type: str, optional
|
||||
|
||||
:return: a dictionnary {font awesome class, icon_code}
|
||||
:rtype: dict
|
||||
'''
|
||||
icon_class = 'fas'
|
||||
icon_text = ''
|
||||
node_color = "#332288"
|
||||
node_radius = 6
|
||||
if correlation_name == "pgp":
|
||||
node_color = '#44AA99'
|
||||
if correlation_type == 'key':
|
||||
icon_text = '\uf084'
|
||||
elif correlation_type == 'name':
|
||||
icon_text = '\uf507'
|
||||
elif correlation_type == 'mail':
|
||||
icon_text = '\uf1fa'
|
||||
else:
|
||||
icon_text = 'times'
|
||||
|
||||
elif correlation_name == 'cryptocurrency':
|
||||
node_color = '#DDCC77'
|
||||
if correlation_type == 'bitcoin':
|
||||
icon_class = 'fab'
|
||||
icon_text = '\uf15a'
|
||||
elif correlation_type == 'monero':
|
||||
icon_class = 'fab'
|
||||
icon_text = '\uf3d0'
|
||||
elif correlation_type == 'ethereum':
|
||||
icon_class = 'fab'
|
||||
icon_text = '\uf42e'
|
||||
else:
|
||||
icon_text = '\uf51e'
|
||||
|
||||
elif correlation_name == 'decoded':
|
||||
node_color = '#88CCEE'
|
||||
correlation_type = Decoded.get_decoded_item_type(value).split('/')[0]
|
||||
if correlation_type == 'application':
|
||||
icon_text = '\uf15b'
|
||||
elif correlation_type == 'audio':
|
||||
icon_text = '\uf1c7'
|
||||
elif correlation_type == 'image':
|
||||
icon_text = '\uf1c5'
|
||||
elif correlation_type == 'text':
|
||||
icon_text = '\uf15c'
|
||||
else:
|
||||
icon_text = '\uf249'
|
||||
|
||||
elif correlation_name == 'screenshot' or correlation_name == 'image':
|
||||
node_color = '#E1F5DF'
|
||||
icon_text = '\uf03e'
|
||||
|
||||
elif correlation_name == 'domain':
|
||||
node_radius = 5
|
||||
node_color = '#3DA760'
|
||||
if Domain.get_domain_type(value) == 'onion':
|
||||
icon_text = '\uf06e'
|
||||
else:
|
||||
icon_class = 'fab'
|
||||
icon_text = '\uf13b'
|
||||
|
||||
elif correlation_name == 'paste':
|
||||
node_radius = 5
|
||||
if Item.is_crawled(value):
|
||||
node_color = 'red'
|
||||
else:
|
||||
node_color = '#332288'
|
||||
|
||||
return {"icon_class": icon_class, "icon_text": icon_text, "node_color": node_color, "node_radius": node_radius}
|
||||
|
||||
def get_item_url(correlation_name, value, correlation_type=None):
|
||||
'''
|
||||
Warning: use only in flask
|
||||
'''
|
||||
url = '#'
|
||||
if correlation_name == "pgp":
|
||||
endpoint = 'correlation.show_correlation'
|
||||
url = url_for(endpoint, object_type="pgp", type_id=correlation_type, correlation_id=value)
|
||||
elif correlation_name == 'cryptocurrency':
|
||||
endpoint = 'correlation.show_correlation'
|
||||
url = url_for(endpoint, object_type="cryptocurrency", type_id=correlation_type, correlation_id=value)
|
||||
elif correlation_name == 'decoded':
|
||||
endpoint = 'correlation.show_correlation'
|
||||
url = url_for(endpoint, object_type="decoded", correlation_id=value)
|
||||
elif correlation_name == 'screenshot' or correlation_name == 'image': ### # TODO: rename me
|
||||
endpoint = 'correlation.show_correlation'
|
||||
url = url_for(endpoint, object_type="screenshot", correlation_id=value)
|
||||
elif correlation_name == 'domain':
|
||||
endpoint = 'crawler_splash.showDomain'
|
||||
url = url_for(endpoint, domain=value)
|
||||
elif correlation_name == 'item':
|
||||
endpoint = 'showsavedpastes.showsavedpaste'
|
||||
url = url_for(endpoint, paste=value)
|
||||
elif correlation_name == 'paste': ### # TODO: remove me
|
||||
endpoint = 'showsavedpastes.showsavedpaste'
|
||||
url = url_for(endpoint, paste=value)
|
||||
return url
|
||||
|
||||
def get_obj_tag_table_keys(object_type):
|
||||
'''
|
||||
Warning: use only in flask (dynamic templates)
|
||||
'''
|
||||
if object_type=="domain":
|
||||
return ['id', 'first_seen', 'last_check', 'status'] # # TODO: add root screenshot
|
||||
|
||||
|
||||
def create_graph_links(links_set):
|
||||
graph_links_list = []
|
||||
for link in links_set:
|
||||
graph_links_list.append({"source": link[0], "target": link[1]})
|
||||
return graph_links_list
|
||||
|
||||
def create_graph_nodes(nodes_set, root_node_id):
|
||||
graph_nodes_list = []
|
||||
for node_id in nodes_set:
|
||||
correlation_name, correlation_type, value = node_id.split(';', 3)
|
||||
dict_node = {"id": node_id}
|
||||
dict_node['style'] = get_correlation_node_icon(correlation_name, correlation_type, value)
|
||||
dict_node['text'] = value
|
||||
if node_id == root_node_id:
|
||||
dict_node["style"]["node_color"] = 'orange'
|
||||
dict_node["style"]["node_radius"] = 7
|
||||
dict_node['url'] = get_item_url(correlation_name, value, correlation_type)
|
||||
graph_nodes_list.append(dict_node)
|
||||
return graph_nodes_list
|
||||
|
||||
def create_node_id(correlation_name, value, correlation_type=''):
|
||||
if correlation_type is None:
|
||||
correlation_type = ''
|
||||
return '{};{};{}'.format(correlation_name, correlation_type, value)
|
||||
|
||||
|
||||
|
||||
# # TODO: filter by correlation type => bitcoin, mail, ...
|
||||
def get_graph_node_object_correlation(object_type, root_value, mode, correlation_names, correlation_objects, max_nodes=300, requested_correl_type=None):
|
||||
links = set()
|
||||
nodes = set()
|
||||
|
||||
root_node_id = create_node_id(object_type, root_value, requested_correl_type)
|
||||
nodes.add(root_node_id)
|
||||
|
||||
root_correlation = get_object_correlation(object_type, root_value, correlation_names, correlation_objects, requested_correl_type=requested_correl_type)
|
||||
for correl in root_correlation:
|
||||
if correl in ('pgp', 'cryptocurrency'):
|
||||
for correl_type in root_correlation[correl]:
|
||||
for correl_val in root_correlation[correl][correl_type]:
|
||||
|
||||
# add correlation
|
||||
correl_node_id = create_node_id(correl, correl_val, correl_type)
|
||||
|
||||
if mode=="union":
|
||||
if len(nodes) > max_nodes:
|
||||
break
|
||||
nodes.add(correl_node_id)
|
||||
links.add((root_node_id, correl_node_id))
|
||||
|
||||
# get second correlation
|
||||
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects, requested_correl_type=correl_type)
|
||||
if res:
|
||||
for corr_obj in res:
|
||||
for correl_key_val in res[corr_obj]:
|
||||
#filter root value
|
||||
if correl_key_val == root_value:
|
||||
continue
|
||||
|
||||
if len(nodes) > max_nodes:
|
||||
break
|
||||
new_corel_1 = create_node_id(corr_obj, correl_key_val)
|
||||
new_corel_2 = create_node_id(correl, correl_val, correl_type)
|
||||
nodes.add(new_corel_1)
|
||||
nodes.add(new_corel_2)
|
||||
links.add((new_corel_1, new_corel_2))
|
||||
|
||||
if mode=="inter":
|
||||
nodes.add(correl_node_id)
|
||||
links.add((root_node_id, correl_node_id))
|
||||
if correl in ('decoded', 'screenshot', 'domain', 'paste'):
|
||||
for correl_val in root_correlation[correl]:
|
||||
|
||||
correl_node_id = create_node_id(correl, correl_val)
|
||||
if mode=="union":
|
||||
if len(nodes) > max_nodes:
|
||||
break
|
||||
nodes.add(correl_node_id)
|
||||
links.add((root_node_id, correl_node_id))
|
||||
|
||||
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects)
|
||||
if res:
|
||||
for corr_obj in res:
|
||||
if corr_obj in ('decoded', 'domain', 'paste', 'screenshot'):
|
||||
for correl_key_val in res[corr_obj]:
|
||||
#filter root value
|
||||
if correl_key_val == root_value:
|
||||
continue
|
||||
|
||||
if len(nodes) > max_nodes:
|
||||
break
|
||||
new_corel_1 = create_node_id(corr_obj, correl_key_val)
|
||||
new_corel_2 = create_node_id(correl, correl_val)
|
||||
nodes.add(new_corel_1)
|
||||
nodes.add(new_corel_2)
|
||||
links.add((new_corel_1, new_corel_2))
|
||||
|
||||
if mode=="inter":
|
||||
nodes.add(correl_node_id)
|
||||
links.add((root_node_id, correl_node_id))
|
||||
|
||||
if corr_obj in ('pgp', 'cryptocurrency'):
|
||||
for correl_key_type in res[corr_obj]:
|
||||
for correl_key_val in res[corr_obj][correl_key_type]:
|
||||
#filter root value
|
||||
if correl_key_val == root_value:
|
||||
continue
|
||||
|
||||
if len(nodes) > max_nodes:
|
||||
break
|
||||
new_corel_1 = create_node_id(corr_obj, correl_key_val, correl_key_type)
|
||||
new_corel_2 = create_node_id(correl, correl_val)
|
||||
nodes.add(new_corel_1)
|
||||
nodes.add(new_corel_2)
|
||||
links.add((new_corel_1, new_corel_2))
|
||||
|
||||
if mode=="inter":
|
||||
nodes.add(correl_node_id)
|
||||
links.add((root_node_id, correl_node_id))
|
||||
|
||||
|
||||
return {"nodes": create_graph_nodes(nodes, root_node_id), "links": create_graph_links(links)}
|
||||
|
||||
|
||||
def get_obj_global_id(obj_type, obj_id, obj_sub_type=None):
|
||||
if obj_sub_type:
|
||||
return '{}:{}:{}'.format(obj_type, obj_sub_type, obj_id)
|
||||
else:
|
||||
# # TODO: remove me
|
||||
if obj_type=='paste':
|
||||
obj_type='item'
|
||||
# # TODO: remove me
|
||||
if obj_type=='screenshot':
|
||||
obj_type='image'
|
||||
|
||||
return '{}:{}'.format(obj_type, obj_id)
|
||||
|
||||
######## API EXPOSED ########
|
||||
def sanitize_object_type(object_type):
|
||||
if not is_valid_object_type(object_type):
|
||||
return ({'status': 'error', 'reason': 'Incorrect object_type'}, 400)
|
||||
######## ########
|
|
@ -843,6 +843,21 @@ def get_all_queues_stats():
|
|||
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
|
||||
return dict_stats
|
||||
|
||||
def is_domain_in_queue(queue_type, domain):
|
||||
return r_serv_onion.sismember(f'{queue_type}_domain_crawler_queue', domain)
|
||||
|
||||
def is_item_in_queue(queue_type, url, item_id, queue_name=None):
|
||||
if queue_name is None:
|
||||
queues = get_all_queues_keys()
|
||||
else:
|
||||
queues = get_queue_key_by_name(queue_name)
|
||||
|
||||
key = f'{url};{item_id}'
|
||||
for queue in queues:
|
||||
if r_serv_onion.sismember(queue.format(queue_type), key):
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
|
||||
date_month = datetime.now().strftime("%Y%m")
|
||||
date = datetime.now().strftime("%Y%m%d")
|
||||
|
@ -868,6 +883,17 @@ def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
|
|||
r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg)
|
||||
print(f'sent to queue: {subdomain}')
|
||||
|
||||
def queue_test_clean_up(queue_type, domain, item_id):
|
||||
date_month = datetime.now().strftime("%Y%m")
|
||||
r_serv_onion.srem(f'month_{queue_type}_up:{date_month}', domain)
|
||||
|
||||
# Clean up
|
||||
r_serv_onion.srem(f'{queue_type}_domain_crawler_queue', domain)
|
||||
msg = f'{domain};{item_id}'
|
||||
r_serv_onion.srem(f'{queue_type}_crawler_discovery_queue', msg)
|
||||
r_serv_onion.srem(f'{queue_type}_crawler_queue', msg)
|
||||
|
||||
|
||||
def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove):
|
||||
r_serv_onion.srem(queue_name.format(queue_type), key_to_remove)
|
||||
|
||||
|
@ -1417,7 +1443,7 @@ def test_ail_crawlers():
|
|||
|
||||
#### ---- ####
|
||||
|
||||
if __name__ == '__main__':
|
||||
#if __name__ == '__main__':
|
||||
# res = get_splash_manager_version()
|
||||
# res = test_ail_crawlers()
|
||||
# res = is_test_ail_crawlers_successful()
|
||||
|
|
BIN
samples/2021/01/01/keys_certificat_sample.gz
Normal file
BIN
samples/2021/01/01/keys_certificat_sample.gz
Normal file
Binary file not shown.
BIN
samples/2021/01/01/onion.gz
Normal file
BIN
samples/2021/01/01/onion.gz
Normal file
Binary file not shown.
43
tests/test_modules.py
Normal file
43
tests/test_modules.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
|
||||
# Modules Classes
|
||||
from Onion import Onion
|
||||
|
||||
# projects packages
|
||||
import lib.crawlers as crawlers
|
||||
|
||||
class Test_Module_Onion(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.module_obj = Onion()
|
||||
|
||||
def test_module(self):
|
||||
item_id = 'tests/2021/01/01/onion.gz'
|
||||
domain_1 = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
|
||||
domain_2 = 'www.facebookcorewwwi.onion'
|
||||
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
|
||||
|
||||
self.module_obj.compute(f'{item_id} 3')
|
||||
if crawlers.is_crawler_activated():
|
||||
## check domain queues
|
||||
# all domains queue
|
||||
self.assertTrue(crawlers.is_domain_in_queue('onion', domain_1))
|
||||
# all url/item queue
|
||||
self.assertTrue(crawlers.is_item_in_queue('onion', f'http://{domain_1}', item_id))
|
||||
# domain blacklist
|
||||
self.assertFalse(crawlers.is_domain_in_queue('onion', domain_2))
|
||||
# invalid onion
|
||||
self.assertFalse(crawlers.is_domain_in_queue('onion', 'invalid.onion'))
|
||||
|
||||
# clean DB
|
||||
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
|
||||
else:
|
||||
# # TODO: check warning logs
|
||||
pass
|
Loading…
Reference in a new issue