chg: [launcher + modules] add module tests (Onion module)

This commit is contained in:
Terrtia 2021-05-17 18:03:30 +02:00
parent 869be4a493
commit 4896db98a3
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
7 changed files with 485 additions and 52 deletions

View file

@ -78,10 +78,10 @@ function helptext {
[-k | --killAll] Kill DB + Scripts [-k | --killAll] Kill DB + Scripts
[-ks | --killscript] Scripts [-ks | --killscript] Scripts
[-u | --update] Update AIL [-u | --update] Update AIL
[-c | --crawler] LAUNCH Crawlers [-ut | --thirdpartyUpdate] Update Web
[-f | --launchFeeder] LAUNCH Pystemon feeder [-t | --test] Launch Tests
[-t | --thirdpartyUpdate] Update Web
[-rp | --resetPassword] Reset Password [-rp | --resetPassword] Reset Password
[-f | --launchFeeder] LAUNCH Pystemon feeder
[-m | --menu] Display Advanced Menu [-m | --menu] Display Advanced Menu
[-h | --help] Help [-h | --help] Help
" "
@ -234,34 +234,34 @@ function launching_scripts {
} }
function launching_crawler { # function launching_crawler {
if [[ ! $iscrawler ]]; then # if [[ ! $iscrawler ]]; then
CONFIG=$AIL_HOME/configs/core.cfg # CONFIG=$AIL_HOME/configs/core.cfg
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}") # lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
#
IFS='-' read -ra PORTS <<< "$lport" # IFS='-' read -ra PORTS <<< "$lport"
if [ ${#PORTS[@]} -eq 1 ] # if [ ${#PORTS[@]} -eq 1 ]
then # then
first_port=${PORTS[0]} # first_port=${PORTS[0]}
last_port=${PORTS[0]} # last_port=${PORTS[0]}
else # else
first_port=${PORTS[0]} # first_port=${PORTS[0]}
last_port=${PORTS[1]} # last_port=${PORTS[1]}
fi # fi
#
screen -dmS "Crawler_AIL" # screen -dmS "Crawler_AIL"
sleep 0.1 # sleep 0.1
#
for ((i=first_port;i<=last_port;i++)); do # for ((i=first_port;i<=last_port;i++)); do
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x" # screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
sleep 0.1 # sleep 0.1
done # done
#
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT # echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
else # else
echo -e $RED"\t* A screen is already launched"$DEFAULT # echo -e $RED"\t* A screen is already launched"$DEFAULT
fi # fi
} # }
function shutting_down_redis { function shutting_down_redis {
redis_dir=${AIL_HOME}/redis/src/ redis_dir=${AIL_HOME}/redis/src/
@ -490,6 +490,12 @@ function update_thirdparty {
fi fi
} }
function launch_tests() {
tests_dir=${AIL_HOME}/tests
bin_dir=${AIL_BIN}
python3 `which nosetests` -w $tests_dir --with-coverage --cover-package=$bin_dir -d
}
function reset_password() { function reset_password() {
echo -e "\t* Reseting UI admin password..." echo -e "\t* Reseting UI admin password..."
if checking_ardb && checking_redis; then if checking_ardb && checking_redis; then
@ -557,9 +563,6 @@ function menu_display {
Flask) Flask)
launch_flask; launch_flask;
;; ;;
Crawler)
launching_crawler;
;;
Killall) Killall)
killall; killall;
;; ;;
@ -614,12 +617,12 @@ while [ "$1" != "" ]; do
;; ;;
-u | --update ) update "--manual"; -u | --update ) update "--manual";
;; ;;
-t | --thirdpartyUpdate ) update_thirdparty; -t | --test ) launch_tests;
;;
-ut | --thirdpartyUpdate ) update_thirdparty;
;; ;;
-rp | --resetPassword ) reset_password; -rp | --resetPassword ) reset_password;
;; ;;
-c | --crawler ) launching_crawler;
;;
-f | --launchFeeder ) launch_feeder; -f | --launchFeeder ) launch_feeder;
;; ;;
-h | --help ) helptext; -h | --help ) helptext;

View file

@ -126,11 +126,9 @@ class Onion(AbstractModule):
# list of tuples: (url, subdomains, domain) # list of tuples: (url, subdomains, domain)
urls_to_crawl = [] urls_to_crawl = []
print(message)
id, score = message.split() id, score = message.split()
item = Item(id) item = Item(id)
item_content = item.get_content() item_content = item.get_content()
item_content = 'http://33333333.kingdom7rv6wkfzn.onion?sdsd=ooooo http://2222222.kingdom7rv6wkfzn.onion'
# max execution time on regex # max execution time on regex
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
@ -145,10 +143,6 @@ class Onion(AbstractModule):
domain = url_unpack['domain'].decode().lower() domain = url_unpack['domain'].decode().lower()
except Exception as e: except Exception as e:
domain = url_unpack['domain'].lower() domain = url_unpack['domain'].lower()
print('----')
print(url)
print(subdomain)
print(domain)
if crawlers.is_valid_onion_domain(domain): if crawlers.is_valid_onion_domain(domain):
urls_to_crawl.append((url, subdomain, domain)) urls_to_crawl.append((url, subdomain, domain))
@ -164,8 +158,10 @@ class Onion(AbstractModule):
if crawlers.is_crawler_activated(): if crawlers.is_crawler_activated():
for to_crawl in urls_to_crawl: for to_crawl in urls_to_crawl:
print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id()) crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
else: else:
print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}') self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
# keep manual fetcher ???? # keep manual fetcher ????
## Manually fetch first page if crawler is disabled ## Manually fetch first page if crawler is disabled
@ -176,11 +172,3 @@ if __name__ == "__main__":
module = Onion() module = Onion()
module.run() module.run()
##########################

373
bin/lib/ail_objects.py Executable file
View file

@ -0,0 +1,373 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import uuid
import redis
from abc import ABC
from flask import url_for
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
class AbstractObject(ABC):
"""
Abstract Object
"""
# first seen last/seen ??
# # TODO: - tags
# - handle + refactor coorelations
# - creates others objects
def __init__(self, obj_type, id):
""" Abstract for all the AIL object
:param obj_type: object type (item, ...)
:param id: Object ID
"""
self.id = id
self.type = obj_type
def get_type(self):
return self.type
def get_id(self):
return self.id
config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
def is_valid_object_type(object_type):
if object_type in ['domain', 'item', 'image', 'decoded']:
return True
else:
return False
def get_all_objects():
return ['domain', 'paste', 'pgp', 'cryptocurrency', 'decoded', 'screenshot']
def get_all_correlation_names():
'''
Return a list of all available correlations
'''
return ['pgp', 'cryptocurrency', 'decoded', 'screenshot']
def get_all_correlation_objects():
'''
Return a list of all correllated objects
'''
return ['domain', 'paste']
def exist_object(object_type, correlation_id, type_id=None):
if object_type == 'domain':
return Domain.verify_if_domain_exist(correlation_id)
elif object_type == 'paste' or object_type == 'item':
return Item.exist_item(correlation_id)
elif object_type == 'decoded':
return Decoded.exist_decoded(correlation_id)
elif object_type == 'pgp':
return Pgp.pgp._exist_corelation_field(type_id, correlation_id)
elif object_type == 'cryptocurrency':
return Cryptocurrency.cryptocurrency._exist_corelation_field(type_id, correlation_id)
elif object_type == 'screenshot' or object_type == 'image':
return Screenshot.exist_screenshot(correlation_id)
else:
return False
def get_obj_date(object_type, object_id):
if object_type == "item":
return int(Item.get_item_date(object_id))
else:
return None
# request_type => api or ui
def get_object_metadata(object_type, correlation_id, type_id=None):
if object_type == 'domain':
return Domain.Domain(correlation_id).get_domain_metadata(tags=True)
elif object_type == 'paste' or object_type == 'item':
return Item.get_item({"id": correlation_id, "date": True, "date_separator": True, "tags": True})[0]
elif object_type == 'decoded':
return Decoded.get_decoded_metadata(correlation_id, nb_seen=True, size=True, file_type=True, tag=True)
elif object_type == 'pgp':
return Pgp.pgp.get_metadata(type_id, correlation_id)
elif object_type == 'cryptocurrency':
return Cryptocurrency.cryptocurrency.get_metadata(type_id, correlation_id)
elif object_type == 'screenshot' or object_type == 'image':
return Screenshot.get_metadata(correlation_id)
def get_object_correlation(object_type, value, correlation_names=None, correlation_objects=None, requested_correl_type=None):
if object_type == 'domain':
return Domain.get_domain_all_correlation(value, correlation_names=correlation_names)
elif object_type == 'paste' or object_type == 'item':
return Item.get_item_all_correlation(value, correlation_names=correlation_names)
elif object_type == 'decoded':
return Decoded.get_decoded_correlated_object(value, correlation_objects=correlation_objects)
elif object_type == 'pgp':
return Pgp.pgp.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
elif object_type == 'cryptocurrency':
return Cryptocurrency.cryptocurrency.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
elif object_type == 'screenshot' or object_type == 'image':
return Screenshot.get_screenshot_correlated_object(value, correlation_objects=correlation_objects)
return {}
def get_correlation_node_icon(correlation_name, correlation_type=None, value=None):
'''
Used in UI Graph.
Return a font awesome icon for a given correlation_name.
:param correlation_name: correlation name
:param correlation_name: str
:param correlation_type: correlation type
:type correlation_type: str, optional
:return: a dictionnary {font awesome class, icon_code}
:rtype: dict
'''
icon_class = 'fas'
icon_text = ''
node_color = "#332288"
node_radius = 6
if correlation_name == "pgp":
node_color = '#44AA99'
if correlation_type == 'key':
icon_text = '\uf084'
elif correlation_type == 'name':
icon_text = '\uf507'
elif correlation_type == 'mail':
icon_text = '\uf1fa'
else:
icon_text = 'times'
elif correlation_name == 'cryptocurrency':
node_color = '#DDCC77'
if correlation_type == 'bitcoin':
icon_class = 'fab'
icon_text = '\uf15a'
elif correlation_type == 'monero':
icon_class = 'fab'
icon_text = '\uf3d0'
elif correlation_type == 'ethereum':
icon_class = 'fab'
icon_text = '\uf42e'
else:
icon_text = '\uf51e'
elif correlation_name == 'decoded':
node_color = '#88CCEE'
correlation_type = Decoded.get_decoded_item_type(value).split('/')[0]
if correlation_type == 'application':
icon_text = '\uf15b'
elif correlation_type == 'audio':
icon_text = '\uf1c7'
elif correlation_type == 'image':
icon_text = '\uf1c5'
elif correlation_type == 'text':
icon_text = '\uf15c'
else:
icon_text = '\uf249'
elif correlation_name == 'screenshot' or correlation_name == 'image':
node_color = '#E1F5DF'
icon_text = '\uf03e'
elif correlation_name == 'domain':
node_radius = 5
node_color = '#3DA760'
if Domain.get_domain_type(value) == 'onion':
icon_text = '\uf06e'
else:
icon_class = 'fab'
icon_text = '\uf13b'
elif correlation_name == 'paste':
node_radius = 5
if Item.is_crawled(value):
node_color = 'red'
else:
node_color = '#332288'
return {"icon_class": icon_class, "icon_text": icon_text, "node_color": node_color, "node_radius": node_radius}
def get_item_url(correlation_name, value, correlation_type=None):
'''
Warning: use only in flask
'''
url = '#'
if correlation_name == "pgp":
endpoint = 'correlation.show_correlation'
url = url_for(endpoint, object_type="pgp", type_id=correlation_type, correlation_id=value)
elif correlation_name == 'cryptocurrency':
endpoint = 'correlation.show_correlation'
url = url_for(endpoint, object_type="cryptocurrency", type_id=correlation_type, correlation_id=value)
elif correlation_name == 'decoded':
endpoint = 'correlation.show_correlation'
url = url_for(endpoint, object_type="decoded", correlation_id=value)
elif correlation_name == 'screenshot' or correlation_name == 'image': ### # TODO: rename me
endpoint = 'correlation.show_correlation'
url = url_for(endpoint, object_type="screenshot", correlation_id=value)
elif correlation_name == 'domain':
endpoint = 'crawler_splash.showDomain'
url = url_for(endpoint, domain=value)
elif correlation_name == 'item':
endpoint = 'showsavedpastes.showsavedpaste'
url = url_for(endpoint, paste=value)
elif correlation_name == 'paste': ### # TODO: remove me
endpoint = 'showsavedpastes.showsavedpaste'
url = url_for(endpoint, paste=value)
return url
def get_obj_tag_table_keys(object_type):
'''
Warning: use only in flask (dynamic templates)
'''
if object_type=="domain":
return ['id', 'first_seen', 'last_check', 'status'] # # TODO: add root screenshot
def create_graph_links(links_set):
graph_links_list = []
for link in links_set:
graph_links_list.append({"source": link[0], "target": link[1]})
return graph_links_list
def create_graph_nodes(nodes_set, root_node_id):
graph_nodes_list = []
for node_id in nodes_set:
correlation_name, correlation_type, value = node_id.split(';', 3)
dict_node = {"id": node_id}
dict_node['style'] = get_correlation_node_icon(correlation_name, correlation_type, value)
dict_node['text'] = value
if node_id == root_node_id:
dict_node["style"]["node_color"] = 'orange'
dict_node["style"]["node_radius"] = 7
dict_node['url'] = get_item_url(correlation_name, value, correlation_type)
graph_nodes_list.append(dict_node)
return graph_nodes_list
def create_node_id(correlation_name, value, correlation_type=''):
if correlation_type is None:
correlation_type = ''
return '{};{};{}'.format(correlation_name, correlation_type, value)
# # TODO: filter by correlation type => bitcoin, mail, ...
def get_graph_node_object_correlation(object_type, root_value, mode, correlation_names, correlation_objects, max_nodes=300, requested_correl_type=None):
links = set()
nodes = set()
root_node_id = create_node_id(object_type, root_value, requested_correl_type)
nodes.add(root_node_id)
root_correlation = get_object_correlation(object_type, root_value, correlation_names, correlation_objects, requested_correl_type=requested_correl_type)
for correl in root_correlation:
if correl in ('pgp', 'cryptocurrency'):
for correl_type in root_correlation[correl]:
for correl_val in root_correlation[correl][correl_type]:
# add correlation
correl_node_id = create_node_id(correl, correl_val, correl_type)
if mode=="union":
if len(nodes) > max_nodes:
break
nodes.add(correl_node_id)
links.add((root_node_id, correl_node_id))
# get second correlation
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects, requested_correl_type=correl_type)
if res:
for corr_obj in res:
for correl_key_val in res[corr_obj]:
#filter root value
if correl_key_val == root_value:
continue
if len(nodes) > max_nodes:
break
new_corel_1 = create_node_id(corr_obj, correl_key_val)
new_corel_2 = create_node_id(correl, correl_val, correl_type)
nodes.add(new_corel_1)
nodes.add(new_corel_2)
links.add((new_corel_1, new_corel_2))
if mode=="inter":
nodes.add(correl_node_id)
links.add((root_node_id, correl_node_id))
if correl in ('decoded', 'screenshot', 'domain', 'paste'):
for correl_val in root_correlation[correl]:
correl_node_id = create_node_id(correl, correl_val)
if mode=="union":
if len(nodes) > max_nodes:
break
nodes.add(correl_node_id)
links.add((root_node_id, correl_node_id))
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects)
if res:
for corr_obj in res:
if corr_obj in ('decoded', 'domain', 'paste', 'screenshot'):
for correl_key_val in res[corr_obj]:
#filter root value
if correl_key_val == root_value:
continue
if len(nodes) > max_nodes:
break
new_corel_1 = create_node_id(corr_obj, correl_key_val)
new_corel_2 = create_node_id(correl, correl_val)
nodes.add(new_corel_1)
nodes.add(new_corel_2)
links.add((new_corel_1, new_corel_2))
if mode=="inter":
nodes.add(correl_node_id)
links.add((root_node_id, correl_node_id))
if corr_obj in ('pgp', 'cryptocurrency'):
for correl_key_type in res[corr_obj]:
for correl_key_val in res[corr_obj][correl_key_type]:
#filter root value
if correl_key_val == root_value:
continue
if len(nodes) > max_nodes:
break
new_corel_1 = create_node_id(corr_obj, correl_key_val, correl_key_type)
new_corel_2 = create_node_id(correl, correl_val)
nodes.add(new_corel_1)
nodes.add(new_corel_2)
links.add((new_corel_1, new_corel_2))
if mode=="inter":
nodes.add(correl_node_id)
links.add((root_node_id, correl_node_id))
return {"nodes": create_graph_nodes(nodes, root_node_id), "links": create_graph_links(links)}
def get_obj_global_id(obj_type, obj_id, obj_sub_type=None):
if obj_sub_type:
return '{}:{}:{}'.format(obj_type, obj_sub_type, obj_id)
else:
# # TODO: remove me
if obj_type=='paste':
obj_type='item'
# # TODO: remove me
if obj_type=='screenshot':
obj_type='image'
return '{}:{}'.format(obj_type, obj_id)
######## API EXPOSED ########
def sanitize_object_type(object_type):
if not is_valid_object_type(object_type):
return ({'status': 'error', 'reason': 'Incorrect object_type'}, 400)
######## ########

View file

@ -843,6 +843,21 @@ def get_all_queues_stats():
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type) dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
return dict_stats return dict_stats
def is_domain_in_queue(queue_type, domain):
return r_serv_onion.sismember(f'{queue_type}_domain_crawler_queue', domain)
def is_item_in_queue(queue_type, url, item_id, queue_name=None):
if queue_name is None:
queues = get_all_queues_keys()
else:
queues = get_queue_key_by_name(queue_name)
key = f'{url};{item_id}'
for queue in queues:
if r_serv_onion.sismember(queue.format(queue_type), key):
return True
return False
def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id): def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
date_month = datetime.now().strftime("%Y%m") date_month = datetime.now().strftime("%Y%m")
date = datetime.now().strftime("%Y%m%d") date = datetime.now().strftime("%Y%m%d")
@ -868,6 +883,17 @@ def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg) r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg)
print(f'sent to queue: {subdomain}') print(f'sent to queue: {subdomain}')
def queue_test_clean_up(queue_type, domain, item_id):
date_month = datetime.now().strftime("%Y%m")
r_serv_onion.srem(f'month_{queue_type}_up:{date_month}', domain)
# Clean up
r_serv_onion.srem(f'{queue_type}_domain_crawler_queue', domain)
msg = f'{domain};{item_id}'
r_serv_onion.srem(f'{queue_type}_crawler_discovery_queue', msg)
r_serv_onion.srem(f'{queue_type}_crawler_queue', msg)
def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove): def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove):
r_serv_onion.srem(queue_name.format(queue_type), key_to_remove) r_serv_onion.srem(queue_name.format(queue_type), key_to_remove)
@ -1417,7 +1443,7 @@ def test_ail_crawlers():
#### ---- #### #### ---- ####
if __name__ == '__main__': #if __name__ == '__main__':
# res = get_splash_manager_version() # res = get_splash_manager_version()
# res = test_ail_crawlers() # res = test_ail_crawlers()
# res = is_test_ail_crawlers_successful() # res = is_test_ail_crawlers_successful()

Binary file not shown.

BIN
samples/2021/01/01/onion.gz Normal file

Binary file not shown.

43
tests/test_modules.py Normal file
View file

@ -0,0 +1,43 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import unittest
sys.path.append(os.environ['AIL_BIN'])
# Modules Classes
from Onion import Onion
# projects packages
import lib.crawlers as crawlers
class Test_Module_Onion(unittest.TestCase):
def setUp(self):
self.module_obj = Onion()
def test_module(self):
item_id = 'tests/2021/01/01/onion.gz'
domain_1 = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
domain_2 = 'www.facebookcorewwwi.onion'
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
self.module_obj.compute(f'{item_id} 3')
if crawlers.is_crawler_activated():
## check domain queues
# all domains queue
self.assertTrue(crawlers.is_domain_in_queue('onion', domain_1))
# all url/item queue
self.assertTrue(crawlers.is_item_in_queue('onion', f'http://{domain_1}', item_id))
# domain blacklist
self.assertFalse(crawlers.is_domain_in_queue('onion', domain_2))
# invalid onion
self.assertFalse(crawlers.is_domain_in_queue('onion', 'invalid.onion'))
# clean DB
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
else:
# # TODO: check warning logs
pass