Merge branch 'dev' into scriptsenhance

This commit is contained in:
Thirion Aurélien 2021-05-14 14:50:42 +02:00 committed by GitHub
commit 8a95bbba87
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 412 additions and 265 deletions

View file

@ -22,254 +22,165 @@ Requirements
"""
import time
from packages import Paste
from pubsublogger import publisher
import datetime
import os
import base64
import subprocess
import redis
import signal
import sys
import re
from pyfaup.faup import Faup
from module.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import crawlers
from lib import regex_helper
from packages.Item import Item
from Helper import Process
## Manually fetch first page if crawler is disabled
# import base64
# import subprocess
#
# torclient_host = '127.0.0.1'
# torclient_port = 9050
#
# def fetch(p, r_cache, urls, domains):
# now = datetime.datetime.now()
# path = os.path.join('onions', str(now.year).zfill(4),
# str(now.month).zfill(2),
# str(now.day).zfill(2),
# str(int(time.mktime(now.utctimetuple()))))
# failed = []
# downloaded = []
# print('{} Urls to fetch'.format(len(urls)))
# for url, domain in zip(urls, domains):
# if r_cache.exists(url) or url in failed:
# continue
# to_fetch = base64.standard_b64encode(url.encode('utf8'))
# print('fetching url: {}'.format(to_fetch))
# process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
# stdout=subprocess.PIPE)
# while process.poll() is None:
# time.sleep(1)
#
# if process.returncode == 0:
# r_cache.setbit(url, 0, 1)
# r_cache.expire(url, 360000)
# downloaded.append(url)
# print('downloaded : {}'.format(downloaded))
# '''tempfile = process.stdout.read().strip()
# tempfile = tempfile.decode('utf8')
# #with open(tempfile, 'r') as f:
# filename = path + domain + '.gz'
# fetched = f.read()
# content = base64.standard_b64decode(fetched)
# save_path = os.path.join(os.environ['AIL_HOME'],
# p.config.get("Directories", "pastes"),
# filename)
# dirname = os.path.dirname(save_path)
# if not os.path.exists(dirname):
# os.makedirs(dirname)
# with open(save_path, 'w') as ff:
# ff.write(content)
# p.populate_set_out(save_path, 'Global')
# p.populate_set_out(url, 'ValidOnion')
# p.populate_set_out(fetched, 'FetchedOnion')'''
# yield url
# #os.unlink(tempfile)
# else:
# r_cache.setbit(url, 0, 0)
# r_cache.expire(url, 3600)
# failed.append(url)
# print('Failed at downloading', url)
# print(process.stdout.read())
# print('Failed:', len(failed), 'Downloaded:', len(downloaded))
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
class Onion(AbstractModule):
"""docstring for Onion module."""
signal.signal(signal.SIGALRM, timeout_handler)
def __init__(self):
super(Onion, self).__init__()
def fetch(p, r_cache, urls, domains, path):
failed = []
downloaded = []
print('{} Urls to fetch'.format(len(urls)))
for url, domain in zip(urls, domains):
if r_cache.exists(url) or url in failed:
continue
to_fetch = base64.standard_b64encode(url.encode('utf8'))
print('fetching url: {}'.format(to_fetch))
process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
self.r_onion = config_loader.get_redis_conn("ARDB_Onion")
if process.returncode == 0:
r_cache.setbit(url, 0, 1)
r_cache.expire(url, 360000)
downloaded.append(url)
print('downloaded : {}'.format(downloaded))
'''tempfile = process.stdout.read().strip()
tempfile = tempfile.decode('utf8')
#with open(tempfile, 'r') as f:
filename = path + domain + '.gz'
fetched = f.read()
content = base64.standard_b64decode(fetched)
save_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "pastes"),
filename)
dirname = os.path.dirname(save_path)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(save_path, 'w') as ff:
ff.write(content)
p.populate_set_out(save_path, 'Global')
p.populate_set_out(url, 'ValidOnion')
p.populate_set_out(fetched, 'FetchedOnion')'''
yield url
#os.unlink(tempfile)
self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
# regex timeout
self.regex_timeout = 30
self.faup = crawlers.get_faup()
self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)
# activate_crawler = p.config.get("Crawler", "activate_crawler")
self.url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
self.i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(self.url_regex)
re.compile(self.i2p_regex)
self.redis_logger.info(f"Module: {self.module_name} Launched")
# TEMP var: SAVE I2P Domain (future I2P crawler)
self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
def compute(self, message):
# list of tuples: (url, subdomains, domain)
urls_to_crawl = []
print(message)
id, score = message.split()
item = Item(id)
item_content = item.get_content()
item_content = 'http://33333333.kingdom7rv6wkfzn.onion?sdsd=ooooo http://2222222.kingdom7rv6wkfzn.onion'
# max execution time on regex
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
for x in res:
# String to tuple
x = x[2:-2].replace(" '", "").split("',")
url = x[0]
subdomain = x[4].lower()
self.faup.decode(url)
url_unpack = self.faup.get()
try: ## TODO: # FIXME: check faup version
domain = url_unpack['domain'].decode().lower()
except Exception as e:
domain = url_unpack['domain'].lower()
print('----')
print(url)
print(subdomain)
print(domain)
if crawlers.is_valid_onion_domain(domain):
urls_to_crawl.append((url, subdomain, domain))
to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
if not urls_to_crawl:
self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}')
return
# TAG Item
msg = f'infoleak:automatic-detection="onion";{item.get_id()}'
self.send_message_to_queue('Tags', msg)
if crawlers.is_crawler_activated():
for to_crawl in urls_to_crawl:
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
else:
r_cache.setbit(url, 0, 0)
r_cache.expire(url, 3600)
failed.append(url)
print('Failed at downloading', url)
print(process.stdout.read())
print('Failed:', len(failed), 'Downloaded:', len(downloaded))
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
# keep manual fetcher ????
## Manually fetch first page if crawler is disabled
# for url in fetch(p, r_cache, urls, domains_list):
# publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
torclient_host = '127.0.0.1'
torclient_port = 9050
config_section = 'Onion'
p = Process(config_section)
r_cache = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),
db=p.config.getint("Redis_Cache", "db"),
decode_responses=True)
r_onion = redis.StrictRedis(
host=p.config.get("ARDB_Onion", "host"),
port=p.config.getint("ARDB_Onion", "port"),
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
# FUNCTIONS #
publisher.info("Script subscribed to channel onion_categ")
# FIXME For retro compatibility
channel = 'onion_categ'
# Getting the first message from redis.
message = p.get_from_set()
prec_filename = None
max_execution_time = p.config.getint("Onion", "max_execution_time")
# send to crawler:
activate_crawler = p.config.get("Crawler", "activate_crawler")
if activate_crawler == 'True':
activate_crawler = True
print('Crawler enabled')
else:
activate_crawler = False
print('Crawler disabled')
faup = Faup()
# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_regex)
module = Onion()
module.run()
while True:
message = p.get_from_set()
if message is not None:
print(message)
filename, score = message.split()
# "For each new paste"
if prec_filename is None or filename != prec_filename:
domains_list = []
urls = []
PST = Paste.Paste(filename)
# max execution time on regex
signal.alarm(max_execution_time)
try:
for x in PST.get_regex(url_regex):
print(x)
# Extracting url with regex
url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x
if '.onion' in url:
print(url)
domains_list.append(domain)
urls.append(url)
except TimeoutException:
encoded_list = []
p.incr_module_timeout_statistic()
print ("{0} processing timeout".format(PST.p_rel_path))
continue
signal.alarm(0)
'''
for x in PST.get_regex(i2p_regex):
# Extracting url with regex
url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x
if '.i2p' in url:
print('add i2p')
print(domain)
if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain):
r_onion.sadd('i2p_domain', domain)
r_onion.sadd('i2p_link', url)
r_onion.sadd('i2p_domain_crawler_queue', domain)
msg = '{};{}'.format(url,PST.p_rel_path)
r_onion.sadd('i2p_crawler_queue', msg)
'''
to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
PST.p_name)
print(len(domains_list))
if len(domains_list) > 0:
if not activate_crawler:
publisher.warning('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_rel_path))
else:
publisher.info('{}Detected {} .onion(s);{}'.format(
to_print, len(domains_list),PST.p_rel_path))
now = datetime.datetime.now()
path = os.path.join('onions', str(now.year).zfill(4),
str(now.month).zfill(2),
str(now.day).zfill(2),
str(int(time.mktime(now.utctimetuple()))))
to_print = 'Onion;{};{};{};'.format(PST.p_source,
PST.p_date,
PST.p_name)
if activate_crawler:
date_month = datetime.datetime.now().strftime("%Y%m")
date = datetime.datetime.now().strftime("%Y%m%d")
for url in urls:
faup.decode(url)
url_unpack = faup.get()
## TODO: # FIXME: remove me
try:
domain = url_unpack['domain'].decode().lower()
except Exception as e:
domain = url_unpack['domain'].lower()
## TODO: blackilst by port ?
# check blacklist
if r_onion.sismember('blacklist_onion', domain):
continue
subdomain = re.findall(url_regex, url)
if len(subdomain) > 0:
subdomain = subdomain[0][4].lower()
else:
continue
# too many subdomain
if len(subdomain.split('.')) > 3:
subdomain = '{}.{}.onion'.format(subdomain[-3], subdomain[-2])
if not r_onion.sismember('month_onion_up:{}'.format(date_month), subdomain) and not r_onion.sismember('onion_down:'+date , subdomain):
if not r_onion.sismember('onion_domain_crawler_queue', subdomain):
print('send to onion crawler')
r_onion.sadd('onion_domain_crawler_queue', subdomain)
msg = '{};{}'.format(url,PST.p_rel_path)
if not r_onion.hexists('onion_metadata:{}'.format(subdomain), 'first_seen'):
r_onion.sadd('onion_crawler_discovery_queue', msg)
print('send to priority queue')
else:
r_onion.sadd('onion_crawler_queue', msg)
# tag if domain was up
if r_onion.sismember('full_onion_up', subdomain):
# TAG Item
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
p.populate_set_out(msg, 'Tags')
else:
for url in fetch(p, r_cache, urls, domains_list, path):
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))
# TAG Item
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
p.populate_set_out(msg, 'Tags')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_rel_path))
prec_filename = filename
else:
publisher.debug("Script url is Idling 10s")
#print('Sleeping')
time.sleep(10)
##########################

View file

@ -31,6 +31,12 @@ def is_valid_object_type(object_type):
else:
return False
def check_correlation_object(object):
if object in get_all_correlation_objects():
return True
else:
return False
def is_valid_object_subtype(object_type, object_subtype):
if object_type == 'pgp':
return Pgp.pgp.is_valid_obj_subtype(object_subtype)
@ -462,7 +468,7 @@ def sanitise_correlation_names(correlation_names):
def sanitise_correlation_objects(correlation_objects):
'''
correlation_objects ex = 'domain,decoded'
correlation_objects ex = 'domain,paste'
'''
all_correlation_objects = get_all_correlation_objects()
if correlation_objects is None:
@ -478,6 +484,11 @@ def sanitise_correlation_objects(correlation_objects):
return all_correlation_objects
######## API EXPOSED ########
def api_check_correlation_objects(l_object):
for object in l_object:
if not check_correlation_object(object):
return ({"error": f"Invalid Object: {object}"}, 400)
def sanitize_object_type(object_type):
if not is_valid_object_type(object_type):
return ({'status': 'error', 'reason': 'Incorrect object_type'}, 400)

View file

@ -19,7 +19,8 @@ import uuid
import subprocess
from datetime import datetime, timedelta
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from pyfaup.faup import Faup
@ -41,6 +42,7 @@ r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
r_cache = config_loader.get_redis_conn("Redis_Cache")
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes"))
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
config_loader = None
faup = Faup()
@ -76,6 +78,64 @@ def is_valid_onion_domain(domain):
def get_faup():
return faup
# # # # # # # #
# #
# FAVICON #
# #
# # # # # # # #
def get_favicon_from_html(html, domain, url):
favicon_urls = extract_favicon_from_html(html, url)
# add root favicom
if not favicon_urls:
favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico')
print(favicon_urls)
return favicon_urls
def extract_favicon_from_html(html, url):
favicon_urls = set()
soup = BeautifulSoup(html, 'html.parser')
set_icons = set()
# If there are multiple <link rel="icon">s, the browser uses their media,
# type, and sizes attributes to select the most appropriate icon.
# If several icons are equally appropriate, the last one is used.
# If the most appropriate icon is later found to be inappropriate,
# for example because it uses an unsupported format,
# the browser proceeds to the next-most appropriate, and so on.
# # DEBUG: /!\ firefox load all favicon ???
# iOS Safari 'apple-touch-icon'
# Safari pinned tabs 'mask-icon'
# Android Chrome 'manifest'
# Edge and IE 12:
# - <meta name="msapplication-TileColor" content="#aaaaaa"> <meta name="theme-color" content="#ffffff">
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
# desktop browser 'shortcut icon' (older browser), 'icon'
for favicon_tag in ['icon', 'shortcut icon']:
if soup.head:
for icon in soup.head.find_all('link', attrs={'rel': lambda x : x and x.lower() == favicon_tag, 'href': True}):
set_icons.add(icon)
# # TODO: handle base64 favicon
for tag in set_icons:
icon_url = tag.get('href')
if icon_url:
if icon_url.startswith('//'):
icon_url = icon_url.replace('//', '/')
if icon_url.startswith('data:'):
# # TODO: handle base64 favicon
pass
else:
icon_url = urljoin(url, icon_url)
icon_url = urlparse(icon_url, scheme=urlparse(url).scheme).geturl()
favicon_urls.add(icon_url)
return favicon_urls
# # # - - # # #
################################################################################
# # TODO: handle prefix cookies
@ -412,6 +472,13 @@ def api_create_cookie(user_id, cookiejar_uuid, cookie_dict):
#### CRAWLER GLOBAL ####
## TODO: # FIXME: config db, dynamic load
def is_crawler_activated():
return activate_crawler == 'True'
def get_crawler_all_types():
return ['onion', 'regular']
def get_all_spash_crawler_status():
crawler_metadata = []
all_crawlers = r_cache.smembers('all_splash_crawlers')
@ -741,6 +808,83 @@ def api_add_crawled_item(dict_crawled):
create_item_metadata(item_id, domain, 'last_url', port, 'father')
#### CRAWLER QUEUES ####
## queues priority:
# 1 - priority queue
# 2 - discovery queue
# 3 - default queue
##
def get_all_queues_names():
return ['priority', 'discovery', 'default']
def get_all_queues_keys():
return ['{}_crawler_priority_queue', '{}_crawler_discovery_queue', '{}_crawler_queue']
def get_queue_key_by_name(queue_name):
if queue_name == 'priority':
return '{}_crawler_priority_queue'
elif queue_name == 'discovery':
return '{}_crawler_discovery_queue'
else: # default
return '{}_crawler_queue'
def get_stats_elem_to_crawl_by_queue_type(queue_type):
dict_stats = {}
for queue_name in get_all_queues_names():
dict_stats[queue_name] = r_serv_onion.scard(get_queue_key_by_name(queue_name).format(queue_type))
return dict_stats
def get_all_queues_stats():
print(get_all_crawlers_queues_types())
dict_stats = {}
for queue_type in get_crawler_all_types():
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
for queue_type in get_all_splash():
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
return dict_stats
def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
date_month = datetime.now().strftime("%Y%m")
date = datetime.now().strftime("%Y%m%d")
# check blacklist
if r_serv_onion.sismember(f'blacklist_{queue_type}', domain):
return
# too many subdomain # # FIXME: move to crawler module ?
if len(subdomain.split('.')) > 3:
subdomain = f'{subdomain[-3]}.{subdomain[-2]}.{queue_type}'
if not r_serv_onion.sismember(f'month_{queue_type}_up:{date_month}', subdomain) and not r_serv_onion.sismember(f'{queue_type}_down:{date}' , subdomain):
if not r_serv_onion.sismember(f'{queue_type}_domain_crawler_queue', subdomain):
r_serv_onion.sadd(f'{queue_type}_domain_crawler_queue', subdomain)
msg = f'{url};{item_id}'
# First time we see this domain => Add to discovery queue (priority=2)
if not r_serv_onion.hexists(f'{queue_type}_metadata:{subdomain}', 'first_seen'):
r_serv_onion.sadd(f'{queue_type}_crawler_discovery_queue', msg)
print(f'sent to priority queue: {subdomain}')
# Add to default queue (priority=3)
else:
r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg)
print(f'sent to queue: {subdomain}')
def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove):
r_serv_onion.srem(queue_name.format(queue_type), key_to_remove)
# # TODO: keep auto crawler ?
def clear_crawler_queues():
for queue_key in get_all_queues_keys():
for queue_type in get_crawler_all_types():
r_serv_onion.delete(queue_key.format(queue_type))
###################################################################################
def get_nb_elem_to_crawl_by_type(queue_type): # # TODO: rename me
nb = r_serv_onion.scard('{}_crawler_priority_queue'.format(queue_type))
nb += r_serv_onion.scard('{}_crawler_discovery_queue'.format(queue_type))
nb += r_serv_onion.scard('{}_crawler_queue'.format(queue_type))
return nb
###################################################################################
def get_all_crawlers_queues_types():
all_queues_types = set()
all_splash_name = get_all_crawlers_to_launch_splash_name()
@ -782,9 +926,8 @@ def get_elem_to_crawl_by_queue_type(l_queue_type):
# 2 - discovery queue
# 3 - normal queue
##
all_queue_key = ['{}_crawler_priority_queue', '{}_crawler_discovery_queue', '{}_crawler_queue']
for queue_key in all_queue_key:
for queue_key in get_all_queues_keys():
for queue_type in l_queue_type:
message = r_serv_onion.spop(queue_key.format(queue_type))
if message:
@ -801,12 +944,6 @@ def get_elem_to_crawl_by_queue_type(l_queue_type):
return {'url': url, 'paste': item_id, 'type_service': crawler_type, 'queue_type': queue_type, 'original_message': message}
return None
def get_nb_elem_to_crawl_by_type(queue_type):
nb = r_serv_onion.scard('{}_crawler_priority_queue'.format(queue_type))
nb += r_serv_onion.scard('{}_crawler_discovery_queue'.format(queue_type))
nb += r_serv_onion.scard('{}_crawler_queue'.format(queue_type))
return nb
#### ---- ####
# # # # # # # # # # # #
@ -1281,8 +1418,9 @@ def test_ail_crawlers():
#### ---- ####
if __name__ == '__main__':
res = get_splash_manager_version()
res = test_ail_crawlers()
res = is_test_ail_crawlers_successful()
print(res)
print(get_test_ail_crawlers_message())
# res = get_splash_manager_version()
# res = test_ail_crawlers()
# res = is_test_ail_crawlers_successful()
# print(res)
# print(get_test_ail_crawlers_message())
#print(get_all_queues_stats())

View file

@ -15,7 +15,6 @@ import time
from pubsublogger import publisher
from Helper import Process
class AbstractModule(ABC):
"""
Abstract Module class
@ -41,9 +40,10 @@ class AbstractModule(ABC):
self.redis_logger.port = 6380
# Channel name to publish logs
# # TODO: refactor logging
# If provided could be a namespaced channel like script:<ModuleName>
self.redis_logger.channel = logger_channel
# self.redis_logger.channel = 'script:%s'%(self.module_name)
# Run module endlessly
self.proceed = True
@ -54,6 +54,23 @@ class AbstractModule(ABC):
# Setup the I/O queues
self.process = Process(self.queue_name)
def get_message(self):
"""
Get message from the Redis Queue (QueueIn)
Input message can change between modules
ex: '<item id>'
"""
return self.process.get_from_set()
def send_message_to_queue(self, queue_name, message):
"""
Send message to queue
:param queue_name: queue or module name
:param message: message to send in queue
ex: send_to_queue(item_id, 'Global')
"""
self.process.populate_set_out(message, queue_name)
def run(self):
"""
@ -62,8 +79,8 @@ class AbstractModule(ABC):
# Endless loop processing messages from the input queue
while self.proceed:
# Get one message (paste) from the QueueIn (copy of Redis_Global publish)
message = self.process.get_from_set()
# Get one message (ex:item id) from the Redis Queue (QueueIn)
message = self.get_message()
if message:
try:

View file

@ -132,6 +132,11 @@ class Correlation(object):
stop = start + nb_elem -1
return r_serv_metadata.zrange(f'{self.correlation_name}_all:{subtype}', start, stop)
def paginate_list(self, obj_list, nb_elem=50, page=1):
start = (page - 1) * nb_elem
stop = start + nb_elem
return obj_list[start:stop]
def get_all_correlation_types(self):
'''
Gel all correlation types
@ -437,7 +442,10 @@ class Correlation(object):
return True
######## API EXPOSED ########
######## API EXPOSED ########
def api_check_objs_type(self, l_types):
for obj_type in l_types:
if not self.is_valid_obj_subtype(obj_type):
return ({"error": f"Invalid Type: {obj_type}"}, 400)
######## ########
######## ########

View file

@ -25,6 +25,7 @@ import Decoded
import Screenshot
import Username
from ail_objects import AbstractObject
from item_basic import *
config_loader = ConfigLoader.ConfigLoader()
@ -549,7 +550,42 @@ def delete_domain_node(item_id):
for child_id in get_all_domain_node_by_item_id(item_id):
delete_item(child_id)
class Item(AbstractObject):
"""
AIL Item Object. (strings)
"""
def __init__(self, id):
super(Item, self).__init__('item', id)
def get_date(self, separator=False):
"""
Returns Item date
"""
return item_basic.get_item_date(self.id, add_separator=separator)
def get_source(self):
"""
Returns Item source/feeder name
"""
return item_basic.get_source(self.id)
def get_basename(self):
return os.path.basename(self.id)
def get_content(self):
"""
Returns Item content
"""
return item_basic.get_item_content(self.id)
# if __name__ == '__main__':
#
# item = Item('')
# res = item.get_date(separator=True)
# print(res)
# import Domain
# domain = Domain.Domain('domain.onion')
# for domain_history in domain.get_domain_history():

View file

@ -50,12 +50,18 @@ def is_tags_safe(ltags):
#### Taxonomies - Galaxies ####
def get_taxonomie_from_tag(tag):
return tag.split(':')[0]
try:
return tag.split(':')[0]
except IndexError:
return None
def get_galaxy_from_tag(tag):
galaxy = tag.split(':')[1]
galaxy = galaxy.split('=')[0]
return galaxy
try:
galaxy = tag.split(':')[1]
galaxy = galaxy.split('=')[0]
return galaxy
except IndexError:
return None
def get_active_taxonomies():
return r_serv_tags.smembers('active_taxonomies')
@ -110,6 +116,8 @@ def is_valid_tags_taxonomies_galaxy(list_tags, list_tags_galaxy):
for tag in list_tags:
taxonomie = get_taxonomie_from_tag(tag)
if taxonomie is None:
return False
if taxonomie not in active_taxonomies:
return False
if not is_taxonomie_tag_enabled(taxonomie, tag):
@ -120,6 +128,8 @@ def is_valid_tags_taxonomies_galaxy(list_tags, list_tags_galaxy):
for tag in list_tags_galaxy:
galaxy = get_galaxy_from_tag(tag)
if galaxy is None:
return False
if galaxy not in active_galaxies:
return False
if not is_galaxy_tag_enabled(galaxy, tag):
@ -271,7 +281,7 @@ def update_tag_last_seen(tag, tag_first_seen, tag_last_seen):
if r_serv_tags.scard('{}:{}'.format(tag, tag_last_seen)) > 0:
r_serv_tags.hset('tag_metadata:{}'.format(tag), 'last_seen', tag_last_seen)
else:
# # TODO: # FIXME:
# # TODO: # FIXME:
#tag_last_seen = Date.date_substract_day(str(tag_last_seen))
#update_tag_last_seen(tag, tag_first_seen, tag_last_seen)
pass

View file

@ -77,6 +77,7 @@ minTopPassList=5
max_execution_time = 90
[Onion]
save_i2p = False
max_execution_time = 180
[PgpDump]

View file

@ -34,14 +34,13 @@ def sanitise_nb_max_nodes(nb_max_nodes):
return nb_max_nodes
def get_object_correlation_json(correlation_id, subtype, max_nodes):
max_nodes = sanitise_nb_max_nodes(max_nodes)
object_type = 'cryptocurrency'
max_nodes = sanitise_nb_max_nodes(max_nodes)
# ALL correlations
correlation_names = Correlate_object.sanitise_correlation_names('')
correlation_objects = Correlate_object.sanitise_correlation_objects('')
#correlation_objects = Correlate_object.sanitise_correlation_objects('')
correlation_objects = ['domain']
res = Correlate_object.get_graph_node_object_correlation(object_type, correlation_id, mode, correlation_names,
correlation_objects, requested_correl_type=subtype,
@ -51,23 +50,39 @@ def get_object_correlation_json(correlation_id, subtype, max_nodes):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Trigger backgroud update')
parser.add_argument('-t', '--type', help='Cryptocurrency type (bitcoin, bitcoin-cash, etherum, litecoin, monero, dash, zcash)', type=str, dest='type', required=True, default=None)
parser.add_argument('-p', '--page',help='page number' , type=int, default=1, dest='page')
parser.add_argument('-n', '--nb',help='number of addresses by page' , type=int, default=50, dest='nb_elem')
parser.add_argument('--node' ,help='correlation graph: max number of nodes' , type=int, default=50, dest='max_nodes')
parser.add_argument('-t', '--type', help='Cryptocurrency type (bitcoin, bitcoin-cash, ethereum, litecoin, monero, dash, zcash)', type=str, dest='type', required=True, default=None)
parser.add_argument('-a', '--address', help='Cryptocurrency addresses', type=str, dest='address', default=None, nargs="*")
parser.add_argument('-p', '--page',help='page number, default=1' , type=int, default=1, dest='page')
parser.add_argument('-n', '--nb',help='number of addresses by page, default=50' , type=int, default=50, dest='nb_elem')
parser.add_argument('-fo', '--filter_objects',help='filter correlation by object : domain, paste/item' , type=str, default=[], dest='objects', nargs="*")
parser.add_argument('--node' ,help='correlation graph: max number of nodes, default=50' , type=int, default=50, dest='max_nodes')
args = parser.parse_args()
subtype = args.type
if subtype is None:
parser.print_help()
sys.exit(0)
else:
res = Cryptocurrency.cryptocurrency.api_check_objs_type([args.type])
if res:
print(json.dumps(res[0]))
sys.exit(0)
page = sanitise_int(args.page, 1)
nb_elem = sanitise_int(args.nb_elem, 50)
max_nodes = sanitise_int(args.max_nodes, 300)
if args.objects:
res = Correlate_object.api_check_correlation_objects(args.objects)
if res:
print(json.dumps(res[0]))
sys.exit(0)
dict_json = {}
for address in Cryptocurrency.cryptocurrency.get_all_correlations_by_subtype_pagination(subtype, nb_elem=nb_elem, page=page):
if args.address:
l_addresse = Cryptocurrency.cryptocurrency.paginate_list(args.address, nb_elem=nb_elem, page=page)
else:
l_addresse = Cryptocurrency.cryptocurrency.get_all_correlations_by_subtype_pagination(subtype, nb_elem=nb_elem, page=page)
for address in l_addresse:
dict_json[address] = get_object_correlation_json(address, subtype, max_nodes)
print(json.dumps(dict_json))