From 2a1cd4a00947014f1cfcda148a2220731f54fd0d Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 23 Apr 2019 11:15:34 +0200 Subject: [PATCH] chg: [Onion, crawler config] auto crawler: add config by url, fix onions tagging + filter subdomains --- OVERVIEW.md | 7 +++- bin/Crawler.py | 13 ++++--- bin/Onion.py | 37 ++++++++++++++----- .../hiddenServices/Flask_hiddenServices.py | 8 ++-- 4 files changed, 45 insertions(+), 20 deletions(-) diff --git a/OVERVIEW.md b/OVERVIEW.md index 3e325ceb..500aea12 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -167,11 +167,16 @@ Redis and ARDB overview | ------ | ------ | ------ | | crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** | -##### Key: +##### crawler config: | Key | Value | | ------ | ------ | | crawler\_config:**crawler mode**:**service type**:**domain** | **json config** | +##### automatic crawler config: +| Key | Value | +| ------ | ------ | +| crawler\_config:**crawler mode**:**service type**:**domain**:**url** | **json config** | + ###### exemple json config: ```json { diff --git a/bin/Crawler.py b/bin/Crawler.py index b6ebdcf8..ed76783e 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -105,9 +105,12 @@ def get_elem_to_crawl(rotation_mode): return message -def get_crawler_config(redis_server, mode, service_type, domain): +def get_crawler_config(redis_server, mode, service_type, domain, url=None): crawler_options = {} - config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) + if mode=='auto': + config = redis_server.get('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url)) + else: + config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) if config is None: config = {} else: @@ -123,7 +126,7 @@ def get_crawler_config(redis_server, mode, service_type, domain): redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) return crawler_options -def load_crawler_config(service_type, domain, paste, date): +def load_crawler_config(service_type, domain, paste, url, date): crawler_config = {} crawler_config['splash_url'] = splash_url crawler_config['item'] = paste @@ -134,7 +137,7 @@ def load_crawler_config(service_type, domain, paste, date): # Auto and Manual Crawling # Auto ################################################# create new entry, next crawling => here or when ended ? if paste == 'auto': - crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain) + crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain, url=url) crawler_config['requested'] = True # Manual elif paste == 'manual': @@ -342,7 +345,7 @@ if __name__ == '__main__': # Update crawler status type r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port) - crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date) + crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler if not crawler_config['requested']: # Auto crawl only if service not up this month diff --git a/bin/Onion.py b/bin/Onion.py index 30b62ba6..f511714e 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -32,6 +32,8 @@ import redis import signal import re +from pyfaup.faup import Faup + from Helper import Process class TimeoutException(Exception): @@ -132,6 +134,8 @@ if __name__ == "__main__": activate_crawler = False print('Crawler disabled') + faup = Faup() + # Thanks to Faup project for this regex # https://github.com/stricaud/faup url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" @@ -218,27 +222,40 @@ if __name__ == "__main__": date = datetime.datetime.now().strftime("%Y%m%d") for url in urls: - domain = re.findall(url_regex, url) - if len(domain) > 0: - domain = domain[0][4] + faup.decode(url) + url_unpack = faup.get() + domain = url_unpack['domain'].decode() + + ## TODO: blackilst by port ? + # check blacklist + if redis_crawler.sismember('blacklist_onion', domain): + continue + + subdomain = re.findall(url_regex, url) + if len(subdomain) > 0: + subdomain = subdomain[0][4] else: continue # too many subdomain - if len(domain.split('.')) > 5: - continue + if len(subdomain.split('.')) > 3: + subdomain = '{}.{}.onion'.format(subdomain[-3], subdomain[-2]) - if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - if not r_onion.sismember('onion_domain_crawler_queue', domain): + if not r_onion.sismember('month_onion_up:{}'.format(date_month), subdomain) and not r_onion.sismember('onion_down:'+date , subdomain): + if not r_onion.sismember('onion_domain_crawler_queue', subdomain): print('send to onion crawler') - r_onion.sadd('onion_domain_crawler_queue', domain) + r_onion.sadd('onion_domain_crawler_queue', subdomain) msg = '{};{}'.format(url,PST.p_rel_path) - if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'): + if not r_onion.hexists('onion_metadata:{}'.format(subdomain), 'first_seen'): r_onion.sadd('onion_crawler_priority_queue', msg) print('send to priority queue') else: r_onion.sadd('onion_crawler_queue', msg) - #p.populate_set_out(msg, 'Crawler') + # tag if domain was up + if r_onion.sismember('full_onion_up', subdomain): + # TAG Item + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path) + p.populate_set_out(msg, 'Tags') else: for url in fetch(p, r_cache, urls, domains_list, path): diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index dc51446b..0d4426f4 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -194,11 +194,11 @@ def get_crawler_splash_status(type): return crawler_metadata -def create_crawler_config(mode, service_type, crawler_config, domain): +def create_crawler_config(mode, service_type, crawler_config, domain, url=None): if mode == 'manual': r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) elif mode == 'auto': - r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) + r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config)) def send_url_to_crawl_in_queue(mode, service_type, url): r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode)) @@ -212,7 +212,7 @@ def delete_auto_crawler(url): # remove from set r_serv_onion.srem('auto_crawler_url:{}'.format(type), url) # remove config - r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain)) + r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url)) # remove from queue r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url)) # remove from crawler_auto_queue @@ -417,7 +417,7 @@ def create_spider_splash(): mode = 'manual' epoch = None - create_crawler_config(mode, service_type, crawler_config, domain) + create_crawler_config(mode, service_type, crawler_config, domain, url=url) send_url_to_crawl_in_queue(mode, service_type, url) return redirect(url_for('hiddenServices.manual'))