chg: [Onion, crawler config] auto crawler: add config by url, fix onions tagging + filter subdomains

This commit is contained in:
Terrtia 2019-04-23 11:15:34 +02:00
parent 6fdf7c2123
commit 2a1cd4a009
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 45 additions and 20 deletions

View file

@ -167,11 +167,16 @@ Redis and ARDB overview
| ------ | ------ | ------ |
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
##### Key:
##### crawler config:
| Key | Value |
| ------ | ------ |
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
##### automatic crawler config:
| Key | Value |
| ------ | ------ |
| crawler\_config:**crawler mode**:**service type**:**domain**:**url** | **json config** |
###### exemple json config:
```json
{

View file

@ -105,8 +105,11 @@ def get_elem_to_crawl(rotation_mode):
return message
def get_crawler_config(redis_server, mode, service_type, domain):
def get_crawler_config(redis_server, mode, service_type, domain, url=None):
crawler_options = {}
if mode=='auto':
config = redis_server.get('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url))
else:
config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
if config is None:
config = {}
@ -123,7 +126,7 @@ def get_crawler_config(redis_server, mode, service_type, domain):
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
return crawler_options
def load_crawler_config(service_type, domain, paste, date):
def load_crawler_config(service_type, domain, paste, url, date):
crawler_config = {}
crawler_config['splash_url'] = splash_url
crawler_config['item'] = paste
@ -134,7 +137,7 @@ def load_crawler_config(service_type, domain, paste, date):
# Auto and Manual Crawling
# Auto ################################################# create new entry, next crawling => here or when ended ?
if paste == 'auto':
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain)
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain, url=url)
crawler_config['requested'] = True
# Manual
elif paste == 'manual':
@ -342,7 +345,7 @@ if __name__ == '__main__':
# Update crawler status type
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date)
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
# check if default crawler
if not crawler_config['requested']:
# Auto crawl only if service not up this month

View file

@ -32,6 +32,8 @@ import redis
import signal
import re
from pyfaup.faup import Faup
from Helper import Process
class TimeoutException(Exception):
@ -132,6 +134,8 @@ if __name__ == "__main__":
activate_crawler = False
print('Crawler disabled')
faup = Faup()
# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@ -218,27 +222,40 @@ if __name__ == "__main__":
date = datetime.datetime.now().strftime("%Y%m%d")
for url in urls:
domain = re.findall(url_regex, url)
if len(domain) > 0:
domain = domain[0][4]
faup.decode(url)
url_unpack = faup.get()
domain = url_unpack['domain'].decode()
## TODO: blackilst by port ?
# check blacklist
if redis_crawler.sismember('blacklist_onion', domain):
continue
subdomain = re.findall(url_regex, url)
if len(subdomain) > 0:
subdomain = subdomain[0][4]
else:
continue
# too many subdomain
if len(domain.split('.')) > 5:
continue
if len(subdomain.split('.')) > 3:
subdomain = '{}.{}.onion'.format(subdomain[-3], subdomain[-2])
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
if not r_onion.sismember('onion_domain_crawler_queue', domain):
if not r_onion.sismember('month_onion_up:{}'.format(date_month), subdomain) and not r_onion.sismember('onion_down:'+date , subdomain):
if not r_onion.sismember('onion_domain_crawler_queue', subdomain):
print('send to onion crawler')
r_onion.sadd('onion_domain_crawler_queue', domain)
r_onion.sadd('onion_domain_crawler_queue', subdomain)
msg = '{};{}'.format(url,PST.p_rel_path)
if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
if not r_onion.hexists('onion_metadata:{}'.format(subdomain), 'first_seen'):
r_onion.sadd('onion_crawler_priority_queue', msg)
print('send to priority queue')
else:
r_onion.sadd('onion_crawler_queue', msg)
#p.populate_set_out(msg, 'Crawler')
# tag if domain was up
if r_onion.sismember('full_onion_up', subdomain):
# TAG Item
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
p.populate_set_out(msg, 'Tags')
else:
for url in fetch(p, r_cache, urls, domains_list, path):

View file

@ -194,11 +194,11 @@ def get_crawler_splash_status(type):
return crawler_metadata
def create_crawler_config(mode, service_type, crawler_config, domain):
def create_crawler_config(mode, service_type, crawler_config, domain, url=None):
if mode == 'manual':
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
elif mode == 'auto':
r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config))
def send_url_to_crawl_in_queue(mode, service_type, url):
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
@ -212,7 +212,7 @@ def delete_auto_crawler(url):
# remove from set
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
# remove config
r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain))
r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url))
# remove from queue
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
# remove from crawler_auto_queue
@ -417,7 +417,7 @@ def create_spider_splash():
mode = 'manual'
epoch = None
create_crawler_config(mode, service_type, crawler_config, domain)
create_crawler_config(mode, service_type, crawler_config, domain, url=url)
send_url_to_crawl_in_queue(mode, service_type, url)
return redirect(url_for('hiddenServices.manual'))