mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [Onion, crawler config] auto crawler: add config by url, fix onions tagging + filter subdomains
This commit is contained in:
parent
6fdf7c2123
commit
2a1cd4a009
4 changed files with 45 additions and 20 deletions
|
@ -167,11 +167,16 @@ Redis and ARDB overview
|
|||
| ------ | ------ | ------ |
|
||||
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
||||
|
||||
##### Key:
|
||||
##### crawler config:
|
||||
| Key | Value |
|
||||
| ------ | ------ |
|
||||
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
|
||||
|
||||
##### automatic crawler config:
|
||||
| Key | Value |
|
||||
| ------ | ------ |
|
||||
| crawler\_config:**crawler mode**:**service type**:**domain**:**url** | **json config** |
|
||||
|
||||
###### exemple json config:
|
||||
```json
|
||||
{
|
||||
|
|
|
@ -105,9 +105,12 @@ def get_elem_to_crawl(rotation_mode):
|
|||
|
||||
return message
|
||||
|
||||
def get_crawler_config(redis_server, mode, service_type, domain):
|
||||
def get_crawler_config(redis_server, mode, service_type, domain, url=None):
|
||||
crawler_options = {}
|
||||
config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||
if mode=='auto':
|
||||
config = redis_server.get('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url))
|
||||
else:
|
||||
config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||
if config is None:
|
||||
config = {}
|
||||
else:
|
||||
|
@ -123,7 +126,7 @@ def get_crawler_config(redis_server, mode, service_type, domain):
|
|||
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||
return crawler_options
|
||||
|
||||
def load_crawler_config(service_type, domain, paste, date):
|
||||
def load_crawler_config(service_type, domain, paste, url, date):
|
||||
crawler_config = {}
|
||||
crawler_config['splash_url'] = splash_url
|
||||
crawler_config['item'] = paste
|
||||
|
@ -134,7 +137,7 @@ def load_crawler_config(service_type, domain, paste, date):
|
|||
# Auto and Manual Crawling
|
||||
# Auto ################################################# create new entry, next crawling => here or when ended ?
|
||||
if paste == 'auto':
|
||||
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain)
|
||||
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain, url=url)
|
||||
crawler_config['requested'] = True
|
||||
# Manual
|
||||
elif paste == 'manual':
|
||||
|
@ -342,7 +345,7 @@ if __name__ == '__main__':
|
|||
# Update crawler status type
|
||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||
|
||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date)
|
||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||
# check if default crawler
|
||||
if not crawler_config['requested']:
|
||||
# Auto crawl only if service not up this month
|
||||
|
|
37
bin/Onion.py
37
bin/Onion.py
|
@ -32,6 +32,8 @@ import redis
|
|||
import signal
|
||||
import re
|
||||
|
||||
from pyfaup.faup import Faup
|
||||
|
||||
from Helper import Process
|
||||
|
||||
class TimeoutException(Exception):
|
||||
|
@ -132,6 +134,8 @@ if __name__ == "__main__":
|
|||
activate_crawler = False
|
||||
print('Crawler disabled')
|
||||
|
||||
faup = Faup()
|
||||
|
||||
# Thanks to Faup project for this regex
|
||||
# https://github.com/stricaud/faup
|
||||
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
|
@ -218,27 +222,40 @@ if __name__ == "__main__":
|
|||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
for url in urls:
|
||||
|
||||
domain = re.findall(url_regex, url)
|
||||
if len(domain) > 0:
|
||||
domain = domain[0][4]
|
||||
faup.decode(url)
|
||||
url_unpack = faup.get()
|
||||
domain = url_unpack['domain'].decode()
|
||||
|
||||
## TODO: blackilst by port ?
|
||||
# check blacklist
|
||||
if redis_crawler.sismember('blacklist_onion', domain):
|
||||
continue
|
||||
|
||||
subdomain = re.findall(url_regex, url)
|
||||
if len(subdomain) > 0:
|
||||
subdomain = subdomain[0][4]
|
||||
else:
|
||||
continue
|
||||
|
||||
# too many subdomain
|
||||
if len(domain.split('.')) > 5:
|
||||
continue
|
||||
if len(subdomain.split('.')) > 3:
|
||||
subdomain = '{}.{}.onion'.format(subdomain[-3], subdomain[-2])
|
||||
|
||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||
if not r_onion.sismember('onion_domain_crawler_queue', domain):
|
||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), subdomain) and not r_onion.sismember('onion_down:'+date , subdomain):
|
||||
if not r_onion.sismember('onion_domain_crawler_queue', subdomain):
|
||||
print('send to onion crawler')
|
||||
r_onion.sadd('onion_domain_crawler_queue', domain)
|
||||
r_onion.sadd('onion_domain_crawler_queue', subdomain)
|
||||
msg = '{};{}'.format(url,PST.p_rel_path)
|
||||
if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
|
||||
if not r_onion.hexists('onion_metadata:{}'.format(subdomain), 'first_seen'):
|
||||
r_onion.sadd('onion_crawler_priority_queue', msg)
|
||||
print('send to priority queue')
|
||||
else:
|
||||
r_onion.sadd('onion_crawler_queue', msg)
|
||||
#p.populate_set_out(msg, 'Crawler')
|
||||
# tag if domain was up
|
||||
if r_onion.sismember('full_onion_up', subdomain):
|
||||
# TAG Item
|
||||
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
else:
|
||||
for url in fetch(p, r_cache, urls, domains_list, path):
|
||||
|
|
|
@ -194,11 +194,11 @@ def get_crawler_splash_status(type):
|
|||
|
||||
return crawler_metadata
|
||||
|
||||
def create_crawler_config(mode, service_type, crawler_config, domain):
|
||||
def create_crawler_config(mode, service_type, crawler_config, domain, url=None):
|
||||
if mode == 'manual':
|
||||
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||
elif mode == 'auto':
|
||||
r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||
r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config))
|
||||
|
||||
def send_url_to_crawl_in_queue(mode, service_type, url):
|
||||
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
|
||||
|
@ -212,7 +212,7 @@ def delete_auto_crawler(url):
|
|||
# remove from set
|
||||
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
|
||||
# remove config
|
||||
r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain))
|
||||
r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url))
|
||||
# remove from queue
|
||||
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
|
||||
# remove from crawler_auto_queue
|
||||
|
@ -417,7 +417,7 @@ def create_spider_splash():
|
|||
mode = 'manual'
|
||||
epoch = None
|
||||
|
||||
create_crawler_config(mode, service_type, crawler_config, domain)
|
||||
create_crawler_config(mode, service_type, crawler_config, domain, url=url)
|
||||
send_url_to_crawl_in_queue(mode, service_type, url)
|
||||
|
||||
return redirect(url_for('hiddenServices.manual'))
|
||||
|
|
Loading…
Reference in a new issue