mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 01:37:17 +00:00
chg: [Onion, crawler config] auto crawler: add config by url, fix onions tagging + filter subdomains
This commit is contained in:
parent
6fdf7c2123
commit
2a1cd4a009
4 changed files with 45 additions and 20 deletions
|
@ -167,11 +167,16 @@ Redis and ARDB overview
|
||||||
| ------ | ------ | ------ |
|
| ------ | ------ | ------ |
|
||||||
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
||||||
|
|
||||||
##### Key:
|
##### crawler config:
|
||||||
| Key | Value |
|
| Key | Value |
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
|
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
|
||||||
|
|
||||||
|
##### automatic crawler config:
|
||||||
|
| Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| crawler\_config:**crawler mode**:**service type**:**domain**:**url** | **json config** |
|
||||||
|
|
||||||
###### exemple json config:
|
###### exemple json config:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
|
|
|
@ -105,9 +105,12 @@ def get_elem_to_crawl(rotation_mode):
|
||||||
|
|
||||||
return message
|
return message
|
||||||
|
|
||||||
def get_crawler_config(redis_server, mode, service_type, domain):
|
def get_crawler_config(redis_server, mode, service_type, domain, url=None):
|
||||||
crawler_options = {}
|
crawler_options = {}
|
||||||
config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
if mode=='auto':
|
||||||
|
config = redis_server.get('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url))
|
||||||
|
else:
|
||||||
|
config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||||
if config is None:
|
if config is None:
|
||||||
config = {}
|
config = {}
|
||||||
else:
|
else:
|
||||||
|
@ -123,7 +126,7 @@ def get_crawler_config(redis_server, mode, service_type, domain):
|
||||||
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||||
return crawler_options
|
return crawler_options
|
||||||
|
|
||||||
def load_crawler_config(service_type, domain, paste, date):
|
def load_crawler_config(service_type, domain, paste, url, date):
|
||||||
crawler_config = {}
|
crawler_config = {}
|
||||||
crawler_config['splash_url'] = splash_url
|
crawler_config['splash_url'] = splash_url
|
||||||
crawler_config['item'] = paste
|
crawler_config['item'] = paste
|
||||||
|
@ -134,7 +137,7 @@ def load_crawler_config(service_type, domain, paste, date):
|
||||||
# Auto and Manual Crawling
|
# Auto and Manual Crawling
|
||||||
# Auto ################################################# create new entry, next crawling => here or when ended ?
|
# Auto ################################################# create new entry, next crawling => here or when ended ?
|
||||||
if paste == 'auto':
|
if paste == 'auto':
|
||||||
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain)
|
crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain, url=url)
|
||||||
crawler_config['requested'] = True
|
crawler_config['requested'] = True
|
||||||
# Manual
|
# Manual
|
||||||
elif paste == 'manual':
|
elif paste == 'manual':
|
||||||
|
@ -342,7 +345,7 @@ if __name__ == '__main__':
|
||||||
# Update crawler status type
|
# Update crawler status type
|
||||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||||
|
|
||||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date)
|
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||||
# check if default crawler
|
# check if default crawler
|
||||||
if not crawler_config['requested']:
|
if not crawler_config['requested']:
|
||||||
# Auto crawl only if service not up this month
|
# Auto crawl only if service not up this month
|
||||||
|
|
37
bin/Onion.py
37
bin/Onion.py
|
@ -32,6 +32,8 @@ import redis
|
||||||
import signal
|
import signal
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
|
||||||
class TimeoutException(Exception):
|
class TimeoutException(Exception):
|
||||||
|
@ -132,6 +134,8 @@ if __name__ == "__main__":
|
||||||
activate_crawler = False
|
activate_crawler = False
|
||||||
print('Crawler disabled')
|
print('Crawler disabled')
|
||||||
|
|
||||||
|
faup = Faup()
|
||||||
|
|
||||||
# Thanks to Faup project for this regex
|
# Thanks to Faup project for this regex
|
||||||
# https://github.com/stricaud/faup
|
# https://github.com/stricaud/faup
|
||||||
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
@ -218,27 +222,40 @@ if __name__ == "__main__":
|
||||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
|
||||||
domain = re.findall(url_regex, url)
|
faup.decode(url)
|
||||||
if len(domain) > 0:
|
url_unpack = faup.get()
|
||||||
domain = domain[0][4]
|
domain = url_unpack['domain'].decode()
|
||||||
|
|
||||||
|
## TODO: blackilst by port ?
|
||||||
|
# check blacklist
|
||||||
|
if redis_crawler.sismember('blacklist_onion', domain):
|
||||||
|
continue
|
||||||
|
|
||||||
|
subdomain = re.findall(url_regex, url)
|
||||||
|
if len(subdomain) > 0:
|
||||||
|
subdomain = subdomain[0][4]
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# too many subdomain
|
# too many subdomain
|
||||||
if len(domain.split('.')) > 5:
|
if len(subdomain.split('.')) > 3:
|
||||||
continue
|
subdomain = '{}.{}.onion'.format(subdomain[-3], subdomain[-2])
|
||||||
|
|
||||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
if not r_onion.sismember('month_onion_up:{}'.format(date_month), subdomain) and not r_onion.sismember('onion_down:'+date , subdomain):
|
||||||
if not r_onion.sismember('onion_domain_crawler_queue', domain):
|
if not r_onion.sismember('onion_domain_crawler_queue', subdomain):
|
||||||
print('send to onion crawler')
|
print('send to onion crawler')
|
||||||
r_onion.sadd('onion_domain_crawler_queue', domain)
|
r_onion.sadd('onion_domain_crawler_queue', subdomain)
|
||||||
msg = '{};{}'.format(url,PST.p_rel_path)
|
msg = '{};{}'.format(url,PST.p_rel_path)
|
||||||
if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
|
if not r_onion.hexists('onion_metadata:{}'.format(subdomain), 'first_seen'):
|
||||||
r_onion.sadd('onion_crawler_priority_queue', msg)
|
r_onion.sadd('onion_crawler_priority_queue', msg)
|
||||||
print('send to priority queue')
|
print('send to priority queue')
|
||||||
else:
|
else:
|
||||||
r_onion.sadd('onion_crawler_queue', msg)
|
r_onion.sadd('onion_crawler_queue', msg)
|
||||||
#p.populate_set_out(msg, 'Crawler')
|
# tag if domain was up
|
||||||
|
if r_onion.sismember('full_onion_up', subdomain):
|
||||||
|
# TAG Item
|
||||||
|
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
|
||||||
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for url in fetch(p, r_cache, urls, domains_list, path):
|
for url in fetch(p, r_cache, urls, domains_list, path):
|
||||||
|
|
|
@ -194,11 +194,11 @@ def get_crawler_splash_status(type):
|
||||||
|
|
||||||
return crawler_metadata
|
return crawler_metadata
|
||||||
|
|
||||||
def create_crawler_config(mode, service_type, crawler_config, domain):
|
def create_crawler_config(mode, service_type, crawler_config, domain, url=None):
|
||||||
if mode == 'manual':
|
if mode == 'manual':
|
||||||
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||||
elif mode == 'auto':
|
elif mode == 'auto':
|
||||||
r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url), json.dumps(crawler_config))
|
||||||
|
|
||||||
def send_url_to_crawl_in_queue(mode, service_type, url):
|
def send_url_to_crawl_in_queue(mode, service_type, url):
|
||||||
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
|
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
|
||||||
|
@ -212,7 +212,7 @@ def delete_auto_crawler(url):
|
||||||
# remove from set
|
# remove from set
|
||||||
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
|
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
|
||||||
# remove config
|
# remove config
|
||||||
r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain))
|
r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url))
|
||||||
# remove from queue
|
# remove from queue
|
||||||
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
|
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
|
||||||
# remove from crawler_auto_queue
|
# remove from crawler_auto_queue
|
||||||
|
@ -417,7 +417,7 @@ def create_spider_splash():
|
||||||
mode = 'manual'
|
mode = 'manual'
|
||||||
epoch = None
|
epoch = None
|
||||||
|
|
||||||
create_crawler_config(mode, service_type, crawler_config, domain)
|
create_crawler_config(mode, service_type, crawler_config, domain, url=url)
|
||||||
send_url_to_crawl_in_queue(mode, service_type, url)
|
send_url_to_crawl_in_queue(mode, service_type, url)
|
||||||
|
|
||||||
return redirect(url_for('hiddenServices.manual'))
|
return redirect(url_for('hiddenServices.manual'))
|
||||||
|
|
Loading…
Reference in a new issue