2018-08-21 13:54:53 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
'''
|
|
|
|
Flask functions and routes for the trending modules page
|
|
|
|
'''
|
|
|
|
import datetime
|
2018-08-27 09:02:39 +00:00
|
|
|
import sys
|
|
|
|
import os
|
2019-03-06 14:05:58 +00:00
|
|
|
import time
|
2019-02-19 10:41:45 +00:00
|
|
|
from pyfaup.faup import Faup
|
2019-06-07 11:47:44 +00:00
|
|
|
from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for
|
2019-06-19 15:02:09 +00:00
|
|
|
|
2019-11-20 15:15:08 +00:00
|
|
|
from Role_Manager import login_admin, login_analyst, login_read_only, no_cache
|
2019-05-02 15:31:14 +00:00
|
|
|
from flask_login import login_required
|
2018-08-21 13:54:53 +00:00
|
|
|
|
2022-11-28 14:01:40 +00:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from lib import crawlers
|
2018-08-21 13:54:53 +00:00
|
|
|
|
|
|
|
# ============ VARIABLES ============
|
|
|
|
import Flask_config
|
|
|
|
|
|
|
|
app = Flask_config.app
|
2018-09-28 13:42:06 +00:00
|
|
|
baseUrl = Flask_config.baseUrl
|
2019-01-29 11:00:14 +00:00
|
|
|
r_cache = Flask_config.r_cache
|
2018-08-21 13:54:53 +00:00
|
|
|
r_serv_onion = Flask_config.r_serv_onion
|
2018-08-27 12:34:08 +00:00
|
|
|
bootstrap_label = Flask_config.bootstrap_label
|
2018-08-21 13:54:53 +00:00
|
|
|
|
|
|
|
hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates')
|
|
|
|
|
2019-02-19 10:41:45 +00:00
|
|
|
faup = Faup()
|
2019-02-28 13:56:14 +00:00
|
|
|
list_types=['onion', 'regular']
|
|
|
|
dic_type_name={'onion':'Onion', 'regular':'Website'}
|
2019-02-19 10:41:45 +00:00
|
|
|
|
2018-08-21 13:54:53 +00:00
|
|
|
# ============ FUNCTIONS ============
|
2019-05-03 14:52:05 +00:00
|
|
|
|
2018-09-12 07:55:49 +00:00
|
|
|
|
2019-02-28 13:56:14 +00:00
|
|
|
def is_valid_domain(domain):
|
|
|
|
faup.decode(domain)
|
2019-02-19 10:41:45 +00:00
|
|
|
domain_unpack = faup.get()
|
2019-02-28 13:56:14 +00:00
|
|
|
if domain_unpack['tld'] is not None and domain_unpack['scheme'] is None and domain_unpack['port'] is None and domain_unpack['query_string'] is None:
|
2019-02-19 10:41:45 +00:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2019-03-06 14:05:58 +00:00
|
|
|
def get_type_domain(domain):
|
|
|
|
if domain is None:
|
|
|
|
type = 'regular'
|
|
|
|
else:
|
|
|
|
if domain.rsplit('.', 1)[1] == 'onion':
|
|
|
|
type = 'onion'
|
|
|
|
else:
|
|
|
|
type = 'regular'
|
|
|
|
return type
|
|
|
|
|
2019-04-18 14:57:51 +00:00
|
|
|
def get_domain_from_url(url):
|
|
|
|
faup.decode(url)
|
|
|
|
unpack_url = faup.get()
|
2019-05-06 11:38:13 +00:00
|
|
|
domain = unpack_url['domain']
|
|
|
|
## TODO: FIXME remove me
|
|
|
|
try:
|
|
|
|
domain = domain.decode()
|
|
|
|
except:
|
|
|
|
pass
|
2019-04-18 14:57:51 +00:00
|
|
|
return domain
|
|
|
|
|
2022-09-01 12:04:00 +00:00
|
|
|
def get_last_domains_crawled(type): # DONE
|
2019-02-28 13:56:14 +00:00
|
|
|
return r_serv_onion.lrange('last_{}'.format(type), 0 ,-1)
|
|
|
|
|
|
|
|
|
2019-04-18 14:57:51 +00:00
|
|
|
def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, auto_mode=False):
|
2019-02-19 10:41:45 +00:00
|
|
|
list_crawled_metadata = []
|
2019-02-26 13:50:48 +00:00
|
|
|
for domain_epoch in list_domains_crawled:
|
2019-04-18 14:57:51 +00:00
|
|
|
if not auto_mode:
|
|
|
|
domain, epoch = domain_epoch.rsplit(';', 1)
|
|
|
|
else:
|
|
|
|
url = domain_epoch
|
|
|
|
domain = domain_epoch
|
2019-03-22 15:48:07 +00:00
|
|
|
domain = domain.split(':')
|
|
|
|
if len(domain) == 1:
|
|
|
|
port = 80
|
|
|
|
domain = domain[0]
|
|
|
|
else:
|
|
|
|
port = domain[1]
|
|
|
|
domain = domain[0]
|
2019-02-19 10:41:45 +00:00
|
|
|
metadata_domain = {}
|
|
|
|
# get Domain type
|
|
|
|
if type is None:
|
2019-04-18 14:57:51 +00:00
|
|
|
type_domain = get_type_domain(domain)
|
|
|
|
else:
|
|
|
|
type_domain = type
|
|
|
|
if auto_mode:
|
|
|
|
metadata_domain['url'] = url
|
|
|
|
epoch = r_serv_onion.zscore('crawler_auto_queue', '{};auto;{}'.format(domain, type_domain))
|
|
|
|
#domain in priority queue
|
|
|
|
if epoch is None:
|
|
|
|
epoch = 'In Queue'
|
|
|
|
else:
|
|
|
|
epoch = datetime.datetime.fromtimestamp(float(epoch)).strftime('%Y-%m-%d %H:%M:%S')
|
2019-02-19 10:41:45 +00:00
|
|
|
|
|
|
|
metadata_domain['domain'] = domain
|
2019-03-01 10:40:43 +00:00
|
|
|
if len(domain) > 45:
|
|
|
|
domain_name, tld_domain = domain.rsplit('.', 1)
|
|
|
|
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
|
|
|
|
else:
|
|
|
|
metadata_domain['domain_name'] = domain
|
2019-03-22 15:48:07 +00:00
|
|
|
metadata_domain['port'] = port
|
2019-02-26 13:50:48 +00:00
|
|
|
metadata_domain['epoch'] = epoch
|
2019-04-18 14:57:51 +00:00
|
|
|
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'last_check')
|
2019-02-19 10:41:45 +00:00
|
|
|
if metadata_domain['last_check'] is None:
|
|
|
|
metadata_domain['last_check'] = '********'
|
2019-04-18 14:57:51 +00:00
|
|
|
metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'first_seen')
|
2019-02-19 10:41:45 +00:00
|
|
|
if metadata_domain['first_seen'] is None:
|
|
|
|
metadata_domain['first_seen'] = '********'
|
2019-04-18 14:57:51 +00:00
|
|
|
if r_serv_onion.sismember('{}_up:{}'.format(type_domain, metadata_domain['last_check']) , domain):
|
2019-02-19 10:41:45 +00:00
|
|
|
metadata_domain['status_text'] = 'UP'
|
|
|
|
metadata_domain['status_color'] = 'Green'
|
|
|
|
metadata_domain['status_icon'] = 'fa-check-circle'
|
|
|
|
else:
|
|
|
|
metadata_domain['status_text'] = 'DOWN'
|
|
|
|
metadata_domain['status_color'] = 'Red'
|
|
|
|
metadata_domain['status_icon'] = 'fa-times-circle'
|
|
|
|
list_crawled_metadata.append(metadata_domain)
|
|
|
|
return list_crawled_metadata
|
|
|
|
|
2019-04-18 14:57:51 +00:00
|
|
|
def delete_auto_crawler(url):
|
|
|
|
domain = get_domain_from_url(url)
|
|
|
|
type = get_type_domain(domain)
|
|
|
|
# remove from set
|
|
|
|
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
|
|
|
|
# remove config
|
2019-04-23 09:15:34 +00:00
|
|
|
r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url))
|
2019-04-18 14:57:51 +00:00
|
|
|
# remove from queue
|
|
|
|
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
|
|
|
|
# remove from crawler_auto_queue
|
|
|
|
r_serv_onion.zrem('crawler_auto_queue'.format(type), '{};auto;{}'.format(url, type))
|
|
|
|
|
2018-08-21 13:54:53 +00:00
|
|
|
# ============= ROUTES ==============
|
|
|
|
|
2019-02-28 13:56:14 +00:00
|
|
|
@hiddenServices.route("/crawlers/blacklisted_domains", methods=['GET'])
|
2019-05-02 15:31:14 +00:00
|
|
|
@login_required
|
2019-11-20 15:15:08 +00:00
|
|
|
@login_read_only
|
2019-02-28 13:56:14 +00:00
|
|
|
def blacklisted_domains():
|
|
|
|
blacklist_domain = request.args.get('blacklist_domain')
|
|
|
|
unblacklist_domain = request.args.get('unblacklist_domain')
|
|
|
|
type = request.args.get('type')
|
|
|
|
if type in list_types:
|
|
|
|
type_name = dic_type_name[type]
|
|
|
|
if blacklist_domain is not None:
|
|
|
|
blacklist_domain = int(blacklist_domain)
|
|
|
|
if unblacklist_domain is not None:
|
|
|
|
unblacklist_domain = int(unblacklist_domain)
|
|
|
|
try:
|
|
|
|
page = int(request.args.get('page'))
|
|
|
|
except:
|
|
|
|
page = 1
|
|
|
|
if page <= 0:
|
|
|
|
page = 1
|
|
|
|
nb_page_max = r_serv_onion.scard('blacklist_{}'.format(type))/(1000)
|
|
|
|
if isinstance(nb_page_max, float):
|
|
|
|
nb_page_max = int(nb_page_max)+1
|
|
|
|
if page > nb_page_max:
|
|
|
|
page = nb_page_max
|
|
|
|
start = 1000*(page -1)
|
|
|
|
stop = 1000*page
|
|
|
|
|
|
|
|
list_blacklisted = list(r_serv_onion.smembers('blacklist_{}'.format(type)))
|
|
|
|
list_blacklisted_1 = list_blacklisted[start:stop]
|
|
|
|
list_blacklisted_2 = list_blacklisted[stop:stop+1000]
|
|
|
|
return render_template("blacklisted_domains.html", list_blacklisted_1=list_blacklisted_1, list_blacklisted_2=list_blacklisted_2,
|
|
|
|
type=type, type_name=type_name, page=page, nb_page_max=nb_page_max,
|
|
|
|
blacklist_domain=blacklist_domain, unblacklist_domain=unblacklist_domain)
|
|
|
|
else:
|
|
|
|
return 'Incorrect Type'
|
|
|
|
|
|
|
|
@hiddenServices.route("/crawler/blacklist_domain", methods=['GET'])
|
2019-05-02 15:31:14 +00:00
|
|
|
@login_required
|
2019-06-19 15:02:09 +00:00
|
|
|
@login_analyst
|
2019-02-28 13:56:14 +00:00
|
|
|
def blacklist_domain():
|
|
|
|
domain = request.args.get('domain')
|
|
|
|
type = request.args.get('type')
|
2019-02-19 10:41:45 +00:00
|
|
|
try:
|
|
|
|
page = int(request.args.get('page'))
|
|
|
|
except:
|
|
|
|
page = 1
|
2019-02-28 13:56:14 +00:00
|
|
|
if type in list_types:
|
|
|
|
if is_valid_domain(domain):
|
|
|
|
res = r_serv_onion.sadd('blacklist_{}'.format(type), domain)
|
|
|
|
if page:
|
|
|
|
if res == 0:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=2))
|
|
|
|
else:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=1))
|
|
|
|
else:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=0))
|
2019-02-19 10:41:45 +00:00
|
|
|
else:
|
2019-02-28 13:56:14 +00:00
|
|
|
return 'Incorrect type'
|
2019-02-19 10:41:45 +00:00
|
|
|
|
2019-02-28 13:56:14 +00:00
|
|
|
@hiddenServices.route("/crawler/unblacklist_domain", methods=['GET'])
|
2019-05-02 15:31:14 +00:00
|
|
|
@login_required
|
2019-06-19 15:02:09 +00:00
|
|
|
@login_analyst
|
2019-02-28 13:56:14 +00:00
|
|
|
def unblacklist_domain():
|
|
|
|
domain = request.args.get('domain')
|
|
|
|
type = request.args.get('type')
|
2019-02-19 10:41:45 +00:00
|
|
|
try:
|
|
|
|
page = int(request.args.get('page'))
|
|
|
|
except:
|
|
|
|
page = 1
|
2019-02-28 13:56:14 +00:00
|
|
|
if type in list_types:
|
|
|
|
if is_valid_domain(domain):
|
|
|
|
res = r_serv_onion.srem('blacklist_{}'.format(type), domain)
|
|
|
|
if page:
|
|
|
|
if res == 0:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=2))
|
|
|
|
else:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=1))
|
|
|
|
else:
|
|
|
|
return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=0))
|
2019-02-19 10:41:45 +00:00
|
|
|
else:
|
2019-02-28 13:56:14 +00:00
|
|
|
return 'Incorrect type'
|
2019-02-19 10:41:45 +00:00
|
|
|
|
2019-04-18 14:57:51 +00:00
|
|
|
@hiddenServices.route("/crawlers/auto_crawler", methods=['GET'])
|
2019-05-02 15:31:14 +00:00
|
|
|
@login_required
|
2019-11-20 15:15:08 +00:00
|
|
|
@login_read_only
|
2019-04-18 14:57:51 +00:00
|
|
|
def auto_crawler():
|
|
|
|
nb_element_to_display = 100
|
|
|
|
try:
|
|
|
|
page = int(request.args.get('page'))
|
|
|
|
except:
|
|
|
|
page = 1
|
|
|
|
if page <= 0:
|
|
|
|
page = 1
|
|
|
|
|
|
|
|
nb_auto_onion = r_serv_onion.scard('auto_crawler_url:onion')
|
|
|
|
nb_auto_regular = r_serv_onion.scard('auto_crawler_url:regular')
|
|
|
|
|
|
|
|
if nb_auto_onion > nb_auto_regular:
|
|
|
|
nb_max = nb_auto_onion
|
|
|
|
else:
|
|
|
|
nb_max = nb_auto_regular
|
|
|
|
|
|
|
|
nb_page_max = nb_max/(nb_element_to_display)
|
|
|
|
if isinstance(nb_page_max, float):
|
|
|
|
nb_page_max = int(nb_page_max)+1
|
|
|
|
if page > nb_page_max:
|
|
|
|
page = nb_page_max
|
|
|
|
start = nb_element_to_display*(page -1)
|
|
|
|
stop = nb_element_to_display*page
|
|
|
|
|
|
|
|
last_auto_crawled = get_last_domains_crawled('auto_crawled')
|
|
|
|
last_domains = get_last_crawled_domains_metadata(last_auto_crawled, '')
|
|
|
|
|
|
|
|
if start > nb_auto_onion:
|
|
|
|
auto_crawler_domain_onions = []
|
|
|
|
elif stop > nb_auto_onion:
|
|
|
|
auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:nb_auto_onion]
|
|
|
|
else:
|
|
|
|
auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:stop]
|
|
|
|
|
|
|
|
if start > nb_auto_regular:
|
|
|
|
auto_crawler_domain_regular = []
|
|
|
|
elif stop > nb_auto_regular:
|
|
|
|
auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:nb_auto_regular]
|
|
|
|
else:
|
|
|
|
auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:stop]
|
|
|
|
|
|
|
|
auto_crawler_domain_onions_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_onions, '', type='onion', auto_mode=True)
|
|
|
|
auto_crawler_domain_regular_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_regular, '', type='regular', auto_mode=True)
|
|
|
|
|
|
|
|
return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max,
|
|
|
|
last_domains=last_domains,
|
2022-10-25 14:25:19 +00:00
|
|
|
is_manager_connected=crawlers.get_lacus_connection_metadata(),
|
2019-04-18 14:57:51 +00:00
|
|
|
auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata,
|
|
|
|
auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata)
|
|
|
|
|
|
|
|
@hiddenServices.route("/crawlers/remove_auto_crawler", methods=['GET'])
|
2019-05-02 15:31:14 +00:00
|
|
|
@login_required
|
2019-06-19 15:02:09 +00:00
|
|
|
@login_analyst
|
2019-04-18 14:57:51 +00:00
|
|
|
def remove_auto_crawler():
|
|
|
|
url = request.args.get('url')
|
|
|
|
page = request.args.get('page')
|
|
|
|
|
|
|
|
if url:
|
|
|
|
delete_auto_crawler(url)
|
|
|
|
return redirect(url_for('hiddenServices.auto_crawler', page=page))
|
|
|
|
|
2019-02-15 16:00:07 +00:00
|
|
|
|
2018-08-21 13:54:53 +00:00
|
|
|
# ========= REGISTRATION =========
|
2018-09-28 13:42:06 +00:00
|
|
|
app.register_blueprint(hiddenServices, url_prefix=baseUrl)
|