chg: [UI crawler] status/remove auto crawler

This commit is contained in:
Terrtia 2019-04-18 16:57:51 +02:00
parent ecca5b11d0
commit 6fdf7c2123
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
7 changed files with 362 additions and 57 deletions

View file

@ -397,7 +397,9 @@ if __name__ == '__main__':
# add next auto Crawling in queue:
if to_crawl['paste'] == 'auto':
redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
# update list, last auto crawled domains
redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
redis_crawler.ltrim('last_auto_crawled', 0, 9)
else:
print(' Blacklisted Domain')
print()

View file

@ -105,6 +105,12 @@ def get_type_domain(domain):
type = 'regular'
return type
def get_domain_from_url(url):
faup.decode(url)
unpack_url = faup.get()
domain = unpack_url['domain'].decode()
return domain
def get_last_domains_crawled(type):
return r_serv_onion.lrange('last_{}'.format(type), 0 ,-1)
@ -117,10 +123,14 @@ def get_stats_last_crawled_domains(type, date):
statDomains['domains_queue'] += r_serv_onion.scard('{}_crawler_priority_queue'.format(type))
return statDomains
def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, auto_mode=False):
list_crawled_metadata = []
for domain_epoch in list_domains_crawled:
domain, epoch = domain_epoch.rsplit(';', 1)
if not auto_mode:
domain, epoch = domain_epoch.rsplit(';', 1)
else:
url = domain_epoch
domain = domain_epoch
domain = domain.split(':')
if len(domain) == 1:
port = 80
@ -131,7 +141,17 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
metadata_domain = {}
# get Domain type
if type is None:
type = get_domain_type(domain)
type_domain = get_type_domain(domain)
else:
type_domain = type
if auto_mode:
metadata_domain['url'] = url
epoch = r_serv_onion.zscore('crawler_auto_queue', '{};auto;{}'.format(domain, type_domain))
#domain in priority queue
if epoch is None:
epoch = 'In Queue'
else:
epoch = datetime.datetime.fromtimestamp(float(epoch)).strftime('%Y-%m-%d %H:%M:%S')
metadata_domain['domain'] = domain
if len(domain) > 45:
@ -141,13 +161,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
metadata_domain['domain_name'] = domain
metadata_domain['port'] = port
metadata_domain['epoch'] = epoch
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'last_check')
if metadata_domain['last_check'] is None:
metadata_domain['last_check'] = '********'
metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'first_seen')
metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'first_seen')
if metadata_domain['first_seen'] is None:
metadata_domain['first_seen'] = '********'
if r_serv_onion.sismember('{}_up:{}'.format(type, metadata_domain['last_check']) , domain):
if r_serv_onion.sismember('{}_up:{}'.format(type_domain, metadata_domain['last_check']) , domain):
metadata_domain['status_text'] = 'UP'
metadata_domain['status_color'] = 'Green'
metadata_domain['status_icon'] = 'fa-check-circle'
@ -186,6 +206,18 @@ def send_url_to_crawl_in_queue(mode, service_type, url):
if mode == 'auto':
r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url)
def delete_auto_crawler(url):
domain = get_domain_from_url(url)
type = get_type_domain(domain)
# remove from set
r_serv_onion.srem('auto_crawler_url:{}'.format(type), url)
# remove config
r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain))
# remove from queue
r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url))
# remove from crawler_auto_queue
r_serv_onion.zrem('crawler_auto_queue'.format(type), '{};auto;{}'.format(url, type))
# ============= ROUTES ==============
@hiddenServices.route("/crawlers/", methods=['GET'])
@ -390,6 +422,66 @@ def create_spider_splash():
return redirect(url_for('hiddenServices.manual'))
@hiddenServices.route("/crawlers/auto_crawler", methods=['GET'])
def auto_crawler():
nb_element_to_display = 100
try:
page = int(request.args.get('page'))
except:
page = 1
if page <= 0:
page = 1
nb_auto_onion = r_serv_onion.scard('auto_crawler_url:onion')
nb_auto_regular = r_serv_onion.scard('auto_crawler_url:regular')
if nb_auto_onion > nb_auto_regular:
nb_max = nb_auto_onion
else:
nb_max = nb_auto_regular
nb_page_max = nb_max/(nb_element_to_display)
if isinstance(nb_page_max, float):
nb_page_max = int(nb_page_max)+1
if page > nb_page_max:
page = nb_page_max
start = nb_element_to_display*(page -1)
stop = nb_element_to_display*page
last_auto_crawled = get_last_domains_crawled('auto_crawled')
last_domains = get_last_crawled_domains_metadata(last_auto_crawled, '')
if start > nb_auto_onion:
auto_crawler_domain_onions = []
elif stop > nb_auto_onion:
auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:nb_auto_onion]
else:
auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:stop]
if start > nb_auto_regular:
auto_crawler_domain_regular = []
elif stop > nb_auto_regular:
auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:nb_auto_regular]
else:
auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:stop]
auto_crawler_domain_onions_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_onions, '', type='onion', auto_mode=True)
auto_crawler_domain_regular_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_regular, '', type='regular', auto_mode=True)
return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max,
last_domains=last_domains,
auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata,
auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata)
@hiddenServices.route("/crawlers/remove_auto_crawler", methods=['GET'])
def remove_auto_crawler():
url = request.args.get('url')
page = request.args.get('page')
if url:
delete_auto_crawler(url)
return redirect(url_for('hiddenServices.auto_crawler', page=page))
# # TODO: refractor
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
def last_crawled_domains_with_stats_json():

View file

@ -0,0 +1,217 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dataTables.bootstrap4.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
{%if last_domains%}
<div class="table-responsive mt-1 mb-3 table-hover table-borderless table-striped">
<table class="table">
<thead class="thead-dark">
<tr>
<th>Domain</th>
<th>First Seen</th>
<th>Last Check</th>
<th>Status</th>
</tr>
</thead>
<tbody id="tbody_last_crawled">
{% for metadata_domain in last_domains %}
<tr>
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
<i class="fas {{metadata_domain['status_icon']}} "></i>
{{metadata_domain['status_text']}}
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{%endif%}
<div class="row">
<div class="col-lg-6">
<div class="table-responsive mt-1 table-hover table-borderless table-striped">
<table class="table" id="myTable_1">
<thead class="thead-dark">
<tr>
<th>Onion Url</th>
<th></th>
<th>Next Check</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody id="tbody_last_crawled">
{% for metadata_domain in auto_crawler_domain_onions_metadata %}
<tr>
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['url'] }}</a></td>
<td><a class="btn btn-outline-danger px-1 py-0" href="{{ url_for('hiddenServices.remove_auto_crawler') }}?url={{ metadata_domain['url'] }}&page={{page}}">
<i class="fas fa-trash-alt"></i></a>
</td>
<td>{{metadata_domain['epoch']}}</td>
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
<i class="fas {{metadata_domain['status_icon']}} "></i>
{{metadata_domain['status_text']}}
</div>
</td>
<td>
<button class="btn btn-outline-secondary px-1 py-0 disabled"><i class="fas fa-pencil-alt"></i></button>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
<div class="col-lg-6">
<div class="table-responsive mt-1 table-hover table-borderless table-striped">
<table class="table" id="myTable_2">
<thead class="thead-dark">
<tr>
<th>Regular Url</th>
<th></th>
<th>Next Check</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody id="tbody_last_crawled">
{% for metadata_domain in auto_crawler_domain_regular_metadata %}
<tr>
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['url'] }}</a></td>
<td><a class="btn btn-outline-danger px-1 py-0" href="{{ url_for('hiddenServices.remove_auto_crawler') }}?url={{ metadata_domain['url'] }}&page={{page}}">
<i class="fas fa-trash-alt"></i></a>
</td>
<td>{{metadata_domain['epoch']}}</td>
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
<i class="fas {{metadata_domain['status_icon']}} "></i>
{{metadata_domain['status_text']}}
</div>
</td>
<td>
<button class="btn btn-outline-secondary px-1 py-0 disabled"><i class="fas fa-pencil-alt"></i></button>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
<hr class="mt-4">
<div class="d-flex justify-content-center">
<nav aria-label="...">
<ul class="pagination">
<li class="page-item {%if page==1%}disabled{%endif%}">
<a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page-1}}">Previous</a>
</li>
{%if page>3%}
<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page=1">1</a></li>
<li class="page-item disabled"><a class="page-link" aria-disabled="true" href="#">...</a></li>
<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page-1}}">{{page-1}}</a></li>
<li class="page-item active"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page}}">{{page}}</a></li>
{%else%}
{%if page>2%}<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page-2}}">{{page-2}}</a></li>{%endif%}
{%if page>1%}<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page-1}}">{{page-1}}</a></li>{%endif%}
<li class="page-item active"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page}}">{{page}}</a></li>
{%endif%}
{%if nb_page_max-page>3%}
<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page+1}}">{{page+1}}</a></li>
<li class="page-item disabled"><a class="page-link" aria-disabled="true" href="#">...</a></li>
<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{nb_page_max}}">{{nb_page_max}}</a></li>
{%else%}
{%if nb_page_max-page>2%}<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{nb_page_max-2}}">{{nb_page_max-2}}</a></li>{%endif%}
{%if nb_page_max-page>1%}<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{nb_page_max-1}}">{{nb_page_max-1}}</a></li>{%endif%}
{%if nb_page_max-page>0%}<li class="page-item"><a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{nb_page_max}}">{{nb_page_max}}</a></li>{%endif%}
{%endif%}
<li class="page-item {%if page==nb_page_max%}disabled{%endif%}">
<a class="page-link" href="{{ url_for('hiddenServices.auto_crawler') }}?page={{page+1}}" aria-disabled="true">Next</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
</body>
<script>
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_auto_crawler").addClass("active");
table1 = $('#myTable_1').DataTable(
{
//"aLengthMenu": [[5, 10, 15, 20, -1], [5, 10, 15, 20, "All"]],
//"iDisplayLength": 5,
//"order": [[ 0, "desc" ]]
columnDefs: [
{ orderable: false, targets: [-1, -4] }
]
});
table2 = $('#myTable_2').DataTable(
{
//"aLengthMenu": [[5, 10, 15, 20, -1], [5, 10, 15, 20, "All"]],
//"iDisplayLength": 5,
//"order": [[ 0, "desc" ]]
columnDefs: [
{ orderable: false, targets: [-1, -4] }
]
});
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -50,4 +50,18 @@
$(document).ready(function(){
$("#page-Crawler").addClass("active");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -22,61 +22,45 @@
<div class="container-fluid">
<div class="row">
<div class="col-12 col-md-2 p-0 bg-light border-right">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<nav class="navbar navbar-expand navbar-light bg-light flex-md-column flex-row align-items-start py-2">
<h5 class="d-flex text-muted w-100">
<span>Splash Crawlers </span>
<a class="ml-auto" href="#">
<i class="fas fa-plus-circle ml-auto"></i>
</a>
</h5>
<ul class="nav flex-md-column flex-row navbar-nav justify-content-between w-100"> <!--nav-pills-->
<li class="nav-item">
<a class="nav-link active" href="#">
<i class="fas fa-search"></i>
<span>Dashboard</span>
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">
<i class="fas fa-search"></i>
Automatic Splash Crawler
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">
<i class="fas fa-search"></i>
Manual Splash Crawler
</a>
</li>
</ul>
</nav>
<div >
<pre>
--------------
--------------
</pre>
</div>
</div>
<div >
<pre>
--------------
--------------
</pre>
</div>
</div>
</div>
</body>
<script>
$(document).ready(function(){
$("#page-Crawler").addClass("active");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>

View file

@ -44,10 +44,6 @@ def get_git_metadata():
dict_git['last_local_tag'] = git_status.get_last_tag_from_local()
dict_git['last_remote_tag'] = git_status.get_last_tag_from_remote()
# # DEBUG:
dict_git['last_local_tag'] = 'v1.3'
dict_git['last_remote_commit'] = '234328439828943843839'
if dict_git['current_commit'] != dict_git['last_remote_commit']:
dict_git['new_git_update_available'] = True
else:

View file

@ -8,7 +8,7 @@
<nav class="navbar navbar-expand navbar-light bg-light flex-md-column flex-row align-items-start py-2" id="nav_menu">
<h5 class="d-flex text-muted w-100">
<span>Splash Crawlers </span>
<a class="ml-auto" href="#">
<a class="ml-auto" href="{{url_for('hiddenServices.manual')}}">
<i class="fas fa-plus-circle ml-auto"></i>
</a>
</h5>
@ -38,7 +38,7 @@
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#" id="nav_auto_crawler">
<a class="nav-link" href="{{url_for('hiddenServices.auto_crawler')}}" id="nav_auto_crawler">
<i class="fas fa-sync"></i>
Automatic Crawler
</a>