From 6fdf7c2123e69343efd145ec945ee43a5766c5b7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 18 Apr 2019 16:57:51 +0200 Subject: [PATCH] chg: [UI crawler] status/remove auto crawler --- bin/Crawler.py | 4 +- .../hiddenServices/Flask_hiddenServices.py | 104 ++++++++- .../templates/Crawler_auto.html | 217 ++++++++++++++++++ .../templates/Crawler_dashboard.html | 14 ++ .../templates/Crawler_index.html | 72 +++--- var/www/modules/settings/Flask_settings.py | 4 - var/www/templates/crawler/menu_sidebar.html | 4 +- 7 files changed, 362 insertions(+), 57 deletions(-) create mode 100644 var/www/modules/hiddenServices/templates/Crawler_auto.html diff --git a/bin/Crawler.py b/bin/Crawler.py index 19de0e2a..b6ebdcf8 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -397,7 +397,9 @@ if __name__ == '__main__': # add next auto Crawling in queue: if to_crawl['paste'] == 'auto': redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service'])) - + # update list, last auto crawled domains + redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) + redis_crawler.ltrim('last_auto_crawled', 0, 9) else: print(' Blacklisted Domain') print() diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index b0deb6fc..dc51446b 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -105,6 +105,12 @@ def get_type_domain(domain): type = 'regular' return type +def get_domain_from_url(url): + faup.decode(url) + unpack_url = faup.get() + domain = unpack_url['domain'].decode() + return domain + def get_last_domains_crawled(type): return r_serv_onion.lrange('last_{}'.format(type), 0 ,-1) @@ -117,10 +123,14 @@ def get_stats_last_crawled_domains(type, date): statDomains['domains_queue'] += r_serv_onion.scard('{}_crawler_priority_queue'.format(type)) return statDomains -def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): +def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, auto_mode=False): list_crawled_metadata = [] for domain_epoch in list_domains_crawled: - domain, epoch = domain_epoch.rsplit(';', 1) + if not auto_mode: + domain, epoch = domain_epoch.rsplit(';', 1) + else: + url = domain_epoch + domain = domain_epoch domain = domain.split(':') if len(domain) == 1: port = 80 @@ -131,7 +141,17 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): metadata_domain = {} # get Domain type if type is None: - type = get_domain_type(domain) + type_domain = get_type_domain(domain) + else: + type_domain = type + if auto_mode: + metadata_domain['url'] = url + epoch = r_serv_onion.zscore('crawler_auto_queue', '{};auto;{}'.format(domain, type_domain)) + #domain in priority queue + if epoch is None: + epoch = 'In Queue' + else: + epoch = datetime.datetime.fromtimestamp(float(epoch)).strftime('%Y-%m-%d %H:%M:%S') metadata_domain['domain'] = domain if len(domain) > 45: @@ -141,13 +161,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): metadata_domain['domain_name'] = domain metadata_domain['port'] = port metadata_domain['epoch'] = epoch - metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check') + metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'last_check') if metadata_domain['last_check'] is None: metadata_domain['last_check'] = '********' - metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'first_seen') + metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'first_seen') if metadata_domain['first_seen'] is None: metadata_domain['first_seen'] = '********' - if r_serv_onion.sismember('{}_up:{}'.format(type, metadata_domain['last_check']) , domain): + if r_serv_onion.sismember('{}_up:{}'.format(type_domain, metadata_domain['last_check']) , domain): metadata_domain['status_text'] = 'UP' metadata_domain['status_color'] = 'Green' metadata_domain['status_icon'] = 'fa-check-circle' @@ -186,6 +206,18 @@ def send_url_to_crawl_in_queue(mode, service_type, url): if mode == 'auto': r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url) +def delete_auto_crawler(url): + domain = get_domain_from_url(url) + type = get_type_domain(domain) + # remove from set + r_serv_onion.srem('auto_crawler_url:{}'.format(type), url) + # remove config + r_serv_onion.delete('crawler_config:auto:{}:{}'.format(type, domain)) + # remove from queue + r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url)) + # remove from crawler_auto_queue + r_serv_onion.zrem('crawler_auto_queue'.format(type), '{};auto;{}'.format(url, type)) + # ============= ROUTES ============== @hiddenServices.route("/crawlers/", methods=['GET']) @@ -390,6 +422,66 @@ def create_spider_splash(): return redirect(url_for('hiddenServices.manual')) +@hiddenServices.route("/crawlers/auto_crawler", methods=['GET']) +def auto_crawler(): + nb_element_to_display = 100 + try: + page = int(request.args.get('page')) + except: + page = 1 + if page <= 0: + page = 1 + + nb_auto_onion = r_serv_onion.scard('auto_crawler_url:onion') + nb_auto_regular = r_serv_onion.scard('auto_crawler_url:regular') + + if nb_auto_onion > nb_auto_regular: + nb_max = nb_auto_onion + else: + nb_max = nb_auto_regular + + nb_page_max = nb_max/(nb_element_to_display) + if isinstance(nb_page_max, float): + nb_page_max = int(nb_page_max)+1 + if page > nb_page_max: + page = nb_page_max + start = nb_element_to_display*(page -1) + stop = nb_element_to_display*page + + last_auto_crawled = get_last_domains_crawled('auto_crawled') + last_domains = get_last_crawled_domains_metadata(last_auto_crawled, '') + + if start > nb_auto_onion: + auto_crawler_domain_onions = [] + elif stop > nb_auto_onion: + auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:nb_auto_onion] + else: + auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:stop] + + if start > nb_auto_regular: + auto_crawler_domain_regular = [] + elif stop > nb_auto_regular: + auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:nb_auto_regular] + else: + auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:stop] + + auto_crawler_domain_onions_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_onions, '', type='onion', auto_mode=True) + auto_crawler_domain_regular_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_regular, '', type='regular', auto_mode=True) + + return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max, + last_domains=last_domains, + auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata, + auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata) + +@hiddenServices.route("/crawlers/remove_auto_crawler", methods=['GET']) +def remove_auto_crawler(): + url = request.args.get('url') + page = request.args.get('page') + + if url: + delete_auto_crawler(url) + return redirect(url_for('hiddenServices.auto_crawler', page=page)) + # # TODO: refractor @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) def last_crawled_domains_with_stats_json(): diff --git a/var/www/modules/hiddenServices/templates/Crawler_auto.html b/var/www/modules/hiddenServices/templates/Crawler_auto.html new file mode 100644 index 00000000..977aed1e --- /dev/null +++ b/var/www/modules/hiddenServices/templates/Crawler_auto.html @@ -0,0 +1,217 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ + {%if last_domains%} +
+ + + + + + + + + + + {% for metadata_domain in last_domains %} + + + + + + + {% endfor %} + +
DomainFirst SeenLast CheckStatus
{{ metadata_domain['domain_name'] }}{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}
+ + {{metadata_domain['status_text']}} +
+
+
+ {%endif%} + +
+
+
+ + + + + + + + + + + + {% for metadata_domain in auto_crawler_domain_onions_metadata %} + + + + + + + + {% endfor %} + +
Onion UrlNext Check
{{ metadata_domain['url'] }} + + {{metadata_domain['epoch']}}
+ + {{metadata_domain['status_text']}} +
+
+ +
+
+
+
+
+ + + + + + + + + + + + {% for metadata_domain in auto_crawler_domain_regular_metadata %} + + + + + + + + {% endfor %} + +
Regular UrlNext Check
{{ metadata_domain['url'] }} + + {{metadata_domain['epoch']}}
+ + {{metadata_domain['status_text']}} +
+
+ +
+
+
+
+ +
+ +
+ +
+ +
+
+
+ + + + + diff --git a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html index 4c4f8a3a..1f27cc3d 100644 --- a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html +++ b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html @@ -50,4 +50,18 @@ $(document).ready(function(){ $("#page-Crawler").addClass("active"); }); + +function toggle_sidebar(){ + if($('#nav_menu').is(':visible')){ + $('#nav_menu').hide(); + $('#side_menu').removeClass('border-right') + $('#side_menu').removeClass('col-lg-2') + $('#core_content').removeClass('col-lg-10') + }else{ + $('#nav_menu').show(); + $('#side_menu').addClass('border-right') + $('#side_menu').addClass('col-lg-2') + $('#core_content').addClass('col-lg-10') + } +} diff --git a/var/www/modules/hiddenServices/templates/Crawler_index.html b/var/www/modules/hiddenServices/templates/Crawler_index.html index faccf26a..fea345a3 100644 --- a/var/www/modules/hiddenServices/templates/Crawler_index.html +++ b/var/www/modules/hiddenServices/templates/Crawler_index.html @@ -22,61 +22,45 @@
-
+ {% include 'crawler/menu_sidebar.html' %} + +
- +
+
+						--------------
+
+
+
+						--------------
+
+					
+
+
- - - -
-
-					--------------
-
-
-
-					--------------
-
-				
-
-
- diff --git a/var/www/modules/settings/Flask_settings.py b/var/www/modules/settings/Flask_settings.py index 2d2d2be9..f8600f58 100644 --- a/var/www/modules/settings/Flask_settings.py +++ b/var/www/modules/settings/Flask_settings.py @@ -44,10 +44,6 @@ def get_git_metadata(): dict_git['last_local_tag'] = git_status.get_last_tag_from_local() dict_git['last_remote_tag'] = git_status.get_last_tag_from_remote() - # # DEBUG: - dict_git['last_local_tag'] = 'v1.3' - dict_git['last_remote_commit'] = '234328439828943843839' - if dict_git['current_commit'] != dict_git['last_remote_commit']: dict_git['new_git_update_available'] = True else: diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html index 7cfc61f0..2065bab9 100644 --- a/var/www/templates/crawler/menu_sidebar.html +++ b/var/www/templates/crawler/menu_sidebar.html @@ -8,7 +8,7 @@