From bb301a870c822354f697db5e818bdca9b6f29bc2 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Jan 2019 12:00:14 +0100 Subject: [PATCH] fix: [Crawler] fix onion blacklist + add crawler info --- bin/Crawler.py | 25 +++++++++++++++-- var/www/modules/Flask_config.py | 6 ++++ .../hiddenServices/Flask_hiddenServices.py | 17 ++++++++++- .../templates/hiddenServices.html | 28 ++++++++++++++++++- 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 0f69cfe6..5b8dd6f9 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -10,6 +10,8 @@ import time import subprocess import requests +from pyfaup.faup import Faup + sys.path.append(os.environ['AIL_BIN']) from Helper import Process from pubsublogger import publisher @@ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message): def crawl_onion(url, domain, date, date_month, message): + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') if super_father is None: @@ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - exit(1) + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN') + nb_retry == 0 print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') print(' Retry({}) in 10 seconds'.format(nb_retry)) time.sleep(10) if r.status_code == 200: + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: @@ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message): print('') print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) print('------------------------------------------------------------------------') + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error') exit(-2) else: print(process.stdout.read()) @@ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message): print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') exit(1) @@ -119,6 +128,7 @@ if __name__ == '__main__': print('splash url: {}'.format(splash_url)) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + faup = Faup() PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) @@ -140,6 +150,10 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) + r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port) + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + # load domains blacklist try: with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: @@ -180,7 +194,10 @@ if __name__ == '__main__': print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): + faup.decode(domain) + onion_domain=faup.get()['domain'].decode() + + if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") @@ -243,6 +260,10 @@ if __name__ == '__main__': r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) + #update crawler status + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') + r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + else: continue else: diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 7cc802f0..d26b2363 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -30,6 +30,12 @@ r_serv = redis.StrictRedis( db=cfg.getint("Redis_Queues", "db"), decode_responses=True) +r_cache = redis.StrictRedis( + host=cfg.get("Redis_Cache", "host"), + port=cfg.getint("Redis_Cache", "port"), + db=cfg.getint("Redis_Cache", "db"), + decode_responses=True) + r_serv_log = redis.StrictRedis( host=cfg.get("Redis_Log", "host"), port=cfg.getint("Redis_Log", "port"), diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 5b53370a..e532a250 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -19,6 +19,7 @@ import Flask_config app = Flask_config.app cfg = Flask_config.cfg baseUrl = Flask_config.baseUrl +r_cache = Flask_config.r_cache r_serv_onion = Flask_config.r_serv_onion r_serv_metadata = Flask_config.r_serv_metadata bootstrap_label = Flask_config.bootstrap_label @@ -102,8 +103,22 @@ def hiddenServices_page(): metadata_onion['status_icon'] = 'fa-times-circle' list_onion.append(metadata_onion) + crawler_metadata=[] + all_onion_crawler = r_cache.smembers('all_crawler:onion') + for crawler in all_onion_crawler: + crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time') + status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status') + crawler_info = '{} - {}'.format(crawler, started_time) + if status_info=='Waiting' or status_info=='Crawling': + status=True + else: + status=False + crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) + date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) - return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string) + return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, + crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) def last_crawled_domains_with_stats_json(): diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index 88a42d0d..fa6fa0ec 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -142,7 +142,6 @@ -
Domains Crawled Today @@ -203,6 +202,33 @@
+ + {%if crawler_metadata%} +
+
+ Crawlers Status +
+ + + + {% for crawler in crawler_metadata %} + + + + + + {% endfor %} + +
+ {{crawler['crawler_info']}} + + {{crawler['crawling_domain']}} + + {{crawler['status_info']}} +
+
+ {%endif%} +