fix: [Crawler] fix onion blacklist + add crawler info

This commit is contained in:
Terrtia 2019-01-29 12:00:14 +01:00
parent 1a1fda4c47
commit bb301a870c
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 72 additions and 4 deletions

View file

@ -10,6 +10,8 @@ import time
import subprocess import subprocess
import requests import requests
from pyfaup.faup import Faup
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
from Helper import Process from Helper import Process
from pubsublogger import publisher from pubsublogger import publisher
@ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
def crawl_onion(url, domain, date, date_month, message): def crawl_onion(url, domain, date, date_month, message):
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None: if super_father is None:
@ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message):
print('--------------------------------------') print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url)) print(' {} DOWN'.format(splash_url))
exit(1) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
nb_retry == 0
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
print(' Retry({}) in 10 seconds'.format(nb_retry)) print(' Retry({}) in 10 seconds'.format(nb_retry))
time.sleep(10) time.sleep(10)
if r.status_code == 200: if r.status_code == 200:
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
stdout=subprocess.PIPE) stdout=subprocess.PIPE)
while process.poll() is None: while process.poll() is None:
@ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message):
print('') print('')
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
print('------------------------------------------------------------------------') print('------------------------------------------------------------------------')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
exit(-2) exit(-2)
else: else:
print(process.stdout.read()) print(process.stdout.read())
@ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message):
print('--------------------------------------') print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url)) print(' {} DOWN'.format(splash_url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
exit(1) exit(1)
@ -119,6 +128,7 @@ if __name__ == '__main__':
print('splash url: {}'.format(splash_url)) print('splash url: {}'.format(splash_url))
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
faup = Faup()
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
@ -140,6 +150,10 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"), db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True) decode_responses=True)
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
# load domains blacklist # load domains blacklist
try: try:
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
@ -180,7 +194,10 @@ if __name__ == '__main__':
print('domain: {}'.format(domain)) print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url)) print('domain_url: {}'.format(domain_url))
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): faup.decode(domain)
onion_domain=faup.get()['domain'].decode()
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
date = datetime.datetime.now().strftime("%Y%m%d") date = datetime.datetime.now().strftime("%Y%m%d")
date_month = datetime.datetime.now().strftime("%Y%m") date_month = datetime.datetime.now().strftime("%Y%m")
@ -243,6 +260,10 @@ if __name__ == '__main__':
r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
#update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
else: else:
continue continue
else: else:

View file

@ -30,6 +30,12 @@ r_serv = redis.StrictRedis(
db=cfg.getint("Redis_Queues", "db"), db=cfg.getint("Redis_Queues", "db"),
decode_responses=True) decode_responses=True)
r_cache = redis.StrictRedis(
host=cfg.get("Redis_Cache", "host"),
port=cfg.getint("Redis_Cache", "port"),
db=cfg.getint("Redis_Cache", "db"),
decode_responses=True)
r_serv_log = redis.StrictRedis( r_serv_log = redis.StrictRedis(
host=cfg.get("Redis_Log", "host"), host=cfg.get("Redis_Log", "host"),
port=cfg.getint("Redis_Log", "port"), port=cfg.getint("Redis_Log", "port"),

View file

@ -19,6 +19,7 @@ import Flask_config
app = Flask_config.app app = Flask_config.app
cfg = Flask_config.cfg cfg = Flask_config.cfg
baseUrl = Flask_config.baseUrl baseUrl = Flask_config.baseUrl
r_cache = Flask_config.r_cache
r_serv_onion = Flask_config.r_serv_onion r_serv_onion = Flask_config.r_serv_onion
r_serv_metadata = Flask_config.r_serv_metadata r_serv_metadata = Flask_config.r_serv_metadata
bootstrap_label = Flask_config.bootstrap_label bootstrap_label = Flask_config.bootstrap_label
@ -102,8 +103,22 @@ def hiddenServices_page():
metadata_onion['status_icon'] = 'fa-times-circle' metadata_onion['status_icon'] = 'fa-times-circle'
list_onion.append(metadata_onion) list_onion.append(metadata_onion)
crawler_metadata=[]
all_onion_crawler = r_cache.smembers('all_crawler:onion')
for crawler in all_onion_crawler:
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time')
status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status')
crawler_info = '{} - {}'.format(crawler, started_time)
if status_info=='Waiting' or status_info=='Crawling':
status=True
else:
status=False
crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string) return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
def last_crawled_domains_with_stats_json(): def last_crawled_domains_with_stats_json():

View file

@ -142,7 +142,6 @@
</div> </div>
</div> </div>
<div class="panel panel-info"> <div class="panel panel-info">
<div class="panel-heading"> <div class="panel-heading">
<i class="fa fa-eye-slash"></i> Domains Crawled Today <i class="fa fa-eye-slash"></i> Domains Crawled Today
@ -203,6 +202,33 @@
</tbody> </tbody>
</table> </table>
</div> </div>
{%if crawler_metadata%}
<div class="panel panel-info">
<div class="panel-heading">
Crawlers Status
</div>
<table class="table table-hover table-striped">
<tbody>
{% for crawler in crawler_metadata %}
<tr>
<td>
<i class="fa fa-{%if crawler['status']%}check{%else%}times{%endif%}-circle fa-2x" style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"></i> {{crawler['crawler_info']}}
</td>
<td>
{{crawler['crawling_domain']}}
</td>
<td style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};">
{{crawler['status_info']}}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{%endif%}
</div> </div>
</div> </div>