fix: [Crawler] fix onion blacklist + add crawler info

This commit is contained in:
Terrtia 2019-01-29 12:00:14 +01:00
parent 1a1fda4c47
commit bb301a870c
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 72 additions and 4 deletions

View file

@ -10,6 +10,8 @@ import time
import subprocess
import requests
from pyfaup.faup import Faup
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
from pubsublogger import publisher
@ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
def crawl_onion(url, domain, date, date_month, message):
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
@ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message):
print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
exit(1)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
nb_retry == 0
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
print(' Retry({}) in 10 seconds'.format(nb_retry))
time.sleep(10)
if r.status_code == 200:
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
@ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message):
print('')
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
print('------------------------------------------------------------------------')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
exit(-2)
else:
print(process.stdout.read())
@ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message):
print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
exit(1)
@ -119,6 +128,7 @@ if __name__ == '__main__':
print('splash url: {}'.format(splash_url))
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
faup = Faup()
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
@ -140,6 +150,10 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
# load domains blacklist
try:
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
@ -180,7 +194,10 @@ if __name__ == '__main__':
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
faup.decode(domain)
onion_domain=faup.get()['domain'].decode()
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
date = datetime.datetime.now().strftime("%Y%m%d")
date_month = datetime.datetime.now().strftime("%Y%m")
@ -243,6 +260,10 @@ if __name__ == '__main__':
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
#update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
else:
continue
else:

View file

@ -30,6 +30,12 @@ r_serv = redis.StrictRedis(
db=cfg.getint("Redis_Queues", "db"),
decode_responses=True)
r_cache = redis.StrictRedis(
host=cfg.get("Redis_Cache", "host"),
port=cfg.getint("Redis_Cache", "port"),
db=cfg.getint("Redis_Cache", "db"),
decode_responses=True)
r_serv_log = redis.StrictRedis(
host=cfg.get("Redis_Log", "host"),
port=cfg.getint("Redis_Log", "port"),

View file

@ -19,6 +19,7 @@ import Flask_config
app = Flask_config.app
cfg = Flask_config.cfg
baseUrl = Flask_config.baseUrl
r_cache = Flask_config.r_cache
r_serv_onion = Flask_config.r_serv_onion
r_serv_metadata = Flask_config.r_serv_metadata
bootstrap_label = Flask_config.bootstrap_label
@ -102,8 +103,22 @@ def hiddenServices_page():
metadata_onion['status_icon'] = 'fa-times-circle'
list_onion.append(metadata_onion)
crawler_metadata=[]
all_onion_crawler = r_cache.smembers('all_crawler:onion')
for crawler in all_onion_crawler:
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time')
status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status')
crawler_info = '{} - {}'.format(crawler, started_time)
if status_info=='Waiting' or status_info=='Crawling':
status=True
else:
status=False
crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string)
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
def last_crawled_domains_with_stats_json():

View file

@ -142,7 +142,6 @@
</div>
</div>
<div class="panel panel-info">
<div class="panel-heading">
<i class="fa fa-eye-slash"></i> Domains Crawled Today
@ -203,6 +202,33 @@
</tbody>
</table>
</div>
{%if crawler_metadata%}
<div class="panel panel-info">
<div class="panel-heading">
Crawlers Status
</div>
<table class="table table-hover table-striped">
<tbody>
{% for crawler in crawler_metadata %}
<tr>
<td>
<i class="fa fa-{%if crawler['status']%}check{%else%}times{%endif%}-circle fa-2x" style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"></i> {{crawler['crawler_info']}}
</td>
<td>
{{crawler['crawling_domain']}}
</td>
<td style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};">
{{crawler['status_info']}}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{%endif%}
</div>
</div>