mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
fix: [Crawler] fix onion blacklist + add crawler info
This commit is contained in:
parent
1a1fda4c47
commit
bb301a870c
4 changed files with 72 additions and 4 deletions
|
@ -10,6 +10,8 @@ import time
|
||||||
import subprocess
|
import subprocess
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
@ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
||||||
|
|
||||||
def crawl_onion(url, domain, date, date_month, message):
|
def crawl_onion(url, domain, date, date_month, message):
|
||||||
|
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||||
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
|
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
|
||||||
if super_father is None:
|
if super_father is None:
|
||||||
|
@ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
print(' {} DOWN'.format(splash_url))
|
print(' {} DOWN'.format(splash_url))
|
||||||
exit(1)
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
|
||||||
|
nb_retry == 0
|
||||||
|
|
||||||
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
||||||
print(' Retry({}) in 10 seconds'.format(nb_retry))
|
print(' Retry({}) in 10 seconds'.format(nb_retry))
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
||||||
stdout=subprocess.PIPE)
|
stdout=subprocess.PIPE)
|
||||||
while process.poll() is None:
|
while process.poll() is None:
|
||||||
|
@ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
print('')
|
print('')
|
||||||
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||||
print('------------------------------------------------------------------------')
|
print('------------------------------------------------------------------------')
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
|
||||||
exit(-2)
|
exit(-2)
|
||||||
else:
|
else:
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
|
@ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
print(' {} DOWN'.format(splash_url))
|
print(' {} DOWN'.format(splash_url))
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -119,6 +128,7 @@ if __name__ == '__main__':
|
||||||
print('splash url: {}'.format(splash_url))
|
print('splash url: {}'.format(splash_url))
|
||||||
|
|
||||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||||
|
faup = Faup()
|
||||||
|
|
||||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
||||||
|
|
||||||
|
@ -140,6 +150,10 @@ if __name__ == '__main__':
|
||||||
db=p.config.getint("ARDB_Onion", "db"),
|
db=p.config.getint("ARDB_Onion", "db"),
|
||||||
decode_responses=True)
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
# load domains blacklist
|
# load domains blacklist
|
||||||
try:
|
try:
|
||||||
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
|
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
|
||||||
|
@ -180,7 +194,10 @@ if __name__ == '__main__':
|
||||||
print('domain: {}'.format(domain))
|
print('domain: {}'.format(domain))
|
||||||
print('domain_url: {}'.format(domain_url))
|
print('domain_url: {}'.format(domain_url))
|
||||||
|
|
||||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
|
faup.decode(domain)
|
||||||
|
onion_domain=faup.get()['domain'].decode()
|
||||||
|
|
||||||
|
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
|
||||||
|
|
||||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
|
@ -243,6 +260,10 @@ if __name__ == '__main__':
|
||||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||||
|
|
||||||
|
#update crawler status
|
||||||
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||||
|
r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -30,6 +30,12 @@ r_serv = redis.StrictRedis(
|
||||||
db=cfg.getint("Redis_Queues", "db"),
|
db=cfg.getint("Redis_Queues", "db"),
|
||||||
decode_responses=True)
|
decode_responses=True)
|
||||||
|
|
||||||
|
r_cache = redis.StrictRedis(
|
||||||
|
host=cfg.get("Redis_Cache", "host"),
|
||||||
|
port=cfg.getint("Redis_Cache", "port"),
|
||||||
|
db=cfg.getint("Redis_Cache", "db"),
|
||||||
|
decode_responses=True)
|
||||||
|
|
||||||
r_serv_log = redis.StrictRedis(
|
r_serv_log = redis.StrictRedis(
|
||||||
host=cfg.get("Redis_Log", "host"),
|
host=cfg.get("Redis_Log", "host"),
|
||||||
port=cfg.getint("Redis_Log", "port"),
|
port=cfg.getint("Redis_Log", "port"),
|
||||||
|
|
|
@ -19,6 +19,7 @@ import Flask_config
|
||||||
app = Flask_config.app
|
app = Flask_config.app
|
||||||
cfg = Flask_config.cfg
|
cfg = Flask_config.cfg
|
||||||
baseUrl = Flask_config.baseUrl
|
baseUrl = Flask_config.baseUrl
|
||||||
|
r_cache = Flask_config.r_cache
|
||||||
r_serv_onion = Flask_config.r_serv_onion
|
r_serv_onion = Flask_config.r_serv_onion
|
||||||
r_serv_metadata = Flask_config.r_serv_metadata
|
r_serv_metadata = Flask_config.r_serv_metadata
|
||||||
bootstrap_label = Flask_config.bootstrap_label
|
bootstrap_label = Flask_config.bootstrap_label
|
||||||
|
@ -102,8 +103,22 @@ def hiddenServices_page():
|
||||||
metadata_onion['status_icon'] = 'fa-times-circle'
|
metadata_onion['status_icon'] = 'fa-times-circle'
|
||||||
list_onion.append(metadata_onion)
|
list_onion.append(metadata_onion)
|
||||||
|
|
||||||
|
crawler_metadata=[]
|
||||||
|
all_onion_crawler = r_cache.smembers('all_crawler:onion')
|
||||||
|
for crawler in all_onion_crawler:
|
||||||
|
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||||
|
started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time')
|
||||||
|
status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status')
|
||||||
|
crawler_info = '{} - {}'.format(crawler, started_time)
|
||||||
|
if status_info=='Waiting' or status_info=='Crawling':
|
||||||
|
status=True
|
||||||
|
else:
|
||||||
|
status=False
|
||||||
|
crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
|
||||||
|
|
||||||
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
||||||
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string)
|
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
|
||||||
|
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
||||||
def last_crawled_domains_with_stats_json():
|
def last_crawled_domains_with_stats_json():
|
||||||
|
|
|
@ -142,7 +142,6 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
<div class="panel panel-info">
|
<div class="panel panel-info">
|
||||||
<div class="panel-heading">
|
<div class="panel-heading">
|
||||||
<i class="fa fa-eye-slash"></i> Domains Crawled Today
|
<i class="fa fa-eye-slash"></i> Domains Crawled Today
|
||||||
|
@ -203,6 +202,33 @@
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{%if crawler_metadata%}
|
||||||
|
<div class="panel panel-info">
|
||||||
|
<div class="panel-heading">
|
||||||
|
Crawlers Status
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<table class="table table-hover table-striped">
|
||||||
|
<tbody>
|
||||||
|
{% for crawler in crawler_metadata %}
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<i class="fa fa-{%if crawler['status']%}check{%else%}times{%endif%}-circle fa-2x" style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"></i> {{crawler['crawler_info']}}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
{{crawler['crawling_domain']}}
|
||||||
|
</td>
|
||||||
|
<td style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};">
|
||||||
|
{{crawler['status_info']}}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{%endif%}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue