chg: [crawler] add test + relaunch crawlers + major fixs

This commit is contained in:
Terrtia 2021-03-29 20:27:20 +02:00
parent 8754350d39
commit c0be210d2c
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
7 changed files with 379 additions and 152 deletions

View file

@ -29,7 +29,8 @@ if __name__ == '__main__':
is_manager_connected = crawlers.reload_splash_and_proxies_list()
print(is_manager_connected)
if is_manager_connected:
crawlers.relaunch_crawlers()
if crawlers.test_ail_crawlers():
crawlers.relaunch_crawlers()
last_check = int(time.time())
while True:
@ -45,7 +46,8 @@ if __name__ == '__main__':
is_manager_connected = crawlers.reload_splash_and_proxies_list()
if is_manager_connected:
print('reload proxies and splash list')
crawlers.relaunch_crawlers()
if crawlers.test_ail_crawlers():
crawlers.relaunch_crawlers()
session_uuid = current_session_uuid
if not is_manager_connected:
print('Error, Can\'t connect to Splash manager')

View file

@ -16,6 +16,8 @@ import sys
import time
import uuid
import subprocess
from datetime import datetime, timedelta
from urllib.parse import urlparse
@ -25,6 +27,9 @@ from pyfaup.faup import Faup
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import git_status
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
@ -429,6 +434,19 @@ def get_splash_crawler_status(spash_url):
status=False
return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type}
def set_current_crawler_status(splash_url, status, started_time=False, crawled_domain=None, crawler_type=None):
# TODO: get crawler type if None
# Status: ['Waiting', 'Error', ...]
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', status)
if started_time:
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
if crawler_type:
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', crawler_type)
if crawled_domain:
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', crawled_domain)
#r_cache.sadd('all_splash_crawlers', splash_url) # # TODO: add me in fct: create_ail_crawler
def get_stats_last_crawled_domains(crawler_types, date):
statDomains = {}
for crawler_type in crawler_types:
@ -1014,6 +1032,20 @@ def get_all_splash_by_proxy(proxy_name, r_list=False):
else:
return []
def get_all_splash_name_by_crawler_type(crawler_type):
l_splash_name = []
for splash_name in get_all_splash():
if get_splash_crawler_type(splash_name) == crawler_type:
l_splash_name.append(splash_name)
return l_splash_name
def get_all_splash_url_by_crawler_type(crawler_type):
l_splash_url = []
for splash_name in get_all_splash_name_by_crawler_type(crawler_type):
for splash_url in get_splash_all_url(splash_name, r_list=True):
l_splash_url.append(splash_url)
return l_splash_url
def delete_all_splash_containers():
for splash_name in get_all_splash():
delete_splash_container(splash_name)
@ -1140,7 +1172,106 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
screen.create_screen(screen_name)
screen.launch_uniq_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options, kill_previous_windows=True)
def is_test_ail_crawlers_successful():
return r_serv_onion.hget('crawler:tor:test', 'success') == 'True'
def get_test_ail_crawlers_message():
return r_serv_onion.hget('crawler:tor:test', 'message')
def save_test_ail_crawlers_result(test_success, message):
r_serv_onion.hset('crawler:tor:test', 'success', bool(test_success))
r_serv_onion.hset('crawler:tor:test', 'message', message)
def test_ail_crawlers():
# # TODO: test regular domain
if not ping_splash_manager():
manager_url = get_splash_manager_url()
error_message = f'Error: Can\'t connect to AIL Splash Manager, http://{manager_url}'
print(error_message)
save_test_ail_crawlers_result(False, error_message)
return False
splash_url = get_all_splash_url_by_crawler_type('tor')
if not splash_url:
error_message = f'Error: No Tor Splash Launched'
print(error_message)
save_test_ail_crawlers_result(False, error_message)
return False
splash_url = splash_url[0]
commit_id = git_status.get_last_commit_id_from_local()
crawler_options = {'html': True,
'har': False,
'png': False,
'depth_limit': 0,
'closespider_pagecount': 100,
'cookiejar_uuid': None,
'user_agent': commit_id + '-AIL SPLASH CRAWLER'}
date = {'date_day': datetime.now().strftime("%Y%m%d"),
'date_month': datetime.now().strftime("%Y%m"),
'epoch': int(time.time())}
crawler_config = {'splash_url': f'http://{splash_url}',
'service_type': 'onion',
'url': 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion',
'domain': 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion',
'port': 80,
'original_item': None,
'item': None,
'crawler_options': crawler_options,
'date': date,
'requested': 'test'}
## CHECK IF SPLASH AVAILABLE ##
try:
r = requests.get(f'http://{splash_url}' , timeout=30.0)
retry = False
except Exception as e:
error_message = f'Error: Can\'t connect to Splash Docker, http://{splash_url}'
print(error_message)
save_test_ail_crawlers_result(False, error_message)
return False
## -- ##
## LAUNCH CRAWLER, TEST MODE ##
set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True, crawled_domain='TEST DOMAIN', crawler_type='onion')
UUID = str(uuid.uuid4())
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
## LAUNCH CRAWLER, TEST MODE ##
tor_crawler_script = os.path.join(os.environ['AIL_BIN'], 'torcrawler/tor_crawler.py')
process = subprocess.Popen(["python", tor_crawler_script, UUID],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
if process.returncode == 0:
# Scrapy-Splash ERRORS
stderr = process.stdout.read().decode()
#print(stderr)
if stderr:
print(f'stderr: {stderr}')
save_test_ail_crawlers_result(False, f'Error: {stderr}')
set_current_crawler_status(splash_url, 'Error')
output = process.stdout.read().decode()
#print(output)
# error: splash:Connection to proxy refused
if 'Connection to proxy refused' in output:
print('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url))
save_test_ail_crawlers_result(False, 'SPASH, PROXY DOWN OR BAD CONFIGURATION')
set_current_crawler_status(splash_url, 'Error')
return False
else:
set_current_crawler_status(splash_url, 'Waiting')
return True
else:
# ERROR
stderr = process.stdout.read().decode()
output = process.stdout.read().decode()
error = f'-stderr-\n{stderr}\n-stdout-\n{output}'
print(error)
save_test_ail_crawlers_result(splash_url, error)
return False
return True
## -- ##
#### ---- ####
@ -1151,5 +1282,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
if __name__ == '__main__':
res = get_splash_manager_version()
#res = restart_splash_docker('127.0.0.1:8050', 'default_splash_tor')
res = test_ail_crawlers()
res = is_test_ail_crawlers_successful()
print(res)
print(get_test_ail_crawlers_message())

View file

@ -81,7 +81,7 @@ function main(splash, args)
html = splash:html(),
png = splash:png{render_all=true},
cookies = splash:get_cookies(),
last_url = splash:url()
last_url = splash:url(),
}
end
"""
@ -174,35 +174,54 @@ class TorSplashCrawler():
def parse(self,response):
#print(response.headers)
#print(response.status)
#print(response.meta)
#print(response.data) # # TODO: handle lua script error
#{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'",
#'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'',
#'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53},
#'error': 400, 'description': 'Error happened while executing Lua script'}
if response.status == 504:
# no response
#print('504 detected')
pass
# LUA ERROR # # TODO: print/display errors
# LUA ERROR # # TODO: logs errors
elif 'error' in response.data:
if(response.data['error'] == 'network99'):
## splash restart ##
error_retry = request.meta.get('error_retry', 0)
error_retry = response.meta.get('error_retry', 0)
if error_retry < 3:
error_retry += 1
url= request.meta['current_url']
father = request.meta['father']
url = response.data['last_url']
father = response.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if 'cookies' in response.data:
all_cookies = response.data['cookies'] # # TODO: use initial cookie ?????
else:
all_cookies = []
l_cookies = self.build_request_arg(all_cookies)
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
dont_filter=True,
meta={'father': father, 'current_url': url, 'error_retry': error_retry},
args=self.build_request_arg(response.cookiejar)
args=l_cookies
)
else:
if self.requested_mode == 'test':
crawlers.save_test_ail_crawlers_result(False, 'Connection to proxy refused')
print('Connection to proxy refused')
elif response.data['error'] == 'network3':
if self.requested_mode == 'test':
crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
print('HostNotFoundError: the remote host name was not found (invalid hostname)')
else:
if self.requested_mode == 'test':
crawlers.save_test_ail_crawlers_result(False, response.data['error'])
print(response.data['error'])
elif response.status != 200:
@ -213,6 +232,17 @@ class TorSplashCrawler():
#elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
# pass # ignore response
else:
## TEST MODE ##
if self.requested_mode == 'test':
if 'It works!' in response.data['html']:
print(response.data['html'])
#print('success')
crawlers.save_test_ail_crawlers_result(True, 'It works!')
else:
print('TEST ERROR')
crawlers.save_test_ail_crawlers_result(False, 'TEST ERROR')
return
## -- ##
item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
self.save_crawled_item(item_id, response.data['html'])

View file

@ -45,5 +45,9 @@ if __name__ == '__main__':
redis_cache.delete('crawler_request:{}'.format(uuid))
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
try:
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
except Exception as e:
print(e)
print(e, file=sys.stderr)

View file

@ -1,4 +1,5 @@
pymisp
d4-pyclient
thehive4py

View file

@ -504,18 +504,22 @@ def crawler_cookiejar_cookie_json_add_post():
def crawler_splash_setings():
all_proxies = crawlers.get_all_proxies_metadata()
all_splash = crawlers.get_all_splash_crawler_metadata()
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
splash_manager_url = crawlers.get_splash_manager_url()
api_key = crawlers.get_hidden_splash_api_key()
is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True)
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
crawler_full_config = Config_DB.get_full_config_by_section('crawler')
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
return render_template("settings_splash_crawler.html",
is_manager_connected=is_manager_connected,
splash_manager_url=splash_manager_url, api_key=api_key,
nb_crawlers_to_launch=nb_crawlers_to_launch,
all_splash=all_splash, all_proxies=all_proxies,
nb_crawlers_to_launch=nb_crawlers_to_launch,
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
crawler_full_config=crawler_full_config)
@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST'])
@ -555,4 +559,18 @@ def crawler_splash_setings_crawlers_to_lauch():
return render_template("settings_edit_crawlers_to_launch.html",
nb_crawlers_to_launch=nb_crawlers_to_launch)
@crawler_splash.route('/crawler/settings/test_crawler', methods=['GET'])
@login_required
@login_admin
def crawler_splash_setings_test_crawler():
crawlers.test_ail_crawlers()
return redirect(url_for('crawler_splash.crawler_splash_setings'))
@crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET'])
@login_required
@login_admin
def crawler_splash_setings_relaunch_crawler():
crawlers.relaunch_crawlers()
return redirect(url_for('crawler_splash.crawler_splash_setings'))
## - - ##

View file

@ -90,11 +90,165 @@
</div>
</div>
</div>
</div>
<div {%if not is_manager_connected%}class="hidden"{%endif%}>
<div class="card border-secondary mb-4">
<div class="card-body text-dark">
<h5 class="card-title">All Splash Crawlers:</h5>
<table class="table table-striped">
<thead class="bg-info text-white">
<th>
Splash name
</th>
<th>
Proxy
</th>
<th>
Crawler type
</th>
<th>
Description
</th>
<th></th>
</thead>
<tbody>
{% for splash_name in all_splash %}
<tr>
<td>
{{splash_name}}
</td>
<td>
{{all_splash[splash_name]['proxy']}}
</td>
<td>
{%if all_splash[splash_name]['type']=='tor'%}
<i class="fas fa-user-secret"></i>
{%else%}
<i class="fab fa-html5">
{%endif%}
{{all_splash[splash_name]['type']}}
</td>
<td>
{{all_splash[splash_name]['description']}}
</td>
<td>
<div class="d-flex justify-content-end">
<!-- <button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button> -->
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
<div class="card border-secondary">
<div class="card-body text-dark">
<h5 class="card-title">All Proxies:</h5>
<table class="table table-striped">
<thead class="bg-info text-white">
<th>
Proxy name
</th>
<th>
Host
</th>
<th>
Port
</th>
<th>
Type
</th>
<th>
Crawler Type
</th>
<th>
Description
</th>
<th></th>
</thead>
<tbody>
{% for proxy_name in all_proxies %}
<tr>
<td>
{{proxy_name}}
</td>
<td>
{{all_proxies[proxy_name]['host']}}
</td>
<td>
{{all_proxies[proxy_name]['port']}}
</td>
<td>
{{all_proxies[proxy_name]['type']}}
</td>
<td>
{%if all_proxies[proxy_name]['crawler_type']=='tor'%}
<i class="fas fa-user-secret"></i>
{%else%}
<i class="fab fa-html5">
{%endif%}
{{all_proxies[proxy_name]['crawler_type']}}
</td>
<td>
{{all_proxies[proxy_name]['description']}}
</td>
<td>
<div class="d-flex justify-content-end">
<!-- <button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button> -->
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
<div {%if not is_manager_connected%}class="hidden"{%endif%}>
<div class="card mb-3 mt-1">
<div class="card-header bg-dark text-white">
<span class="badge badge-pill badge-light flex-row-reverse float-right">
{% if is_crawler_working %}
<div style="color:Green;">
<i class="fas fa-check-circle fa-2x"></i>
{{crawler_error_mess}}
</div>
{% else %}
<div style="color:Red;">
<i class="fas fa-times-circle fa-2x"></i>
Error
</div>
{% endif %}
</span>
<h4>Crawlers</h4>
</div>
<div class="card-body">
<div class="card border-secondary mb-4">
<pre class="bg-dark text-white">
----------------------------
- TOR CRAWLER TEST OUTPUT: -
----------------------------
{{crawler_error_mess}}
</pre>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_test_crawler') }}">
<button type="button" class="btn btn-primary">
ReRun Test <i class="fas fa-rocket"></i>
</button>
</a>
<div class="card border-secondary my-4">
<div class="card-body text-dark">
<h5 class="card-title">Number of Crawlers to Launch:</h5>
<table class="table table-sm">
@ -112,175 +266,60 @@
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
</button>
</a>
</div>
</div>
<div class="card border-secondary mb-4">
<div class="card-body text-dark">
<h5 class="card-title">All Splash Crawlers:</h5>
<table class="table table-striped">
<thead class="bg-info text-white">
<th>
Splash name
</th>
<th>
Proxy
</th>
<th>
Crawler type
</th>
<th>
Description
</th>
<th></th>
</thead>
<tbody>
{% for splash_name in all_splash %}
<tr>
<td>
{{splash_name}}
</td>
<td>
{{all_splash[splash_name]['proxy']}}
</td>
<td>
{%if all_splash[splash_name]['type']=='tor'%}
<i class="fas fa-user-secret"></i>
{%else%}
<i class="fab fa-html5">
{%endif%}
{{all_splash[splash_name]['type']}}
</td>
<td>
{{all_splash[splash_name]['description']}}
</td>
<td>
<div class="d-flex justify-content-end">
<button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_relaunch_crawler') }}">
<button type="button" class="btn btn-danger">
ReLaunch Crawlers <i class="fas fa-redo"></i>
</button>
</a>
</div>
</div>
<div class="card border-secondary">
<div class="card-body text-dark">
<h5 class="card-title">All Proxies:</h5>
<table class="table table-striped">
<h5 class="card-title">Crawlers Settings:</h5>
<table class="table table-striped table-hover">
<thead class="bg-info text-white">
<th>
Proxy name
</th>
<th>
Host
</th>
<th>
Port
</th>
<th>
Type
</th>
<th>
Crawler Type
Key
</th>
<th>
Description
</th>
<th>
Value
</th>
<th></th>
</thead>
<tbody>
{% for proxy_name in all_proxies %}
{% for config_field in crawler_full_config %}
<tr>
<td>
{{proxy_name}}
{{config_field}}
</td>
<td>
{{all_proxies[proxy_name]['host']}}
{{crawler_full_config[config_field]['info']}}
</td>
<td>
{{all_proxies[proxy_name]['port']}}
</td>
<td>
{{all_proxies[proxy_name]['type']}}
</td>
<td>
{%if all_proxies[proxy_name]['crawler_type']=='tor'%}
<i class="fas fa-user-secret"></i>
{%else%}
<i class="fab fa-html5">
{%endif%}
{{all_proxies[proxy_name]['crawler_type']}}
</td>
<td>
{{all_proxies[proxy_name]['description']}}
{{crawler_full_config[config_field]['value']}}
</td>
<td>
<div class="d-flex justify-content-end">
<button class="btn btn-outline-dark px-1 py-0">
<!-- <button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button>
</button> -->
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
<div class="card mb-3 mt-1">
<div class="card-header bg-dark text-white">
<h4>Crawlers Settings</h4>
</div>
<div class="card-body">
<table class="table table-striped table-hover">
<thead class="bg-info text-white">
<th>
Key
</th>
<th>
Description
</th>
<th>
Value
</th>
<th></th>
</thead>
<tbody>
{% for config_field in crawler_full_config %}
<tr>
<td>
{{config_field}}
</td>
<td>
{{crawler_full_config[config_field]['info']}}
</td>
<td>
{{crawler_full_config[config_field]['value']}}
</td>
<td>
<div class="d-flex justify-content-end">
<button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>