mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [crawler] add test + relaunch crawlers + major fixs
This commit is contained in:
parent
8754350d39
commit
c0be210d2c
7 changed files with 379 additions and 152 deletions
|
@ -29,6 +29,7 @@ if __name__ == '__main__':
|
|||
is_manager_connected = crawlers.reload_splash_and_proxies_list()
|
||||
print(is_manager_connected)
|
||||
if is_manager_connected:
|
||||
if crawlers.test_ail_crawlers():
|
||||
crawlers.relaunch_crawlers()
|
||||
last_check = int(time.time())
|
||||
|
||||
|
@ -45,6 +46,7 @@ if __name__ == '__main__':
|
|||
is_manager_connected = crawlers.reload_splash_and_proxies_list()
|
||||
if is_manager_connected:
|
||||
print('reload proxies and splash list')
|
||||
if crawlers.test_ail_crawlers():
|
||||
crawlers.relaunch_crawlers()
|
||||
session_uuid = current_session_uuid
|
||||
if not is_manager_connected:
|
||||
|
|
|
@ -16,6 +16,8 @@ import sys
|
|||
import time
|
||||
import uuid
|
||||
|
||||
import subprocess
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
@ -25,6 +27,9 @@ from pyfaup.faup import Faup
|
|||
import requests
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
||||
import git_status
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||
import ConfigLoader
|
||||
|
||||
|
@ -429,6 +434,19 @@ def get_splash_crawler_status(spash_url):
|
|||
status=False
|
||||
return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type}
|
||||
|
||||
def set_current_crawler_status(splash_url, status, started_time=False, crawled_domain=None, crawler_type=None):
|
||||
# TODO: get crawler type if None
|
||||
# Status: ['Waiting', 'Error', ...]
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', status)
|
||||
if started_time:
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
if crawler_type:
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', crawler_type)
|
||||
if crawled_domain:
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', crawled_domain)
|
||||
|
||||
#r_cache.sadd('all_splash_crawlers', splash_url) # # TODO: add me in fct: create_ail_crawler
|
||||
|
||||
def get_stats_last_crawled_domains(crawler_types, date):
|
||||
statDomains = {}
|
||||
for crawler_type in crawler_types:
|
||||
|
@ -1014,6 +1032,20 @@ def get_all_splash_by_proxy(proxy_name, r_list=False):
|
|||
else:
|
||||
return []
|
||||
|
||||
def get_all_splash_name_by_crawler_type(crawler_type):
|
||||
l_splash_name = []
|
||||
for splash_name in get_all_splash():
|
||||
if get_splash_crawler_type(splash_name) == crawler_type:
|
||||
l_splash_name.append(splash_name)
|
||||
return l_splash_name
|
||||
|
||||
def get_all_splash_url_by_crawler_type(crawler_type):
|
||||
l_splash_url = []
|
||||
for splash_name in get_all_splash_name_by_crawler_type(crawler_type):
|
||||
for splash_url in get_splash_all_url(splash_name, r_list=True):
|
||||
l_splash_url.append(splash_url)
|
||||
return l_splash_url
|
||||
|
||||
def delete_all_splash_containers():
|
||||
for splash_name in get_all_splash():
|
||||
delete_splash_container(splash_name)
|
||||
|
@ -1140,7 +1172,106 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
|
|||
screen.create_screen(screen_name)
|
||||
screen.launch_uniq_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options, kill_previous_windows=True)
|
||||
|
||||
def is_test_ail_crawlers_successful():
|
||||
return r_serv_onion.hget('crawler:tor:test', 'success') == 'True'
|
||||
|
||||
def get_test_ail_crawlers_message():
|
||||
return r_serv_onion.hget('crawler:tor:test', 'message')
|
||||
|
||||
def save_test_ail_crawlers_result(test_success, message):
|
||||
r_serv_onion.hset('crawler:tor:test', 'success', bool(test_success))
|
||||
r_serv_onion.hset('crawler:tor:test', 'message', message)
|
||||
|
||||
def test_ail_crawlers():
|
||||
# # TODO: test regular domain
|
||||
if not ping_splash_manager():
|
||||
manager_url = get_splash_manager_url()
|
||||
error_message = f'Error: Can\'t connect to AIL Splash Manager, http://{manager_url}'
|
||||
print(error_message)
|
||||
save_test_ail_crawlers_result(False, error_message)
|
||||
return False
|
||||
|
||||
splash_url = get_all_splash_url_by_crawler_type('tor')
|
||||
if not splash_url:
|
||||
error_message = f'Error: No Tor Splash Launched'
|
||||
print(error_message)
|
||||
save_test_ail_crawlers_result(False, error_message)
|
||||
return False
|
||||
splash_url = splash_url[0]
|
||||
commit_id = git_status.get_last_commit_id_from_local()
|
||||
crawler_options = {'html': True,
|
||||
'har': False,
|
||||
'png': False,
|
||||
'depth_limit': 0,
|
||||
'closespider_pagecount': 100,
|
||||
'cookiejar_uuid': None,
|
||||
'user_agent': commit_id + '-AIL SPLASH CRAWLER'}
|
||||
date = {'date_day': datetime.now().strftime("%Y%m%d"),
|
||||
'date_month': datetime.now().strftime("%Y%m"),
|
||||
'epoch': int(time.time())}
|
||||
crawler_config = {'splash_url': f'http://{splash_url}',
|
||||
'service_type': 'onion',
|
||||
'url': 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion',
|
||||
'domain': 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion',
|
||||
'port': 80,
|
||||
'original_item': None,
|
||||
'item': None,
|
||||
'crawler_options': crawler_options,
|
||||
'date': date,
|
||||
'requested': 'test'}
|
||||
|
||||
## CHECK IF SPLASH AVAILABLE ##
|
||||
try:
|
||||
r = requests.get(f'http://{splash_url}' , timeout=30.0)
|
||||
retry = False
|
||||
except Exception as e:
|
||||
error_message = f'Error: Can\'t connect to Splash Docker, http://{splash_url}'
|
||||
print(error_message)
|
||||
save_test_ail_crawlers_result(False, error_message)
|
||||
return False
|
||||
## -- ##
|
||||
|
||||
## LAUNCH CRAWLER, TEST MODE ##
|
||||
set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True, crawled_domain='TEST DOMAIN', crawler_type='onion')
|
||||
UUID = str(uuid.uuid4())
|
||||
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
|
||||
|
||||
## LAUNCH CRAWLER, TEST MODE ##
|
||||
tor_crawler_script = os.path.join(os.environ['AIL_BIN'], 'torcrawler/tor_crawler.py')
|
||||
process = subprocess.Popen(["python", tor_crawler_script, UUID],
|
||||
stdout=subprocess.PIPE)
|
||||
while process.poll() is None:
|
||||
time.sleep(1)
|
||||
|
||||
if process.returncode == 0:
|
||||
# Scrapy-Splash ERRORS
|
||||
stderr = process.stdout.read().decode()
|
||||
#print(stderr)
|
||||
if stderr:
|
||||
print(f'stderr: {stderr}')
|
||||
save_test_ail_crawlers_result(False, f'Error: {stderr}')
|
||||
set_current_crawler_status(splash_url, 'Error')
|
||||
|
||||
output = process.stdout.read().decode()
|
||||
#print(output)
|
||||
# error: splash:Connection to proxy refused
|
||||
if 'Connection to proxy refused' in output:
|
||||
print('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url))
|
||||
save_test_ail_crawlers_result(False, 'SPASH, PROXY DOWN OR BAD CONFIGURATION')
|
||||
set_current_crawler_status(splash_url, 'Error')
|
||||
return False
|
||||
else:
|
||||
set_current_crawler_status(splash_url, 'Waiting')
|
||||
return True
|
||||
else:
|
||||
# ERROR
|
||||
stderr = process.stdout.read().decode()
|
||||
output = process.stdout.read().decode()
|
||||
error = f'-stderr-\n{stderr}\n-stdout-\n{output}'
|
||||
print(error)
|
||||
save_test_ail_crawlers_result(splash_url, error)
|
||||
return False
|
||||
return True
|
||||
## -- ##
|
||||
|
||||
#### ---- ####
|
||||
|
@ -1151,5 +1282,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
|
|||
|
||||
if __name__ == '__main__':
|
||||
res = get_splash_manager_version()
|
||||
#res = restart_splash_docker('127.0.0.1:8050', 'default_splash_tor')
|
||||
res = test_ail_crawlers()
|
||||
res = is_test_ail_crawlers_successful()
|
||||
print(res)
|
||||
print(get_test_ail_crawlers_message())
|
||||
|
|
|
@ -81,7 +81,7 @@ function main(splash, args)
|
|||
html = splash:html(),
|
||||
png = splash:png{render_all=true},
|
||||
cookies = splash:get_cookies(),
|
||||
last_url = splash:url()
|
||||
last_url = splash:url(),
|
||||
}
|
||||
end
|
||||
"""
|
||||
|
@ -174,35 +174,54 @@ class TorSplashCrawler():
|
|||
def parse(self,response):
|
||||
#print(response.headers)
|
||||
#print(response.status)
|
||||
#print(response.meta)
|
||||
#print(response.data) # # TODO: handle lua script error
|
||||
#{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'",
|
||||
#'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'',
|
||||
#'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53},
|
||||
#'error': 400, 'description': 'Error happened while executing Lua script'}
|
||||
if response.status == 504:
|
||||
# no response
|
||||
#print('504 detected')
|
||||
pass
|
||||
|
||||
# LUA ERROR # # TODO: print/display errors
|
||||
# LUA ERROR # # TODO: logs errors
|
||||
elif 'error' in response.data:
|
||||
if(response.data['error'] == 'network99'):
|
||||
## splash restart ##
|
||||
error_retry = request.meta.get('error_retry', 0)
|
||||
error_retry = response.meta.get('error_retry', 0)
|
||||
if error_retry < 3:
|
||||
error_retry += 1
|
||||
url= request.meta['current_url']
|
||||
father = request.meta['father']
|
||||
url = response.data['last_url']
|
||||
father = response.meta['father']
|
||||
|
||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||
time.sleep(10)
|
||||
if 'cookies' in response.data:
|
||||
all_cookies = response.data['cookies'] # # TODO: use initial cookie ?????
|
||||
else:
|
||||
all_cookies = []
|
||||
l_cookies = self.build_request_arg(all_cookies)
|
||||
yield SplashRequest(
|
||||
url,
|
||||
self.parse,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='execute',
|
||||
cache_args=['lua_source'],
|
||||
dont_filter=True,
|
||||
meta={'father': father, 'current_url': url, 'error_retry': error_retry},
|
||||
args=self.build_request_arg(response.cookiejar)
|
||||
args=l_cookies
|
||||
)
|
||||
else:
|
||||
if self.requested_mode == 'test':
|
||||
crawlers.save_test_ail_crawlers_result(False, 'Connection to proxy refused')
|
||||
print('Connection to proxy refused')
|
||||
elif response.data['error'] == 'network3':
|
||||
if self.requested_mode == 'test':
|
||||
crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
|
||||
print('HostNotFoundError: the remote host name was not found (invalid hostname)')
|
||||
else:
|
||||
if self.requested_mode == 'test':
|
||||
crawlers.save_test_ail_crawlers_result(False, response.data['error'])
|
||||
print(response.data['error'])
|
||||
|
||||
elif response.status != 200:
|
||||
|
@ -213,6 +232,17 @@ class TorSplashCrawler():
|
|||
#elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
|
||||
# pass # ignore response
|
||||
else:
|
||||
## TEST MODE ##
|
||||
if self.requested_mode == 'test':
|
||||
if 'It works!' in response.data['html']:
|
||||
print(response.data['html'])
|
||||
#print('success')
|
||||
crawlers.save_test_ail_crawlers_result(True, 'It works!')
|
||||
else:
|
||||
print('TEST ERROR')
|
||||
crawlers.save_test_ail_crawlers_result(False, 'TEST ERROR')
|
||||
return
|
||||
## -- ##
|
||||
|
||||
item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
|
||||
self.save_crawled_item(item_id, response.data['html'])
|
||||
|
|
|
@ -45,5 +45,9 @@ if __name__ == '__main__':
|
|||
|
||||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||
|
||||
try:
|
||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(e, file=sys.stderr)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
pymisp
|
||||
d4-pyclient
|
||||
|
||||
thehive4py
|
||||
|
||||
|
|
|
@ -504,18 +504,22 @@ def crawler_cookiejar_cookie_json_add_post():
|
|||
def crawler_splash_setings():
|
||||
all_proxies = crawlers.get_all_proxies_metadata()
|
||||
all_splash = crawlers.get_all_splash_crawler_metadata()
|
||||
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
|
||||
|
||||
splash_manager_url = crawlers.get_splash_manager_url()
|
||||
api_key = crawlers.get_hidden_splash_api_key()
|
||||
is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True)
|
||||
|
||||
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
|
||||
crawler_full_config = Config_DB.get_full_config_by_section('crawler')
|
||||
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
|
||||
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
|
||||
|
||||
return render_template("settings_splash_crawler.html",
|
||||
is_manager_connected=is_manager_connected,
|
||||
splash_manager_url=splash_manager_url, api_key=api_key,
|
||||
nb_crawlers_to_launch=nb_crawlers_to_launch,
|
||||
all_splash=all_splash, all_proxies=all_proxies,
|
||||
nb_crawlers_to_launch=nb_crawlers_to_launch,
|
||||
is_crawler_working=is_crawler_working,
|
||||
crawler_error_mess=crawler_error_mess,
|
||||
crawler_full_config=crawler_full_config)
|
||||
|
||||
@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST'])
|
||||
|
@ -555,4 +559,18 @@ def crawler_splash_setings_crawlers_to_lauch():
|
|||
return render_template("settings_edit_crawlers_to_launch.html",
|
||||
nb_crawlers_to_launch=nb_crawlers_to_launch)
|
||||
|
||||
@crawler_splash.route('/crawler/settings/test_crawler', methods=['GET'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_splash_setings_test_crawler():
|
||||
crawlers.test_ail_crawlers()
|
||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
||||
|
||||
@crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_splash_setings_relaunch_crawler():
|
||||
crawlers.relaunch_crawlers()
|
||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
||||
|
||||
## - - ##
|
||||
|
|
|
@ -94,27 +94,6 @@
|
|||
|
||||
<div {%if not is_manager_connected%}class="hidden"{%endif%}>
|
||||
|
||||
<div class="card border-secondary mb-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
||||
<table class="table table-sm">
|
||||
<tbody>
|
||||
{%for crawler in nb_crawlers_to_launch%}
|
||||
<tr>
|
||||
<td>{{crawler}}</td>
|
||||
<td>{{nb_crawlers_to_launch[crawler]}}</td>
|
||||
</tr>
|
||||
{%endfor%}
|
||||
</tbody>
|
||||
</table>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
|
||||
<button type="button" class="btn btn-info">
|
||||
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card border-secondary mb-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">All Splash Crawlers:</h5>
|
||||
|
@ -156,9 +135,9 @@
|
|||
</td>
|
||||
<td>
|
||||
<div class="d-flex justify-content-end">
|
||||
<button class="btn btn-outline-dark px-1 py-0">
|
||||
<!-- <button class="btn btn-outline-dark px-1 py-0">
|
||||
<i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</button> -->
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
|
@ -221,9 +200,9 @@
|
|||
</td>
|
||||
<td>
|
||||
<div class="d-flex justify-content-end">
|
||||
<button class="btn btn-outline-dark px-1 py-0">
|
||||
<!-- <button class="btn btn-outline-dark px-1 py-0">
|
||||
<i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</button> -->
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
|
@ -233,15 +212,72 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card mb-3 mt-1">
|
||||
<div class="card-header bg-dark text-white">
|
||||
<h4>Crawlers Settings</h4>
|
||||
<span class="badge badge-pill badge-light flex-row-reverse float-right">
|
||||
{% if is_crawler_working %}
|
||||
<div style="color:Green;">
|
||||
<i class="fas fa-check-circle fa-2x"></i>
|
||||
{{crawler_error_mess}}
|
||||
</div>
|
||||
{% else %}
|
||||
<div style="color:Red;">
|
||||
<i class="fas fa-times-circle fa-2x"></i>
|
||||
Error
|
||||
</div>
|
||||
{% endif %}
|
||||
</span>
|
||||
<h4>Crawlers</h4>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
|
||||
<pre class="bg-dark text-white">
|
||||
----------------------------
|
||||
- TOR CRAWLER TEST OUTPUT: -
|
||||
----------------------------
|
||||
|
||||
{{crawler_error_mess}}
|
||||
</pre>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_test_crawler') }}">
|
||||
<button type="button" class="btn btn-primary">
|
||||
ReRun Test <i class="fas fa-rocket"></i>
|
||||
</button>
|
||||
</a>
|
||||
|
||||
<div class="card border-secondary my-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
||||
<table class="table table-sm">
|
||||
<tbody>
|
||||
{%for crawler in nb_crawlers_to_launch%}
|
||||
<tr>
|
||||
<td>{{crawler}}</td>
|
||||
<td>{{nb_crawlers_to_launch[crawler]}}</td>
|
||||
</tr>
|
||||
{%endfor%}
|
||||
</tbody>
|
||||
</table>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
|
||||
<button type="button" class="btn btn-info">
|
||||
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</a>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_relaunch_crawler') }}">
|
||||
<button type="button" class="btn btn-danger">
|
||||
ReLaunch Crawlers <i class="fas fa-redo"></i>
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card border-secondary">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">Crawlers Settings:</h5>
|
||||
|
||||
<table class="table table-striped table-hover">
|
||||
<thead class="bg-info text-white">
|
||||
<th>
|
||||
|
@ -269,9 +305,9 @@
|
|||
</td>
|
||||
<td>
|
||||
<div class="d-flex justify-content-end">
|
||||
<button class="btn btn-outline-dark px-1 py-0">
|
||||
<!-- <button class="btn btn-outline-dark px-1 py-0">
|
||||
<i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</button> -->
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
|
@ -284,6 +320,9 @@
|
|||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue