#!/usr/bin/env python3 # -*-coding:UTF-8 -* import os import sys import time sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import ConfigLoader import crawlers config_loader = ConfigLoader.ConfigLoader() r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") config_loader = None config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url') api_key = config_loader.get_config_str('Splash_Manager', 'api_key') crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers') config_loader = None import screen # # TODO: lauch me in core screen # # TODO: check if already launched in tor screen def launch_crawlers(): for crawler_splash in crawlers_to_launch: splash_name = crawler_splash[0] nb_crawlers = int(crawler_splash[1]) all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True) if nb_crawlers > len(all_crawler_urls): print('Error, can\'t launch all Splash Dockers') print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) nb_crawlers = len(all_crawler_urls) crawlers.reset_all_spash_crawler_status() for i in range(0, int(nb_crawlers)): splash_url = all_crawler_urls[i] print(all_crawler_urls[i]) crawlers.launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url)) # # TODO: handle mutltiple splash_manager if __name__ == '__main__': is_manager_connected = crawlers.ping_splash_manager() if not is_manager_connected: print('Error, Can\'t connect to Splash manager') session_uuid = None else: print('Splash manager connected') session_uuid = crawlers.get_splash_manager_session_uuid() is_manager_connected = crawlers.reload_splash_and_proxies_list() print(is_manager_connected) if is_manager_connected: launch_crawlers() last_check = int(time.time()) while True: # # TODO: avoid multiple ping # check if manager is connected if int(time.time()) - last_check > 60: is_manager_connected = crawlers.is_splash_manager_connected() current_session_uuid = crawlers.get_splash_manager_session_uuid() # reload proxy and splash list if current_session_uuid and current_session_uuid != session_uuid: is_manager_connected = crawlers.reload_splash_and_proxies_list() if is_manager_connected: print('reload proxies and splash list') launch_crawlers() session_uuid = current_session_uuid if not is_manager_connected: print('Error, Can\'t connect to Splash manager') last_check = int(time.time()) # # TODO: lauch crawlers if was never connected # refresh splash and proxy list elif False: crawlers.reload_splash_and_proxies_list() print('list of splash and proxies refreshed') else: time.sleep(5) # kill/launch new crawler / crawler manager check if already launched # # TODO: handle mutltiple splash_manager # catch reload request