mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
chg: [crawler proxy] interact with the splash manager API
(get list of proxies + splash containers + launch them)
This commit is contained in:
parent
5d7488f74a
commit
3fd200be43
2 changed files with 192 additions and 4 deletions
|
@ -15,8 +15,8 @@ import configparser
|
||||||
|
|
||||||
# Get Config file
|
# Get Config file
|
||||||
config_dir = os.path.join(os.environ['AIL_HOME'], 'configs')
|
config_dir = os.path.join(os.environ['AIL_HOME'], 'configs')
|
||||||
config_file = os.path.join(config_dir, 'core.cfg')
|
default_config_file = os.path.join(config_dir, 'core.cfg')
|
||||||
if not os.path.exists(config_file):
|
if not os.path.exists(default_config_file):
|
||||||
raise Exception('Unable to find the configuration file. \
|
raise Exception('Unable to find the configuration file. \
|
||||||
Did you set environment variables? \
|
Did you set environment variables? \
|
||||||
Or activate the virtualenv.')
|
Or activate the virtualenv.')
|
||||||
|
@ -28,9 +28,12 @@ if not os.path.exists(config_file):
|
||||||
class ConfigLoader(object):
|
class ConfigLoader(object):
|
||||||
"""docstring for Config_Loader."""
|
"""docstring for Config_Loader."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, config_file=None):
|
||||||
self.cfg = configparser.ConfigParser()
|
self.cfg = configparser.ConfigParser()
|
||||||
self.cfg.read(config_file)
|
if config_file:
|
||||||
|
self.cfg.read(os.path.join(config_dir, config_file))
|
||||||
|
else:
|
||||||
|
self.cfg.read(default_config_file)
|
||||||
|
|
||||||
def get_redis_conn(self, redis_name, decode_responses=True): ## TODO: verify redis name
|
def get_redis_conn(self, redis_name, decode_responses=True): ## TODO: verify redis name
|
||||||
return redis.StrictRedis( host=self.cfg.get(redis_name, "host"),
|
return redis.StrictRedis( host=self.cfg.get(redis_name, "host"),
|
||||||
|
|
|
@ -20,9 +20,15 @@ from urllib.parse import urlparse
|
||||||
|
|
||||||
from pyfaup.faup import Faup
|
from pyfaup.faup import Faup
|
||||||
|
|
||||||
|
# interact with splash_crawler API
|
||||||
|
import requests
|
||||||
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||||
import ConfigLoader
|
import ConfigLoader
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'core/'))
|
||||||
|
import screen
|
||||||
|
|
||||||
config_loader = ConfigLoader.ConfigLoader()
|
config_loader = ConfigLoader.ConfigLoader()
|
||||||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||||
|
@ -30,6 +36,12 @@ r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
||||||
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||||
config_loader = None
|
config_loader = None
|
||||||
|
|
||||||
|
# load crawler config
|
||||||
|
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||||
|
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||||
|
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||||
|
config_loader = None
|
||||||
|
|
||||||
faup = Faup()
|
faup = Faup()
|
||||||
|
|
||||||
def generate_uuid():
|
def generate_uuid():
|
||||||
|
@ -530,3 +542,176 @@ def save_har(har_dir, item_id, har_content):
|
||||||
filename = os.path.join(har_dir, item_id + '.json')
|
filename = os.path.join(har_dir, item_id + '.json')
|
||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
f.write(json.dumps(har_content))
|
f.write(json.dumps(har_content))
|
||||||
|
|
||||||
|
|
||||||
|
#### SPLASH MANAGER ####
|
||||||
|
def get_splash_manager_url(reload=False): # TODO: add config reload
|
||||||
|
return splash_manager_url
|
||||||
|
|
||||||
|
def get_splash_api_key(reload=False): # TODO: add config reload
|
||||||
|
return splash_api_key
|
||||||
|
|
||||||
|
def get_splash_url_from_manager_url(splash_manager_url, splash_port):
|
||||||
|
url = urlparse(splash_manager_url)
|
||||||
|
host = url.netloc.split(':', 1)[0]
|
||||||
|
return 'http://{}:{}'.format(host, splash_port)
|
||||||
|
|
||||||
|
## API ##
|
||||||
|
def ping_splash_manager():
|
||||||
|
req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||||
|
if req.status_code == 200:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(req.json())
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_all_splash_manager_containers_name():
|
||||||
|
req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||||
|
if req.status_code == 200:
|
||||||
|
return req.json()
|
||||||
|
else:
|
||||||
|
print(req.json())
|
||||||
|
|
||||||
|
def get_all_splash_manager_proxies():
|
||||||
|
req = requests.get('{}/api/v1/get/proxies/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||||
|
if req.status_code == 200:
|
||||||
|
return req.json()
|
||||||
|
else:
|
||||||
|
print(req.json())
|
||||||
|
## -- ##
|
||||||
|
|
||||||
|
## SPLASH ##
|
||||||
|
def get_all_splash(r_list=False):
|
||||||
|
res = r_serv_onion.smembers('all_splash')
|
||||||
|
if res:
|
||||||
|
if r_list:
|
||||||
|
return list(res)
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_splash_proxy(splash_name):
|
||||||
|
return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'proxy')
|
||||||
|
|
||||||
|
def get_splash_all_url(splash_name, r_list=False):
|
||||||
|
res = r_serv_onion.smembers('splash:url:{}'.format(splash_name))
|
||||||
|
if res:
|
||||||
|
if r_list:
|
||||||
|
return list(res)
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_splash_name_by_url(splash_url):
|
||||||
|
return r_serv_onion.get('splash:map:url:name:{}'.format(splash_url))
|
||||||
|
|
||||||
|
def get_splash_crawler_type(splash_name):
|
||||||
|
return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'crawler_type')
|
||||||
|
|
||||||
|
def get_all_splash_by_proxy(proxy_name):
|
||||||
|
res = r_serv_onion.smembers('proxy:splash:{}'.format(proxy_name))
|
||||||
|
if res:
|
||||||
|
if r_list:
|
||||||
|
return list(res)
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def delete_all_splash_containers():
|
||||||
|
for splash_name in get_all_splash():
|
||||||
|
delete_splash_container(splash_name)
|
||||||
|
|
||||||
|
def delete_splash_container(splash_name):
|
||||||
|
r_serv_onion.srem('proxy:splash:{}'.format(get_splash_proxy(splash_name)), splash_name)
|
||||||
|
r_serv_onion.delete('splash:metadata:{}'.format(splash_name))
|
||||||
|
|
||||||
|
for splash_url in get_splash_all_url(splash_name):
|
||||||
|
r_serv_onion.delete('splash:map:url:name:{}'.format(splash_url), splash_name)
|
||||||
|
r_serv_onion.srem('splash:url:{}'.format(splash_name), splash_url)
|
||||||
|
r_serv_onion.srem('all_splash', splash_name)
|
||||||
|
## -- ##
|
||||||
|
|
||||||
|
## PROXY ##
|
||||||
|
def get_all_proxies(r_list=False):
|
||||||
|
res = r_serv_onion.smembers('all_proxy')
|
||||||
|
if res:
|
||||||
|
return list(res)
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def delete_all_proxies():
|
||||||
|
for proxy_name in get_all_proxies():
|
||||||
|
delete_proxy(proxy_name)
|
||||||
|
|
||||||
|
def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy)
|
||||||
|
proxy_splash = get_all_splash_by_proxy(proxy_name)
|
||||||
|
if proxy_splash:
|
||||||
|
print('error, a splash container is using this proxy')
|
||||||
|
r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name))
|
||||||
|
r_serv_onion.srem('all_proxy', proxy_name)
|
||||||
|
## -- ##
|
||||||
|
|
||||||
|
## LOADER ##
|
||||||
|
def load_all_splash_containers():
|
||||||
|
all_splash_containers_name = get_all_splash_manager_containers_name()
|
||||||
|
for splash_name in all_splash_containers_name:
|
||||||
|
r_serv_onion.sadd('all_splash', splash_name)
|
||||||
|
|
||||||
|
proxy = all_splash_containers_name[splash_name]['proxy']
|
||||||
|
if not proxy:
|
||||||
|
proxy = {'name': 'no_proxy', 'crawler_type': 'web'}
|
||||||
|
|
||||||
|
r_serv_onion.sadd('proxy:splash:{}'.format(proxy['name']), splash_name)
|
||||||
|
|
||||||
|
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'crawler_type', proxy['crawler_type'])
|
||||||
|
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'proxy', proxy['name'])
|
||||||
|
description = all_splash_containers_name[splash_name].get('description', None)
|
||||||
|
if description:
|
||||||
|
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'description', description)
|
||||||
|
|
||||||
|
for port in all_splash_containers_name[splash_name]['ports']:
|
||||||
|
splash_url = get_splash_url_from_manager_url(get_splash_manager_url(), port)
|
||||||
|
r_serv_onion.sadd('splash:url:{}'.format(splash_name), splash_url)
|
||||||
|
r_serv_onion.set('splash:map:url:name:{}'.format(splash_url), splash_name)
|
||||||
|
|
||||||
|
def load_all_proxy():
|
||||||
|
all_proxies = get_all_splash_manager_proxies()
|
||||||
|
for proxy_name in all_proxies:
|
||||||
|
proxy_dict = all_proxies[proxy_name]
|
||||||
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'host', proxy_dict['host'])
|
||||||
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'port', proxy_dict['port'])
|
||||||
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'type', proxy_dict['type'])
|
||||||
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'crawler_type', proxy_dict['crawler_type'])
|
||||||
|
description = all_proxies[proxy_name].get('description', None)
|
||||||
|
if description:
|
||||||
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
|
||||||
|
|
||||||
|
def init_splash_list_db():
|
||||||
|
delete_all_splash_containers()
|
||||||
|
delete_all_proxies()
|
||||||
|
if ping_splash_manager():
|
||||||
|
load_all_splash_containers()
|
||||||
|
load_all_proxy()
|
||||||
|
# # TODO: kill crawler screen ?
|
||||||
|
## -- ##
|
||||||
|
|
||||||
|
## SPLASH CONTROLLER ##
|
||||||
|
def launch_ail_splash_crawler(splash_url, script_options=''):
|
||||||
|
screen_name = 'Crawler_AIL'
|
||||||
|
dir_project = os.environ['AIL_HOME']
|
||||||
|
script_location = os.path.join(os.environ['AIL_BIN'])
|
||||||
|
script_name = 'Crawler.py'
|
||||||
|
screen.create_screen(screen_name)
|
||||||
|
screen.launch_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options)
|
||||||
|
|
||||||
|
|
||||||
|
## -- ##
|
||||||
|
|
||||||
|
#### ---- ####
|
||||||
|
|
||||||
|
#### CRAWLER PROXY ####
|
||||||
|
|
||||||
|
#### ---- ####
|
||||||
|
|
Loading…
Reference in a new issue