chg: [crawler proxy] interact with the splash manager API

(get list of proxies + splash containers + launch them)
This commit is contained in:
Terrtia 2020-05-22 15:41:05 +02:00
parent 5d7488f74a
commit 3fd200be43
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 192 additions and 4 deletions

View file

@ -15,8 +15,8 @@ import configparser
# Get Config file # Get Config file
config_dir = os.path.join(os.environ['AIL_HOME'], 'configs') config_dir = os.path.join(os.environ['AIL_HOME'], 'configs')
config_file = os.path.join(config_dir, 'core.cfg') default_config_file = os.path.join(config_dir, 'core.cfg')
if not os.path.exists(config_file): if not os.path.exists(default_config_file):
raise Exception('Unable to find the configuration file. \ raise Exception('Unable to find the configuration file. \
Did you set environment variables? \ Did you set environment variables? \
Or activate the virtualenv.') Or activate the virtualenv.')
@ -28,9 +28,12 @@ if not os.path.exists(config_file):
class ConfigLoader(object): class ConfigLoader(object):
"""docstring for Config_Loader.""" """docstring for Config_Loader."""
def __init__(self): def __init__(self, config_file=None):
self.cfg = configparser.ConfigParser() self.cfg = configparser.ConfigParser()
self.cfg.read(config_file) if config_file:
self.cfg.read(os.path.join(config_dir, config_file))
else:
self.cfg.read(default_config_file)
def get_redis_conn(self, redis_name, decode_responses=True): ## TODO: verify redis name def get_redis_conn(self, redis_name, decode_responses=True): ## TODO: verify redis name
return redis.StrictRedis( host=self.cfg.get(redis_name, "host"), return redis.StrictRedis( host=self.cfg.get(redis_name, "host"),

View file

@ -20,9 +20,15 @@ from urllib.parse import urlparse
from pyfaup.faup import Faup from pyfaup.faup import Faup
# interact with splash_crawler API
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader import ConfigLoader
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'core/'))
import screen
config_loader = ConfigLoader.ConfigLoader() config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
@ -30,6 +36,12 @@ r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
r_cache = config_loader.get_redis_conn("Redis_Cache") r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None config_loader = None
# load crawler config
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
config_loader = None
faup = Faup() faup = Faup()
def generate_uuid(): def generate_uuid():
@ -530,3 +542,176 @@ def save_har(har_dir, item_id, har_content):
filename = os.path.join(har_dir, item_id + '.json') filename = os.path.join(har_dir, item_id + '.json')
with open(filename, 'w') as f: with open(filename, 'w') as f:
f.write(json.dumps(har_content)) f.write(json.dumps(har_content))
#### SPLASH MANAGER ####
def get_splash_manager_url(reload=False): # TODO: add config reload
return splash_manager_url
def get_splash_api_key(reload=False): # TODO: add config reload
return splash_api_key
def get_splash_url_from_manager_url(splash_manager_url, splash_port):
url = urlparse(splash_manager_url)
host = url.netloc.split(':', 1)[0]
return 'http://{}:{}'.format(host, splash_port)
## API ##
def ping_splash_manager():
req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
return True
else:
print(req.json())
return False
def get_all_splash_manager_containers_name():
req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
return req.json()
else:
print(req.json())
def get_all_splash_manager_proxies():
req = requests.get('{}/api/v1/get/proxies/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
return req.json()
else:
print(req.json())
## -- ##
## SPLASH ##
def get_all_splash(r_list=False):
res = r_serv_onion.smembers('all_splash')
if res:
if r_list:
return list(res)
else:
return res
else:
return []
def get_splash_proxy(splash_name):
return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'proxy')
def get_splash_all_url(splash_name, r_list=False):
res = r_serv_onion.smembers('splash:url:{}'.format(splash_name))
if res:
if r_list:
return list(res)
else:
return res
else:
return []
def get_splash_name_by_url(splash_url):
return r_serv_onion.get('splash:map:url:name:{}'.format(splash_url))
def get_splash_crawler_type(splash_name):
return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'crawler_type')
def get_all_splash_by_proxy(proxy_name):
res = r_serv_onion.smembers('proxy:splash:{}'.format(proxy_name))
if res:
if r_list:
return list(res)
else:
return res
else:
return []
def delete_all_splash_containers():
for splash_name in get_all_splash():
delete_splash_container(splash_name)
def delete_splash_container(splash_name):
r_serv_onion.srem('proxy:splash:{}'.format(get_splash_proxy(splash_name)), splash_name)
r_serv_onion.delete('splash:metadata:{}'.format(splash_name))
for splash_url in get_splash_all_url(splash_name):
r_serv_onion.delete('splash:map:url:name:{}'.format(splash_url), splash_name)
r_serv_onion.srem('splash:url:{}'.format(splash_name), splash_url)
r_serv_onion.srem('all_splash', splash_name)
## -- ##
## PROXY ##
def get_all_proxies(r_list=False):
res = r_serv_onion.smembers('all_proxy')
if res:
return list(res)
else:
return []
def delete_all_proxies():
for proxy_name in get_all_proxies():
delete_proxy(proxy_name)
def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy)
proxy_splash = get_all_splash_by_proxy(proxy_name)
if proxy_splash:
print('error, a splash container is using this proxy')
r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name))
r_serv_onion.srem('all_proxy', proxy_name)
## -- ##
## LOADER ##
def load_all_splash_containers():
all_splash_containers_name = get_all_splash_manager_containers_name()
for splash_name in all_splash_containers_name:
r_serv_onion.sadd('all_splash', splash_name)
proxy = all_splash_containers_name[splash_name]['proxy']
if not proxy:
proxy = {'name': 'no_proxy', 'crawler_type': 'web'}
r_serv_onion.sadd('proxy:splash:{}'.format(proxy['name']), splash_name)
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'crawler_type', proxy['crawler_type'])
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'proxy', proxy['name'])
description = all_splash_containers_name[splash_name].get('description', None)
if description:
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'description', description)
for port in all_splash_containers_name[splash_name]['ports']:
splash_url = get_splash_url_from_manager_url(get_splash_manager_url(), port)
r_serv_onion.sadd('splash:url:{}'.format(splash_name), splash_url)
r_serv_onion.set('splash:map:url:name:{}'.format(splash_url), splash_name)
def load_all_proxy():
all_proxies = get_all_splash_manager_proxies()
for proxy_name in all_proxies:
proxy_dict = all_proxies[proxy_name]
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'host', proxy_dict['host'])
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'port', proxy_dict['port'])
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'type', proxy_dict['type'])
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'crawler_type', proxy_dict['crawler_type'])
description = all_proxies[proxy_name].get('description', None)
if description:
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
def init_splash_list_db():
delete_all_splash_containers()
delete_all_proxies()
if ping_splash_manager():
load_all_splash_containers()
load_all_proxy()
# # TODO: kill crawler screen ?
## -- ##
## SPLASH CONTROLLER ##
def launch_ail_splash_crawler(splash_url, script_options=''):
screen_name = 'Crawler_AIL'
dir_project = os.environ['AIL_HOME']
script_location = os.path.join(os.environ['AIL_BIN'])
script_name = 'Crawler.py'
screen.create_screen(screen_name)
screen.launch_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options)
## -- ##
#### ---- ####
#### CRAWLER PROXY ####
#### ---- ####