mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [crawler manager] get all splash dockers, proxies and launch all crawlers
This commit is contained in:
parent
9828ffdbc8
commit
41cacf7129
6 changed files with 158 additions and 29 deletions
27
OVERVIEW.md
27
OVERVIEW.md
|
@ -420,6 +420,33 @@ Supported cryptocurrency:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Splash containers and proxies:
|
||||||
|
| SET - Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| all_proxy | **proxy name** |
|
||||||
|
| all_splash | **splash name** |
|
||||||
|
|
||||||
|
| HSET - Key | Field | Value |
|
||||||
|
| ------ | ------ | ------ |
|
||||||
|
| proxy:metadata:**proxy name** | host | **host** |
|
||||||
|
| proxy:metadata:**proxy name** | port | **port** |
|
||||||
|
| proxy:metadata:**proxy name** | type | **type** |
|
||||||
|
| proxy:metadata:**proxy name** | crawler_type | **crawler_type** |
|
||||||
|
| proxy:metadata:**proxy name** | description | **proxy description** |
|
||||||
|
| | | |
|
||||||
|
| splash:metadata:**splash name** | description | **splash description** |
|
||||||
|
| splash:metadata:**splash name** | crawler_type | **crawler_type** |
|
||||||
|
| splash:metadata:**splash name** | proxy | **splash proxy (None if null)** |
|
||||||
|
|
||||||
|
| SET - Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| splash:url:**container name** | **splash url** |
|
||||||
|
| proxy:splash:**proxy name** | **container name** |
|
||||||
|
|
||||||
|
| Key | Value |
|
||||||
|
| ------ | ------ |
|
||||||
|
| splash:map:url:name:**splash url** | **container name** |
|
||||||
|
|
||||||
##### CRAWLER QUEUES:
|
##### CRAWLER QUEUES:
|
||||||
| SET - Key | Value |
|
| SET - Key | Value |
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
|
|
|
@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
crawler_config['port'] = port
|
crawler_config['port'] = port
|
||||||
print('Launching Crawler: {}'.format(url))
|
print('Launching Crawler: {}'.format(url))
|
||||||
|
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain)
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
retry = True
|
retry = True
|
||||||
nb_retry = 0
|
nb_retry = 0
|
||||||
|
@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
print(' {} DOWN'.format(splash_url))
|
print(' {} DOWN'.format(splash_url))
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN')
|
||||||
nb_retry == 0
|
nb_retry == 0
|
||||||
|
|
||||||
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
||||||
|
@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
|
||||||
# save config in cash
|
# save config in cash
|
||||||
UUID = str(uuid.uuid4())
|
UUID = str(uuid.uuid4())
|
||||||
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
|
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
|
||||||
|
@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
print('')
|
print('')
|
||||||
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||||
print('------------------------------------------------------------------------')
|
print('------------------------------------------------------------------------')
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error')
|
||||||
exit(-2)
|
exit(-2)
|
||||||
else:
|
else:
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
|
@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
print(' {} DOWN'.format(splash_url))
|
print(' {} DOWN'.format(splash_url))
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# check external links (full_crawl)
|
# check external links (full_crawl)
|
||||||
|
@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2 and len(sys.argv) != 3:
|
||||||
print('usage:', 'Crawler.py', 'splash_port')
|
print('usage:', 'Crawler.py', 'splash_port')
|
||||||
|
print('usage:', 'Crawler.py', 'splash_name', 'splash_url')
|
||||||
exit(1)
|
exit(1)
|
||||||
##################################################
|
##################################################
|
||||||
#mode = sys.argv[1]
|
|
||||||
splash_port = sys.argv[1]
|
|
||||||
|
|
||||||
rotation_mode = deque(['onion', 'regular'])
|
|
||||||
default_proto_map = {'http': 80, 'https': 443}
|
|
||||||
######################################################## add ftp ???
|
|
||||||
|
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
@ -323,9 +318,20 @@ if __name__ == '__main__':
|
||||||
# Setup the I/O queues
|
# Setup the I/O queues
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
|
|
||||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port)
|
if len(sys.argv) == 2:
|
||||||
|
splash_port = sys.argv[1]
|
||||||
|
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port)
|
||||||
|
else:
|
||||||
|
splash_name = sys.argv[1]
|
||||||
|
splash_url = sys.argv[2]
|
||||||
|
print(splash_name)
|
||||||
|
|
||||||
print('splash url: {}'.format(splash_url))
|
print('splash url: {}'.format(splash_url))
|
||||||
|
|
||||||
|
rotation_mode = deque(['onion', 'regular'])
|
||||||
|
default_proto_map = {'http': 80, 'https': 443}
|
||||||
|
######################################################## add ftp ???
|
||||||
|
|
||||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
||||||
|
|
||||||
r_serv_metadata = redis.StrictRedis(
|
r_serv_metadata = redis.StrictRedis(
|
||||||
|
@ -372,9 +378,9 @@ if __name__ == '__main__':
|
||||||
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
|
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
|
||||||
|
|
||||||
# Track launched crawler
|
# Track launched crawler
|
||||||
r_cache.sadd('all_crawler', splash_port)
|
r_cache.sadd('all_crawler', splash_url)
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
# update hardcoded blacklist
|
# update hardcoded blacklist
|
||||||
load_blacklist('onion')
|
load_blacklist('onion')
|
||||||
|
@ -408,7 +414,7 @@ if __name__ == '__main__':
|
||||||
'epoch': int(time.time())}
|
'epoch': int(time.time())}
|
||||||
|
|
||||||
# Update crawler status type
|
# Update crawler status type
|
||||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)
|
||||||
|
|
||||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||||
# check if default crawler
|
# check if default crawler
|
||||||
|
@ -456,11 +462,11 @@ if __name__ == '__main__':
|
||||||
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
||||||
|
|
||||||
#update crawler status
|
#update crawler status
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
|
||||||
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')
|
||||||
|
|
||||||
# Update crawler status type
|
# Update crawler status type
|
||||||
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)
|
||||||
|
|
||||||
# add next auto Crawling in queue:
|
# add next auto Crawling in queue:
|
||||||
if to_crawl['paste'] == 'auto':
|
if to_crawl['paste'] == 'auto':
|
||||||
|
|
47
bin/core/Crawler_manager.py
Executable file
47
bin/core/Crawler_manager.py
Executable file
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
||||||
|
import ConfigLoader
|
||||||
|
import crawlers
|
||||||
|
|
||||||
|
config_loader = ConfigLoader.ConfigLoader()
|
||||||
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||||
|
config_loader = None
|
||||||
|
|
||||||
|
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||||
|
SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||||
|
api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||||
|
crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
|
||||||
|
config_loader = None
|
||||||
|
|
||||||
|
import screen
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
if not crawlers.ping_splash_manager():
|
||||||
|
print('Error, Can\'t cnnect to Splash manager')
|
||||||
|
|
||||||
|
crawlers.reload_splash_and_proxies_list()
|
||||||
|
|
||||||
|
# # TODO: handle mutltiple splash_manager
|
||||||
|
|
||||||
|
for crawler_splash in crawlers_to_launch:
|
||||||
|
splash_name = crawler_splash[0]
|
||||||
|
nb_crawlers = int(crawler_splash[1])
|
||||||
|
|
||||||
|
all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
|
||||||
|
if nb_crawlers > len(all_crawler_urls):
|
||||||
|
print('Error, can\'t launch all Splash Dockers')
|
||||||
|
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
|
||||||
|
nb_crawlers = len(all_crawler_urls)
|
||||||
|
|
||||||
|
for i in range(0, int(nb_crawlers)):
|
||||||
|
splash_url = all_crawler_urls[i]
|
||||||
|
print(all_crawler_urls[i])
|
||||||
|
|
||||||
|
crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url))
|
|
@ -4,6 +4,7 @@
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
all_screen_name = set()
|
all_screen_name = set()
|
||||||
|
|
||||||
|
@ -16,8 +17,11 @@ def is_screen_install():
|
||||||
print(p.stderr)
|
print(p.stderr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def exist_screen(screen_name):
|
def exist_screen(screen_name, with_sudoer=False):
|
||||||
cmd_1 = ['screen', '-ls']
|
if with_sudoer:
|
||||||
|
cmd_1 = ['sudo', 'screen', '-ls']
|
||||||
|
else:
|
||||||
|
cmd_1 = ['screen', '-ls']
|
||||||
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
|
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
|
||||||
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
|
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
|
||||||
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
|
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
|
||||||
|
@ -27,6 +31,28 @@ def exist_screen(screen_name):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_screen_pid(screen_name, with_sudoer=False):
|
||||||
|
if with_sudoer:
|
||||||
|
cmd_1 = ['sudo', 'screen', '-ls']
|
||||||
|
else:
|
||||||
|
cmd_1 = ['screen', '-ls']
|
||||||
|
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
|
||||||
|
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
|
||||||
|
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
|
||||||
|
p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits.
|
||||||
|
output = p2.communicate()[0]
|
||||||
|
if output:
|
||||||
|
# extract pids with screen name
|
||||||
|
regex_pid_screen_name = b'[0-9]+.' + screen_name.encode()
|
||||||
|
pids = re.findall(regex_pid_screen_name, output)
|
||||||
|
# extract pids
|
||||||
|
all_pids = []
|
||||||
|
for pid_name in pids:
|
||||||
|
pid = pid_name.split(b'.')[0].decode()
|
||||||
|
all_pids.append(pid)
|
||||||
|
return all_pids
|
||||||
|
return []
|
||||||
|
|
||||||
def create_screen(screen_name):
|
def create_screen(screen_name):
|
||||||
if not exist_screen(screen_name):
|
if not exist_screen(screen_name):
|
||||||
cmd = ['screen', '-dmS', screen_name]
|
cmd = ['screen', '-dmS', screen_name]
|
||||||
|
@ -38,6 +64,18 @@ def create_screen(screen_name):
|
||||||
print(p.stderr)
|
print(p.stderr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def kill_screen(screen_name, with_sudoer=False):
|
||||||
|
if get_screen_pid(screen_name, with_sudoer=with_sudoer):
|
||||||
|
for pid in get_screen_pid(screen_name, with_sudoer=with_sudoer):
|
||||||
|
cmd = ['kill', pid]
|
||||||
|
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
if p.stderr:
|
||||||
|
print(p.stderr)
|
||||||
|
else:
|
||||||
|
print('{} killed'.format(pid))
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
# # TODO: add check if len(window_name) == 20
|
# # TODO: add check if len(window_name) == 20
|
||||||
# use: screen -S 'pid.screen_name' -p %window_id% -Q title
|
# use: screen -S 'pid.screen_name' -p %window_id% -Q title
|
||||||
# if len(windows_name) > 20 (truncated by default)
|
# if len(windows_name) > 20 (truncated by default)
|
||||||
|
@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False):
|
||||||
print(p.stderr)
|
print(p.stderr)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
res = get_screen_windows_list('Script_AIL')
|
res = kill_screen('Docker_Splash', with_sudoer=True)
|
||||||
print(res)
|
print(res)
|
||||||
|
|
|
@ -55,3 +55,12 @@ class ConfigLoader(object):
|
||||||
|
|
||||||
def has_section(self, section):
|
def has_section(self, section):
|
||||||
return self.cfg.has_section(section)
|
return self.cfg.has_section(section)
|
||||||
|
|
||||||
|
def get_all_keys_values_from_section(self, section):
|
||||||
|
if section in self.cfg:
|
||||||
|
all_keys_values = []
|
||||||
|
for key_name in self.cfg[section]:
|
||||||
|
all_keys_values.append((key_name, self.cfg.get(section, key_name)))
|
||||||
|
return all_keys_values
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
|
@ -38,8 +38,8 @@ config_loader = None
|
||||||
|
|
||||||
# load crawler config
|
# load crawler config
|
||||||
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||||
#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||||
#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||||
config_loader = None
|
config_loader = None
|
||||||
|
|
||||||
faup = Faup()
|
faup = Faup()
|
||||||
|
@ -691,11 +691,13 @@ def load_all_proxy():
|
||||||
if description:
|
if description:
|
||||||
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
|
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
|
||||||
|
|
||||||
def init_splash_list_db():
|
def reload_splash_and_proxies_list():
|
||||||
delete_all_splash_containers()
|
|
||||||
delete_all_proxies()
|
|
||||||
if ping_splash_manager():
|
if ping_splash_manager():
|
||||||
|
# LOAD SPLASH containers
|
||||||
|
delete_all_splash_containers()
|
||||||
load_all_splash_containers()
|
load_all_splash_containers()
|
||||||
|
# LOAD PROXIES containers
|
||||||
|
delete_all_proxies()
|
||||||
load_all_proxy()
|
load_all_proxy()
|
||||||
# # TODO: kill crawler screen ?
|
# # TODO: kill crawler screen ?
|
||||||
## -- ##
|
## -- ##
|
||||||
|
|
Loading…
Reference in a new issue