chg: [crawler manager] get all splash dockers, proxies and launch all crawlers

This commit is contained in:
Terrtia 2020-06-09 18:33:41 +02:00
parent 9828ffdbc8
commit 41cacf7129
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
6 changed files with 158 additions and 29 deletions

View file

@ -420,6 +420,33 @@ Supported cryptocurrency:
}
```
### Splash containers and proxies:
| SET - Key | Value |
| ------ | ------ |
| all_proxy | **proxy name** |
| all_splash | **splash name** |
| HSET - Key | Field | Value |
| ------ | ------ | ------ |
| proxy:metadata:**proxy name** | host | **host** |
| proxy:metadata:**proxy name** | port | **port** |
| proxy:metadata:**proxy name** | type | **type** |
| proxy:metadata:**proxy name** | crawler_type | **crawler_type** |
| proxy:metadata:**proxy name** | description | **proxy description** |
| | | |
| splash:metadata:**splash name** | description | **splash description** |
| splash:metadata:**splash name** | crawler_type | **crawler_type** |
| splash:metadata:**splash name** | proxy | **splash proxy (None if null)** |
| SET - Key | Value |
| ------ | ------ |
| splash:url:**container name** | **splash url** |
| proxy:splash:**proxy name** | **container name** |
| Key | Value |
| ------ | ------ |
| splash:map:url:name:**splash url** | **container name** |
##### CRAWLER QUEUES:
| SET - Key | Value |
| ------ | ------ |

View file

@ -224,8 +224,8 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
crawler_config['port'] = port
print('Launching Crawler: {}'.format(url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
retry = True
nb_retry = 0
@ -243,7 +243,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN')
nb_retry == 0
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
@ -251,7 +251,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
time.sleep(10)
if r.status_code == 200:
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
# save config in cash
UUID = str(uuid.uuid4())
r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config))
@ -273,7 +273,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
print('')
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
print('------------------------------------------------------------------------')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error')
exit(-2)
else:
print(process.stdout.read())
@ -283,7 +283,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling')
exit(1)
# check external links (full_crawl)
@ -304,16 +304,11 @@ def search_potential_source_domain(type_service, domain):
if __name__ == '__main__':
if len(sys.argv) != 2:
if len(sys.argv) != 2 and len(sys.argv) != 3:
print('usage:', 'Crawler.py', 'splash_port')
print('usage:', 'Crawler.py', 'splash_name', 'splash_url')
exit(1)
##################################################
#mode = sys.argv[1]
splash_port = sys.argv[1]
rotation_mode = deque(['onion', 'regular'])
default_proto_map = {'http': 80, 'https': 443}
######################################################## add ftp ???
publisher.port = 6380
publisher.channel = "Script"
@ -323,9 +318,20 @@ if __name__ == '__main__':
# Setup the I/O queues
p = Process(config_section)
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port)
if len(sys.argv) == 2:
splash_port = sys.argv[1]
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url"), splash_port)
else:
splash_name = sys.argv[1]
splash_url = sys.argv[2]
print(splash_name)
print('splash url: {}'.format(splash_url))
rotation_mode = deque(['onion', 'regular'])
default_proto_map = {'http': 80, 'https': 443}
######################################################## add ftp ???
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
r_serv_metadata = redis.StrictRedis(
@ -372,9 +378,9 @@ if __name__ == '__main__':
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
# Track launched crawler
r_cache.sadd('all_crawler', splash_port)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
r_cache.sadd('all_crawler', splash_url)
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
# update hardcoded blacklist
load_blacklist('onion')
@ -408,7 +414,7 @@ if __name__ == '__main__':
'epoch': int(time.time())}
# Update crawler status type
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
# check if default crawler
@ -456,11 +462,11 @@ if __name__ == '__main__':
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
#update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')
# Update crawler status type
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)
# add next auto Crawling in queue:
if to_crawl['paste'] == 'auto':

47
bin/core/Crawler_manager.py Executable file
View file

@ -0,0 +1,47 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import os
import sys
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import ConfigLoader
import crawlers
config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
config_loader = None
import screen
if __name__ == '__main__':
if not crawlers.ping_splash_manager():
print('Error, Can\'t cnnect to Splash manager')
crawlers.reload_splash_and_proxies_list()
# # TODO: handle mutltiple splash_manager
for crawler_splash in crawlers_to_launch:
splash_name = crawler_splash[0]
nb_crawlers = int(crawler_splash[1])
all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
if nb_crawlers > len(all_crawler_urls):
print('Error, can\'t launch all Splash Dockers')
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
nb_crawlers = len(all_crawler_urls)
for i in range(0, int(nb_crawlers)):
splash_url = all_crawler_urls[i]
print(all_crawler_urls[i])
crawlers.launch_ail_splash_crawler('http://127.0.0.1:8054', script_options='{} {}'.format(splash_name, splash_url))

View file

@ -4,6 +4,7 @@
import os
import subprocess
import sys
import re
all_screen_name = set()
@ -16,8 +17,11 @@ def is_screen_install():
print(p.stderr)
return False
def exist_screen(screen_name):
cmd_1 = ['screen', '-ls']
def exist_screen(screen_name, with_sudoer=False):
if with_sudoer:
cmd_1 = ['sudo', 'screen', '-ls']
else:
cmd_1 = ['screen', '-ls']
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
@ -27,6 +31,28 @@ def exist_screen(screen_name):
return True
return False
def get_screen_pid(screen_name, with_sudoer=False):
if with_sudoer:
cmd_1 = ['sudo', 'screen', '-ls']
else:
cmd_1 = ['screen', '-ls']
cmd_2 = ['egrep', '[0-9]+.{}'.format(screen_name)]
p1 = subprocess.Popen(cmd_1, stdout=subprocess.PIPE)
p2 = subprocess.Popen(cmd_2, stdin=p1.stdout, stdout=subprocess.PIPE)
p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits.
output = p2.communicate()[0]
if output:
# extract pids with screen name
regex_pid_screen_name = b'[0-9]+.' + screen_name.encode()
pids = re.findall(regex_pid_screen_name, output)
# extract pids
all_pids = []
for pid_name in pids:
pid = pid_name.split(b'.')[0].decode()
all_pids.append(pid)
return all_pids
return []
def create_screen(screen_name):
if not exist_screen(screen_name):
cmd = ['screen', '-dmS', screen_name]
@ -38,6 +64,18 @@ def create_screen(screen_name):
print(p.stderr)
return False
def kill_screen(screen_name, with_sudoer=False):
if get_screen_pid(screen_name, with_sudoer=with_sudoer):
for pid in get_screen_pid(screen_name, with_sudoer=with_sudoer):
cmd = ['kill', pid]
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if p.stderr:
print(p.stderr)
else:
print('{} killed'.format(pid))
return True
return False
# # TODO: add check if len(window_name) == 20
# use: screen -S 'pid.screen_name' -p %window_id% -Q title
# if len(windows_name) > 20 (truncated by default)
@ -70,5 +108,5 @@ def kill_screen_window(screen_name, window_id, force=False):
print(p.stderr)
if __name__ == '__main__':
res = get_screen_windows_list('Script_AIL')
res = kill_screen('Docker_Splash', with_sudoer=True)
print(res)

View file

@ -55,3 +55,12 @@ class ConfigLoader(object):
def has_section(self, section):
return self.cfg.has_section(section)
def get_all_keys_values_from_section(self, section):
if section in self.cfg:
all_keys_values = []
for key_name in self.cfg[section]:
all_keys_values.append((key_name, self.cfg.get(section, key_name)))
return all_keys_values
else:
return []

View file

@ -38,8 +38,8 @@ config_loader = None
# load crawler config
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
#splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
#splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
config_loader = None
faup = Faup()
@ -691,11 +691,13 @@ def load_all_proxy():
if description:
r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description)
def init_splash_list_db():
delete_all_splash_containers()
delete_all_proxies()
def reload_splash_and_proxies_list():
if ping_splash_manager():
# LOAD SPLASH containers
delete_all_splash_containers()
load_all_splash_containers()
# LOAD PROXIES containers
delete_all_proxies()
load_all_proxy()
# # TODO: kill crawler screen ?
## -- ##