fix: [Crawler] detect splash connection to proxy error

This commit is contained in:
Terrtia 2018-09-27 15:43:03 +02:00
parent 04b9d9fc1d
commit e357dce59b
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 43 additions and 22 deletions

View file

@ -102,20 +102,23 @@ Crawler
--------------------- ---------------------
In AIL, you can crawl hidden services. In AIL, you can crawl hidden services.
two types of configutation [explaination for what]: There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same.
1) use local Splash dockers (use the same host for Splash servers and AIL)
2) use remote Splash servers
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) Install/Configure and launch all crawler scripts:
- (Splash host) Setup your tor proxy[is already installed]:
- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option)
- *(Splash host)* Install/Setup your tor proxy:
- Install the tor proxy: ``sudo apt-get install tor -y``
(The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it)
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
- Restart the tor proxy: ``sudo service tor restart`` - Restart the tor proxy: ``sudo service tor restart``
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]`` - *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
- (AIL host) Edit the ``/bin/packages/config.cfg`` file: - *(AIL host)* Edit the ``/bin/packages/config.cfg`` file:
- In the crawler section, set ``activate_crawler`` to ``True`` - In the crawler section, set ``activate_crawler`` to ``True``
- Change the IP address of Splash servers if needed (remote only) - Change the IP address of Splash servers if needed (remote only)
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).

View file

@ -18,6 +18,12 @@ from pubsublogger import publisher
def signal_handler(sig, frame): def signal_handler(sig, frame):
sys.exit(0) sys.exit(0)
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
# send this msg back in the queue
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
def crawl_onion(url, domain, date, date_month, message): def crawl_onion(url, domain, date, date_month, message):
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
@ -30,15 +36,11 @@ def crawl_onion(url, domain, date, date_month, message):
except Exception: except Exception:
## FIXME: # TODO: relaunch docker or send error message ## FIXME: # TODO: relaunch docker or send error message
# send this msg back in the queue on_error_send_message_back_in_queue(type_hidden_service, domain, message)
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
print('--------------------------------------') print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url)) print(' {} DOWN'.format(splash_url))
exit(0) exit(1)
if r.status_code == 200: if r.status_code == 200:
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
@ -47,15 +49,26 @@ def crawl_onion(url, domain, date, date_month, message):
time.sleep(1) time.sleep(1)
if process.returncode == 0: if process.returncode == 0:
# onion up output = process.stdout.read().decode()
print(process.stdout.read()) print(output)
# error: splash:Connection to proxy refused
if 'Connection to proxy refused' in output:
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
print('------------------------------------------------------------------------')
print(' \033[91m SPLASH: Connection to proxy refused')
print('')
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
print('------------------------------------------------------------------------')
exit(-2)
else: else:
print(process.stdout.read()) print(process.stdout.read())
exit(0) exit(-1)
else: else:
## FIXME: # TODO: relaunch docker on_error_send_message_back_in_queue(type_hidden_service, domain, message)
exit(0) print('--------------------------------------')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
exit(1)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -110,12 +110,16 @@ class TorSplashCrawler():
def parse(self,response): def parse(self,response):
#print(response.headers) #print(response.headers)
#print(response.status) #print(response.status)
print(' | ')
if response.status == 504: if response.status == 504:
# down ? # down ?
print('504 detected') print('504 detected')
elif response.status != 200: elif response.status != 200:
print('other: {}'.format(response.status)) #print('other: {}'.format(response.status))
#print(error_log)
#detect connection to proxy refused
error_log = (json.loads(response.body.decode()))
if(error_log['info']['text'] == 'Connection to proxy refused'):
print('Connection to proxy refused')
else: else:
UUID = self.domains[0]+str(uuid.uuid4()) UUID = self.domains[0]+str(uuid.uuid4())

View file

@ -39,4 +39,5 @@ for ((i=0;i<=$((${n} - 1));i++)); do
port_number=$((${p} + $i)) port_number=$((${p} + $i))
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
sleep 0.1 sleep 0.1
echo " Splash server launched on port $port_number"
done done