mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
fix: [Crawler] detect splash connection to proxy error
This commit is contained in:
parent
04b9d9fc1d
commit
e357dce59b
4 changed files with 43 additions and 22 deletions
19
HOWTO.md
19
HOWTO.md
|
@ -102,20 +102,23 @@ Crawler
|
||||||
---------------------
|
---------------------
|
||||||
In AIL, you can crawl hidden services.
|
In AIL, you can crawl hidden services.
|
||||||
|
|
||||||
two types of configutation [explaination for what]:
|
There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same.
|
||||||
1) use local Splash dockers (use the same host for Splash servers and AIL)
|
|
||||||
2) use remote Splash servers
|
|
||||||
|
|
||||||
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used)
|
Install/Configure and launch all crawler scripts:
|
||||||
- (Splash host) Setup your tor proxy[is already installed]:
|
|
||||||
|
- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option)
|
||||||
|
|
||||||
|
- *(Splash host)* Install/Setup your tor proxy:
|
||||||
|
- Install the tor proxy: ``sudo apt-get install tor -y``
|
||||||
|
(The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it)
|
||||||
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
|
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
|
||||||
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
|
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
|
||||||
- Restart the tor proxy: ``sudo service tor restart``
|
- Restart the tor proxy: ``sudo service tor restart``
|
||||||
|
|
||||||
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
- *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
||||||
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
||||||
|
|
||||||
- (AIL host) Edit the ``/bin/packages/config.cfg`` file:
|
- *(AIL host)* Edit the ``/bin/packages/config.cfg`` file:
|
||||||
- In the crawler section, set ``activate_crawler`` to ``True``
|
- In the crawler section, set ``activate_crawler`` to ``True``
|
||||||
- Change the IP address of Splash servers if needed (remote only)
|
- Change the IP address of Splash servers if needed (remote only)
|
||||||
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
|
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
|
||||||
|
|
|
@ -18,6 +18,12 @@ from pubsublogger import publisher
|
||||||
def signal_handler(sig, frame):
|
def signal_handler(sig, frame):
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
|
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
||||||
|
# send this msg back in the queue
|
||||||
|
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
||||||
|
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||||
|
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
||||||
|
|
||||||
def crawl_onion(url, domain, date, date_month, message):
|
def crawl_onion(url, domain, date, date_month, message):
|
||||||
|
|
||||||
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||||
|
@ -30,15 +36,11 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
except Exception:
|
except Exception:
|
||||||
## FIXME: # TODO: relaunch docker or send error message
|
## FIXME: # TODO: relaunch docker or send error message
|
||||||
|
|
||||||
# send this msg back in the queue
|
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||||
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
|
||||||
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
|
||||||
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
|
||||||
|
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
print(' {} DOWN'.format(splash_url))
|
print(' {} DOWN'.format(splash_url))
|
||||||
exit(0)
|
exit(1)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
||||||
|
@ -47,15 +49,26 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
if process.returncode == 0:
|
if process.returncode == 0:
|
||||||
# onion up
|
output = process.stdout.read().decode()
|
||||||
print(process.stdout.read())
|
print(output)
|
||||||
|
# error: splash:Connection to proxy refused
|
||||||
|
if 'Connection to proxy refused' in output:
|
||||||
|
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||||
|
print('------------------------------------------------------------------------')
|
||||||
|
print(' \033[91m SPLASH: Connection to proxy refused')
|
||||||
|
print('')
|
||||||
|
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||||
|
print('------------------------------------------------------------------------')
|
||||||
|
exit(-2)
|
||||||
else:
|
else:
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
exit(0)
|
exit(-1)
|
||||||
else:
|
else:
|
||||||
## FIXME: # TODO: relaunch docker
|
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||||
exit(0)
|
print('--------------------------------------')
|
||||||
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
|
print(' {} DOWN'.format(splash_url))
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -110,12 +110,16 @@ class TorSplashCrawler():
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
#print(response.headers)
|
#print(response.headers)
|
||||||
#print(response.status)
|
#print(response.status)
|
||||||
print(' | ')
|
|
||||||
if response.status == 504:
|
if response.status == 504:
|
||||||
# down ?
|
# down ?
|
||||||
print('504 detected')
|
print('504 detected')
|
||||||
elif response.status != 200:
|
elif response.status != 200:
|
||||||
print('other: {}'.format(response.status))
|
#print('other: {}'.format(response.status))
|
||||||
|
#print(error_log)
|
||||||
|
#detect connection to proxy refused
|
||||||
|
error_log = (json.loads(response.body.decode()))
|
||||||
|
if(error_log['info']['text'] == 'Connection to proxy refused'):
|
||||||
|
print('Connection to proxy refused')
|
||||||
else:
|
else:
|
||||||
|
|
||||||
UUID = self.domains[0]+str(uuid.uuid4())
|
UUID = self.domains[0]+str(uuid.uuid4())
|
||||||
|
|
|
@ -39,4 +39,5 @@ for ((i=0;i<=$((${n} - 1));i++)); do
|
||||||
port_number=$((${p} + $i))
|
port_number=$((${p} + $i))
|
||||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
echo " Splash server launched on port $port_number"
|
||||||
done
|
done
|
||||||
|
|
Loading…
Reference in a new issue