mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
fix: [Crawler] detect splash connection to proxy error
This commit is contained in:
parent
04b9d9fc1d
commit
e357dce59b
4 changed files with 43 additions and 22 deletions
19
HOWTO.md
19
HOWTO.md
|
@ -102,20 +102,23 @@ Crawler
|
|||
---------------------
|
||||
In AIL, you can crawl hidden services.
|
||||
|
||||
two types of configutation [explaination for what]:
|
||||
1) use local Splash dockers (use the same host for Splash servers and AIL)
|
||||
2) use remote Splash servers
|
||||
There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same.
|
||||
|
||||
- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used)
|
||||
- (Splash host) Setup your tor proxy[is already installed]:
|
||||
Install/Configure and launch all crawler scripts:
|
||||
|
||||
- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option)
|
||||
|
||||
- *(Splash host)* Install/Setup your tor proxy:
|
||||
- Install the tor proxy: ``sudo apt-get install tor -y``
|
||||
(The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it)
|
||||
- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16``
|
||||
(for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform)
|
||||
- Restart the tor proxy: ``sudo service tor restart``
|
||||
|
||||
- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
||||
all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
||||
- *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]``
|
||||
All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status.
|
||||
|
||||
- (AIL host) Edit the ``/bin/packages/config.cfg`` file:
|
||||
- *(AIL host)* Edit the ``/bin/packages/config.cfg`` file:
|
||||
- In the crawler section, set ``activate_crawler`` to ``True``
|
||||
- Change the IP address of Splash servers if needed (remote only)
|
||||
- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports).
|
||||
|
|
|
@ -18,6 +18,12 @@ from pubsublogger import publisher
|
|||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
||||
# send this msg back in the queue
|
||||
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
||||
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
||||
|
||||
def crawl_onion(url, domain, date, date_month, message):
|
||||
|
||||
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||
|
@ -30,15 +36,11 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
except Exception:
|
||||
## FIXME: # TODO: relaunch docker or send error message
|
||||
|
||||
# send this msg back in the queue
|
||||
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
||||
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
||||
|
||||
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
exit(0)
|
||||
exit(1)
|
||||
|
||||
if r.status_code == 200:
|
||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
||||
|
@ -47,15 +49,26 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
time.sleep(1)
|
||||
|
||||
if process.returncode == 0:
|
||||
# onion up
|
||||
print(process.stdout.read())
|
||||
|
||||
output = process.stdout.read().decode()
|
||||
print(output)
|
||||
# error: splash:Connection to proxy refused
|
||||
if 'Connection to proxy refused' in output:
|
||||
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||
print('------------------------------------------------------------------------')
|
||||
print(' \033[91m SPLASH: Connection to proxy refused')
|
||||
print('')
|
||||
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||
print('------------------------------------------------------------------------')
|
||||
exit(-2)
|
||||
else:
|
||||
print(process.stdout.read())
|
||||
exit(0)
|
||||
exit(-1)
|
||||
else:
|
||||
## FIXME: # TODO: relaunch docker
|
||||
exit(0)
|
||||
on_error_send_message_back_in_queue(type_hidden_service, domain, message)
|
||||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -110,12 +110,16 @@ class TorSplashCrawler():
|
|||
def parse(self,response):
|
||||
#print(response.headers)
|
||||
#print(response.status)
|
||||
print(' | ')
|
||||
if response.status == 504:
|
||||
# down ?
|
||||
print('504 detected')
|
||||
elif response.status != 200:
|
||||
print('other: {}'.format(response.status))
|
||||
#print('other: {}'.format(response.status))
|
||||
#print(error_log)
|
||||
#detect connection to proxy refused
|
||||
error_log = (json.loads(response.body.decode()))
|
||||
if(error_log['info']['text'] == 'Connection to proxy refused'):
|
||||
print('Connection to proxy refused')
|
||||
else:
|
||||
|
||||
UUID = self.domains[0]+str(uuid.uuid4())
|
||||
|
|
|
@ -39,4 +39,5 @@ for ((i=0;i<=$((${n} - 1));i++)); do
|
|||
port_number=$((${p} + $i))
|
||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
||||
sleep 0.1
|
||||
echo " Splash server launched on port $port_number"
|
||||
done
|
||||
|
|
Loading…
Reference in a new issue