fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss)

This commit is contained in:
Terrtia 2019-01-04 15:51:08 +01:00
parent 0358b0cf58
commit b3b75ccbea
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 46 additions and 11 deletions

View file

@ -10,10 +10,12 @@ import datetime
import base64
import redis
import json
import time
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
@ -39,6 +41,7 @@ class TorSplashCrawler():
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'DEPTH_LIMIT': crawler_depth_limit
})
@ -97,7 +100,7 @@ class TorSplashCrawler():
yield SplashRequest(
self.start_urls,
self.parse,
#errback=self.errback_catcher,
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': self.original_paste},
args={ 'html': 1,
@ -174,7 +177,7 @@ class TorSplashCrawler():
yield SplashRequest(
link.url,
self.parse,
#errback=self.errback_catcher,
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': relative_filename_paste},
args={ 'html': 1,
@ -184,17 +187,39 @@ class TorSplashCrawler():
'wait': 10}
)
'''
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
url = request.meta['splash']['args']['url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': father},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
)
else:
print('failure')
#print(failure)
print(failure.type)
#print(failure.request.meta['item'])
'''
#if isinstance(failure.value, HttpError):
if failure.check(HttpError):
elif failure.check(HttpError):
# you can get the response
response = failure.value.response
print('HttpError')

View file

@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
echo " -n: number of splash servers to start";
echo "";
echo " -options:";
echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)";
echo "";
echo "example:";
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
exit 1;
}
while getopts ":p:f:n:" o; do
while getopts ":p:f:n:u:" o; do
case "${o}" in
p)
p=${OPTARG}
@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do
n)
n=${OPTARG}
;;
u)
u=${OPTARG}
;;
*)
usage
;;
@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do
done
shift $((OPTIND-1))
if [ -z "${u}" ]; then
u=3000;
fi
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
usage;
fi
@ -52,7 +62,7 @@ sleep 0.1
for ((i=0;i<=$((${n} - 1));i++)); do
port_number=$((${p} + $i))
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x'
sleep 0.1
printf "$GREEN Splash server launched on port $port_number$DEFAULT\n"
done