fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss)

This commit is contained in:
Terrtia 2019-01-04 15:51:08 +01:00
parent 0358b0cf58
commit b3b75ccbea
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 46 additions and 11 deletions

View file

@ -10,10 +10,12 @@ import datetime
import base64 import base64
import redis import redis
import json import json
import time
from scrapy.spidermiddlewares.httperror import HttpError from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
from scrapy import Spider from scrapy import Spider
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
@ -39,6 +41,7 @@ class TorSplashCrawler():
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True, 'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'DEPTH_LIMIT': crawler_depth_limit 'DEPTH_LIMIT': crawler_depth_limit
}) })
@ -97,7 +100,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
self.start_urls, self.start_urls,
self.parse, self.parse,
#errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': self.original_paste}, meta={'father': self.original_paste},
args={ 'html': 1, args={ 'html': 1,
@ -174,7 +177,7 @@ class TorSplashCrawler():
yield SplashRequest( yield SplashRequest(
link.url, link.url,
self.parse, self.parse,
#errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': relative_filename_paste}, meta={'father': relative_filename_paste},
args={ 'html': 1, args={ 'html': 1,
@ -184,17 +187,39 @@ class TorSplashCrawler():
'wait': 10} 'wait': 10}
) )
'''
def errback_catcher(self, failure): def errback_catcher(self, failure):
# catch all errback failures, # catch all errback failures,
self.logger.error(repr(failure)) self.logger.error(repr(failure))
print('failure')
#print(failure)
print(failure.type)
#print(failure.request.meta['item'])
if failure.check(ResponseNeverReceived):
request = failure.request
url = request.meta['splash']['args']['url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': father},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
)
else:
print('failure')
#print(failure)
print(failure.type)
#print(failure.request.meta['item'])
'''
#if isinstance(failure.value, HttpError): #if isinstance(failure.value, HttpError):
if failure.check(HttpError): elif failure.check(HttpError):
# you can get the response # you can get the response
response = failure.value.response response = failure.value.response
print('HttpError') print('HttpError')
@ -214,7 +239,7 @@ class TorSplashCrawler():
print('TimeoutError') print('TimeoutError')
print(TimeoutError) print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url) self.logger.error('TimeoutError on %s', request.url)
''' '''
def save_crawled_paste(self, filename, content): def save_crawled_paste(self, filename, content):

View file

@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
echo " -n: number of splash servers to start"; echo " -n: number of splash servers to start";
echo ""; echo "";
echo " -options:";
echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)";
echo "";
echo "example:"; echo "example:";
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
exit 1; exit 1;
} }
while getopts ":p:f:n:" o; do while getopts ":p:f:n:u:" o; do
case "${o}" in case "${o}" in
p) p)
p=${OPTARG} p=${OPTARG}
@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do
n) n)
n=${OPTARG} n=${OPTARG}
;; ;;
u)
u=${OPTARG}
;;
*) *)
usage usage
;; ;;
@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do
done done
shift $((OPTIND-1)) shift $((OPTIND-1))
if [ -z "${u}" ]; then
u=3000;
fi
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
usage; usage;
fi fi
@ -52,7 +62,7 @@ sleep 0.1
for ((i=0;i<=$((${n} - 1));i++)); do for ((i=0;i<=$((${n} - 1));i++)); do
port_number=$((${p} + $i)) port_number=$((${p} + $i))
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x'
sleep 0.1 sleep 0.1
printf "$GREEN Splash server launched on port $port_number$DEFAULT\n" printf "$GREEN Splash server launched on port $port_number$DEFAULT\n"
done done