From b3b75ccbeac8615eef720358299f5de959af4b07 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 4 Jan 2019 15:51:08 +0100 Subject: [PATCH] fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss) --- bin/torcrawler/TorSplashCrawler.py | 43 +++++++++++++++++++------ bin/torcrawler/launch_splash_crawler.sh | 14 ++++++-- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 47486dd9..056dd44e 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -10,10 +10,12 @@ import datetime import base64 import redis import json +import time from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError from twisted.internet.error import TimeoutError +from twisted.web._newclient import ResponseNeverReceived from scrapy import Spider from scrapy.linkextractors import LinkExtractor @@ -39,6 +41,7 @@ class TorSplashCrawler(): 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, + 'RETRY_TIMES': 2, 'DEPTH_LIMIT': crawler_depth_limit }) @@ -97,7 +100,7 @@ class TorSplashCrawler(): yield SplashRequest( self.start_urls, self.parse, - #errback=self.errback_catcher, + errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, @@ -174,7 +177,7 @@ class TorSplashCrawler(): yield SplashRequest( link.url, self.parse, - #errback=self.errback_catcher, + errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, @@ -184,17 +187,39 @@ class TorSplashCrawler(): 'wait': 10} ) - ''' def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) - print('failure') - #print(failure) - print(failure.type) - #print(failure.request.meta['item']) + if failure.check(ResponseNeverReceived): + request = failure.request + url = request.meta['splash']['args']['url'] + father = request.meta['father'] + + self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) + time.sleep(10) + yield SplashRequest( + url, + self.parse, + errback=self.errback_catcher, + endpoint='render.json', + meta={'father': father}, + args={ 'html': 1, + 'png': 1, + 'render_all': 1, + 'har': 1, + 'wait': 10} + ) + + else: + print('failure') + #print(failure) + print(failure.type) + #print(failure.request.meta['item']) + + ''' #if isinstance(failure.value, HttpError): - if failure.check(HttpError): + elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') @@ -214,7 +239,7 @@ class TorSplashCrawler(): print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) - ''' + ''' def save_crawled_paste(self, filename, content): diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index e47efc36..5c7f21ee 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; echo " -n: number of splash servers to start"; echo ""; + echo " -options:"; + echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)"; + echo ""; echo "example:"; echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; exit 1; } -while getopts ":p:f:n:" o; do +while getopts ":p:f:n:u:" o; do case "${o}" in p) p=${OPTARG} @@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do n) n=${OPTARG} ;; + u) + u=${OPTARG} + ;; *) usage ;; @@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do done shift $((OPTIND-1)) +if [ -z "${u}" ]; then + u=3000; +fi + if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then usage; fi @@ -52,7 +62,7 @@ sleep 0.1 for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) - screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x' sleep 0.1 printf "$GREEN Splash server launched on port $port_number$DEFAULT\n" done