fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss)

2024-11-23 06:37:15 +00:00 · 2019-01-04 15:51:08 +01:00 · 2019-01-04 15:51:08 +01:00 · b3b75ccbea
commit b3b75ccbea
parent 0358b0cf58
2 changed files with 46 additions and 11 deletions
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -10,10 +10,12 @@ import datetime
 import base64
 import redis
 import json
 import time
 from scrapy.spidermiddlewares.httperror import HttpError
 from twisted.internet.error import DNSLookupError
 from twisted.internet.error import TimeoutError
 from twisted.web._newclient import ResponseNeverReceived
 from scrapy import Spider
 from scrapy.linkextractors import LinkExtractor
@ -39,6 +41,7 @@ class TorSplashCrawler():
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'DEPTH_LIMIT': crawler_depth_limit
            })
@ -97,7 +100,7 @@ class TorSplashCrawler():
            yield SplashRequest(
                self.start_urls,
                self.parse,
-                #errback=self.errback_catcher,
+                errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_paste},
                args={  'html': 1,
@ -174,7 +177,7 @@ class TorSplashCrawler():
                        yield SplashRequest(
                            link.url,
                            self.parse,
-                            #errback=self.errback_catcher,
+                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
                            args={  'html': 1,
@ -184,17 +187,39 @@ class TorSplashCrawler():
                                    'wait': 10}
                        )
        '''
        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))
            print('failure')
            #print(failure)
            print(failure.type)
            #print(failure.request.meta['item'])
            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']
                self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
                time.sleep(10)
                yield SplashRequest(
                    url,
                    self.parse,
                    errback=self.errback_catcher,
                    endpoint='render.json',
                    meta={'father': father},
                    args={  'html': 1,
                            'png': 1,
                            'render_all': 1,
                            'har': 1,
                            'wait': 10}
                )
            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])
            '''
            #if isinstance(failure.value, HttpError):
-            if failure.check(HttpError):
+            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
@ -214,7 +239,7 @@ class TorSplashCrawler():
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
-        '''
+            '''
        def save_crawled_paste(self, filename, content):
--- a/bin/torcrawler/launch_splash_crawler.sh
+++ b/bin/torcrawler/launch_splash_crawler.sh
@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n
          echo "          -p: number of the first splash server port number. This number is incremented for the others splash server";
          echo "          -n: number of splash servers to start";
          echo "";
          echo "          -options:";
          echo "          -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)";
          echo "";
          echo "example:";
          echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
          exit 1;
        }
-while getopts ":p:f:n:" o; do
+while getopts ":p:f:n:u:" o; do
    case "${o}" in
        p)
            p=${OPTARG}
@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do
        n)
            n=${OPTARG}
            ;;
        u)
            u=${OPTARG}
            ;;
        *)
            usage
            ;;
@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do
 done
 shift $((OPTIND-1))
 if [ -z "${u}" ]; then
    u=3000;
 fi
 if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
    usage;
 fi
@ -52,7 +62,7 @@ sleep 0.1
 for ((i=0;i<=$((${n} - 1));i++)); do
    port_number=$((${p} + $i))
-    screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
+    screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x'
    sleep 0.1
    printf "$GREEN    Splash server launched on port $port_number$DEFAULT\n"
 done