mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss)
This commit is contained in:
parent
0358b0cf58
commit
b3b75ccbea
2 changed files with 46 additions and 11 deletions
|
@ -10,10 +10,12 @@ import datetime
|
||||||
import base64
|
import base64
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
from scrapy.spidermiddlewares.httperror import HttpError
|
from scrapy.spidermiddlewares.httperror import HttpError
|
||||||
from twisted.internet.error import DNSLookupError
|
from twisted.internet.error import DNSLookupError
|
||||||
from twisted.internet.error import TimeoutError
|
from twisted.internet.error import TimeoutError
|
||||||
|
from twisted.web._newclient import ResponseNeverReceived
|
||||||
|
|
||||||
from scrapy import Spider
|
from scrapy import Spider
|
||||||
from scrapy.linkextractors import LinkExtractor
|
from scrapy.linkextractors import LinkExtractor
|
||||||
|
@ -39,6 +41,7 @@ class TorSplashCrawler():
|
||||||
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
'HTTPERROR_ALLOW_ALL': True,
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
|
'RETRY_TIMES': 2,
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -97,7 +100,7 @@ class TorSplashCrawler():
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
self.start_urls,
|
self.start_urls,
|
||||||
self.parse,
|
self.parse,
|
||||||
#errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='render.json',
|
endpoint='render.json',
|
||||||
meta={'father': self.original_paste},
|
meta={'father': self.original_paste},
|
||||||
args={ 'html': 1,
|
args={ 'html': 1,
|
||||||
|
@ -174,7 +177,7 @@ class TorSplashCrawler():
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
link.url,
|
link.url,
|
||||||
self.parse,
|
self.parse,
|
||||||
#errback=self.errback_catcher,
|
errback=self.errback_catcher,
|
||||||
endpoint='render.json',
|
endpoint='render.json',
|
||||||
meta={'father': relative_filename_paste},
|
meta={'father': relative_filename_paste},
|
||||||
args={ 'html': 1,
|
args={ 'html': 1,
|
||||||
|
@ -184,17 +187,39 @@ class TorSplashCrawler():
|
||||||
'wait': 10}
|
'wait': 10}
|
||||||
)
|
)
|
||||||
|
|
||||||
'''
|
|
||||||
def errback_catcher(self, failure):
|
def errback_catcher(self, failure):
|
||||||
# catch all errback failures,
|
# catch all errback failures,
|
||||||
self.logger.error(repr(failure))
|
self.logger.error(repr(failure))
|
||||||
print('failure')
|
|
||||||
#print(failure)
|
|
||||||
print(failure.type)
|
|
||||||
#print(failure.request.meta['item'])
|
|
||||||
|
|
||||||
|
if failure.check(ResponseNeverReceived):
|
||||||
|
request = failure.request
|
||||||
|
url = request.meta['splash']['args']['url']
|
||||||
|
father = request.meta['father']
|
||||||
|
|
||||||
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||||
|
time.sleep(10)
|
||||||
|
yield SplashRequest(
|
||||||
|
url,
|
||||||
|
self.parse,
|
||||||
|
errback=self.errback_catcher,
|
||||||
|
endpoint='render.json',
|
||||||
|
meta={'father': father},
|
||||||
|
args={ 'html': 1,
|
||||||
|
'png': 1,
|
||||||
|
'render_all': 1,
|
||||||
|
'har': 1,
|
||||||
|
'wait': 10}
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('failure')
|
||||||
|
#print(failure)
|
||||||
|
print(failure.type)
|
||||||
|
#print(failure.request.meta['item'])
|
||||||
|
|
||||||
|
'''
|
||||||
#if isinstance(failure.value, HttpError):
|
#if isinstance(failure.value, HttpError):
|
||||||
if failure.check(HttpError):
|
elif failure.check(HttpError):
|
||||||
# you can get the response
|
# you can get the response
|
||||||
response = failure.value.response
|
response = failure.value.response
|
||||||
print('HttpError')
|
print('HttpError')
|
||||||
|
@ -214,7 +239,7 @@ class TorSplashCrawler():
|
||||||
print('TimeoutError')
|
print('TimeoutError')
|
||||||
print(TimeoutError)
|
print(TimeoutError)
|
||||||
self.logger.error('TimeoutError on %s', request.url)
|
self.logger.error('TimeoutError on %s', request.url)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def save_crawled_paste(self, filename, content):
|
def save_crawled_paste(self, filename, content):
|
||||||
|
|
||||||
|
|
|
@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n
|
||||||
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
|
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
|
||||||
echo " -n: number of splash servers to start";
|
echo " -n: number of splash servers to start";
|
||||||
echo "";
|
echo "";
|
||||||
|
echo " -options:";
|
||||||
|
echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)";
|
||||||
|
echo "";
|
||||||
echo "example:";
|
echo "example:";
|
||||||
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
|
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
|
||||||
exit 1;
|
exit 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while getopts ":p:f:n:" o; do
|
while getopts ":p:f:n:u:" o; do
|
||||||
case "${o}" in
|
case "${o}" in
|
||||||
p)
|
p)
|
||||||
p=${OPTARG}
|
p=${OPTARG}
|
||||||
|
@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do
|
||||||
n)
|
n)
|
||||||
n=${OPTARG}
|
n=${OPTARG}
|
||||||
;;
|
;;
|
||||||
|
u)
|
||||||
|
u=${OPTARG}
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
usage
|
usage
|
||||||
;;
|
;;
|
||||||
|
@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do
|
||||||
done
|
done
|
||||||
shift $((OPTIND-1))
|
shift $((OPTIND-1))
|
||||||
|
|
||||||
|
if [ -z "${u}" ]; then
|
||||||
|
u=3000;
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||||
usage;
|
usage;
|
||||||
fi
|
fi
|
||||||
|
@ -52,7 +62,7 @@ sleep 0.1
|
||||||
|
|
||||||
for ((i=0;i<=$((${n} - 1));i++)); do
|
for ((i=0;i<=$((${n} - 1));i++)); do
|
||||||
port_number=$((${p} + $i))
|
port_number=$((${p} + $i))
|
||||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
printf "$GREEN Splash server launched on port $port_number$DEFAULT\n"
|
printf "$GREEN Splash server launched on port $port_number$DEFAULT\n"
|
||||||
done
|
done
|
||||||
|
|
Loading…
Reference in a new issue