chg: [Crawler] add default crawler config + update default user_agent

This commit is contained in:
Terrtia 2019-07-24 10:18:10 +02:00
parent d9279823d5
commit 09ecc4d93f
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 22 additions and 4 deletions

View file

@ -341,13 +341,27 @@ if __name__ == '__main__':
faup = Faup() faup = Faup()
# get HAR files
default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har")
if default_crawler_har:
default_crawler_har = 1
else:
default_crawler_har = 0
# get PNG files
default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png")
if default_crawler_png:
default_crawler_png = 1
else:
default_crawler_png = 0
# Default crawler options # Default crawler options
default_crawler_config = {'html': 1, default_crawler_config = {'html': 1,
'har': 1, 'har': default_crawler_har,
'png': 1, 'png': default_crawler_png,
'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"),
'closespider_pagecount': 50, 'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"),
'user_agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
# Track launched crawler # Track launched crawler
r_cache.sadd('all_crawler', splash_port) r_cache.sadd('all_crawler', splash_port)

View file

@ -252,5 +252,9 @@ db = 0
[Crawler] [Crawler]
activate_crawler = False activate_crawler = False
crawler_depth_limit = 1 crawler_depth_limit = 1
default_crawler_har = True
default_crawler_png = True
default_crawler_closespider_pagecount = 50
default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0
splash_url = http://127.0.0.1 splash_url = http://127.0.0.1
splash_port = 8050-8052 splash_port = 8050-8052