From 09ecc4d93facb135fdfa7218706f594698de8efa Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 24 Jul 2019 10:18:10 +0200 Subject: [PATCH] chg: [Crawler] add default crawler config + update default user_agent --- bin/Crawler.py | 22 ++++++++++++++++++---- bin/packages/config.cfg.sample | 4 ++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index fa0a796d..848d2b67 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -341,13 +341,27 @@ if __name__ == '__main__': faup = Faup() + # get HAR files + default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har") + if default_crawler_har: + default_crawler_har = 1 + else: + default_crawler_har = 0 + + # get PNG files + default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png") + if default_crawler_png: + default_crawler_png = 1 + else: + default_crawler_png = 0 + # Default crawler options default_crawler_config = {'html': 1, - 'har': 1, - 'png': 1, + 'har': default_crawler_har, + 'png': default_crawler_png, 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), - 'closespider_pagecount': 50, - 'user_agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} + 'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"), + 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler r_cache.sadd('all_crawler', splash_port) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index ea0ea55c..b6f26231 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -252,5 +252,9 @@ db = 0 [Crawler] activate_crawler = False crawler_depth_limit = 1 +default_crawler_har = True +default_crawler_png = True +default_crawler_closespider_pagecount = 50 +default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0 splash_url = http://127.0.0.1 splash_port = 8050-8052