From c1b34bd99c7d42d2dc8edf07f296d67da52f28e1 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Jan 2019 15:38:00 +0100 Subject: [PATCH] fix: [Crawler] limit max crawled pages --- bin/Crawler.py | 11 +++++------ bin/torcrawler/TorSplashCrawler.py | 1 + .../hiddenServices/templates/hiddenServices.html | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index d8eda8a7..c69c1de5 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -205,6 +205,11 @@ if __name__ == '__main__': date_month = datetime.datetime.now().strftime("%Y%m") if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): + # last_father + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) + + # last check + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) crawl_onion(url, domain, date, date_month, message) if url != domain_url: @@ -226,12 +231,6 @@ if __name__ == '__main__': msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) p.populate_set_out(msg, 'Tags') - # last check - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) - - # last_father - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) - # add onion screenshot history # add crawled days if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 056dd44e..1b77c6ef 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -42,6 +42,7 @@ class TorSplashCrawler(): 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, + 'CLOSESPIDER_PAGECOUNT': 1000, 'DEPTH_LIMIT': crawler_depth_limit }) diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index 54d1d763..58b5937f 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -426,7 +426,7 @@ function refresh_list_crawled(){ newCell.innerHTML = ""+crawler['crawler_info']+""; newCell = newRow.insertCell(1); - newCell.innerHTML = ""+crawler['crawling_domain']+""; + newCell.innerHTML = ""+crawler['crawling_domain']+""; newCell = newRow.insertCell(2); newCell.innerHTML = "
"+crawler['status_info']+"
";