fix: [Crawler] change max page crawled

This commit is contained in:
Terrtia 2019-01-29 17:04:45 +01:00
parent 6c7086f4eb
commit 92d192238b
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 5 additions and 5 deletions

View file

@ -209,16 +209,16 @@ if __name__ == '__main__':
date_month = datetime.datetime.now().strftime("%Y%m") date_month = datetime.datetime.now().strftime("%Y%m")
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
# first seen
if not r_onion.hexists('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen'):
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
# last_father # last_father
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
# last check # last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# first seen
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
crawl_onion(url, domain, date, date_month, message) crawl_onion(url, domain, date, date_month, message)
if url != domain_url: if url != domain_url:
print(url) print(url)

View file

@ -42,7 +42,7 @@ class TorSplashCrawler():
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True, 'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2, 'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': 1000, 'CLOSESPIDER_PAGECOUNT': 50,
'DEPTH_LIMIT': crawler_depth_limit 'DEPTH_LIMIT': crawler_depth_limit
}) })