fix: [Crawler] change max page crawled

2025-01-31 14:36:15 +00:00 · 2019-01-29 17:04:45 +01:00 · 2019-01-29 17:04:45 +01:00 · 92d192238b
commit 92d192238b
parent 6c7086f4eb
2 changed files with 5 additions and 5 deletions
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -209,16 +209,16 @@ if __name__ == '__main__':
                    date_month = datetime.datetime.now().strftime("%Y%m")

                    if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
+                        # first seen
+                        if not r_onion.hexists('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen'):
+                            r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
+
                        # last_father
                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)

                        # last check
                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)

-                        # first seen
-                        if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
-                            r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
-
                        crawl_onion(url, domain, date, date_month, message)
                        if url != domain_url:
                            print(url)
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -42,7 +42,7 @@ class TorSplashCrawler():
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
-            'CLOSESPIDER_PAGECOUNT': 1000,
+            'CLOSESPIDER_PAGECOUNT': 50,
            'DEPTH_LIMIT': crawler_depth_limit
            })