From 05079c143c562b74abe1e95e018bbe9e7cdc7d89 Mon Sep 17 00:00:00 2001 From: kovacsbalu Date: Tue, 14 May 2019 16:06:20 +0200 Subject: [PATCH 1/4] Fix #314 Replace char on redis encoding error. Try to use local file on other error. --- bin/packages/Paste.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index f8be2f9b..884bd5ee 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -70,6 +70,7 @@ class Paste(object): host=cfg.get("Redis_Queues", "host"), port=cfg.getint("Redis_Queues", "port"), db=cfg.getint("Redis_Queues", "db"), + encoding_errors='replace', decode_responses=True) self.store = redis.StrictRedis( host=cfg.get("Redis_Data_Merging", "host"), @@ -125,7 +126,13 @@ class Paste(object): """ - paste = self.cache.get(self.p_path) + try: + paste = self.cache.get(self.p_path) + except Exception as e: + print("ERROR in: " + self.p_path) + print(e) + paste = None + if paste is None: try: with gzip.open(self.p_path, 'r') as f: From 6092f482e6037a76b28b067ebe4be425772e2ef3 Mon Sep 17 00:00:00 2001 From: kovacsbalu Date: Wed, 15 May 2019 09:57:18 +0200 Subject: [PATCH 2/4] Fix crawler rotation Before this, crawler processed prioritized onions and after all starts prioritized regular. --- bin/Crawler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index d8e6a430..31f2d594 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -12,6 +12,7 @@ import time import subprocess import requests +from collections import deque from pyfaup.faup import Faup sys.path.append(os.environ['AIL_BIN']) @@ -303,7 +304,7 @@ if __name__ == '__main__': #mode = sys.argv[1] splash_port = sys.argv[1] - rotation_mode = ['onion', 'regular'] + rotation_mode = deque(["onion", "regular"]) default_proto_map = {'http': 80, 'https': 443} ######################################################## add ftp ??? @@ -361,6 +362,7 @@ if __name__ == '__main__': update_auto_crawler() + rotation_mode.rotate() to_crawl = get_elem_to_crawl(rotation_mode) if to_crawl: url_data = unpack_url(to_crawl['url']) From 7765ab92e0d2a171cadc30b48f6c0560b1e908c2 Mon Sep 17 00:00:00 2001 From: kovacsbalu Date: Wed, 15 May 2019 10:00:51 +0200 Subject: [PATCH 3/4] Hopp, single quote :) --- bin/Crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 31f2d594..e5864059 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -304,7 +304,7 @@ if __name__ == '__main__': #mode = sys.argv[1] splash_port = sys.argv[1] - rotation_mode = deque(["onion", "regular"]) + rotation_mode = deque(['onion', 'regular']) default_proto_map = {'http': 80, 'https': 443} ######################################################## add ftp ??? From f61d830678b636ab777534a1fe39a51f8c54d8dd Mon Sep 17 00:00:00 2001 From: kovacsbalu Date: Thu, 16 May 2019 14:24:03 +0200 Subject: [PATCH 4/4] Use default encoding error from redis. --- bin/packages/Paste.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 884bd5ee..1087880b 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -70,7 +70,6 @@ class Paste(object): host=cfg.get("Redis_Queues", "host"), port=cfg.getint("Redis_Queues", "port"), db=cfg.getint("Redis_Queues", "db"), - encoding_errors='replace', decode_responses=True) self.store = redis.StrictRedis( host=cfg.get("Redis_Data_Merging", "host"), @@ -128,6 +127,8 @@ class Paste(object): try: paste = self.cache.get(self.p_path) + except UnicodeDecodeError: + paste = None except Exception as e: print("ERROR in: " + self.p_path) print(e)