ail-framework/bin/torcrawler/tor_crawler.py

77 lines
2.1 KiB
Python
Raw Permalink Normal View History

2018-08-09 15:42:21 +00:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
2019-02-25 15:38:50 +00:00
import json
import redis
2018-08-09 15:42:21 +00:00
from TorSplashCrawler import TorSplashCrawler
2022-06-30 15:33:00 +00:00
from I2pSplashCrawler import I2pSplashCrawler
from pyfaup.faup import Faup
2018-08-09 15:42:21 +00:00
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
import crawlers
2018-08-09 15:42:21 +00:00
if __name__ == '__main__':
2019-02-25 15:38:50 +00:00
if len(sys.argv) != 2:
print('usage:', 'tor_crawler.py', 'uuid')
2018-08-09 15:42:21 +00:00
exit(1)
2022-06-30 15:33:00 +00:00
faup = Faup()
config_loader = ConfigLoader.ConfigLoader()
redis_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None
2019-02-22 16:00:24 +00:00
2019-02-25 15:38:50 +00:00
# get crawler config key
uuid = sys.argv[1]
2019-02-22 16:00:24 +00:00
2019-02-25 15:38:50 +00:00
# get configs
crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid)))
2019-02-22 16:00:24 +00:00
2019-02-25 15:38:50 +00:00
splash_url = crawler_json['splash_url']
service_type = crawler_json['service_type']
url = crawler_json['url']
domain = crawler_json['domain']
port = crawler_json['port']
2019-02-25 15:38:50 +00:00
original_item = crawler_json['item']
crawler_options = crawler_json['crawler_options']
date = crawler_json['date']
requested_mode = crawler_json['requested']
if crawler_options['cookiejar_uuid']:
cookies = crawlers.load_crawler_cookies(crawler_options['cookiejar_uuid'], domain, crawler_type=service_type)
else:
cookies = []
2019-02-22 16:00:24 +00:00
2019-02-25 15:38:50 +00:00
redis_cache.delete('crawler_request:{}'.format(uuid))
2022-06-30 15:33:00 +00:00
# get crawler_mode
faup.decode(url)
unpack_url = faup.get()
try:
2022-06-30 15:33:00 +00:00
tld = unpack_url['tld'].decode()
except:
tld = unpack_url['tld']
if tld == "i2p":
try:
crawler = I2pSplashCrawler(splash_url, crawler_options)
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
except Exception as e:
print(e)
print(e, file=sys.stderr)
else:
try:
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
except Exception as e:
print(e)
print(e, file=sys.stderr)