2018-08-09 15:42:21 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
2019-02-25 15:38:50 +00:00
|
|
|
import json
|
|
|
|
import redis
|
2018-08-09 15:42:21 +00:00
|
|
|
import configparser
|
|
|
|
from TorSplashCrawler import TorSplashCrawler
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
if len(sys.argv) != 2:
|
|
|
|
print('usage:', 'tor_crawler.py', 'uuid')
|
2018-08-09 15:42:21 +00:00
|
|
|
exit(1)
|
|
|
|
|
|
|
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
|
|
|
if not os.path.exists(configfile):
|
|
|
|
raise Exception('Unable to find the configuration file. \
|
|
|
|
Did you set environment variables? \
|
|
|
|
Or activate the virtualenv.')
|
|
|
|
|
|
|
|
cfg = configparser.ConfigParser()
|
|
|
|
cfg.read(configfile)
|
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
redis_cache = redis.StrictRedis(
|
|
|
|
host=cfg.get("Redis_Cache", "host"),
|
|
|
|
port=cfg.getint("Redis_Cache", "port"),
|
|
|
|
db=cfg.getint("Redis_Cache", "db"),
|
|
|
|
decode_responses=True)
|
2019-02-22 16:00:24 +00:00
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
# get crawler config key
|
|
|
|
uuid = sys.argv[1]
|
2019-02-22 16:00:24 +00:00
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
# get configs
|
|
|
|
crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid)))
|
2019-02-22 16:00:24 +00:00
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
splash_url = crawler_json['splash_url']
|
|
|
|
service_type = crawler_json['service_type']
|
|
|
|
url = crawler_json['url']
|
|
|
|
domain = crawler_json['domain']
|
2019-03-22 15:48:07 +00:00
|
|
|
port = crawler_json['port']
|
2019-02-25 15:38:50 +00:00
|
|
|
original_item = crawler_json['item']
|
|
|
|
crawler_options = crawler_json['crawler_options']
|
|
|
|
date = crawler_json['date']
|
2019-02-22 16:00:24 +00:00
|
|
|
|
2019-02-25 15:38:50 +00:00
|
|
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
2019-02-05 16:16:44 +00:00
|
|
|
|
2019-02-21 08:54:43 +00:00
|
|
|
crawler = TorSplashCrawler(splash_url, crawler_options)
|
2019-03-22 15:48:07 +00:00
|
|
|
crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item)
|