mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
76 lines
2.1 KiB
Python
Executable file
76 lines
2.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*-coding:UTF-8 -*
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import redis
|
|
from TorSplashCrawler import TorSplashCrawler
|
|
from I2pSplashCrawler import I2pSplashCrawler
|
|
|
|
from pyfaup.faup import Faup
|
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
|
import ConfigLoader
|
|
import crawlers
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if len(sys.argv) != 2:
|
|
print('usage:', 'tor_crawler.py', 'uuid')
|
|
exit(1)
|
|
|
|
|
|
faup = Faup()
|
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
|
redis_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
config_loader = None
|
|
|
|
# get crawler config key
|
|
uuid = sys.argv[1]
|
|
|
|
# get configs
|
|
crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid)))
|
|
|
|
splash_url = crawler_json['splash_url']
|
|
service_type = crawler_json['service_type']
|
|
url = crawler_json['url']
|
|
domain = crawler_json['domain']
|
|
port = crawler_json['port']
|
|
original_item = crawler_json['item']
|
|
crawler_options = crawler_json['crawler_options']
|
|
date = crawler_json['date']
|
|
requested_mode = crawler_json['requested']
|
|
|
|
if crawler_options['cookiejar_uuid']:
|
|
cookies = crawlers.load_crawler_cookies(crawler_options['cookiejar_uuid'], domain, crawler_type=service_type)
|
|
else:
|
|
cookies = []
|
|
|
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
|
|
|
|
|
# get crawler_mode
|
|
faup.decode(url)
|
|
unpack_url = faup.get()
|
|
|
|
try:
|
|
tld = unpack_url['tld'].decode()
|
|
except:
|
|
tld = unpack_url['tld']
|
|
|
|
if tld == "i2p":
|
|
try:
|
|
crawler = I2pSplashCrawler(splash_url, crawler_options)
|
|
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
|
except Exception as e:
|
|
print(e)
|
|
print(e, file=sys.stderr)
|
|
else:
|
|
try:
|
|
crawler = TorSplashCrawler(splash_url, crawler_options)
|
|
crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
|
|
except Exception as e:
|
|
print(e)
|
|
print(e, file=sys.stderr)
|