diff --git a/bin/Crawler.py b/bin/Crawler.py index c11144f9..14fd0b3a 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -4,6 +4,8 @@ import os import sys import re +import uuid +import json import redis import datetime import time @@ -30,19 +32,39 @@ def load_blacklist(service_type): # Extract info form url (url, domain, domain url, ...) def unpack_url(url): + to_crawl = {} faup.decode(url) url_unpack = faup.get() - domain = url_unpack['domain'].decode() + to_crawl['domain'] = url_unpack['domain'].decode() + if url_unpack['scheme'] is None: - to_crawl['scheme'] = url_unpack['scheme'] - to_crawl['url']= 'http://{}'.format(url) - to_crawl['domain_url'] = 'http://{}'.format(domain) + to_crawl['scheme'] = 'http' else: - to_crawl['scheme'] = url_unpack['scheme'] - to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url) - to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], domain) - to_crawl['port'] = url_unpack['port'] - to_crawl['tld'] = url_unpack['tld'].deocode() + scheme = url_unpack['scheme'].decode() + if scheme in default_proto_map: + to_crawl['scheme'] = scheme + else: + to_crawl['scheme'] = 'http' + + if url_unpack['port'] is None: + to_crawl['port'] = default_proto_map[to_crawl['scheme']] + else: + port = url_unpack['port'].decode() + # Verify port number #################### make function to verify/correct port number + try: + int(port) + # Invalid port Number + except Exception as e: + port = default_proto_map[to_crawl['scheme']] + to_crawl['port'] = port + + if url_unpack['query_string'] is None: + to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) + else: + to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode()) + to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], to_crawl['domain'], to_crawl['port']) + + to_crawl['tld'] = url_unpack['tld'].decode() return to_crawl # get url, paste and service_type to crawl @@ -70,20 +92,45 @@ def get_elem_to_crawl(rotation_mode): url, paste = splitted if paste: paste = paste.replace(PASTES_FOLDER+'/', '') - else: - url = message - paste = 'requested' message = {'url': url, 'paste': paste, 'type_service': domain_service_type, 'original_message': message} return message -def load_crawler_config(service_type, domain, paste): +def get_crawler_config(redis_server, mode, service_type, domain): + crawler_options = {} + config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) + if config is None: + config = {} + else: + config = json.loads(config) + for option in default_crawler_config: + if option in config: + crawler_options[option] = config[option] + else: + crawler_options[option] = default_crawler_config[option] + return crawler_options + +def load_crawler_config(service_type, domain, paste, date): + crawler_config = {} + crawler_config['splash_url'] = splash_url + crawler_config['item'] = paste + crawler_config['service_type'] = service_type + crawler_config['domain'] = domain + crawler_config['date'] = date + # Auto and Manual Crawling - if paste is None: + # Auto ################################################# create new entry, next crawling => here or when ended ? + if paste == 'auto': + crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain) + crawler_config['requested'] = True + # Manual + elif paste == 'manual': + crawler_config['crawler_options'] = get_crawler_config(r_cache, 'manual', service_type, domain) crawler_config['requested'] = True # default crawler else: + crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'default', service_type, domain) crawler_config['requested'] = False return crawler_config @@ -108,17 +155,13 @@ def on_error_send_message_back_in_queue(type_service, domain, message): redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) -##########################################################################################################< def crawl_onion(url, domain, message, crawler_config): + crawler_config['url'] = url print('Launching Crawler: {}'.format(url)) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) - super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') - if super_father is None: - super_father=paste - retry = True nb_retry = 0 while retry: @@ -144,7 +187,11 @@ def crawl_onion(url, domain, message, crawler_config): if r.status_code == 200: r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_service, url, domain, paste, super_father], + # save config in cash + UUID = str(uuid.uuid4()) + r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) + + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', UUID], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) @@ -193,16 +240,15 @@ def search_potential_source_domain(type_service, domain): if __name__ == '__main__': if len(sys.argv) != 3: - print('usage:', 'Crawler.py', 'type_service (onion or i2p or regular)', 'splash_port') + print('usage:', 'Crawler.py', 'mode', 'splash_port') exit(1) ################################################## - type_service = sys.argv[1] + #mode = sys.argv[1] splash_port = sys.argv[2] rotation_mode = ['onion', 'regular'] - - default_port = ['http': 80, 'https': 443] - + default_proto_map = {'http': 80, 'https': 443} +######################################################## add ftp ??? ################################################################### # TODO: port publisher.port = 6380 @@ -216,8 +262,6 @@ if __name__ == '__main__': splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) print('splash url: {}'.format(splash_url)) - faup = Faup() - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) r_serv_metadata = redis.StrictRedis( @@ -238,6 +282,16 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) + faup = Faup() + + # Default crawler options + default_crawler_config = {'html': 1, + 'har': 1, + 'png': 1, + 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), + 'closespider_pagecount': 50, + 'user_agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'} + # Track launched crawler r_cache.sadd('all_crawler', splash_port) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') @@ -250,8 +304,12 @@ if __name__ == '__main__': while True: to_crawl = get_elem_to_crawl(rotation_mode) - if to_crawl_dict: + if to_crawl: + print(to_crawl) + print(to_crawl['url']) url_data = unpack_url(to_crawl['url']) + print('url') + print(url_data) # remove domain from queue redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain']) @@ -265,20 +323,21 @@ if __name__ == '__main__': print('domain_url: {}'.format(url_data['domain_url'])) # Check blacklist - if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain'])): - date = {'date_day'= datetime.datetime.now().strftime("%Y%m%d"), - 'date_month'= datetime.datetime.now().strftime("%Y%m"), - 'epoch'= int(time.time())} + if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']): + date = {'date_day': datetime.datetime.now().strftime("%Y%m%d"), + 'date_month': datetime.datetime.now().strftime("%Y%m"), + 'epoch': int(time.time())} - crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste']) + crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date) + print(crawler_config) # check if default crawler - if not crawler_config['requested']: - # Auto crawl only if service not up this month - if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): - continue + #if not crawler_config['requested']: + # # Auto crawl only if service not up this month + # if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): + # continue - set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste'], crawler_config) + set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste']) #### CRAWLER #### @@ -287,14 +346,14 @@ if __name__ == '__main__': ######################################################crawler strategy # CRAWL domain - crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) + crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config) # Default Crawler else: # CRAWL domain - crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message']) - if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): - crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) + crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config) + #if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): + # crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) ################################################### handle port @@ -304,34 +363,34 @@ if __name__ == '__main__': #### #### - # Save last_status day (DOWN) - if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): - redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['day']), url_data['domain']) + # Save last_status day (DOWN) + if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): + redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain']) - # if domain was UP at least one time - if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): - # add crawler history (if domain is down) - if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): - # Domain is down - redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) + # if domain was UP at least one time + if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): + # add crawler history (if domain is down) + if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): + # Domain is down + redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) ############################ # extract page content ############################ - # update list, last crawled domains - redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) - redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) + # update list, last crawled domains + redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) + redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) - #update crawler status - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') - else: - print(' Blacklisted Domain') - print() - print() + #update crawler status + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') + r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + time.sleep(60) else: - continue + print(' Blacklisted Domain') + print() + print() + else: time.sleep(1) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 92e2e57c..911285f8 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -76,10 +76,12 @@ class HiddenServices(object): pass def get_origin_paste_name(self): - origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') - if origin_paste is None: + origin_item = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + if origin_item is None: return '' - return origin_paste.replace(self.paste_directory+'/', '') + elif origin_item == 'auto' or origin_item == 'manual': + return origin_item + return origin_item.replace(self.paste_directory+'/', '') def get_domain_tags(self, update=False): if not update: @@ -88,18 +90,36 @@ class HiddenServices(object): self.get_last_crawled_pastes() return self.tags - def update_domain_tags(self, children): - p_tags = self.r_serv_metadata.smembers('tag:'+children) + def update_domain_tags(self, item): + if self.r_serv_metadata.exists('tag:{}'.format(item)): + p_tags = self.r_serv_metadata.smembers('tag:{}'.format(item)) + # update path here + else: + # need to remove it + if self.paste_directory in item: + p_tags = self.r_serv_metadata.smembers('tag:{}'.format(item.replace(self.paste_directory+'/', ''))) + # need to remove it + else: + p_tags = self.r_serv_metadata.smembers('tag:{}'.format(os.path.join(self.paste_directory, item))) + print(p_tags) for tag in p_tags: self.tags[tag] = self.tags.get(tag, 0) + 1 #todo use the right paste def get_last_crawled_pastes(self): - paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') - #paste_parent = paste_parent.replace(self.paste_directory, '')[1:] - return self.get_all_pastes_domain(paste_parent) + paste_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0)[0] + return self.get_all_pastes_domain(paste_root) - def get_all_pastes_domain(self, father): + def get_all_pastes_domain(self, root_item): + if root_item is None: + return [] + l_crawled_pastes = [] + l_crawled_pastes = self.get_item_crawled_children(root_item) + l_crawled_pastes.append(root_item) + self.update_domain_tags(root_item) + return l_crawled_pastes + + def get_item_crawled_children(self, father): if father is None: return [] l_crawled_pastes = [] @@ -112,9 +132,10 @@ class HiddenServices(object): if self.domain in children: l_crawled_pastes.append(children) self.update_domain_tags(children) - l_crawled_pastes.extend(self.get_all_pastes_domain(children)) + l_crawled_pastes.extend(self.get_item_crawled_children(children)) return l_crawled_pastes + # experimental def get_domain_son(self, l_paste): if l_paste is None: return None @@ -133,6 +154,7 @@ class HiddenServices(object): return set_domain + ''' def get_all_domain_son(self, father): if father is None: return [] @@ -149,6 +171,7 @@ class HiddenServices(object): l_crawled_pastes.extend(self.get_all_domain_son(children)) return l_crawled_pastes + ''' def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): l_screenshot_paste = [] @@ -176,6 +199,7 @@ class HiddenServices(object): l_crawled_pastes = [] return l_crawled_pastes + ''' def get_last_crawled_pastes_fileSearch(self): last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') @@ -185,3 +209,4 @@ class HiddenServices(object): pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] return l_crawled_pastes + ''' diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 9b3ee389..3800c0bc 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -46,23 +46,23 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_options['depth_limit'] }) - def crawl(self, type, crawler_options, url, domain, original_paste, super_father): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father) + def crawl(self, type, crawler_options, date, url, domain, original_item): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs): + def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs): self.type = type - self.original_paste = original_paste - self.super_father = super_father + self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] - date = datetime.datetime.now().strftime("%Y/%m/%d") - self.full_date = datetime.datetime.now().strftime("%Y%m%d") - self.date_month = datetime.datetime.now().strftime("%Y%m") + date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) + self.full_date = date['date_day'] + self.date_month = date['date_month'] + self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, @@ -97,12 +97,12 @@ class TorSplashCrawler(): db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) - self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) + self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), - self.p.config.get("Directories", "crawled"), date ) + self.p.config.get("Directories", "crawled"), date_str ) - self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) + self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) def start_requests(self): yield SplashRequest( @@ -110,7 +110,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='render.json', - meta={'father': self.original_paste, 'root_key': None}, + meta={'father': self.original_item, 'root_key': None}, args=self.arg_crawler ) @@ -153,7 +153,7 @@ class TorSplashCrawler(): if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history - self.r_serv_onion.zadd('crawler_history_{}:{}'.format(type_service, domain), int(date['epoch']), self.root_key) + self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 7331115b..480bbe34 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -3,16 +3,15 @@ import os import sys +import json +import redis import configparser from TorSplashCrawler import TorSplashCrawler -tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' -default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50} - if __name__ == '__main__': - if len(sys.argv) != 7: - print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') + if len(sys.argv) != 2: + print('usage:', 'tor_crawler.py', 'uuid') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -24,36 +23,27 @@ if __name__ == '__main__': cfg = configparser.ConfigParser() cfg.read(configfile) - splash_url = sys.argv[1] - type = sys.argv[2] + redis_cache = redis.StrictRedis( + host=cfg.get("Redis_Cache", "host"), + port=cfg.getint("Redis_Cache", "port"), + db=cfg.getint("Redis_Cache", "db"), + decode_responses=True) - url = sys.argv[3] - domain = sys.argv[4] - paste = sys.argv[5] - super_father = sys.argv[6] + # get crawler config key + uuid = sys.argv[1] - if crawler_options is None: - crawler_options = default_crawler_options + # get configs + crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid))) + splash_url = crawler_json['splash_url'] + service_type = crawler_json['service_type'] + url = crawler_json['url'] + domain = crawler_json['domain'] + original_item = crawler_json['item'] + crawler_options = crawler_json['crawler_options'] + date = crawler_json['date'] - - - redis_crawler.exists('crawler_option_manual:{}:{}'.format(service_type, domain)): -crawler_config['mode_name'] = 'auto' - crawler_config['requested'] = True - - - - - - - - - - - - crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit") - crawler_options['user_agent'] = tor_browser_agent + redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(type, crawler_options, url, domain, paste, super_father) + crawler.crawl(service_type, crawler_options, date, url, domain, original_item)