diff --git a/bin/Crawler.py b/bin/Crawler.py index 721a4415..1f61a50a 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -79,7 +79,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message): r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message) -def crawl_onion(url, domain, date, date_month, message): +def crawl_onion(url, domain, date, date_month, message, mode): r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) @@ -166,7 +166,8 @@ if __name__ == '__main__': publisher.info("Script Crawler started") # load domains blacklist - load_type_blacklist(type_hidden_service) + load_type_blacklist('onions') + load_type_blacklist('regular') splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) print('splash url: {}'.format(splash_url)) @@ -180,16 +181,15 @@ if __name__ == '__main__': r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) - while True: if mode == 'automatic': # Priority Queue - Recovering the streamed message informations. message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service)) - + # Recovering the streamed message informations. if message is None: - # Recovering the streamed message informations. message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + else: pass @@ -244,16 +244,16 @@ if __name__ == '__main__': r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) # Launch Scrapy-Splash Crawler - crawl_onion(url, domain, date, date_month, message) + crawl_onion(url, domain, date, date_month, message, mode) # Crawl Domain if url != domain_url: #Crawl Domain with port number if port is not None: print('{}:{}'.format(domain_url, port)) - crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message) + crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode) #Crawl without port number print(domain_url) - crawl_onion(domain_url, domain, date, date_month, message) + crawl_onion(domain_url, domain, date, date_month, message, mode) # update last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) @@ -293,14 +293,9 @@ if __name__ == '__main__': r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - # update list, last crawled sites - r_onion.lpush('last_{}'.format(type_hidden_service), domain) - r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) - # manual - else: - # update list, last crawled sites - r_onion.lpush('last_crawled_manual', domain) - r_onion.ltrim('last_crawled_manual', 0, 15) + # update list, last crawled sites + r_onion.lpush('last_{}'.format(type_hidden_service), domain) + r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) #update crawler status r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index b5a5c1f9..42d9a6af 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -28,10 +28,10 @@ from Helper import Process class TorSplashCrawler(): - def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount): + def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { - 'USER_AGENT': user_agent, + 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, @@ -42,18 +42,18 @@ class TorSplashCrawler(): 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, - 'CLOSESPIDER_PAGECOUNT': closespider_pagecount, - 'DEPTH_LIMIT': crawler_depth_limit + 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], + 'DEPTH_LIMIT': crawler_options['depth_limit'] }) - def crawl(self, type, url, domain, original_paste, super_father): - self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father) + def crawl(self, type, crawler_options, url, domain, original_paste, super_father): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): + def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father @@ -63,6 +63,12 @@ class TorSplashCrawler(): self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") + self.arg_crawler = { 'html': crawler_options['html'], + 'wait': 10, + 'render_all': 1, + 'har': crawler_options['har'], + 'png': crawler_options['png']} + config_section = 'Crawler' self.p = Process(config_section) @@ -104,11 +110,7 @@ class TorSplashCrawler(): errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, - args={ 'html': 1, - 'wait': 10, - 'render_all': 1, - 'har': 1, - 'png': 1} + args=self.arg_crawler ) def parse(self,response): @@ -131,6 +133,7 @@ class TorSplashCrawler(): relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + # # TODO: modify me # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): @@ -158,14 +161,16 @@ class TorSplashCrawler(): if not os.path.exists(dirname): os.makedirs(dirname) - size_screenshot = (len(response.data['png'])*3) /4 + if 'png' in response.data: + size_screenshot = (len(response.data['png'])*3) /4 - if size_screenshot < 5000000: #bytes - with open(filename_screenshot, 'wb') as f: - f.write(base64.standard_b64decode(response.data['png'].encode())) + if size_screenshot < 5000000: #bytes + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) - with open(filename_screenshot+'har.txt', 'wb') as f: - f.write(json.dumps(response.data['har']).encode()) + if 'har' in response.data: + with open(filename_screenshot+'har.txt', 'wb') as f: + f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) @@ -181,11 +186,7 @@ class TorSplashCrawler(): errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, - args={ 'html': 1, - 'png': 1, - 'render_all': 1, - 'har': 1, - 'wait': 10} + args=self.arg_crawler ) def errback_catcher(self, failure): @@ -205,11 +206,7 @@ class TorSplashCrawler(): errback=self.errback_catcher, endpoint='render.json', meta={'father': father}, - args={ 'html': 1, - 'png': 1, - 'render_all': 1, - 'har': 1, - 'wait': 10} + args=self.arg_crawler ) else: diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 99bda837..e8a7d96b 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -6,6 +6,9 @@ import sys import configparser from TorSplashCrawler import TorSplashCrawler +tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' +default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50} + if __name__ == '__main__': if len(sys.argv) != 7: @@ -23,17 +26,17 @@ if __name__ == '__main__': splash_url = sys.argv[1] type = sys.argv[2] - crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") url = sys.argv[3] domain = sys.argv[4] paste = sys.argv[5] super_father = sys.argv[6] + + if crawler_options is None: + crawler_options = default_crawler_options - tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' - user_agent = tor_browser_agent + crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit") + crawler_options['user_agent'] = tor_browser_agent - closespider_pagecount = 50 - - crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount) - crawler.crawl(type, url, domain, paste, super_father) + crawler = TorSplashCrawler(splash_url, crawler_options) + crawler.crawl(type, crawler_options, url, domain, paste, super_father) diff --git a/var/www/modules/Tags/templates/Tags.html b/var/www/modules/Tags/templates/Tags.html index 5af8dcf6..0628b4f2 100644 --- a/var/www/modules/Tags/templates/Tags.html +++ b/var/www/modules/Tags/templates/Tags.html @@ -2,30 +2,27 @@ - - - Tags - AIL - - - - + + + + - - - - - - + + + + + + - {% include 'navbar.html' %} + {% include 'nav_bar.html' %}
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 8d4508de..484056e0 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -142,6 +142,10 @@ def get_crawler_splash_status(mode, type): def hiddenServices_page_test(): return render_template("Crawler_index.html") +@hiddenServices.route("/crawlers/manual", methods=['GET']) +def manual(): + return render_template("Crawler_Splash_manual.html") + @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) def crawler_splash_onion(): last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)