From 8b1c10b38c4fd5f3a34fe8ee9f52061e78fb01ec Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 9 Aug 2018 17:42:21 +0200 Subject: [PATCH 01/28] chg: [Onion] add onion splash crawler --- bin/Crawler.py | 92 ++++++++++++++ bin/Onion.py | 8 +- bin/packages/config.cfg.sample | 12 ++ bin/packages/modules.cfg | 7 +- bin/torcrawler/TorSplashCrawler.py | 165 ++++++++++++++++++++++++++ bin/torcrawler/tor_crawler.py | 33 ++++++ etc/splash/proxy-profiles/default.ini | 4 + 7 files changed, 319 insertions(+), 2 deletions(-) create mode 100755 bin/Crawler.py create mode 100644 bin/torcrawler/TorSplashCrawler.py create mode 100755 bin/torcrawler/tor_crawler.py create mode 100644 etc/splash/proxy-profiles/default.ini diff --git a/bin/Crawler.py b/bin/Crawler.py new file mode 100755 index 00000000..92d43a81 --- /dev/null +++ b/bin/Crawler.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import redis +import datetime +import time +import subprocess + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process +from pubsublogger import publisher + + +def signal_handler(sig, frame): + sys.exit(0) + +if __name__ == '__main__': + + publisher.port = 6380 + publisher.channel = "Script" + + publisher.info("Script Crawler started") + + config_section = 'Crawler' + + # Setup the I/O queues + p = Process(config_section) + + splash_url = p.config.get("Crawler", "splash_url") + http_proxy = p.config.get("Crawler", "http_proxy") + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + + #signal.signal(signal.SIGINT, signal_handler) + + r_serv_metadata = redis.StrictRedis( + host=p.config.get("ARDB_Metadata", "host"), + port=p.config.getint("ARDB_Metadata", "port"), + db=p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_cache = redis.StrictRedis( + host=p.config.get("Redis_Cache", "host"), + port=p.config.getint("Redis_Cache", "port"), + db=p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + while True: + + message = p.get_from_set() + # Recovering the streamed message informations. + if message is not None: + splitted = message.split(';') + if len(splitted) == 2: + url, paste = splitted + + print(url) + + if not r_cache.exists(url): + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste + + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) + + date = datetime.datetime.now().strftime("%Y%m%d") + print(date) + url_domain = url.replace('http://', '') + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') + + r_onion.sadd('onion_up:'+date , url_domain) + else: + r_onion.sadd('onion_down:'+date , url_domain) + print(process.stdout.read()) + + else: + continue + else: + time.sleep(1) diff --git a/bin/Onion.py b/bin/Onion.py index 277f1c71..dbedf1e1 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -21,7 +21,6 @@ Requirements *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly. """ -import pprint import time from packages import Paste from pubsublogger import publisher @@ -123,6 +122,7 @@ if __name__ == "__main__": PST = Paste.Paste(filename) for x in PST.get_regex(url_regex): + print(x) # Extracting url with regex url, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = x @@ -149,12 +149,18 @@ if __name__ == "__main__": to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) + ''' for url in fetch(p, r_cache, urls, domains_list, path): publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) p.populate_set_out(msg, 'Tags') + ''' + for url in urls: + msg = '{};{}'.format(url,PST.p_path) + print('send to crawler') + p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 2ed662c1..62ea0887 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -3,6 +3,8 @@ bloomfilters = Blooms dicofilters = Dicos pastes = PASTES base64 = BASE64 +crawled = crawled +crawled_screenshot = CRAWLED_SCREENSHOT wordtrending_csv = var/www/static/csv/wordstrendingdata wordsfile = files/wordfile @@ -171,6 +173,11 @@ host = localhost port = 6382 db = 8 +[ARDB_Onion] +host = localhost +port = 6382 +db = 9 + [Url] cc_critical = DE @@ -215,3 +222,8 @@ channel = FetchedOnion host = localhost port = 6381 db = 0 + +[Crawler] +crawler_depth_limit = 1 +splash_url = http://127.0.0.1:8050 +http_proxy = http://127.0.0.1:9050 diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 452850f7..d8acf2dc 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags [Onion] subscribe = Redis_Onion -publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags +publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler #publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler [DumpValidOnion] @@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags [submit_paste] subscribe = Redis publish = Redis_Mixer + +[Crawler] +subscribe = Redis_Crawler +publish = Redis_Mixer,Redis_Tags + diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py new file mode 100644 index 00000000..ace36056 --- /dev/null +++ b/bin/torcrawler/TorSplashCrawler.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import gzip +import base64 +import uuid +import datetime +import base64 +import redis +from urllib.parse import urlparse + +from scrapy import Spider +from scrapy.linkextractors import LinkExtractor +from scrapy.crawler import CrawlerProcess, Crawler + +from twisted.internet import reactor + +from scrapy_splash import SplashRequest + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process + +class TorSplashCrawler(): + + def __init__(self, splash_url, http_proxy, crawler_depth_limit): + self.process = CrawlerProcess({'LOG_ENABLED': False}) + self.crawler = Crawler(self.TorSplashSpider, { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', + 'SPLASH_URL': splash_url, + 'HTTP_PROXY': http_proxy, + 'ROBOTSTXT_OBEY': False, + 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + }, + 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, + 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', + 'DEPTH_LIMIT': crawler_depth_limit + }) + + def crawl(self, url, original_paste, super_father): + self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father) + self.process.start() + + class TorSplashSpider(Spider): + name = 'TorSplashSpider' + + def __init__(self, url, original_paste, super_father, *args, **kwargs): + self.original_paste = original_paste + self.super_father = super_father + self.start_urls = url + self.domains = [urlparse(url).netloc] + date = datetime.datetime.now().strftime("%Y/%m/%d") + + config_section = 'Crawler' + self.p = Process(config_section) + + self.r_cache = redis.StrictRedis( + host=self.p.config.get("Redis_Cache", "host"), + port=self.p.config.getint("Redis_Cache", "port"), + db=self.p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + self.r_serv_log_submit = redis.StrictRedis( + host=self.p.config.get("Redis_Log_submit", "host"), + port=self.p.config.getint("Redis_Log_submit", "port"), + db=self.p.config.getint("Redis_Log_submit", "db"), + decode_responses=True) + + self.r_serv_metadata = redis.StrictRedis( + host=self.p.config.get("ARDB_Metadata", "host"), + port=self.p.config.getint("ARDB_Metadata", "port"), + db=self.p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), + self.p.config.get("Directories", "crawled"), date ) + + self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) + + def start_requests(self): + yield SplashRequest( + self.start_urls, + self.parse, + endpoint='render.json', + meta={'parent': self.original_paste}, + args={ 'html': 1, + 'wait': 10, + 'render_all': 1, + 'png': 1} + ) + + def parse(self,response): + print(response.headers) + print(response.status) + + self.r_cache.setbit(response.url, 0, 1) + self.r_cache.expire(response.url, 360000) + + UUID = self.domains[0]+str(uuid.uuid4()) + filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + + # save new paste on disk + if self.save_crawled_paste(filename_paste, response.data['html']): + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) + + self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) + + dirname = os.path.dirname(filename_screenshot) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) + + # save external links in set + lext = LinkExtractor(deny_domains=self.domains, unique=True) + for link in lext.extract_links(response): + self.r_serv_metadata.sadd('paste_crawler:filename_paste', link) + + #le = LinkExtractor(unique=True) + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + self.r_cache.setbit(link, 0, 0) + self.r_cache.expire(link, 360000) + yield SplashRequest( + link.url, + self.parse, + endpoint='render.json', + meta={'parent': UUID}, + args={ 'html': 1, + 'png': 1, + 'render_all': 1, + 'wait': 10} + ) + + def save_crawled_paste(self, filename, content): + + print(filename) + if os.path.isfile(filename): + print('File: {} already exist in submitted pastes'.format(filename)) + return False + + try: + gzipencoded = gzip.compress(content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + except: + print("file error: {}".format(filename)) + return False + + # send paste to Global + relay_message = "{0} {1}".format(filename, gzip64encoded) + self.p.populate_set_out(relay_message, 'Mixer') + + # increase nb of paste by feeder name + self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) + + # tag crawled paste + msg = 'infoleak:submission="crawler";{}'.format(filename) + self.p.populate_set_out(msg, 'Tags') + return True diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py new file mode 100755 index 00000000..3085f213 --- /dev/null +++ b/bin/torcrawler/tor_crawler.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import configparser +from TorSplashCrawler import TorSplashCrawler + +if __name__ == '__main__': + + if len(sys.argv) != 4: + print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father') + exit(1) + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + + splash_url = cfg.get("Crawler", "splash_url") + http_proxy = cfg.get("Crawler", "http_proxy") + crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") + + url = sys.argv[1] + paste = sys.argv[2] + super_father = sys.argv[3] + + crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) + crawler.crawl(url, paste, super_father) diff --git a/etc/splash/proxy-profiles/default.ini b/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..91208135 --- /dev/null +++ b/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=localhost +port=9050 +type=SOCKS5 From 765208943344fb56bfad652d79aecdac83a38364 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 13 Aug 2018 09:23:14 +0200 Subject: [PATCH 02/28] chg: [Onion] change onion regex, fix crawler --- bin/Crawler.py | 65 +++++++++++++++++++++--------- bin/Onion.py | 2 +- bin/torcrawler/TorSplashCrawler.py | 38 +++++++++++++---- bin/torcrawler/tor_crawler.py | 11 ++--- 4 files changed, 82 insertions(+), 34 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 92d43a81..a8292b74 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -3,6 +3,7 @@ import os import sys +import re import redis import datetime import time @@ -16,6 +17,33 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) +def crawl_onion(url, domain): + date = datetime.datetime.now().strftime("%Y%m%d") + + if not r_onion.sismember('onion_up:'+date , domain): + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste + + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) + + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') + print(process.stdout.read()) + + r_onion.sadd('onion_up:'+date , domain) + r_onion.sadd('onion_up_link:'+date , url) + else: + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + print(process.stdout.read()) + + if __name__ == '__main__': publisher.port = 6380 @@ -52,6 +80,9 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) + url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_regex) + while True: message = p.get_from_set() @@ -61,30 +92,24 @@ if __name__ == '__main__': if len(splitted) == 2: url, paste = splitted - print(url) + url_list = re.findall(url_regex, url)[0] + if url_list[1] == '': + url= 'http://{}'.format(url) - if not r_cache.exists(url): - super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') - if super_father is None: - super_father=paste + link, s, credential, subdomain, domain, host, port, \ + resource_path, query_string, f1, f2, f3, f4 = url_list + domain = url_list[4] - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) + domain_url = 'http://{}'.format(domain) - date = datetime.datetime.now().strftime("%Y%m%d") - print(date) - url_domain = url.replace('http://', '') - if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) - p.populate_set_out(msg, 'Tags') + print('------------------START ONIOM CRAWLER------------------') + print('url: {}'.format(url)) + print('domain: {}'.format(domain)) + print('domain_url: {}'.format(domain_url)) - r_onion.sadd('onion_up:'+date , url_domain) - else: - r_onion.sadd('onion_down:'+date , url_domain) - print(process.stdout.read()) + crawl_onion(url, domain) + if url != domain_url: + crawl_onion(domain_url, domain) else: continue diff --git a/bin/Onion.py b/bin/Onion.py index dbedf1e1..1e2dff32 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -108,7 +108,7 @@ if __name__ == "__main__": # Thanks to Faup project for this regex # https://github.com/stricaud/faup - url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" while True: if message is not None: diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index ace36056..63839799 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -9,14 +9,11 @@ import uuid import datetime import base64 import redis -from urllib.parse import urlparse from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler -from twisted.internet import reactor - from scrapy_splash import SplashRequest sys.path.append(os.environ['AIL_BIN']) @@ -40,19 +37,20 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_depth_limit }) - def crawl(self, url, original_paste, super_father): - self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father) + def crawl(self, url, domain, original_paste, super_father): + self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, url, original_paste, super_father, *args, **kwargs): + def __init__(self, url, domain,original_paste, super_father, *args, **kwargs): self.original_paste = original_paste self.super_father = super_father self.start_urls = url - self.domains = [urlparse(url).netloc] + self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") + self.full_date = datetime.datetime.now().strftime("%Y%m%d") config_section = 'Crawler' self.p = Process(config_section) @@ -75,6 +73,12 @@ class TorSplashCrawler(): db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) + self.r_serv_onion = redis.StrictRedis( + host=self.p.config.get("ARDB_Onion", "host"), + port=self.p.config.getint("ARDB_Onion", "port"), + db=self.p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) @@ -96,6 +100,7 @@ class TorSplashCrawler(): print(response.headers) print(response.status) + # # TODO: # FIXME: self.r_cache.setbit(response.url, 0, 1) self.r_cache.expire(response.url, 360000) @@ -105,8 +110,19 @@ class TorSplashCrawler(): # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): + + # create onion metadata + if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])): + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date) + + # add onion screenshot history + self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date) + + #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) @@ -114,6 +130,13 @@ class TorSplashCrawler(): dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) + + print(sys.getsizeof(response.data['png'])) + print(sys.getsizeof(response.data['html'])) + print(self.domains[0]) + + + with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) @@ -140,7 +163,6 @@ class TorSplashCrawler(): def save_crawled_paste(self, filename, content): - print(filename) if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 3085f213..57a77e76 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': - if len(sys.argv) != 4: - print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father') + if len(sys.argv) != 5: + print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -26,8 +26,9 @@ if __name__ == '__main__': crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") url = sys.argv[1] - paste = sys.argv[2] - super_father = sys.argv[3] + domain = sys.argv[2] + paste = sys.argv[3] + super_father = sys.argv[4] crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) - crawler.crawl(url, paste, super_father) + crawler.crawl(url, domain, paste, super_father) From ed559d9f4a9d2b2cd36918c6559fe49fe4fbf140 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 16 Aug 2018 17:24:39 +0200 Subject: [PATCH 03/28] chg: [Showpaste] add screenshot + improve onion db --- .gitignore | 1 + bin/Crawler.py | 88 ++++++++++++++----- bin/packages/Paste.py | 4 + bin/torcrawler/TorSplashCrawler.py | 67 ++++++++++---- var/www/modules/Flask_config.py | 8 ++ var/www/modules/showpaste/Flask_showpaste.py | 20 ++++- .../showpaste/templates/show_saved_paste.html | 36 ++++++++ 7 files changed, 186 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index e74906ae..b5755ee6 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ var/www/submitted # Local config bin/packages/config.cfg configs/keys +files # installed files nltk_data/ diff --git a/bin/Crawler.py b/bin/Crawler.py index a8292b74..df1e0117 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -8,6 +8,7 @@ import redis import datetime import time import subprocess +import requests sys.path.append(os.environ['AIL_BIN']) from Helper import Process @@ -17,31 +18,40 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) -def crawl_onion(url, domain): - date = datetime.datetime.now().strftime("%Y%m%d") +def crawl_onion(url, domain, date): - if not r_onion.sismember('onion_up:'+date , domain): + if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain): + #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') if super_father is None: super_father=paste - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) + try: + r = requests.get(splash_url , timeout=0.010) + except Exception: + ## FIXME: # TODO: relaunch docker + exit(0) - if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) - p.populate_set_out(msg, 'Tags') - print(process.stdout.read()) + if r.status_code == 200: + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) - r_onion.sadd('onion_up:'+date , domain) - r_onion.sadd('onion_up_link:'+date , url) + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') + + print(process.stdout.read()) + + else: + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + print(process.stdout.read()) else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - print(process.stdout.read()) + ## FIXME: # TODO: relaunch docker + exit(0) if __name__ == '__main__': @@ -102,15 +112,51 @@ if __name__ == '__main__': domain_url = 'http://{}'.format(domain) - print('------------------START ONIOM CRAWLER------------------') + print('------------------START ONION CRAWLER------------------') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - crawl_onion(url, domain) - if url != domain_url: - crawl_onion(domain_url, domain) + if not r_onion.sismember('banned_onion', domain): + date = datetime.datetime.now().strftime("%Y%m%d") + + crawl_onion(url, domain, date) + if url != domain_url: + crawl_onion(domain_url, domain, date) + + # save dowm onion + if not r_onion.sismember('onion_up:'+date , domain): + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + r_onion.hincrby('onion_link_down', url, 1) + if not r_onion.exists('onion_metadata:{}'.format(domain)): + r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) + r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) + else: + r_onion.hincrby('onion_link_up', url, 1) + + # last check + r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + + # check external onions links (full_scrawl) + external_domains = set() + for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): + print(link) + external_domain = re.findall(url_regex, link) + print(external_domain) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + print(external_domain) + # # TODO: add i2p + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('onion_potential_source', domain) + r_onion.delete('domain_onion_external_links:{}'.format(domain)) + print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) else: continue else: diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index d1e3f0d3..45ed1ed2 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -94,6 +94,7 @@ class Paste(object): var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) + self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) @@ -291,6 +292,9 @@ class Paste(object): else: return '[]' + def get_p_rel_path(self): + return self.p_rel_path + def save_all_attributes_redis(self, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 63839799..3d392b93 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -10,6 +10,10 @@ import datetime import base64 import redis +from scrapy.spidermiddlewares.httperror import HttpError +from twisted.internet.error import DNSLookupError +from twisted.internet.error import TimeoutError + from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler @@ -79,6 +83,8 @@ class TorSplashCrawler(): db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) + self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) @@ -89,7 +95,7 @@ class TorSplashCrawler(): self.start_urls, self.parse, endpoint='render.json', - meta={'parent': self.original_paste}, + meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, @@ -106,44 +112,47 @@ class TorSplashCrawler(): UUID = self.domains[0]+str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): + self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) + self.r_serv_onion.sadd('full_onion_up', self.domains[0]) + # create onion metadata - if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])): - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date) + if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date) # add onion screenshot history - self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date) + self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) - self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) + self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) - print(sys.getsizeof(response.data['png'])) - print(sys.getsizeof(response.data['html'])) - print(self.domains[0]) + size_screenshot = (len(response.data['png'])*3) /4 + print(size_screenshot) - - - with open(filename_screenshot, 'wb') as f: - f.write(base64.standard_b64decode(response.data['png'].encode())) + if size_screenshot < 5000000: #bytes + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) # save external links in set lext = LinkExtractor(deny_domains=self.domains, unique=True) for link in lext.extract_links(response): - self.r_serv_metadata.sadd('paste_crawler:filename_paste', link) + self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url) + self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url) #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) @@ -154,12 +163,38 @@ class TorSplashCrawler(): link.url, self.parse, endpoint='render.json', - meta={'parent': UUID}, + meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'wait': 10} + #errback=self.errback_catcher ) + ''' + def errback_catcher(self, failure): + # catch all errback failures, + self.logger.error(repr(failure)) + + #if isinstance(failure.value, HttpError): + if failure.check(HttpError): + # you can get the response + response = failure.value.response + print('HttpError') + self.logger.error('HttpError on %s', response.url) + + #elif isinstance(failure.value, DNSLookupError): + elif failure.check(DNSLookupError): + # this is the original request + request = failure.request + print(DNSLookupError) + self.logger.error('DNSLookupError on %s', request.url) + + #elif isinstance(failure.value, TimeoutError): + elif failure.check(TimeoutError): + request = failure.request + print(TimeoutError) + self.logger.error('TimeoutError on %s', request.url) + ''' def save_crawled_paste(self, filename, content): diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 2c3e736a..5424ccc8 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis( db=cfg.getint("ARDB_Statistics", "db"), decode_responses=True) +r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + sys.path.append('../../configs/keys') # MISP # @@ -144,4 +150,6 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') +SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index cc70527c..6fa5a983 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -5,9 +5,10 @@ Flask functions and routes for the trending modules page ''' import redis +import os import json import flask -from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response +from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory import difflib import ssdeep @@ -22,12 +23,14 @@ r_serv_pasteName = Flask_config.r_serv_pasteName r_serv_metadata = Flask_config.r_serv_metadata r_serv_tags = Flask_config.r_serv_tags r_serv_statistics = Flask_config.r_serv_statistics +r_serv_onion = Flask_config.r_serv_onion max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal DiffMaxLineLength = Flask_config.DiffMaxLineLength bootstrap_label = Flask_config.bootstrap_label misp_event_url = Flask_config.misp_event_url hive_case_url = Flask_config.hive_case_url +SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') @@ -130,6 +133,16 @@ def showpaste(content_range): list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) ) + crawler_metadata = {} + if 'infoleak:submission="crawler"' in l_tags: + crawler_metadata['get_metadata'] = True + crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') + crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') + crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path) + crawler_metadata['screenshot'] = paste.get_p_rel_path() + else: + crawler_metadata['get_metadata'] = False + if Flask_config.pymisp is False: misp = False else: @@ -157,6 +170,7 @@ def showpaste(content_range): hive_url = hive_case_url.replace('id_here', hive_case) return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list, + crawler_metadata=crawler_metadata, misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url) # ============ ROUTES ============ @@ -202,5 +216,9 @@ def showDiff(): the_html = htmlD.make_file(lines1, lines2) return the_html +@showsavedpastes.route('/screenshot/') +def screenshot(filename): + return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True) + # ========= REGISTRATION ========= app.register_blueprint(showsavedpastes) diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index cb3f8b68..866f64c1 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -373,6 +373,42 @@ {% endif %} + + {% if crawler_metadata['get_metadata'] %} +
+
+ +
+ +
+
+
+
+ Graph +
+ + + + + + + + + + + + + + + + +
Father{{ crawler_metadata['paste_father'] }}
Source link{{ crawler_metadata['real_link'] }}
External links{{ crawler_metadata['external_links'] }}
+
+
+
+
+ {% endif %} +

Content:

[Raw content]

{{ content }}

From e9580d6775981a6a7eeea882bd96ce77ea59cb32 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 21 Aug 2018 15:54:53 +0200 Subject: [PATCH 04/28] chg: [Crawler] change BDD, save i2p links --- bin/Crawler.py | 140 +++++++------ bin/Onion.py | 45 ++++- bin/packages/HiddenServices.py | 79 ++++++++ bin/torcrawler/TorSplashCrawler.py | 2 + files/Onion | 1 + .../hiddenServices/Flask_hiddenServices.py | 99 +++++++++ .../templates/header_hiddenServices.html | 1 + .../templates/hiddenServices.html | 188 ++++++++++++++++++ .../hiddenServices/templates/showDomain.html | 76 +++++++ 9 files changed, 567 insertions(+), 64 deletions(-) create mode 100755 bin/packages/HiddenServices.py create mode 100644 var/www/modules/hiddenServices/Flask_hiddenServices.py create mode 100644 var/www/modules/hiddenServices/templates/header_hiddenServices.html create mode 100644 var/www/modules/hiddenServices/templates/hiddenServices.html create mode 100644 var/www/modules/hiddenServices/templates/showDomain.html diff --git a/bin/Crawler.py b/bin/Crawler.py index df1e0117..2e617959 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -18,40 +18,41 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) -def crawl_onion(url, domain, date): +def crawl_onion(url, domain, date, date_month): - if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): - super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') - if super_father is None: - super_father=paste + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste - try: - r = requests.get(splash_url , timeout=0.010) - except Exception: - ## FIXME: # TODO: relaunch docker - exit(0) + try: + r = requests.get(splash_url , timeout=30.0) + except Exception: + ## FIXME: # TODO: relaunch docker or send error message + print('--------------------------------------') + print(' DOCKER SPLASH DOWN') + exit(0) - if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) + if r.status_code == 200: + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) - if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) - p.populate_set_out(msg, 'Tags') + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') - print(process.stdout.read()) + print(process.stdout.read()) - else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - print(process.stdout.read()) else: - ## FIXME: # TODO: relaunch docker - exit(0) + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + print(process.stdout.read()) + else: + ## FIXME: # TODO: relaunch docker + exit(0) if __name__ == '__main__': @@ -97,11 +98,23 @@ if __name__ == '__main__': message = p.get_from_set() # Recovering the streamed message informations. + #message = r_onion.spop('mess_onion') + print(message) + + if message is None: + print('get ardb message') + message = r_onion.spop('mess_onion') + if message is not None: + splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + if not '.onion' in url: + print('not onion') + continue + url_list = re.findall(url_regex, url)[0] if url_list[1] == '': url= 'http://{}'.format(url) @@ -117,46 +130,55 @@ if __name__ == '__main__': print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) + '''if not r_onion.sismember('full_onion_up', domain): + r_onion.sadd('mess_onion', message) + print('added ..............')''' + + if not r_onion.sismember('banned_onion', domain): date = datetime.datetime.now().strftime("%Y%m%d") + date_month = datetime.datetime.now().strftime("%Y%m") - crawl_onion(url, domain, date) - if url != domain_url: - crawl_onion(domain_url, domain, date) + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - # save dowm onion - if not r_onion.sismember('onion_up:'+date , domain): - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - r_onion.hincrby('onion_link_down', url, 1) - if not r_onion.exists('onion_metadata:{}'.format(domain)): - r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) - r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) - else: - r_onion.hincrby('onion_link_up', url, 1) + crawl_onion(url, domain, date, date_month) + if url != domain_url: + crawl_onion(domain_url, domain, date, date_month) - # last check - r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) - - # check external onions links (full_scrawl) - external_domains = set() - for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): - print(link) - external_domain = re.findall(url_regex, link) - print(external_domain) - if len(external_domain) > 0: - external_domain = external_domain[0][4] + # save down onion + if not r_onion.sismember('onion_up:'+date , domain): + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + r_onion.hincrby('onion_link_down', url, 1) + if not r_onion.exists('onion_metadata:{}'.format(domain)): + r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) + r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) else: - continue - print(external_domain) - # # TODO: add i2p - if '.onion' in external_domain and external_domain != domain: - external_domains.add(external_domain) - if len(external_domains) >= 10: - r_onion.sadd('onion_potential_source', domain) - r_onion.delete('domain_onion_external_links:{}'.format(domain)) - print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + r_onion.hincrby('onion_link_up', url, 1) + + # last check + r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + + # check external onions links (full_scrawl) + external_domains = set() + for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): + external_domain = re.findall(url_regex, link) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + # # TODO: add i2p + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('onion_potential_source', domain) + r_onion.delete('domain_onion_external_links:{}'.format(domain)) + print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + + r_onion.lpush('last_onions', domain) + r_onion.ltrim('last_onions', 0, 15) + else: continue else: diff --git a/bin/Onion.py b/bin/Onion.py index 1e2dff32..23a81755 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -29,6 +29,7 @@ import os import base64 import subprocess import redis +import re from Helper import Process @@ -96,6 +97,12 @@ if __name__ == "__main__": db=p.config.getint("Redis_Cache", "db"), decode_responses=True) + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + # FUNCTIONS # publisher.info("Script subscribed to channel onion_categ") @@ -109,6 +116,9 @@ if __name__ == "__main__": # Thanks to Faup project for this regex # https://github.com/stricaud/faup url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_regex) + while True: if message is not None: @@ -127,8 +137,22 @@ if __name__ == "__main__": url, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = x - domains_list.append(domain) - urls.append(url) + if '.onion' in url: + print(url) + domains_list.append(domain) + urls.append(url) + + for x in PST.get_regex(i2p_regex): + # Extracting url with regex + url, s, credential, subdomain, domain, host, port, \ + resource_path, query_string, f1, f2, f3, f4 = x + + if '.i2p' in url: + print('add i2p') + print(domain) + if not r_onion.sismember('i2p_domain', domain): + r_onion.sadd('i2p_domain', domain) + r_onion.sadd('i2p_link', url) # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -157,10 +181,21 @@ if __name__ == "__main__": msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) p.populate_set_out(msg, 'Tags') ''' + + date_month = datetime.datetime.now().strftime("%Y%m") + date = datetime.datetime.now().strftime("%Y%m%d") for url in urls: - msg = '{};{}'.format(url,PST.p_path) - print('send to crawler') - p.populate_set_out(msg, 'Crawler') + + domain = re.findall(url_regex, url) + if len(domain) > 0: + domain = domain[0][4] + else: + continue + + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + msg = '{};{}'.format(url,PST.p_path) + print('send to crawler') + p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py new file mode 100755 index 00000000..48f514fc --- /dev/null +++ b/bin/packages/HiddenServices.py @@ -0,0 +1,79 @@ +#!/usr/bin/python3 + +""" +The ``hiddenServices Class`` +=================== + +Use it to create an object from an existing paste or other random file. + +Conditions to fulfill to be able to use this class correctly: +------------------------------------------------------------- + +1/ The paste need to be saved on disk somewhere (have an accessible path) +2/ The paste need to be gziped. +3/ The filepath need to look like something like this: + /directory/source/year/month/day/paste.gz + +""" + +import os +import gzip +import redis + +import configparser +import sys +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +from Date import Date + +class HiddenServices(object): + """ + This class representing a hiddenServices as an object. + When created, the object will have by default some "main attributes" + + :Example: + + PST = HiddenServices("xxxxxxxx.onion", "onion") + + """ + + def __init__(self, domain, type): + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + self.r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + self.domain = domain + self.type = type + + if type == 'onion': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"), cfg.get("Directories", "crawled")) + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + elif type == 'i2p': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + else: + ## TODO: # FIXME: add error + pass + + + def get_last_crawled_pastes(self): + + last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + return self.get_crawled_pastes_by_date(last_check) + + def get_crawled_pastes_by_date(self, date): + pastes_path = os.path.join(self.paste_directory, date[0:4], date[4:6], date[6:8]) + l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] + print(len(l_crawled_pastes)) + print(l_crawled_pastes) + return l_crawled_pastes diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 3d392b93..c5280329 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -55,6 +55,7 @@ class TorSplashCrawler(): self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") + self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) @@ -120,6 +121,7 @@ class TorSplashCrawler(): self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) self.r_serv_onion.sadd('full_onion_up', self.domains[0]) + self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): diff --git a/files/Onion b/files/Onion index 5c9980e2..69fcf878 100644 --- a/files/Onion +++ b/files/Onion @@ -1 +1,2 @@ onion +i2p diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py new file mode 100644 index 00000000..04740a93 --- /dev/null +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Flask functions and routes for the trending modules page +''' +import redis +import datetime +from flask import Flask, render_template, jsonify, request, Blueprint + +import HiddenServices +from Date import Date + +# ============ VARIABLES ============ +import Flask_config + +app = Flask_config.app +cfg = Flask_config.cfg +r_serv_onion = Flask_config.r_serv_onion + +hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') + +# ============ FUNCTIONS ============ +def one(): + return 1 + +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date( '{}{}{}'.format(str(curr_date.year), str(curr_date.month).zfill(2), str(curr_date.day).zfill(2)) ) + date_list = [] + + for i in range(0, num_day): + date_list.append(date.substract_day(i)) + + return list(reversed(date_list)) + +def get_onion_status(domain, date): + if r_serv_onion.sismember('onion_up:'+date , domain): + return True + else: + return False +# ============= ROUTES ============== + +@hiddenServices.route("/hiddenServices/", methods=['GET']) +def hiddenServices_page(): + last_onions = r_serv_onion.lrange('last_onions', 0 ,-1) + list_onion = [] + + for onion in last_onions: + metadata_onion = {} + metadata_onion['domain'] = onion + metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check') + metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen') + if get_onion_status(onion, metadata_onion['last_check']): + metadata_onion['status_text'] = 'UP' + metadata_onion['status_color'] = 'Green' + metadata_onion['status_icon'] = 'fa-check-circle' + else: + metadata_onion['status_text'] = 'DOWN' + metadata_onion['status_color'] = 'Red' + metadata_onion['status_icon'] = 'fa-times-circle' + list_onion.append(metadata_onion) + + return render_template("hiddenServices.html", last_onions=list_onion) + +@hiddenServices.route("/hiddenServices/onion_domain", methods=['GET']) +def onion_domain(): + onion_domain = request.args.get('onion_domain') + if onion_domain is None or not r_serv_onion.exists('onion_metadata:{}'.format(onion_domain)): + pass + # # TODO: FIXME return 404 + + last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') + first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') + date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) + + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen) + +# ============= JSON ============== +@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) +def domain_crawled_7days_json(): + type = 'onion' + ## TODO: # FIXME: 404 error + + date_range = get_date_range(7) + json_domain_stats = [] + #try: + for date in date_range: + nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date)) + nb_domain_down = r_serv_onion.scard('{}_up:{}'.format(type, date)) + date = date[0:4] + '-' + date[4:6] + '-' + date[6:8] + json_domain_stats.append({ 'date': date, 'value': int( nb_domain_up ), 'nb_domain_down': int( nb_domain_down )}) + #except: + #return jsonify() + + return jsonify(json_domain_stats) + +# ========= REGISTRATION ========= +app.register_blueprint(hiddenServices) diff --git a/var/www/modules/hiddenServices/templates/header_hiddenServices.html b/var/www/modules/hiddenServices/templates/header_hiddenServices.html new file mode 100644 index 00000000..5c77963c --- /dev/null +++ b/var/www/modules/hiddenServices/templates/header_hiddenServices.html @@ -0,0 +1 @@ +
  • hidden Services
  • diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html new file mode 100644 index 00000000..bbc66ace --- /dev/null +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -0,0 +1,188 @@ + + + + + + + + Hidden Service - AIL + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    +
    + +
    +
    ONION
    +
    + + + + + + + + + + + {% for metadata_onion in last_onions %} + + + + + + + {% endfor %} + +
    DomainFirst SeenLast CheckStatus
    {{ metadata_onion['domain'] }}{{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}}{{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}}
    + + {{metadata_onion['status_text']}} +
    +
    + +
    +
    + +
    + +
    + +
    +
    + +
    + +
    + + + + + + + + diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html new file mode 100644 index 00000000..18cd79be --- /dev/null +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -0,0 +1,76 @@ + + + + + + + + Show Domain - AIL + + + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    + +
    +
    +
    +
    + Graph +
    + + + + + + + + + + + + + + + + +
    Domain{{ domain }}
    First Seen{{ first_seen }}
    Last Check{{ last_check }}
    +
    +
    +
    + +
    + +
    + +
    + +
    + + + + + + + From 7e24943537ede802dbdbd887db2b32a30751f90a Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 24 Aug 2018 10:13:56 +0200 Subject: [PATCH 05/28] chg: [Crawler] crawler accept all kind of domains --- bin/Crawler.py | 97 +++++++++++-------- bin/Onion.py | 14 ++- bin/torcrawler/TorSplashCrawler.py | 39 +++++--- bin/torcrawler/tor_crawler.py | 19 ++-- .../hiddenServices/Flask_hiddenServices.py | 6 +- .../hiddenServices/templates/showDomain.html | 6 ++ 6 files changed, 112 insertions(+), 69 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 2e617959..240ae2a3 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -34,21 +34,21 @@ def crawl_onion(url, domain, date, date_month): exit(0) if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) if process.returncode == 0: if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) p.populate_set_out(msg, 'Tags') print(process.stdout.read()) else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) + r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) + r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) print(process.stdout.read()) else: ## FIXME: # TODO: relaunch docker @@ -67,8 +67,28 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) - splash_url = p.config.get("Crawler", "splash_url") - http_proxy = p.config.get("Crawler", "http_proxy") + url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_onion) + url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_i2p) + + type_hidden_service = 'onion' + if type_hidden_service == 'onion': + regex_hidden_service = url_onion + splash_url = p.config.get("Crawler", "splash_url_onion") + http_proxy = p.config.get("Crawler", "http_proxy_onion") + elif type_hidden_service == 'i2p': + regex_hidden_service = url_i2p + splash_url = p.config.get("Crawler", "splash_url_i2p") + http_proxy = p.config.get("Crawler", "http_proxy_i2p") + elif type_hidden_service == 'regular': + regex_hidden_service = url_i2p + splash_url = p.config.get("Crawler", "splash_url_onion") + http_proxy = p.config.get("Crawler", "http_proxy_onion") + else: + print('incorrect crawler type: {}'.format(type_hidden_service)) + exit(0) + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") #signal.signal(signal.SIGINT, signal_handler) @@ -91,93 +111,94 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) - url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" - re.compile(url_regex) - while True: - message = p.get_from_set() # Recovering the streamed message informations. - #message = r_onion.spop('mess_onion') - print(message) + message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + #message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2' + # # FIXME: remove if message is None: print('get ardb message') message = r_onion.spop('mess_onion') + print(message) + if message is not None: splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + if not '.onion' in url: print('not onion') continue - url_list = re.findall(url_regex, url)[0] + url_list = re.findall(regex_hidden_service, url)[0] if url_list[1] == '': url= 'http://{}'.format(url) link, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = url_list domain = url_list[4] + r_onion.srem('onion_domain_crawler_queue', domain) + #domain = 'myip.com' domain_url = 'http://{}'.format(domain) - print('------------------START ONION CRAWLER------------------') + print('------------------START CRAWLER------------------') + print(type_hidden_service) + print('-------------------------------------------------') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - '''if not r_onion.sismember('full_onion_up', domain): - r_onion.sadd('mess_onion', message) - print('added ..............')''' - - - if not r_onion.sismember('banned_onion', domain): + if not r_onion.sismember('banned_{}'.format(type_hidden_service), domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") - if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): crawl_onion(url, domain, date, date_month) if url != domain_url: crawl_onion(domain_url, domain, date, date_month) # save down onion - if not r_onion.sismember('onion_up:'+date , domain): - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - r_onion.hincrby('onion_link_down', url, 1) - if not r_onion.exists('onion_metadata:{}'.format(domain)): - r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) - r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) + if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): + r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) + r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) + r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) + if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) + r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date) else: - r_onion.hincrby('onion_link_up', url, 1) + r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) # last check - r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) # check external onions links (full_scrawl) external_domains = set() - for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): - external_domain = re.findall(url_regex, link) + for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): + external_domain = re.findall(url_onion, link) + external_domain.extend(re.findall(url_i2p, link)) if len(external_domain) > 0: external_domain = external_domain[0][4] else: continue - # # TODO: add i2p if '.onion' in external_domain and external_domain != domain: external_domains.add(external_domain) + elif '.i2p' in external_domain and external_domain != domain: + external_domains.add(external_domain) if len(external_domains) >= 10: - r_onion.sadd('onion_potential_source', domain) - r_onion.delete('domain_onion_external_links:{}'.format(domain)) - print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) + r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) + print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - r_onion.lpush('last_onions', domain) - r_onion.ltrim('last_onions', 0, 15) + r_onion.lpush('last_{}'.format(type_hidden_service), domain) + r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) else: continue diff --git a/bin/Onion.py b/bin/Onion.py index 23a81755..d77c010f 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -150,9 +150,12 @@ if __name__ == "__main__": if '.i2p' in url: print('add i2p') print(domain) - if not r_onion.sismember('i2p_domain', domain): + if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain): r_onion.sadd('i2p_domain', domain) r_onion.sadd('i2p_link', url) + r_onion.sadd('i2p_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('i2p_crawler_queue', msg) # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -193,9 +196,12 @@ if __name__ == "__main__": continue if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - msg = '{};{}'.format(url,PST.p_path) - print('send to crawler') - p.populate_set_out(msg, 'Crawler') + if not r_onion.sismember('onion_domain_crawler_queue', domain): + print('send to onion crawler') + r_onion.sadd('onion_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('onion_crawler_queue', msg) + #p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index c5280329..135ad0a7 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -9,6 +9,7 @@ import uuid import datetime import base64 import redis +import json from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError @@ -30,7 +31,6 @@ class TorSplashCrawler(): self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'SPLASH_URL': splash_url, - 'HTTP_PROXY': http_proxy, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, @@ -41,14 +41,15 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_depth_limit }) - def crawl(self, url, domain, original_paste, super_father): - self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father) + def crawl(self, type, url, domain, original_paste, super_father): + self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, url, domain,original_paste, super_father, *args, **kwargs): + def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): + self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url @@ -100,12 +101,13 @@ class TorSplashCrawler(): args={ 'html': 1, 'wait': 10, 'render_all': 1, + 'har': 1, 'png': 1} ) def parse(self,response): - print(response.headers) - print(response.status) + #print(response.headers) + #print(response.status) # # TODO: # FIXME: self.r_cache.setbit(response.url, 0, 1) @@ -119,17 +121,18 @@ class TorSplashCrawler(): # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): - self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) - self.r_serv_onion.sadd('full_onion_up', self.domains[0]) - self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0]) + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) + self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) + self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata - if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date) + if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste) # add onion screenshot history - self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date) + self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) @@ -144,17 +147,20 @@ class TorSplashCrawler(): os.makedirs(dirname) size_screenshot = (len(response.data['png'])*3) /4 - print(size_screenshot) if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) + #interest = response.data['har']['log']['entries'][0]['response']['header'][0] + with open(filename_screenshot+'har.txt', 'wb') as f: + f.write(json.dumps(response.data['har']).encode()) + # save external links in set lext = LinkExtractor(deny_domains=self.domains, unique=True) for link in lext.extract_links(response): - self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url) - self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url) + self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) @@ -169,6 +175,7 @@ class TorSplashCrawler(): args={ 'html': 1, 'png': 1, 'render_all': 1, + 'har': 1, 'wait': 10} #errback=self.errback_catcher ) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 57a77e76..99eb18c8 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': - if len(sys.argv) != 5: - print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father') + if len(sys.argv) != 8: + print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -21,14 +21,15 @@ if __name__ == '__main__': cfg = configparser.ConfigParser() cfg.read(configfile) - splash_url = cfg.get("Crawler", "splash_url") - http_proxy = cfg.get("Crawler", "http_proxy") + splash_url = sys.argv[1] + http_proxy = sys.argv[2] + type = sys.argv[3] crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") - url = sys.argv[1] - domain = sys.argv[2] - paste = sys.argv[3] - super_father = sys.argv[4] + url = sys.argv[4] + domain = sys.argv[5] + paste = sys.argv[6] + super_father = sys.argv[7] crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) - crawler.crawl(url, domain, paste, super_father) + crawler.crawl(type, url, domain, paste, super_father) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 04740a93..6d01bbbb 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -43,7 +43,7 @@ def get_onion_status(domain, date): @hiddenServices.route("/hiddenServices/", methods=['GET']) def hiddenServices_page(): - last_onions = r_serv_onion.lrange('last_onions', 0 ,-1) + last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) list_onion = [] for onion in last_onions: @@ -72,9 +72,11 @@ def onion_domain(): last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') + domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) - return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen) + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, + domain_paste=domain_paste) # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 18cd79be..88942c73 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -49,6 +49,12 @@ Last Check {{ last_check }} + + Origin Paste + + {{ domain_paste }} + +
    From ced0b1e350e85228a5a6ec9a645047d98f5d14e1 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 24 Aug 2018 10:24:03 +0200 Subject: [PATCH 06/28] chg: [I2P] add default config --- bin/Crawler.py | 6 ------ bin/packages/config.cfg.sample | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 240ae2a3..ab74c64b 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -115,22 +115,17 @@ if __name__ == '__main__': # Recovering the streamed message informations. message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) - #message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2' # # FIXME: remove if message is None: print('get ardb message') message = r_onion.spop('mess_onion') - print(message) - if message is not None: splitted = message.split(';') if len(splitted) == 2: url, paste = splitted - - if not '.onion' in url: print('not onion') continue @@ -143,7 +138,6 @@ if __name__ == '__main__': resource_path, query_string, f1, f2, f3, f4 = url_list domain = url_list[4] r_onion.srem('onion_domain_crawler_queue', domain) - #domain = 'myip.com' domain_url = 'http://{}'.format(domain) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 62ea0887..2ca156d4 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -225,5 +225,7 @@ db = 0 [Crawler] crawler_depth_limit = 1 -splash_url = http://127.0.0.1:8050 -http_proxy = http://127.0.0.1:9050 +splash_url_onion = http://127.0.0.1:8050 +splash_url_i2p = http://127.0.0.1:8050 +http_proxy_onion = http://127.0.0.1:9050 +http_proxy_i2p = http://127.0.0.1:9050 From d42dd118a4572d107d8c8e09b5be11ac1f417b53 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 27 Aug 2018 11:02:39 +0200 Subject: [PATCH 07/28] chg: [Domain crawled] add random screenshot --- bin/packages/HiddenServices.py | 67 ++++++++++++++++--- .../hiddenServices/Flask_hiddenServices.py | 10 ++- .../hiddenServices/templates/showDomain.html | 2 +- 3 files changed, 68 insertions(+), 11 deletions(-) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 48f514fc..9f4e9302 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -19,6 +19,7 @@ Conditions to fulfill to be able to use this class correctly: import os import gzip import redis +import random import configparser import sys @@ -52,11 +53,19 @@ class HiddenServices(object): db=cfg.getint("ARDB_Onion", "db"), decode_responses=True) + self.r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + self.domain = domain self.type = type if type == 'onion': - self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"), cfg.get("Directories", "crawled")) + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled")) + self.paste_crawled_directory_name = cfg.get("Directories", "crawled") self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) elif type == 'i2p': self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) @@ -65,15 +74,57 @@ class HiddenServices(object): ## TODO: # FIXME: add error pass - + #todo use the right paste def get_last_crawled_pastes(self): + paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + #paste_parent = paste_parent.replace(self.paste_directory, '')[1:] + return self.get_all_pastes_domain(paste_parent) - last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') - return self.get_crawled_pastes_by_date(last_check) + def get_all_pastes_domain(self, father): + l_crawled_pastes = [] + paste_parent = father.replace(self.paste_directory, '')[1:] + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) + ## TODO: # FIXME: remove me + if not paste_childrens: + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + for children in paste_childrens: + if self.domain in children: + l_crawled_pastes.append(children) + l_crawled_pastes.extend(self.get_all_pastes_domain(children)) + return l_crawled_pastes + + def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): + l_screenshot_paste = [] + for paste in l_crawled_pastes: + ## FIXME: # TODO: remove me + paste= paste.replace(self.paste_directory, '')[1:] + + paste = paste.replace(self.paste_crawled_directory_name, '') + if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): + l_screenshot_paste.append(paste[1:]) + + if len(l_screenshot_paste) > num_screenshot: + l_random_screenshot = [] + for index in random.sample( range(0, len(l_screenshot_paste)), num_screenshot ): + l_random_screenshot.append(l_screenshot_paste[index]) + return l_random_screenshot + else: + return l_screenshot_paste def get_crawled_pastes_by_date(self, date): - pastes_path = os.path.join(self.paste_directory, date[0:4], date[4:6], date[6:8]) - l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] - print(len(l_crawled_pastes)) - print(l_crawled_pastes) + + pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) + paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + + l_crawled_pastes = [] + return l_crawled_pastes + + def get_last_crawled_pastes_fileSearch(self): + + last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + return self.get_crawled_pastes_by_date_fileSearch(last_check) + + def get_crawled_pastes_by_date_fileSearch(self, date): + pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) + l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] return l_crawled_pastes diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 6d01bbbb..7969aae8 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -6,10 +6,12 @@ ''' import redis import datetime +import sys +import os from flask import Flask, render_template, jsonify, request, Blueprint -import HiddenServices from Date import Date +from HiddenServices import HiddenServices # ============ VARIABLES ============ import Flask_config @@ -75,8 +77,12 @@ def onion_domain(): domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) + h = HiddenServices(onion_domain, 'onion') + l_pastes = h.get_last_crawled_pastes() + screenshot = h.get_domain_random_screenshot(l_pastes)[0] + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, - domain_paste=domain_paste) + domain_paste=domain_paste, screenshot=screenshot) # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 88942c73..3f5b8736 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -62,7 +62,7 @@
    - +
    From 40772a5732002081ea7b19cef2447b008a8e4675 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 27 Aug 2018 11:30:19 +0200 Subject: [PATCH 08/28] fix: merge --- var/www/modules/showpaste/Flask_showpaste.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index 39e2283e..40240591 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -9,11 +9,7 @@ import os import json import os import flask -<<<<<<< HEAD -from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory -======= -from flask import Flask, render_template, jsonify, request, Blueprint, make_response, redirect, url_for, Response, send_from_directory ->>>>>>> master +from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory, redirect, url_for import difflib import ssdeep From ca982e13e1b2adf21e4814efcdcbfca1c89ba2a0 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 27 Aug 2018 14:34:08 +0200 Subject: [PATCH 09/28] chg: [Crawled Domain] show crawled pastes by domain --- bin/packages/HiddenServices.py | 4 +- var/www/modules/Flask_config.py | 1 + .../hiddenServices/Flask_hiddenServices.py | 32 +++++++++++- .../hiddenServices/templates/showDomain.html | 51 +++++++++++++++++-- 4 files changed, 80 insertions(+), 8 deletions(-) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 9f4e9302..5143553b 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -85,8 +85,8 @@ class HiddenServices(object): paste_parent = father.replace(self.paste_directory, '')[1:] paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) ## TODO: # FIXME: remove me - if not paste_childrens: - paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + paste_childrens = paste_childrens | paste_children for children in paste_childrens: if self.domain in children: l_crawled_pastes.append(children) diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 34e630f2..07a6a3f0 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -150,6 +150,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') +PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 7969aae8..2c0c7e4a 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -19,6 +19,9 @@ import Flask_config app = Flask_config.app cfg = Flask_config.cfg r_serv_onion = Flask_config.r_serv_onion +r_serv_metadata = Flask_config.r_serv_metadata +bootstrap_label = Flask_config.bootstrap_label +PASTES_FOLDER = Flask_config.PASTES_FOLDER hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') @@ -79,9 +82,36 @@ def onion_domain(): h = HiddenServices(onion_domain, 'onion') l_pastes = h.get_last_crawled_pastes() - screenshot = h.get_domain_random_screenshot(l_pastes)[0] + screenshot = h.get_domain_random_screenshot(l_pastes) + if screenshot: + screenshot = screenshot[0] + else: + screenshot = 'None' + + paste_tags = [] + path_name = [] + for path in l_pastes: + path_name.append(path.replace(PASTES_FOLDER, '')) + p_tags = r_serv_metadata.smembers('tag:'+path) + l_tags = [] + for tag in p_tags: + complete_tag = tag + tag = tag.split('=') + if len(tag) > 1: + if tag[1] != '': + tag = tag[1][1:-1] + # no value + else: + tag = tag[0][1:-1] + # use for custom tags + else: + tag = tag[0] + l_tags.append( (tag, complete_tag) ) + paste_tags.append(l_tags) return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, + l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label, + path_name=path_name, domain_paste=domain_paste, screenshot=screenshot) # ============= JSON ============== diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 3f5b8736..29aa821c 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -15,9 +15,15 @@ - - - + + + + @@ -28,8 +34,9 @@
    -
    +
    +
    Graph @@ -57,11 +64,38 @@ +
    + + + + + + + + + + {% for path in l_pastes %} + + + + {% endfor %} + + +
    Crawled Pastes
    {{ path_name[loop.index0] }} +
    + {% for tag in paste_tags[loop.index0] %} + + {{ tag[0] }} + + {% endfor %} +
    +
    +
    -
    +
    @@ -74,6 +108,13 @@ $(document).ready(function(){ activePage = "page-hiddenServices" $("#"+activePage).addClass("active"); + table = $('#myTable_').DataTable( + { + "aLengthMenu": [[5, 10, 15, 20, -1], [5, 10, 15, 20, "All"]], + "iDisplayLength": 5, + "order": [[ 0, "desc" ]] + } + ); }); From 6f0817365acc0891537c80355f250d9b5d28a9c8 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 12 Sep 2018 09:55:49 +0200 Subject: [PATCH 10/28] chg: [Crawler UI] display domain information --- .gitignore | 3 +- bin/Bitcoin.py | 1 + bin/Crawler.py | 42 +++++++++- bin/packages/HiddenServices.py | 22 ++++- bin/torcrawler/TorSplashCrawler.py | 18 +++-- pip3_packages_requirement.txt | 3 + .../hiddenServices/Flask_hiddenServices.py | 55 ++++++++----- .../hiddenServices/templates/showDomain.html | 80 ++++++++++++------- 8 files changed, 164 insertions(+), 60 deletions(-) diff --git a/.gitignore b/.gitignore index b5755ee6..6973080f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,9 +11,10 @@ ardb faup tlsh Blooms -LEVEL_DB_DATA PASTES +CRAWLED_SCREENSHOT BASE64 +HASHS DATA_ARDB indexdir/ logs/ diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py index 5ec2199f..1b7694b7 100755 --- a/bin/Bitcoin.py +++ b/bin/Bitcoin.py @@ -32,6 +32,7 @@ def decode_base58(bc, length): for char in bc: n = n * 58 + digits58.index(char) return n.to_bytes(length, 'big') + def check_bc(bc): try: bcbytes = decode_base58(bc, 25) diff --git a/bin/Crawler.py b/bin/Crawler.py index ab74c64b..3660aa41 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -57,6 +57,12 @@ def crawl_onion(url, domain, date, date_month): if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)') + exit(1) + + type_hidden_service = sys.argv[1] + publisher.port = 6380 publisher.channel = "Script" @@ -72,7 +78,6 @@ if __name__ == '__main__': url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" re.compile(url_i2p) - type_hidden_service = 'onion' if type_hidden_service == 'onion': regex_hidden_service = url_onion splash_url = p.config.get("Crawler", "splash_url_onion") @@ -89,8 +94,12 @@ if __name__ == '__main__': print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) + print(type_hidden_service) + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + #signal.signal(signal.SIGINT, signal_handler) r_serv_metadata = redis.StrictRedis( @@ -113,8 +122,10 @@ if __name__ == '__main__': while True: - # Recovering the streamed message informations. + # Recovering the streamed message informations. http://eepsites.i2p message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + #message = 'http://i2pwiki.i2p;test' + #message = 'http://i2host.i2p;test' # # FIXME: remove if message is None: @@ -122,13 +133,19 @@ if __name__ == '__main__': message = r_onion.spop('mess_onion') if message is not None: + print(message) splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + paste = paste.replace(PASTES_FOLDER+'/', '') + print(paste) + ''' if not '.onion' in url: print('not onion') continue + ''' + url_list = re.findall(regex_hidden_service, url)[0] if url_list[1] == '': @@ -137,7 +154,7 @@ if __name__ == '__main__': link, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = url_list domain = url_list[4] - r_onion.srem('onion_domain_crawler_queue', domain) + r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) domain_url = 'http://{}'.format(domain) @@ -157,6 +174,8 @@ if __name__ == '__main__': crawl_onion(url, domain, date, date_month) if url != domain_url: + print(url) + print(domain_url) crawl_onion(domain_url, domain, date, date_month) # save down onion @@ -173,6 +192,17 @@ if __name__ == '__main__': # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) + # last_father + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) + + # add onion screenshot history + # add crawled days + if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: + r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date) + # add crawled history by date + r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here + + # check external onions links (full_scrawl) external_domains = set() for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): @@ -194,6 +224,12 @@ if __name__ == '__main__': r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) + #send all crawled domain past + msg = domain + p.populate_set_out(msg, 'DomainSubject') + + #time.sleep(30) + else: continue else: diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 5143553b..ca07bfd2 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -61,6 +61,7 @@ class HiddenServices(object): self.domain = domain self.type = type + self.tags = {} if type == 'onion': self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) @@ -74,6 +75,20 @@ class HiddenServices(object): ## TODO: # FIXME: add error pass + def get_origin_paste_name(self): + origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + if origin_paste is None: + return '' + return origin_paste.replace(self.paste_directory+'/', '') + + def get_domain_tags(self): + return self.tags + + def update_domain_tags(self, children): + p_tags = self.r_serv_metadata.smembers('tag:'+children) + for tag in p_tags: + self.tags[tag] = self.tags.get(tag, 0) + 1 + #todo use the right paste def get_last_crawled_pastes(self): paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') @@ -81,8 +96,10 @@ class HiddenServices(object): return self.get_all_pastes_domain(paste_parent) def get_all_pastes_domain(self, father): + if father is None: + return [] l_crawled_pastes = [] - paste_parent = father.replace(self.paste_directory, '')[1:] + paste_parent = father.replace(self.paste_directory+'/', '') paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) ## TODO: # FIXME: remove me paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) @@ -90,6 +107,7 @@ class HiddenServices(object): for children in paste_childrens: if self.domain in children: l_crawled_pastes.append(children) + self.update_domain_tags(children) l_crawled_pastes.extend(self.get_all_pastes_domain(children)) return l_crawled_pastes @@ -97,7 +115,7 @@ class HiddenServices(object): l_screenshot_paste = [] for paste in l_crawled_pastes: ## FIXME: # TODO: remove me - paste= paste.replace(self.paste_directory, '')[1:] + paste= paste.replace(self.paste_directory+'/', '') paste = paste.replace(self.paste_crawled_directory_name, '') if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 135ad0a7..ffbc5da9 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -96,6 +96,7 @@ class TorSplashCrawler(): yield SplashRequest( self.start_urls, self.parse, + errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, @@ -121,6 +122,9 @@ class TorSplashCrawler(): # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): + # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? + self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) @@ -129,10 +133,6 @@ class TorSplashCrawler(): if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste) - - # add onion screenshot history - self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) @@ -170,6 +170,7 @@ class TorSplashCrawler(): yield SplashRequest( link.url, self.parse, + errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, @@ -179,10 +180,13 @@ class TorSplashCrawler(): 'wait': 10} #errback=self.errback_catcher ) - ''' + def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) + print('failure') + print(failure) + print(failure.request.meta['item']) #if isinstance(failure.value, HttpError): if failure.check(HttpError): @@ -196,14 +200,16 @@ class TorSplashCrawler(): # this is the original request request = failure.request print(DNSLookupError) + print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request + print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) - ''' + def save_crawled_paste(self, filename, content): diff --git a/pip3_packages_requirement.txt b/pip3_packages_requirement.txt index 53ec97e7..ddf60626 100644 --- a/pip3_packages_requirement.txt +++ b/pip3_packages_requirement.txt @@ -58,6 +58,9 @@ pycountry # To fetch Onion urls PySocks +#extract subject +newspaper3k + # decompress files sflock diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 2c0c7e4a..5e63374b 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -39,6 +39,23 @@ def get_date_range(num_day): return list(reversed(date_list)) +def unpack_paste_tags(p_tags): + l_tags = [] + for tag in p_tags: + complete_tag = tag + tag = tag.split('=') + if len(tag) > 1: + if tag[1] != '': + tag = tag[1][1:-1] + # no value + else: + tag = tag[0][1:-1] + # use for custom tags + else: + tag = tag[0] + l_tags.append( (tag, complete_tag) ) + return l_tags + def get_onion_status(domain, date): if r_serv_onion.sismember('onion_up:'+date , domain): return True @@ -76,43 +93,39 @@ def onion_domain(): # # TODO: FIXME return 404 last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') + last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8]) first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') - domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') - date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) + first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8]) + origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') h = HiddenServices(onion_domain, 'onion') l_pastes = h.get_last_crawled_pastes() + if l_pastes: + status = True + else: + status = False screenshot = h.get_domain_random_screenshot(l_pastes) if screenshot: screenshot = screenshot[0] else: screenshot = 'None' + domain_tags = h.get_domain_tags() + + origin_paste_name = h.get_origin_paste_name() + origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste))) paste_tags = [] path_name = [] for path in l_pastes: - path_name.append(path.replace(PASTES_FOLDER, '')) + path_name.append(path.replace(PASTES_FOLDER+'/', '')) p_tags = r_serv_metadata.smembers('tag:'+path) - l_tags = [] - for tag in p_tags: - complete_tag = tag - tag = tag.split('=') - if len(tag) > 1: - if tag[1] != '': - tag = tag[1][1:-1] - # no value - else: - tag = tag[0][1:-1] - # use for custom tags - else: - tag = tag[0] - l_tags.append( (tag, complete_tag) ) - paste_tags.append(l_tags) + paste_tags.append(unpack_paste_tags(p_tags)) return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, - l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label, - path_name=path_name, - domain_paste=domain_paste, screenshot=screenshot) + l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label, + path_name=path_name, origin_paste_tags=origin_paste_tags, status=status, + origin_paste=origin_paste, origin_paste_name=origin_paste_name, + domain_tags=domain_tags, screenshot=screenshot) # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 29aa821c..b89388aa 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -36,35 +36,61 @@
    -
    -
    +
    - Graph -
    + {% if status %} +
    + + UP +
    + {% else %} +
    + + DOWN +
    + {% endif %} +

    {{ domain }} :

    +
      +
    • - - - - - - - - - - - - - - - - - - - -
      Domain{{ domain }}
      First Seen{{ first_seen }}
      Last Check{{ last_check }}
      Origin Paste - {{ domain_paste }} -
      -
    + + + + + + + + + + + + + +
    First SeenLast Check
    {{ first_seen }}{{ last_check }}
    + + +
  • + Origin Paste: {{ origin_paste_name }} +
    + {% for tag in origin_paste_tags %} + + {{ tag[0] }} + + {% endfor %} +
    +
    +
  • + +
    +
    +
    + {% for tag in domain_tags %} + + {{ tag }} {{ domain_tags[tag] }} + + {% endfor %} +
    +
    From 0c63f2f24f399a2337dc3e3389128646744e2c91 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 17 Sep 2018 15:35:06 +0200 Subject: [PATCH 11/28] chg: [Crawler] catch server response --- bin/torcrawler/TorSplashCrawler.py | 130 +++++++++--------- var/www/modules/showpaste/Flask_showpaste.py | 10 ++ .../showpaste/templates/show_saved_paste.html | 6 +- 3 files changed, 83 insertions(+), 63 deletions(-) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index ffbc5da9..6673436b 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -38,6 +38,7 @@ class TorSplashCrawler(): }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', + 'HTTPERROR_ALLOW_ALL': True, 'DEPTH_LIMIT': crawler_depth_limit }) @@ -96,7 +97,7 @@ class TorSplashCrawler(): yield SplashRequest( self.start_urls, self.parse, - errback=self.errback_catcher, + #errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, @@ -109,84 +110,89 @@ class TorSplashCrawler(): def parse(self,response): #print(response.headers) #print(response.status) + print(' | ') + if response.status == 504: + # down ? + print('504 detected') + #elif response.status in in range(400, 600): + elif response.status != 200: + print('other: {}'.format(response.status)) + else: - # # TODO: # FIXME: - self.r_cache.setbit(response.url, 0, 1) - self.r_cache.expire(response.url, 360000) + UUID = self.domains[0]+str(uuid.uuid4()) + filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + relative_filename_paste = os.path.join(self.crawler_path, UUID) + filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') - UUID = self.domains[0]+str(uuid.uuid4()) - filename_paste = os.path.join(self.crawled_paste_filemame, UUID) - relative_filename_paste = os.path.join(self.crawler_path, UUID) - filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + # save new paste on disk + if self.save_crawled_paste(filename_paste, response.data['html']): - # save new paste on disk - if self.save_crawled_paste(filename_paste, response.data['html']): + # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? + self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) - # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? - self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) + self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) + self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) - self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) - self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) - self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) + # create onion metadata + if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) - # create onion metadata - if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) + #create paste metadata + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) - #create paste metadata - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) + self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) - self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) + dirname = os.path.dirname(filename_screenshot) + if not os.path.exists(dirname): + os.makedirs(dirname) - dirname = os.path.dirname(filename_screenshot) - if not os.path.exists(dirname): - os.makedirs(dirname) + size_screenshot = (len(response.data['png'])*3) /4 - size_screenshot = (len(response.data['png'])*3) /4 + if size_screenshot < 5000000: #bytes + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) - if size_screenshot < 5000000: #bytes - with open(filename_screenshot, 'wb') as f: - f.write(base64.standard_b64decode(response.data['png'].encode())) + #interest = response.data['har']['log']['entries'][0]['response']['header'][0] + with open(filename_screenshot+'har.txt', 'wb') as f: + f.write(json.dumps(response.data['har']).encode()) - #interest = response.data['har']['log']['entries'][0]['response']['header'][0] - with open(filename_screenshot+'har.txt', 'wb') as f: - f.write(json.dumps(response.data['har']).encode()) + # save external links in set + lext = LinkExtractor(deny_domains=self.domains, unique=True) + for link in lext.extract_links(response): + self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - # save external links in set - lext = LinkExtractor(deny_domains=self.domains, unique=True) - for link in lext.extract_links(response): - self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) - self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - - #le = LinkExtractor(unique=True) - le = LinkExtractor(allow_domains=self.domains, unique=True) - for link in le.extract_links(response): - self.r_cache.setbit(link, 0, 0) - self.r_cache.expire(link, 360000) - yield SplashRequest( - link.url, - self.parse, - errback=self.errback_catcher, - endpoint='render.json', - meta={'father': relative_filename_paste}, - args={ 'html': 1, - 'png': 1, - 'render_all': 1, - 'har': 1, - 'wait': 10} - #errback=self.errback_catcher - ) + #le = LinkExtractor(unique=True) + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + self.r_cache.setbit(link, 0, 0) + self.r_cache.expire(link, 360000) + yield SplashRequest( + link.url, + self.parse, + #errback=self.errback_catcher, + endpoint='render.json', + meta={'father': relative_filename_paste}, + args={ 'html': 1, + 'png': 1, + 'render_all': 1, + 'har': 1, + 'wait': 10} + ) + ''' def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) print('failure') - print(failure) - print(failure.request.meta['item']) + #print(failure) + print(failure.type) + #print(failure.request.meta['item']) #if isinstance(failure.value, HttpError): if failure.check(HttpError): @@ -209,7 +215,7 @@ class TorSplashCrawler(): print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) - + ''' def save_crawled_paste(self, filename, content): diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index 40240591..e2780e2a 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -33,6 +33,7 @@ bootstrap_label = Flask_config.bootstrap_label misp_event_url = Flask_config.misp_event_url hive_case_url = Flask_config.hive_case_url vt_enabled = Flask_config.vt_enabled +PASTES_FOLDER = Flask_config.PASTES_FOLDER SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') @@ -40,6 +41,14 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa # ============ FUNCTIONS ============ def showpaste(content_range, requested_path): + if PASTES_FOLDER not in requested_path: + requested_path = os.path.join(PASTES_FOLDER, requested_path) + # remove old full path + #requested_path = requested_path.replace(PASTES_FOLDER, '') + # escape directory transversal + if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: + return 'path transversal detected' + vt_enabled = Flask_config.vt_enabled paste = Paste.Paste(requested_path) @@ -173,6 +182,7 @@ def showpaste(content_range, requested_path): crawler_metadata = {} if 'infoleak:submission="crawler"' in l_tags: crawler_metadata['get_metadata'] = True + crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path) diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index 54ea99b5..6f0ccccc 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -435,9 +435,13 @@
    + + + + - + From f5b648d72a6aad836a36ec4cdc89f156d15eeb93 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 18 Sep 2018 11:03:40 +0200 Subject: [PATCH 12/28] pixelate paste screenshot --- bin/Crawler.py | 3 +- .../showpaste/templates/show_saved_paste.html | 63 ++++++++++++++++++- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 3660aa41..3e6e89aa 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -47,9 +47,8 @@ def crawl_onion(url, domain, date, date_month): print(process.stdout.read()) else: - r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) - r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) print(process.stdout.read()) + exit(0) else: ## FIXME: # TODO: relaunch docker exit(0) diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index 6f0ccccc..8db7cabd 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -423,11 +423,27 @@ {% if crawler_metadata['get_metadata'] %}
    - +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +
    +
    Graph @@ -574,5 +590,50 @@ {% endfor %} + From ce63d81878b762fb09345bcb94f9dc6c484025cc Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 18 Sep 2018 11:51:21 +0200 Subject: [PATCH 13/28] chg: [UI] pixelate crawled screenshot by default --- .../hiddenServices/templates/showDomain.html | 63 ++++++++++++++++++- .../showpaste/templates/show_saved_paste.html | 39 ++++++------ 2 files changed, 81 insertions(+), 21 deletions(-) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index b89388aa..30b078fb 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -122,7 +122,22 @@
    - +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +
    @@ -144,6 +159,52 @@ }); + + diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index 8db7cabd..99b92a3f 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -351,7 +351,6 @@
    {% if duplicate_list|length == 0 %} -

    No Duplicate

    {% else %}

    Duplicate list:

    Domain{{ crawler_metadata['domain'] }}
    Father{{ crawler_metadata['paste_father'] }}{{ crawler_metadata['paste_father'] }}
    Source link
    @@ -422,28 +421,9 @@ {% if crawler_metadata['get_metadata'] %}
    -
    -
    -
    -
    -
    - -
    -
    - -
    -
    -
    -
    - -
    -
    Graph @@ -472,6 +452,25 @@
    + +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    {% endif %} From 5b31b6e853fa70a5bd9a4bfc45cff8bb1891f8bf Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 18 Sep 2018 16:20:13 +0200 Subject: [PATCH 14/28] fix: [Crawler] save domain to crawl on splash error --- bin/Crawler.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 3e6e89aa..aeaf3ab3 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -18,7 +18,7 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) -def crawl_onion(url, domain, date, date_month): +def crawl_onion(url, domain, date, date_month, message): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') @@ -29,6 +29,12 @@ def crawl_onion(url, domain, date, date_month): r = requests.get(splash_url , timeout=30.0) except Exception: ## FIXME: # TODO: relaunch docker or send error message + + # send this msg back in the queue + if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): + r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) + r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) + print('--------------------------------------') print(' DOCKER SPLASH DOWN') exit(0) @@ -171,11 +177,11 @@ if __name__ == '__main__': if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): - crawl_onion(url, domain, date, date_month) + crawl_onion(url, domain, date, date_month, message) if url != domain_url: print(url) print(domain_url) - crawl_onion(domain_url, domain, date, date_month) + crawl_onion(domain_url, domain, date, date_month, message) # save down onion if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): From 912f2352ab3dec7bd91c3b92960332442fceca90 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 18 Sep 2018 16:58:49 +0200 Subject: [PATCH 15/28] fix: [crawled screenshot] fix img error --- .../hiddenServices/templates/showDomain.html | 17 +++++++++-------- .../showpaste/templates/show_saved_paste.html | 17 +++++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 30b078fb..655dbe31 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -166,16 +166,12 @@ ctx.webkitImageSmoothingEnabled = false; ctx.imageSmoothingEnabled = false; - /// wait until image is actually available img.onload = pixelate; + img.addEventListener("error", img_error); var draw_img = false; - img.onError = "img.onerror=null;img.src='{{ url_for('static', filename='image/AIL.png') }}';" - /// some image, we are not struck with CORS restrictions as we - /// do not use pixel buffer to pixelate, so any image will do img.src = "{{ url_for('showsavedpastes.screenshot', filename=screenshot) }}"; - /// MAIN function function pixelate() { /// use slider value @@ -195,14 +191,19 @@ /// draw original image to the scaled size ctx.drawImage(img, 0, 0, w, h); - /// then draw that scaled image thumb back to fill canvas - /// As smoothing is off the result will be pixelated + /// pixelated ctx.drawImage(canvas, 0, 0, w, h, 0, 0, canvas.width, canvas.height); } - /// event listeneners for slider and button blocks.addEventListener('change', pixelate, false); + + function img_error() { + img.onerror=null; + img.src="{{ url_for('static', filename='image/AIL.png') }}"; + blocks.value = 50; + pixelate; + } diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index 99b92a3f..72add804 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -596,16 +596,12 @@ var ctx = canvas.getContext('2d'), img = new Image(); ctx.webkitImageSmoothingEnabled = false; ctx.imageSmoothingEnabled = false; -/// wait until image is actually available img.onload = pixelate; +img.addEventListener("error", img_error); var draw_img = false; -img.onError = "img.onerror=null;img.src='{{ url_for('static', filename='image/AIL.png') }}';" -/// some image, we are not struck with CORS restrictions as we -/// do not use pixel buffer to pixelate, so any image will do img.src = "{{ url_for('showsavedpastes.screenshot', filename=crawler_metadata['screenshot']) }}"; -/// MAIN function function pixelate() { /// use slider value @@ -625,13 +621,18 @@ function pixelate() { /// draw original image to the scaled size ctx.drawImage(img, 0, 0, w, h); - /// then draw that scaled image thumb back to fill canvas - /// As smoothing is off the result will be pixelated + /// pixelated ctx.drawImage(canvas, 0, 0, w, h, 0, 0, canvas.width, canvas.height); } -/// event listeneners for slider and button +function img_error() { + img.onerror=null; + img.src="{{ url_for('static', filename='image/AIL.png') }}"; + blocks.value = 50; + pixelate; +} + blocks.addEventListener('change', pixelate, false); From 6edc1ddbeb3b13c56f1131c8a296a80602dc5056 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 21 Sep 2018 10:34:06 +0200 Subject: [PATCH 16/28] chg: [HiddenServices] get domain related to other domains --- bin/packages/HiddenServices.py | 35 +++++++++++++++++++ .../hiddenServices/Flask_hiddenServices.py | 12 ++++++- .../hiddenServices/templates/showDomain.html | 1 + 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index ca07bfd2..d515c955 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -111,6 +111,41 @@ class HiddenServices(object): l_crawled_pastes.extend(self.get_all_pastes_domain(children)) return l_crawled_pastes + def get_domain_son(self, l_paste): + if l_paste is None: + return None + + set_domain = set() + for paste in l_paste: + paste_full = paste.replace(self.paste_directory+'/', '') + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_full)) + ## TODO: # FIXME: remove me + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) + paste_childrens = paste_childrens | paste_children + for children in paste_childrens: + if not self.domain in children: + print(children) + set_domain.add((children.split('.onion')[0]+'.onion').split('/')[-1]) + + return set_domain + + def get_all_domain_son(self, father): + if father is None: + return [] + l_crawled_pastes = [] + paste_parent = father.replace(self.paste_directory+'/', '') + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) + ## TODO: # FIXME: remove me + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + paste_childrens = paste_childrens | paste_children + for children in paste_childrens: + if not self.domain in children: + l_crawled_pastes.append(children) + #self.update_domain_tags(children) + l_crawled_pastes.extend(self.get_all_domain_son(children)) + + return l_crawled_pastes + def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): l_screenshot_paste = [] for paste in l_crawled_pastes: diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 5e63374b..9613a7e5 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -89,7 +89,7 @@ def hiddenServices_page(): def onion_domain(): onion_domain = request.args.get('onion_domain') if onion_domain is None or not r_serv_onion.exists('onion_metadata:{}'.format(onion_domain)): - pass + return '404' # # TODO: FIXME return 404 last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') @@ -127,6 +127,16 @@ def onion_domain(): origin_paste=origin_paste, origin_paste_name=origin_paste_name, domain_tags=domain_tags, screenshot=screenshot) +@hiddenServices.route("/hiddenServices/onion_son", methods=['GET']) +def onion_son(): + onion_domain = request.args.get('onion_domain') + + h = HiddenServices(onion_domain, 'onion') + l_pastes = h.get_last_crawled_pastes() + l_son = h.get_domain_son(l_pastes) + print(l_son) + return 'l_son' + # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) def domain_crawled_7days_json(): diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 655dbe31..1d58c4ba 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -81,6 +81,7 @@
    +
    From 50c81773e91f9718a63aabacc3a14c19bc960e15 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 24 Sep 2018 16:23:14 +0200 Subject: [PATCH 17/28] chg: [Crawler] add launcher and install --- bin/Crawler.py | 42 +++++++------- bin/LAUNCH.sh | 30 +++++++++- bin/Onion.py | 56 +++++++++++-------- bin/packages/config.cfg.sample | 7 +-- bin/torcrawler/TorSplashCrawler.py | 17 ++---- bin/torcrawler/launch_splash_crawler.sh | 38 +++++++++++++ bin/torcrawler/tor_crawler.py | 18 +++--- .../etc/splash/proxy-profiles/default.ini | 4 ++ crawler_hidden_services_install.sh | 10 ++++ crawler_requirements.txt | 2 + var/www/Flask_server.py | 5 ++ 11 files changed, 160 insertions(+), 69 deletions(-) create mode 100755 bin/torcrawler/launch_splash_crawler.sh create mode 100644 configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini create mode 100644 crawler_hidden_services_install.sh create mode 100644 crawler_requirements.txt diff --git a/bin/Crawler.py b/bin/Crawler.py index aeaf3ab3..1fdf0601 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -40,16 +40,13 @@ def crawl_onion(url, domain, date, date_month, message): exit(0) if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father], + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) - p.populate_set_out(msg, 'Tags') - + # onion up print(process.stdout.read()) else: @@ -59,14 +56,19 @@ def crawl_onion(url, domain, date, date_month, message): ## FIXME: # TODO: relaunch docker exit(0) + time.sleep(60) + if __name__ == '__main__': - if len(sys.argv) != 2: - print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)') + if len(sys.argv) != 3: + print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') + print(sys.argv[1]) + print(sys.argv[2]) exit(1) type_hidden_service = sys.argv[1] + splash_port = sys.argv[2] publisher.port = 6380 publisher.channel = "Script" @@ -85,21 +87,19 @@ if __name__ == '__main__': if type_hidden_service == 'onion': regex_hidden_service = url_onion - splash_url = p.config.get("Crawler", "splash_url_onion") - http_proxy = p.config.get("Crawler", "http_proxy_onion") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) elif type_hidden_service == 'i2p': regex_hidden_service = url_i2p - splash_url = p.config.get("Crawler", "splash_url_i2p") - http_proxy = p.config.get("Crawler", "http_proxy_i2p") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port) elif type_hidden_service == 'regular': regex_hidden_service = url_i2p - splash_url = p.config.get("Crawler", "splash_url_onion") - http_proxy = p.config.get("Crawler", "http_proxy_onion") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) else: print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) print(type_hidden_service) + print(splash_url) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") @@ -129,8 +129,6 @@ if __name__ == '__main__': # Recovering the streamed message informations. http://eepsites.i2p message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) - #message = 'http://i2pwiki.i2p;test' - #message = 'http://i2host.i2p;test' # # FIXME: remove if message is None: @@ -186,13 +184,16 @@ if __name__ == '__main__': # save down onion if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) - r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) - r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) + #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) + #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date) else: - r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) + #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) + if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) + p.populate_set_out(msg, 'Tags') # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) @@ -226,12 +227,13 @@ if __name__ == '__main__': r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) + # update list, last crawled onions r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) #send all crawled domain past - msg = domain - p.populate_set_out(msg, 'DomainSubject') + #msg = domain + #p.populate_set_out(msg, 'DomainSubject') #time.sleep(30) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index c3bfd8cf..9da28a81 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1` isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1` isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1` isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1` +iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1` function helptext { echo -e $YELLOW" @@ -198,6 +199,26 @@ function launching_scripts { } +function launching_crawler { + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") + echo $lport + + IFS='-' read -ra PORTS <<< "$lport" + first_port=${PORTS[0]} + last_port=${PORTS[1]} + + screen -dmS "Crawler_AIL" + sleep 0.1 + + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done + + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT +} + function shutting_down_redis { redis_dir=${AIL_HOME}/redis/src/ bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN' @@ -406,6 +427,9 @@ function launch_all { Flask) launch_flask; ;; + Crawler) + launching_crawler; + ;; Killall) killall; ;; @@ -427,13 +451,13 @@ function launch_all { while [ "$1" != "" ]; do case $1 in - -l | --launchAuto ) launch_all "automatic"; + -l | --launchAuto ) launch_all "automatic"; launching_crawler ;; -k | --killAll ) killall; ;; - -c | --configUpdate ) checking_configuration "manual"; + -t | --thirdpartyUpdate ) update_thirdparty; ;; - -t | --thirdpartyUpdate ) update_thirdparty; + -c | --crawler ) launching_crawler; ;; -h | --help ) helptext; exit diff --git a/bin/Onion.py b/bin/Onion.py index d77c010f..1f233fcf 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -113,6 +113,15 @@ if __name__ == "__main__": message = p.get_from_set() prec_filename = None + # send to crawler: + activate_crawler = p.config.get("Crawler", "activate_crawler") + if activate_crawler == 'True': + activate_crawler = True + print('Crawler enabled') + else: + activate_crawler = False + print('Crawler disabled') + # Thanks to Faup project for this regex # https://github.com/stricaud/faup url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" @@ -142,6 +151,7 @@ if __name__ == "__main__": domains_list.append(domain) urls.append(url) + ''' for x in PST.get_regex(i2p_regex): # Extracting url with regex url, s, credential, subdomain, domain, host, port, \ @@ -156,6 +166,7 @@ if __name__ == "__main__": r_onion.sadd('i2p_domain_crawler_queue', domain) msg = '{};{}'.format(url,PST.p_path) r_onion.sadd('i2p_crawler_queue', msg) + ''' # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -176,32 +187,33 @@ if __name__ == "__main__": to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) - ''' - for url in fetch(p, r_cache, urls, domains_list, path): - publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) - p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) - p.populate_set_out(msg, 'Tags') - ''' + if activate_crawler: + date_month = datetime.datetime.now().strftime("%Y%m") + date = datetime.datetime.now().strftime("%Y%m%d") + for url in urls: - date_month = datetime.datetime.now().strftime("%Y%m") - date = datetime.datetime.now().strftime("%Y%m%d") - for url in urls: + domain = re.findall(url_regex, url) + if len(domain) > 0: + domain = domain[0][4] + else: + continue - domain = re.findall(url_regex, url) - if len(domain) > 0: - domain = domain[0][4] - else: - continue + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + if not r_onion.sismember('onion_domain_crawler_queue', domain): + print('send to onion crawler') + r_onion.sadd('onion_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('onion_crawler_queue', msg) + #p.populate_set_out(msg, 'Crawler') - if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - if not r_onion.sismember('onion_domain_crawler_queue', domain): - print('send to onion crawler') - r_onion.sadd('onion_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) - r_onion.sadd('onion_crawler_queue', msg) - #p.populate_set_out(msg, 'Crawler') + else: + for url in fetch(p, r_cache, urls, domains_list, path): + publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) + p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') + + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 85566654..5bb83d21 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -235,8 +235,7 @@ port = 6381 db = 0 [Crawler] +activate_crawler = True crawler_depth_limit = 1 -splash_url_onion = http://127.0.0.1:8050 -splash_url_i2p = http://127.0.0.1:8050 -http_proxy_onion = http://127.0.0.1:9050 -http_proxy_i2p = http://127.0.0.1:9050 +splash_url_onion = http://127.0.0.1 +splash_onion_port = 8050-8050 diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 6673436b..2c217474 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -26,7 +26,7 @@ from Helper import Process class TorSplashCrawler(): - def __init__(self, splash_url, http_proxy, crawler_depth_limit): + def __init__(self, splash_url, crawler_depth_limit): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', @@ -114,7 +114,6 @@ class TorSplashCrawler(): if response.status == 504: # down ? print('504 detected') - #elif response.status in in range(400, 600): elif response.status != 200: print('other: {}'.format(response.status)) else: @@ -128,7 +127,7 @@ class TorSplashCrawler(): if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? - self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) @@ -157,21 +156,17 @@ class TorSplashCrawler(): with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) - #interest = response.data['har']['log']['entries'][0]['response']['header'][0] with open(filename_screenshot+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set - lext = LinkExtractor(deny_domains=self.domains, unique=True) - for link in lext.extract_links(response): - self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) - self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) + #lext = LinkExtractor(deny_domains=self.domains, unique=True) + #for link in lext.extract_links(response): + # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): - self.r_cache.setbit(link, 0, 0) - self.r_cache.expire(link, 360000) yield SplashRequest( link.url, self.parse, diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh new file mode 100755 index 00000000..562c2eb4 --- /dev/null +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +#usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; exit 1; } + +while getopts ":p:f:n:" o; do + case "${o}" in + p) + p=${OPTARG} + ;; + f) + f=${OPTARG} + ;; + n) + n=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then + #usage + echo "usage" +fi + +first_port=$p +echo "usage0" +screen -dmS "Docker_Splash" +echo "usage1" +sleep 0.1 + +for ((i=0;i<=$((${n} - 1));i++)); do + port_number=$((${p} + $i)) + screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + sleep 0.1 +done diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 99eb18c8..7881177c 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -8,8 +8,9 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': - if len(sys.argv) != 8: - print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father') + if len(sys.argv) != 7: + print(sys.argv) + print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -22,14 +23,13 @@ if __name__ == '__main__': cfg.read(configfile) splash_url = sys.argv[1] - http_proxy = sys.argv[2] - type = sys.argv[3] + type = sys.argv[2] crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") - url = sys.argv[4] - domain = sys.argv[5] - paste = sys.argv[6] - super_father = sys.argv[7] + url = sys.argv[3] + domain = sys.argv[4] + paste = sys.argv[5] + super_father = sys.argv[6] - crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) + crawler = TorSplashCrawler(splash_url, crawler_depth_limit) crawler.crawl(type, url, domain, paste, super_father) diff --git a/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..63217c2a --- /dev/null +++ b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=172.17.0.1 +port=9050 +type=SOCKS5 diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh new file mode 100644 index 00000000..2747ddb6 --- /dev/null +++ b/crawler_hidden_services_install.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# install docker +sudo apt install docker.io + +# pull splah docker +sudo docker pull scrapinghub/splash + +. ./AILENV/bin/activate +pip3 install -U -r pip3_packages_requirement.txt diff --git a/crawler_requirements.txt b/crawler_requirements.txt new file mode 100644 index 00000000..b0c096ac --- /dev/null +++ b/crawler_requirements.txt @@ -0,0 +1,2 @@ +scrapy +scrapy-splash diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 068bee65..9b7a93be 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -44,6 +44,11 @@ except IOError: f = open('templates/ignored_modules.txt', 'w') f.close() +activate_crawler = cfg.get("Crawler", "activate_crawler") +if activate_crawler != 'True': + toIgnoreModule.add('hiddenServices') + +print(toIgnoreModule) # Dynamically import routes and functions from modules # Also, prepare header.html From 8eca0e0778b8e37bf13d32df69e179c5695f4b39 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 24 Sep 2018 16:24:30 +0200 Subject: [PATCH 18/28] fix: [Crawler] clean --- bin/Crawler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 1fdf0601..6773fd37 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -56,8 +56,6 @@ def crawl_onion(url, domain, date, date_month, message): ## FIXME: # TODO: relaunch docker exit(0) - time.sleep(60) - if __name__ == '__main__': From 874824a5898ba350a662901ff3f167d636804761 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 24 Sep 2018 16:28:55 +0200 Subject: [PATCH 19/28] fix: [Crawler] clean --- bin/Crawler.py | 6 ++---- bin/torcrawler/tor_crawler.py | 1 - var/www/Flask_server.py | 2 -- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 6773fd37..fff85daf 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -61,8 +61,6 @@ if __name__ == '__main__': if len(sys.argv) != 3: print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') - print(sys.argv[1]) - print(sys.argv[2]) exit(1) type_hidden_service = sys.argv[1] @@ -96,8 +94,8 @@ if __name__ == '__main__': print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) - print(type_hidden_service) - print(splash_url) + print('crawler type: {}'.format(type_hidden_service)) + print('splash url: {}'.format(splash_url)) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 7881177c..58e8331b 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -9,7 +9,6 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': if len(sys.argv) != 7: - print(sys.argv) print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') exit(1) diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 9b7a93be..b67c46ff 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -48,8 +48,6 @@ activate_crawler = cfg.get("Crawler", "activate_crawler") if activate_crawler != 'True': toIgnoreModule.add('hiddenServices') -print(toIgnoreModule) - # Dynamically import routes and functions from modules # Also, prepare header.html to_add_to_header_dico = {} From c49e871ba85b1d7464943c5a21b918b7a7cc3f79 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 26 Sep 2018 16:34:27 +0200 Subject: [PATCH 20/28] chg: [crawler] add infos --- bin/Crawler.py | 12 +++++------- bin/torcrawler/launch_splash_crawler.sh | 15 ++++++++++----- crawler_hidden_services_install.sh | 9 +++++++-- 3 files changed, 22 insertions(+), 14 deletions(-) mode change 100644 => 100755 crawler_hidden_services_install.sh diff --git a/bin/Crawler.py b/bin/Crawler.py index fff85daf..9642436c 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -36,7 +36,8 @@ def crawl_onion(url, domain, date, date_month, message): r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) print('--------------------------------------') - print(' DOCKER SPLASH DOWN') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) exit(0) if r.status_code == 200: @@ -94,7 +95,6 @@ if __name__ == '__main__': print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) - print('crawler type: {}'.format(type_hidden_service)) print('splash url: {}'.format(splash_url)) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") @@ -132,13 +132,11 @@ if __name__ == '__main__': message = r_onion.spop('mess_onion') if message is not None: - print(message) splitted = message.split(';') if len(splitted) == 2: url, paste = splitted paste = paste.replace(PASTES_FOLDER+'/', '') - print(paste) ''' if not '.onion' in url: print('not onion') @@ -157,9 +155,9 @@ if __name__ == '__main__': domain_url = 'http://{}'.format(domain) - print('------------------START CRAWLER------------------') - print(type_hidden_service) - print('-------------------------------------------------') + print('\033[92m------------------START CRAWLER------------------\033[0m') + print('crawler type: {}'.format(type_hidden_service)) + print('\033[92m-------------------------------------------------\033[0m') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index 562c2eb4..37963e93 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -1,6 +1,14 @@ #!/bin/bash -#usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; exit 1; } +usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; + echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)" + echo " -p: number of the first splash server port number. This number is incremented for the others splash server" + echo " -n: number of splash servers to start" + echo "" + echo "example:" + echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3" + exit 1; + } while getopts ":p:f:n:" o; do case "${o}" in @@ -25,14 +33,11 @@ if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then echo "usage" fi -first_port=$p -echo "usage0" screen -dmS "Docker_Splash" -echo "usage1" sleep 0.1 for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) - screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' sleep 0.1 done diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh old mode 100644 new mode 100755 index 2747ddb6..3cdb96bd --- a/crawler_hidden_services_install.sh +++ b/crawler_hidden_services_install.sh @@ -1,7 +1,12 @@ #!/bin/bash -# install docker -sudo apt install docker.io +read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r +echo # (optional) move to a new line +if [[ $REPLY =~ ^[Yy]$ ]] +then + # install docker + sudo apt install docker.io +fi # pull splah docker sudo docker pull scrapinghub/splash From 04b9d9fc1d65d1758200e2a9ac1a52db9ee88569 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 27 Sep 2018 11:14:29 +0200 Subject: [PATCH 21/28] chg: [Crawler] add docs --- HOWTO.md | 28 +++++++++++ bin/LAUNCH.sh | 33 +++++++------ bin/packages/config.cfg.sample | 2 +- bin/torcrawler/launch_splash_crawler.sh | 15 +++--- crawler_hidden_services_install.sh | 63 ++++++++++++++++++++----- 5 files changed, 107 insertions(+), 34 deletions(-) diff --git a/HOWTO.md b/HOWTO.md index 1a66402b..50fad074 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de - You can track a term by simply putting it in the box. - You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\` - You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`. + + +Crawler +--------------------- +In AIL, you can crawl hidden services. + +two types of configutation [explaination for what]: + 1) use local Splash dockers (use the same host for Splash servers and AIL) + 2) use remote Splash servers + +- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) +- (Splash host) Setup your tor proxy[is already installed]: + - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` + (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) + - Restart the tor proxy: ``sudo service tor restart`` + +- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` + all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. + +- (AIL host) Edit the ``/bin/packages/config.cfg`` file: + - In the crawler section, set ``activate_crawler`` to ``True`` + - Change the IP address of Splash servers if needed (remote only) + - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). + +- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c`` + + + diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index a7c0631d..3b17a4a6 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -201,23 +201,28 @@ function launching_scripts { } function launching_crawler { - CONFIG=$AIL_BIN/packages/config.cfg - lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") - echo $lport + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") - IFS='-' read -ra PORTS <<< "$lport" - first_port=${PORTS[0]} - last_port=${PORTS[1]} + IFS='-' read -ra PORTS <<< "$lport" + if [ ${#PORTS[@]} -eq 1 ] + then + first_port=${PORTS[0]} + last_port=${PORTS[0]} + else + first_port=${PORTS[0]} + last_port=${PORTS[1]} + fi - screen -dmS "Crawler_AIL" - sleep 0.1 + screen -dmS "Crawler_AIL" + sleep 0.1 - for ((i=first_port;i<=last_port;i++)); do - screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' - sleep 0.1 - done + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done - echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT } function shutting_down_redis { @@ -465,7 +470,7 @@ function launch_all { while [ "$1" != "" ]; do case $1 in - -l | --launchAuto ) launch_all "automatic"; launching_crawler + -l | --launchAuto ) launch_all "automatic"; ;; -k | --killAll ) killall; ;; diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 64b1f7f6..c30fa071 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -240,4 +240,4 @@ db = 0 activate_crawler = True crawler_depth_limit = 1 splash_url_onion = http://127.0.0.1 -splash_onion_port = 8050-8050 +splash_onion_port = 8050-8052 diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index 37963e93..e78656ab 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -1,12 +1,12 @@ #!/bin/bash usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; - echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)" - echo " -p: number of the first splash server port number. This number is incremented for the others splash server" - echo " -n: number of splash servers to start" - echo "" - echo "example:" - echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3" + echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"; + echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; + echo " -n: number of splash servers to start"; + echo ""; + echo "example:"; + echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; exit 1; } @@ -29,8 +29,7 @@ done shift $((OPTIND-1)) if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then - #usage - echo "usage" + usage; fi screen -dmS "Docker_Splash" diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh index 3cdb96bd..3fbccb74 100755 --- a/crawler_hidden_services_install.sh +++ b/crawler_hidden_services_install.sh @@ -1,15 +1,56 @@ #!/bin/bash -read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r -echo # (optional) move to a new line -if [[ $REPLY =~ ^[Yy]$ ]] -then +install_docker() { # install docker - sudo apt install docker.io + sudo apt install docker.io; + + # pull splah docker + sudo docker pull scrapinghub/splash; +} + +install_python_requirement() { + . ./AILENV/bin/activate; + pip3 install -U -r crawler_requirements.txt; +} + +install_all() { + read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r + echo # (optional) move to a new line + if [[ $REPLY =~ ^[Yy]$ ]] + then + install_docker; + fi + install_python_requirement; +} + +usage() { + echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2; + echo " -y: install docker" + echo " -n: don't install docker" + echo "" + echo "example:" + echo "crawler_hidden_services_install.sh -y" + exit 1; +} + +if [[ $1 == "" ]]; then + install_all; + exit; +else + key="$1" + case $key in + "") + install_all; + ;; + -y|--yes) + install_docker; + install_python_requirement; + ;; + -n|--no) + install_python_requirement; + ;; + *) # unknown option + usage; + ;; + esac fi - -# pull splah docker -sudo docker pull scrapinghub/splash - -. ./AILENV/bin/activate -pip3 install -U -r pip3_packages_requirement.txt From e357dce59b0824ed09eeb1c5b376fc1d0ed6e56b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 27 Sep 2018 15:43:03 +0200 Subject: [PATCH 22/28] fix: [Crawler] detect splash connection to proxy error --- HOWTO.md | 19 +++++++------ bin/Crawler.py | 37 +++++++++++++++++-------- bin/torcrawler/TorSplashCrawler.py | 8 ++++-- bin/torcrawler/launch_splash_crawler.sh | 1 + 4 files changed, 43 insertions(+), 22 deletions(-) diff --git a/HOWTO.md b/HOWTO.md index 50fad074..5b5bc92a 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -102,20 +102,23 @@ Crawler --------------------- In AIL, you can crawl hidden services. -two types of configutation [explaination for what]: - 1) use local Splash dockers (use the same host for Splash servers and AIL) - 2) use remote Splash servers +There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same. -- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) -- (Splash host) Setup your tor proxy[is already installed]: +Install/Configure and launch all crawler scripts: + +- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option) + +- *(Splash host)* Install/Setup your tor proxy: + - Install the tor proxy: ``sudo apt-get install tor -y`` + (The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it) - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) - Restart the tor proxy: ``sudo service tor restart`` -- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` - all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. +- *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` + All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. -- (AIL host) Edit the ``/bin/packages/config.cfg`` file: +- *(AIL host)* Edit the ``/bin/packages/config.cfg`` file: - In the crawler section, set ``activate_crawler`` to ``True`` - Change the IP address of Splash servers if needed (remote only) - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). diff --git a/bin/Crawler.py b/bin/Crawler.py index 9642436c..30d3ffb2 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -18,6 +18,12 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) +def on_error_send_message_back_in_queue(type_hidden_service, domain, message): + # send this msg back in the queue + if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): + r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) + r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) + def crawl_onion(url, domain, date, date_month, message): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): @@ -30,15 +36,11 @@ def crawl_onion(url, domain, date, date_month, message): except Exception: ## FIXME: # TODO: relaunch docker or send error message - # send this msg back in the queue - if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): - r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) - r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) - + on_error_send_message_back_in_queue(type_hidden_service, domain, message) print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - exit(0) + exit(1) if r.status_code == 200: process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], @@ -47,15 +49,26 @@ def crawl_onion(url, domain, date, date_month, message): time.sleep(1) if process.returncode == 0: - # onion up - print(process.stdout.read()) - + output = process.stdout.read().decode() + print(output) + # error: splash:Connection to proxy refused + if 'Connection to proxy refused' in output: + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + print('------------------------------------------------------------------------') + print(' \033[91m SPLASH: Connection to proxy refused') + print('') + print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) + print('------------------------------------------------------------------------') + exit(-2) else: print(process.stdout.read()) - exit(0) + exit(-1) else: - ## FIXME: # TODO: relaunch docker - exit(0) + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + print('--------------------------------------') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) + exit(1) if __name__ == '__main__': diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 2c217474..59060ba3 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -110,12 +110,16 @@ class TorSplashCrawler(): def parse(self,response): #print(response.headers) #print(response.status) - print(' | ') if response.status == 504: # down ? print('504 detected') elif response.status != 200: - print('other: {}'.format(response.status)) + #print('other: {}'.format(response.status)) + #print(error_log) + #detect connection to proxy refused + error_log = (json.loads(response.body.decode())) + if(error_log['info']['text'] == 'Connection to proxy refused'): + print('Connection to proxy refused') else: UUID = self.domains[0]+str(uuid.uuid4()) diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index e78656ab..5f3f9020 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -39,4 +39,5 @@ for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' sleep 0.1 + echo " Splash server launched on port $port_number" done From ecb28571519748e4528198f4bb8e3395cd840d6b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 27 Sep 2018 16:47:48 +0200 Subject: [PATCH 23/28] chg: [Crawler] css + limit splash RAM --- bin/LAUNCH.sh | 40 ++++++++++--------- bin/torcrawler/TorSplashCrawler.py | 2 +- bin/torcrawler/launch_splash_crawler.sh | 2 +- .../templates/hiddenServices.html | 1 + .../hiddenServices/templates/showDomain.html | 1 + var/www/modules/showpaste/Flask_showpaste.py | 1 - .../showpaste/templates/show_saved_paste.html | 8 +--- 7 files changed, 28 insertions(+), 27 deletions(-) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 3b17a4a6..684af83b 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -201,28 +201,32 @@ function launching_scripts { } function launching_crawler { - CONFIG=$AIL_BIN/packages/config.cfg - lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") + if [[ ! $iscrawler ]]; then + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") - IFS='-' read -ra PORTS <<< "$lport" - if [ ${#PORTS[@]} -eq 1 ] - then - first_port=${PORTS[0]} - last_port=${PORTS[0]} - else - first_port=${PORTS[0]} - last_port=${PORTS[1]} - fi + IFS='-' read -ra PORTS <<< "$lport" + if [ ${#PORTS[@]} -eq 1 ] + then + first_port=${PORTS[0]} + last_port=${PORTS[0]} + else + first_port=${PORTS[0]} + last_port=${PORTS[1]} + fi - screen -dmS "Crawler_AIL" - sleep 0.1 - - for ((i=first_port;i<=last_port;i++)); do - screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + screen -dmS "Crawler_AIL" sleep 0.1 - done - echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done + + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT + else + echo -e $RED"\t* A screen is already launched"$DEFAULT + fi } function shutting_down_redis { diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 59060ba3..47486dd9 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -114,7 +114,7 @@ class TorSplashCrawler(): # down ? print('504 detected') elif response.status != 200: - #print('other: {}'.format(response.status)) + print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index 5f3f9020..412022c1 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -37,7 +37,7 @@ sleep 0.1 for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) - screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' sleep 0.1 echo " Splash server launched on port $port_number" done diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index bbc66ace..1784aa72 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -6,6 +6,7 @@ Hidden Service - AIL + diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 1d58c4ba..50b3c631 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -6,6 +6,7 @@ Show Domain - AIL + diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index e4fc3cfd..f79239a3 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -186,7 +186,6 @@ def showpaste(content_range, requested_path): crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') - crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path) crawler_metadata['screenshot'] = paste.get_p_rel_path() else: crawler_metadata['get_metadata'] = False diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index 72add804..440c82e2 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -424,9 +424,9 @@
    -
    +
    - Graph + Crawled Paste
    @@ -443,10 +443,6 @@ - - - -
    Source link {{ crawler_metadata['real_link'] }}
    External links{{ crawler_metadata['external_links'] }}
    From 7734ed6632fd4d72b01883e11f3fe10840319df2 Mon Sep 17 00:00:00 2001 From: Sami Mokaddem Date: Fri, 28 Sep 2018 10:44:57 +0200 Subject: [PATCH 24/28] Update HOWTO.md --- HOWTO.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HOWTO.md b/HOWTO.md index 5b5bc92a..afbd326f 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -84,9 +84,9 @@ You can navigate into the interface by using arrow keys. In order to perform an To change list, you can press the key. -Also, you can quickly stop or start modules by clicking on the or symbol respectively. These are located in the _Action_ column. +Also, you can quickly stop or start modules by clicking on the ```` or ```` symbol respectively. These are located in the _Action_ column. -Finally, you can quit this program by pressing either or +Finally, you can quit this program by pressing either ```` or ````. Terms frequency usage From 5f18f6946205fef50bd1afd6d9b346f2b0a7898e Mon Sep 17 00:00:00 2001 From: Sami Mokaddem Date: Fri, 28 Sep 2018 11:32:08 +0200 Subject: [PATCH 25/28] Update HOWTO.md --- HOWTO.md | 54 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/HOWTO.md b/HOWTO.md index afbd326f..6e8d09b2 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -102,28 +102,46 @@ Crawler --------------------- In AIL, you can crawl hidden services. -There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same. +There are two types of installation. You can install a *local* or a *remote* Splash server. +``(Splash host) = the server running the splash service`` +``(AIL host) = the server running AIL`` -Install/Configure and launch all crawler scripts: +### Installation/Configuration -- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option) +1. *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirements (type ``y`` if a localhost splah server is used or use the ``-y`` option) -- *(Splash host)* Install/Setup your tor proxy: - - Install the tor proxy: ``sudo apt-get install tor -y`` - (The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it) - - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` - (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) - - Restart the tor proxy: ``sudo service tor restart`` +2. *(Splash host)* To install and setup your tor proxy: + - Install the tor proxy: ``sudo apt-get install tor -y`` + (Not required if ``Splah host == AIL host`` - The tor proxy is installed by default in AIL) + - Add the following line ``SOCKSPolicy accept 172.17.0.0/16`` in ``/etc/tor/torrc`` + (for a linux docker, the localhost IP is *172.17.0.1*; Should be adapted for other platform) + - Restart the tor proxy: ``sudo service tor restart`` -- *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` - All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. - -- *(AIL host)* Edit the ``/bin/packages/config.cfg`` file: - - In the crawler section, set ``activate_crawler`` to ``True`` - - Change the IP address of Splash servers if needed (remote only) - - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). - -- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c`` +3. *(AIL host)* Edit the ``/bin/packages/config.cfg`` file: + - In the crawler section, set ``activate_crawler`` to ``True`` + - Change the IP address of Splash servers if needed (remote only) + - Set ``splash_onion_port`` according to your Splash servers port numbers that will be used. + those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). +### Starting the scripts +- *(Splash host)* Launch all Splash servers with: +```sudo ./bin/torcrawler/launch_splash_crawler.sh -f -p -n ``` +With ```` and ```` matching those specified at ``splash_onion_port`` in the configuration file of point 3 (``/bin/packages/config.cfg``) + +All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. + +- (AIL host) launch all AIL crawler scripts using: +```./bin/LAUNCH.sh -c``` + + +### TL;DR - Local setup +#### Installation +- ```crawler_hidden_services_install.sh -y``` +- Add the following line in ``SOCKSPolicy accept 172.17.0.0/16`` in ``/etc/tor/torrc`` +- ```sudo service tor restart``` +- set activate_crawler to True in ``/bin/packages/config.cfg`` +#### Start +- ```sudo ./bin/torcrawler/launch_splash_crawler.sh -f $AIL_HOME/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 1";``` +- ```./bin/LAUNCH.sh -c``` From 82e6df4b94d7d5c5ee6f819d78da4574ce37eafd Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 28 Sep 2018 15:23:27 +0200 Subject: [PATCH 26/28] chg: [Crawler] domains stats + logs + clean --- bin/Crawler.py | 29 ++---------- pip3_packages_requirement.txt | 3 -- .../hiddenServices/Flask_hiddenServices.py | 9 +++- .../templates/hiddenServices.html | 45 ++++++++++++++++++- var/www/modules/showpaste/Flask_showpaste.py | 1 - 5 files changed, 55 insertions(+), 32 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 30d3ffb2..9ebff043 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -14,10 +14,6 @@ sys.path.append(os.environ['AIL_BIN']) from Helper import Process from pubsublogger import publisher - -def signal_handler(sig, frame): - sys.exit(0) - def on_error_send_message_back_in_queue(type_hidden_service, domain, message): # send this msg back in the queue if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): @@ -34,9 +30,10 @@ def crawl_onion(url, domain, date, date_month, message): try: r = requests.get(splash_url , timeout=30.0) except Exception: - ## FIXME: # TODO: relaunch docker or send error message + # TODO: relaunch docker or send error message on_error_send_message_back_in_queue(type_hidden_service, domain, message) + publisher.error('{} SPASH DOWN'.format(splash_url)) print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) @@ -54,6 +51,7 @@ def crawl_onion(url, domain, date, date_month, message): # error: splash:Connection to proxy refused if 'Connection to proxy refused' in output: on_error_send_message_back_in_queue(type_hidden_service, domain, message) + publisher.error('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) print('------------------------------------------------------------------------') print(' \033[91m SPLASH: Connection to proxy refused') print('') @@ -114,8 +112,6 @@ if __name__ == '__main__': PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) - #signal.signal(signal.SIGINT, signal_handler) - r_serv_metadata = redis.StrictRedis( host=p.config.get("ARDB_Metadata", "host"), port=p.config.getint("ARDB_Metadata", "port"), @@ -136,26 +132,15 @@ if __name__ == '__main__': while True: - # Recovering the streamed message informations. http://eepsites.i2p + # Recovering the streamed message informations. message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) - # # FIXME: remove - if message is None: - print('get ardb message') - message = r_onion.spop('mess_onion') - if message is not None: splitted = message.split(';') if len(splitted) == 2: url, paste = splitted paste = paste.replace(PASTES_FOLDER+'/', '') - ''' - if not '.onion' in url: - print('not onion') - continue - ''' - url_list = re.findall(regex_hidden_service, url)[0] if url_list[1] == '': @@ -238,12 +223,6 @@ if __name__ == '__main__': r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) - #send all crawled domain past - #msg = domain - #p.populate_set_out(msg, 'DomainSubject') - - #time.sleep(30) - else: continue else: diff --git a/pip3_packages_requirement.txt b/pip3_packages_requirement.txt index cc1d0543..dd447d5c 100644 --- a/pip3_packages_requirement.txt +++ b/pip3_packages_requirement.txt @@ -58,9 +58,6 @@ pycountry # To fetch Onion urls PySocks -#extract subject -newspaper3k - # decompress files sflock diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 9613a7e5..64c23f65 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -68,6 +68,13 @@ def hiddenServices_page(): last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) list_onion = [] + now = datetime.datetime.now() + date = '{}{}{}'.format(now.strftime("%Y"), now.strftime("%m"), now.strftime("%d")) + statDomains = {} + statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date)) + statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date)) + statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] + for onion in last_onions: metadata_onion = {} metadata_onion['domain'] = onion @@ -83,7 +90,7 @@ def hiddenServices_page(): metadata_onion['status_icon'] = 'fa-times-circle' list_onion.append(metadata_onion) - return render_template("hiddenServices.html", last_onions=list_onion) + return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains) @hiddenServices.route("/hiddenServices/onion_domain", methods=['GET']) def onion_domain(): diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index 1784aa72..292a70d9 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -80,10 +80,50 @@
    - +
    +
    +
    + Domains Crawled Today +
    + + + + + + + + + + + + + + + + +
    +
    + + Domains UP +
    +
    +
    + {{ statDomains['domains_up'] }} +
    +
    +
    + + Domains DOWN +
    +
    +
    + {{ statDomains['domains_down'] }} +
    +
    Crawled Domains{{ statDomains['total'] }}
    +
    @@ -125,7 +165,8 @@ function create_line_chart(id, url){ var line = d3.line() .x(function(d) { return x(d.date); - }).y(function(d) { + }) + .y(function(d) { return y(d.value); }); var svg_line = d3.select('#'+id).append('svg') diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index f79239a3..4912e7b0 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -5,7 +5,6 @@ Flask functions and routes for the trending modules page ''' import redis -import os import json import os import flask From b3a6dc8487490dbe7406fdb131b139a7a07e2529 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 28 Sep 2018 15:42:06 +0200 Subject: [PATCH 27/28] fix: [Crawler] remove hardcoded url --- var/www/modules/hiddenServices/Flask_hiddenServices.py | 3 ++- var/www/modules/hiddenServices/templates/hiddenServices.html | 2 +- var/www/modules/hiddenServices/templates/showDomain.html | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 64c23f65..c504d1d3 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -18,6 +18,7 @@ import Flask_config app = Flask_config.app cfg = Flask_config.cfg +baseUrl = Flask_config.baseUrl r_serv_onion = Flask_config.r_serv_onion r_serv_metadata = Flask_config.r_serv_metadata bootstrap_label = Flask_config.bootstrap_label @@ -164,4 +165,4 @@ def domain_crawled_7days_json(): return jsonify(json_domain_stats) # ========= REGISTRATION ========= -app.register_blueprint(hiddenServices) +app.register_blueprint(hiddenServices, url_prefix=baseUrl) diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index 292a70d9..1839c5ef 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -136,7 +136,7 @@ $(document).ready(function(){ activePage = "page-hiddenServices" $("#"+activePage).addClass("active"); - all_graph.line_chart = create_line_chart('graph_line', '/hiddenServices/domain_crawled_7days_json?type=onion'); + all_graph.line_chart = create_line_chart('graph_line', "{{ url_for('hiddenServices.domain_crawled_7days_json') }}?type=onion"); }); $(window).on("resize", function() { all_graph.onResize(); diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 50b3c631..dd6b2056 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -16,8 +16,8 @@ - - + +