diff --git a/bin/Crawler.py b/bin/Crawler.py new file mode 100755 index 00000000..92d43a81 --- /dev/null +++ b/bin/Crawler.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import redis +import datetime +import time +import subprocess + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process +from pubsublogger import publisher + + +def signal_handler(sig, frame): + sys.exit(0) + +if __name__ == '__main__': + + publisher.port = 6380 + publisher.channel = "Script" + + publisher.info("Script Crawler started") + + config_section = 'Crawler' + + # Setup the I/O queues + p = Process(config_section) + + splash_url = p.config.get("Crawler", "splash_url") + http_proxy = p.config.get("Crawler", "http_proxy") + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + + #signal.signal(signal.SIGINT, signal_handler) + + r_serv_metadata = redis.StrictRedis( + host=p.config.get("ARDB_Metadata", "host"), + port=p.config.getint("ARDB_Metadata", "port"), + db=p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_cache = redis.StrictRedis( + host=p.config.get("Redis_Cache", "host"), + port=p.config.getint("Redis_Cache", "port"), + db=p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + while True: + + message = p.get_from_set() + # Recovering the streamed message informations. + if message is not None: + splitted = message.split(';') + if len(splitted) == 2: + url, paste = splitted + + print(url) + + if not r_cache.exists(url): + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste + + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) + + date = datetime.datetime.now().strftime("%Y%m%d") + print(date) + url_domain = url.replace('http://', '') + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') + + r_onion.sadd('onion_up:'+date , url_domain) + else: + r_onion.sadd('onion_down:'+date , url_domain) + print(process.stdout.read()) + + else: + continue + else: + time.sleep(1) diff --git a/bin/Onion.py b/bin/Onion.py index 277f1c71..dbedf1e1 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -21,7 +21,6 @@ Requirements *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly. """ -import pprint import time from packages import Paste from pubsublogger import publisher @@ -123,6 +122,7 @@ if __name__ == "__main__": PST = Paste.Paste(filename) for x in PST.get_regex(url_regex): + print(x) # Extracting url with regex url, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = x @@ -149,12 +149,18 @@ if __name__ == "__main__": to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) + ''' for url in fetch(p, r_cache, urls, domains_list, path): publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) p.populate_set_out(msg, 'Tags') + ''' + for url in urls: + msg = '{};{}'.format(url,PST.p_path) + print('send to crawler') + p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 2ed662c1..62ea0887 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -3,6 +3,8 @@ bloomfilters = Blooms dicofilters = Dicos pastes = PASTES base64 = BASE64 +crawled = crawled +crawled_screenshot = CRAWLED_SCREENSHOT wordtrending_csv = var/www/static/csv/wordstrendingdata wordsfile = files/wordfile @@ -171,6 +173,11 @@ host = localhost port = 6382 db = 8 +[ARDB_Onion] +host = localhost +port = 6382 +db = 9 + [Url] cc_critical = DE @@ -215,3 +222,8 @@ channel = FetchedOnion host = localhost port = 6381 db = 0 + +[Crawler] +crawler_depth_limit = 1 +splash_url = http://127.0.0.1:8050 +http_proxy = http://127.0.0.1:9050 diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 452850f7..d8acf2dc 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags [Onion] subscribe = Redis_Onion -publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags +publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler #publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler [DumpValidOnion] @@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags [submit_paste] subscribe = Redis publish = Redis_Mixer + +[Crawler] +subscribe = Redis_Crawler +publish = Redis_Mixer,Redis_Tags + diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py new file mode 100644 index 00000000..ace36056 --- /dev/null +++ b/bin/torcrawler/TorSplashCrawler.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import gzip +import base64 +import uuid +import datetime +import base64 +import redis +from urllib.parse import urlparse + +from scrapy import Spider +from scrapy.linkextractors import LinkExtractor +from scrapy.crawler import CrawlerProcess, Crawler + +from twisted.internet import reactor + +from scrapy_splash import SplashRequest + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process + +class TorSplashCrawler(): + + def __init__(self, splash_url, http_proxy, crawler_depth_limit): + self.process = CrawlerProcess({'LOG_ENABLED': False}) + self.crawler = Crawler(self.TorSplashSpider, { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', + 'SPLASH_URL': splash_url, + 'HTTP_PROXY': http_proxy, + 'ROBOTSTXT_OBEY': False, + 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + }, + 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, + 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', + 'DEPTH_LIMIT': crawler_depth_limit + }) + + def crawl(self, url, original_paste, super_father): + self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father) + self.process.start() + + class TorSplashSpider(Spider): + name = 'TorSplashSpider' + + def __init__(self, url, original_paste, super_father, *args, **kwargs): + self.original_paste = original_paste + self.super_father = super_father + self.start_urls = url + self.domains = [urlparse(url).netloc] + date = datetime.datetime.now().strftime("%Y/%m/%d") + + config_section = 'Crawler' + self.p = Process(config_section) + + self.r_cache = redis.StrictRedis( + host=self.p.config.get("Redis_Cache", "host"), + port=self.p.config.getint("Redis_Cache", "port"), + db=self.p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + self.r_serv_log_submit = redis.StrictRedis( + host=self.p.config.get("Redis_Log_submit", "host"), + port=self.p.config.getint("Redis_Log_submit", "port"), + db=self.p.config.getint("Redis_Log_submit", "db"), + decode_responses=True) + + self.r_serv_metadata = redis.StrictRedis( + host=self.p.config.get("ARDB_Metadata", "host"), + port=self.p.config.getint("ARDB_Metadata", "port"), + db=self.p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), + self.p.config.get("Directories", "crawled"), date ) + + self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) + + def start_requests(self): + yield SplashRequest( + self.start_urls, + self.parse, + endpoint='render.json', + meta={'parent': self.original_paste}, + args={ 'html': 1, + 'wait': 10, + 'render_all': 1, + 'png': 1} + ) + + def parse(self,response): + print(response.headers) + print(response.status) + + self.r_cache.setbit(response.url, 0, 1) + self.r_cache.expire(response.url, 360000) + + UUID = self.domains[0]+str(uuid.uuid4()) + filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + + # save new paste on disk + if self.save_crawled_paste(filename_paste, response.data['html']): + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) + + self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) + + dirname = os.path.dirname(filename_screenshot) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) + + # save external links in set + lext = LinkExtractor(deny_domains=self.domains, unique=True) + for link in lext.extract_links(response): + self.r_serv_metadata.sadd('paste_crawler:filename_paste', link) + + #le = LinkExtractor(unique=True) + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + self.r_cache.setbit(link, 0, 0) + self.r_cache.expire(link, 360000) + yield SplashRequest( + link.url, + self.parse, + endpoint='render.json', + meta={'parent': UUID}, + args={ 'html': 1, + 'png': 1, + 'render_all': 1, + 'wait': 10} + ) + + def save_crawled_paste(self, filename, content): + + print(filename) + if os.path.isfile(filename): + print('File: {} already exist in submitted pastes'.format(filename)) + return False + + try: + gzipencoded = gzip.compress(content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + except: + print("file error: {}".format(filename)) + return False + + # send paste to Global + relay_message = "{0} {1}".format(filename, gzip64encoded) + self.p.populate_set_out(relay_message, 'Mixer') + + # increase nb of paste by feeder name + self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) + + # tag crawled paste + msg = 'infoleak:submission="crawler";{}'.format(filename) + self.p.populate_set_out(msg, 'Tags') + return True diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py new file mode 100755 index 00000000..3085f213 --- /dev/null +++ b/bin/torcrawler/tor_crawler.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import configparser +from TorSplashCrawler import TorSplashCrawler + +if __name__ == '__main__': + + if len(sys.argv) != 4: + print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father') + exit(1) + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + + splash_url = cfg.get("Crawler", "splash_url") + http_proxy = cfg.get("Crawler", "http_proxy") + crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") + + url = sys.argv[1] + paste = sys.argv[2] + super_father = sys.argv[3] + + crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) + crawler.crawl(url, paste, super_father) diff --git a/etc/splash/proxy-profiles/default.ini b/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..91208135 --- /dev/null +++ b/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=localhost +port=9050 +type=SOCKS5