diff --git a/.gitignore b/.gitignore index e74906ae..b5755ee6 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ var/www/submitted # Local config bin/packages/config.cfg configs/keys +files # installed files nltk_data/ diff --git a/bin/Crawler.py b/bin/Crawler.py index a8292b74..df1e0117 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -8,6 +8,7 @@ import redis import datetime import time import subprocess +import requests sys.path.append(os.environ['AIL_BIN']) from Helper import Process @@ -17,31 +18,40 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) -def crawl_onion(url, domain): - date = datetime.datetime.now().strftime("%Y%m%d") +def crawl_onion(url, domain, date): - if not r_onion.sismember('onion_up:'+date , domain): + if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain): + #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') if super_father is None: super_father=paste - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) + try: + r = requests.get(splash_url , timeout=0.010) + except Exception: + ## FIXME: # TODO: relaunch docker + exit(0) - if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) - p.populate_set_out(msg, 'Tags') - print(process.stdout.read()) + if r.status_code == 200: + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) - r_onion.sadd('onion_up:'+date , domain) - r_onion.sadd('onion_up_link:'+date , url) + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') + + print(process.stdout.read()) + + else: + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + print(process.stdout.read()) else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - print(process.stdout.read()) + ## FIXME: # TODO: relaunch docker + exit(0) if __name__ == '__main__': @@ -102,15 +112,51 @@ if __name__ == '__main__': domain_url = 'http://{}'.format(domain) - print('------------------START ONIOM CRAWLER------------------') + print('------------------START ONION CRAWLER------------------') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - crawl_onion(url, domain) - if url != domain_url: - crawl_onion(domain_url, domain) + if not r_onion.sismember('banned_onion', domain): + date = datetime.datetime.now().strftime("%Y%m%d") + + crawl_onion(url, domain, date) + if url != domain_url: + crawl_onion(domain_url, domain, date) + + # save dowm onion + if not r_onion.sismember('onion_up:'+date , domain): + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + r_onion.hincrby('onion_link_down', url, 1) + if not r_onion.exists('onion_metadata:{}'.format(domain)): + r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) + r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) + else: + r_onion.hincrby('onion_link_up', url, 1) + + # last check + r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + + # check external onions links (full_scrawl) + external_domains = set() + for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): + print(link) + external_domain = re.findall(url_regex, link) + print(external_domain) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + print(external_domain) + # # TODO: add i2p + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('onion_potential_source', domain) + r_onion.delete('domain_onion_external_links:{}'.format(domain)) + print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) else: continue else: diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index d1e3f0d3..45ed1ed2 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -94,6 +94,7 @@ class Paste(object): var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) + self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) @@ -291,6 +292,9 @@ class Paste(object): else: return '[]' + def get_p_rel_path(self): + return self.p_rel_path + def save_all_attributes_redis(self, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 63839799..3d392b93 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -10,6 +10,10 @@ import datetime import base64 import redis +from scrapy.spidermiddlewares.httperror import HttpError +from twisted.internet.error import DNSLookupError +from twisted.internet.error import TimeoutError + from scrapy import Spider from scrapy.linkextractors import LinkExtractor from scrapy.crawler import CrawlerProcess, Crawler @@ -79,6 +83,8 @@ class TorSplashCrawler(): db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) + self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) @@ -89,7 +95,7 @@ class TorSplashCrawler(): self.start_urls, self.parse, endpoint='render.json', - meta={'parent': self.original_paste}, + meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, @@ -106,44 +112,47 @@ class TorSplashCrawler(): UUID = self.domains[0]+str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): + self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) + self.r_serv_onion.sadd('full_onion_up', self.domains[0]) + # create onion metadata - if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])): - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date) + if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date) # add onion screenshot history - self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date) + self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) - self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) + self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) - print(sys.getsizeof(response.data['png'])) - print(sys.getsizeof(response.data['html'])) - print(self.domains[0]) + size_screenshot = (len(response.data['png'])*3) /4 + print(size_screenshot) - - - with open(filename_screenshot, 'wb') as f: - f.write(base64.standard_b64decode(response.data['png'].encode())) + if size_screenshot < 5000000: #bytes + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) # save external links in set lext = LinkExtractor(deny_domains=self.domains, unique=True) for link in lext.extract_links(response): - self.r_serv_metadata.sadd('paste_crawler:filename_paste', link) + self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url) + self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url) #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) @@ -154,12 +163,38 @@ class TorSplashCrawler(): link.url, self.parse, endpoint='render.json', - meta={'parent': UUID}, + meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'wait': 10} + #errback=self.errback_catcher ) + ''' + def errback_catcher(self, failure): + # catch all errback failures, + self.logger.error(repr(failure)) + + #if isinstance(failure.value, HttpError): + if failure.check(HttpError): + # you can get the response + response = failure.value.response + print('HttpError') + self.logger.error('HttpError on %s', response.url) + + #elif isinstance(failure.value, DNSLookupError): + elif failure.check(DNSLookupError): + # this is the original request + request = failure.request + print(DNSLookupError) + self.logger.error('DNSLookupError on %s', request.url) + + #elif isinstance(failure.value, TimeoutError): + elif failure.check(TimeoutError): + request = failure.request + print(TimeoutError) + self.logger.error('TimeoutError on %s', request.url) + ''' def save_crawled_paste(self, filename, content): diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 2c3e736a..5424ccc8 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis( db=cfg.getint("ARDB_Statistics", "db"), decode_responses=True) +r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + sys.path.append('../../configs/keys') # MISP # @@ -144,4 +150,6 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') +SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index cc70527c..6fa5a983 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -5,9 +5,10 @@ Flask functions and routes for the trending modules page ''' import redis +import os import json import flask -from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response +from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory import difflib import ssdeep @@ -22,12 +23,14 @@ r_serv_pasteName = Flask_config.r_serv_pasteName r_serv_metadata = Flask_config.r_serv_metadata r_serv_tags = Flask_config.r_serv_tags r_serv_statistics = Flask_config.r_serv_statistics +r_serv_onion = Flask_config.r_serv_onion max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal DiffMaxLineLength = Flask_config.DiffMaxLineLength bootstrap_label = Flask_config.bootstrap_label misp_event_url = Flask_config.misp_event_url hive_case_url = Flask_config.hive_case_url +SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') @@ -130,6 +133,16 @@ def showpaste(content_range): list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) ) + crawler_metadata = {} + if 'infoleak:submission="crawler"' in l_tags: + crawler_metadata['get_metadata'] = True + crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') + crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') + crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path) + crawler_metadata['screenshot'] = paste.get_p_rel_path() + else: + crawler_metadata['get_metadata'] = False + if Flask_config.pymisp is False: misp = False else: @@ -157,6 +170,7 @@ def showpaste(content_range): hive_url = hive_case_url.replace('id_here', hive_case) return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list, + crawler_metadata=crawler_metadata, misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url) # ============ ROUTES ============ @@ -202,5 +216,9 @@ def showDiff(): the_html = htmlD.make_file(lines1, lines2) return the_html +@showsavedpastes.route('/screenshot/') +def screenshot(filename): + return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True) + # ========= REGISTRATION ========= app.register_blueprint(showsavedpastes) diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index cb3f8b68..866f64c1 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -373,6 +373,42 @@ {% endif %} + + {% if crawler_metadata['get_metadata'] %} +
+
+ +
+ +
+
+
+
+ Graph +
+ + + + + + + + + + + + + + + + +
Father{{ crawler_metadata['paste_father'] }}
Source link{{ crawler_metadata['real_link'] }}
External links{{ crawler_metadata['external_links'] }}
+
+
+
+
+ {% endif %} +

Content:

[Raw content]

{{ content }}