chg: [Onion] add onion splash crawler

2024-11-22 22:27:17 +00:00 · 2018-08-09 17:42:21 +02:00 · 2018-08-09 17:42:21 +02:00 · 8b1c10b38c
commit 8b1c10b38c
parent 54cc4f3723
7 changed files with 319 additions and 2 deletions
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -0,0 +1,92 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import os
 import sys
 import redis
 import datetime
 import time
 import subprocess
 sys.path.append(os.environ['AIL_BIN'])
 from Helper import Process
 from pubsublogger import publisher
 def signal_handler(sig, frame):
    sys.exit(0)
 if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = "Script"
    publisher.info("Script Crawler started")
    config_section = 'Crawler'
    # Setup the I/O queues
    p = Process(config_section)
    splash_url = p.config.get("Crawler", "splash_url")
    http_proxy = p.config.get("Crawler", "http_proxy")
    crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
    #signal.signal(signal.SIGINT, signal_handler)
    r_serv_metadata = redis.StrictRedis(
        host=p.config.get("ARDB_Metadata", "host"),
        port=p.config.getint("ARDB_Metadata", "port"),
        db=p.config.getint("ARDB_Metadata", "db"),
        decode_responses=True)
    r_cache = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"),
        decode_responses=True)
    r_onion = redis.StrictRedis(
        host=p.config.get("ARDB_Onion", "host"),
        port=p.config.getint("ARDB_Onion", "port"),
        db=p.config.getint("ARDB_Onion", "db"),
        decode_responses=True)
    while True:
        message = p.get_from_set()
        # Recovering the streamed message informations.
        if message is not None:
            splitted = message.split(';')
            if len(splitted) == 2:
                url, paste = splitted
                print(url)
                if not r_cache.exists(url):
                    super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
                    if super_father is None:
                        super_father=paste
                    process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
                                               stdout=subprocess.PIPE)
                    while process.poll() is None:
                        time.sleep(1)
                    date = datetime.datetime.now().strftime("%Y%m%d")
                    print(date)
                    url_domain = url.replace('http://', '')
                    if process.returncode == 0:
                        if r_serv_metadata.exists('paste_children:'+paste):
                            msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
                            p.populate_set_out(msg, 'Tags')
                        r_onion.sadd('onion_up:'+date , url_domain)
                    else:
                        r_onion.sadd('onion_down:'+date , url_domain)
                        print(process.stdout.read())
            else:
                continue
        else:
            time.sleep(1)
--- a/bin/Onion.py
+++ b/bin/Onion.py
@ -21,7 +21,6 @@ Requirements
 *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.
 """
 import pprint
 import time
 from packages import Paste
 from pubsublogger import publisher
@ -123,6 +122,7 @@ if __name__ == "__main__":
                PST = Paste.Paste(filename)
                for x in PST.get_regex(url_regex):
                    print(x)
                    # Extracting url with regex
                    url, s, credential, subdomain, domain, host, port, \
                        resource_path, query_string, f1, f2, f3, f4 = x
@ -149,12 +149,18 @@ if __name__ == "__main__":
                    to_print = 'Onion;{};{};{};'.format(PST.p_source,
                                                        PST.p_date,
                                                        PST.p_name)
                    '''
                    for url in fetch(p, r_cache, urls, domains_list, path):
                        publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
                        p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
                        msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
                        p.populate_set_out(msg, 'Tags')
                    '''
                    for url in urls:
                        msg = '{};{}'.format(url,PST.p_path)
                        print('send to crawler')
                        p.populate_set_out(msg, 'Crawler')
                else:
                    publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -3,6 +3,8 @@ bloomfilters = Blooms
 dicofilters = Dicos
 pastes = PASTES
 base64 = BASE64
 crawled = crawled
 crawled_screenshot = CRAWLED_SCREENSHOT
 wordtrending_csv = var/www/static/csv/wordstrendingdata
 wordsfile = files/wordfile
@ -171,6 +173,11 @@ host = localhost
 port = 6382
 db = 8
 [ARDB_Onion]
 host = localhost
 port = 6382
 db = 9
 [Url]
 cc_critical = DE
@ -215,3 +222,8 @@ channel = FetchedOnion
 host = localhost
 port = 6381
 db = 0
 [Crawler]
 crawler_depth_limit = 1
 splash_url = http://127.0.0.1:8050
 http_proxy = http://127.0.0.1:9050
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags
 [Onion]
 subscribe = Redis_Onion
-publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags
+publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler
 #publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler
 [DumpValidOnion]
@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags
 [submit_paste]
 subscribe = Redis
 publish = Redis_Mixer
 [Crawler]
 subscribe = Redis_Crawler
 publish = Redis_Mixer,Redis_Tags
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -0,0 +1,165 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import os
 import sys
 import gzip
 import base64
 import uuid
 import datetime
 import base64
 import redis
 from urllib.parse import urlparse
 from scrapy import Spider
 from scrapy.linkextractors import LinkExtractor
 from scrapy.crawler import CrawlerProcess, Crawler
 from twisted.internet import reactor
 from scrapy_splash import SplashRequest
 sys.path.append(os.environ['AIL_BIN'])
 from Helper import Process
 class TorSplashCrawler():
    def __init__(self, splash_url, http_proxy, crawler_depth_limit):
        self.process = CrawlerProcess({'LOG_ENABLED': False})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
            'SPLASH_URL': splash_url,
            'HTTP_PROXY': http_proxy,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'DEPTH_LIMIT': crawler_depth_limit
            })
    def crawl(self, url, original_paste, super_father):
        self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
        self.process.start()
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'
        def __init__(self, url, original_paste, super_father, *args, **kwargs):
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [urlparse(url).netloc]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            config_section = 'Crawler'
            self.p = Process(config_section)
            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)
            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)
            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)
            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date )
            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
        def start_requests(self):
            yield SplashRequest(
                self.start_urls,
                self.parse,
                endpoint='render.json',
                meta={'parent': self.original_paste},
                args={  'html': 1,
                        'wait': 10,
                        'render_all': 1,
                        'png': 1}
            )
        def parse(self,response):
            print(response.headers)
            print(response.status)
            self.r_cache.setbit(response.url, 0, 1)
            self.r_cache.expire(response.url, 360000)
            UUID = self.domains[0]+str(uuid.uuid4())
            filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
            filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
            # save new paste on disk
            if self.save_crawled_paste(filename_paste, response.data['html']):
                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
                self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
                dirname = os.path.dirname(filename_screenshot)
                if not os.path.exists(dirname):
                    os.makedirs(dirname)
                with open(filename_screenshot, 'wb') as f:
                    f.write(base64.standard_b64decode(response.data['png'].encode()))
                # save external links in set
                lext = LinkExtractor(deny_domains=self.domains, unique=True)
                for link in lext.extract_links(response):
                    self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
                #le = LinkExtractor(unique=True)
                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    self.r_cache.setbit(link, 0, 0)
                    self.r_cache.expire(link, 360000)
                    yield SplashRequest(
                        link.url,
                        self.parse,
                        endpoint='render.json',
                        meta={'parent': UUID},
                        args={  'html': 1,
                                'png': 1,
                                'render_all': 1,
                                'wait': 10}
                    )
        def save_crawled_paste(self, filename, content):
            print(filename)
            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(filename))
                return False
            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False
            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')
            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
 import os
 import sys
 import configparser
 from TorSplashCrawler import TorSplashCrawler
 if __name__ == '__main__':
    if len(sys.argv) != 4:
        print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
        exit(1)
    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
    if not os.path.exists(configfile):
        raise Exception('Unable to find the configuration file. \
                        Did you set environment variables? \
                        Or activate the virtualenv.')
    cfg = configparser.ConfigParser()
    cfg.read(configfile)
    splash_url = cfg.get("Crawler", "splash_url")
    http_proxy = cfg.get("Crawler", "http_proxy")
    crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
    url = sys.argv[1]
    paste = sys.argv[2]
    super_father = sys.argv[3]
    crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
    crawler.crawl(url, paste, super_father)
--- a/etc/splash/proxy-profiles/default.ini
+++ b/etc/splash/proxy-profiles/default.ini
@ -0,0 +1,4 @@
 [proxy]
 host=localhost
 port=9050
 type=SOCKS5