add: [crawler] i2p splash crawler

2024-11-22 22:27:17 +00:00 · 2022-06-30 17:33:00 +02:00 · 2022-06-30 17:33:00 +02:00 · f5ac98bcf0
commit f5ac98bcf0
parent 51a48a4c1a
3 changed files with 640 additions and 6 deletions
--- a/bin/Update.py
+++ b/bin/Update.py
@ -431,7 +431,7 @@ if __name__ == "__main__":
                for upper_tag in list_upper_tags_remote:
                    print('    {}{}{}: {}'.format(TERMINAL_BLUE, upper_tag[0], TERMINAL_DEFAULT, upper_tag[1]))
            print()
-            update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)
+            #update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)

        else:
            print('Please, commit your changes or stash them before you can update AIL')
--- a/bin/torcrawler/I2pSplashCrawler.py
+++ b/bin/torcrawler/I2pSplashCrawler.py
@ -0,0 +1,611 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import redis
+import json
+import time
+
+import requests
+from bs4 import BeautifulSoup
+
+from hashlib import sha256
+
+from twisted.web._newclient import ResponseNeverReceived
+
+from scrapy import Spider
+from scrapy.linkextractors import LinkExtractor
+from scrapy.crawler import CrawlerProcess, Crawler
+
+from scrapy_splash import SplashRequest
+
+sys.path.append(os.environ['AIL_BIN'])
+from Helper import Process
+
+sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
+import ConfigLoader
+import Screenshot
+import crawlers
+
+script_cookie = """
+function main(splash, args)
+    -- Default values
+    splash.js_enabled = true
+    splash.private_mode_enabled = true
+    splash.images_enabled = true
+    splash.webgl_enabled = true
+    splash.media_source_enabled = true
+
+    -- Force enable things
+    splash.plugins_enabled = true
+    splash.request_body_enabled = true
+    splash.response_body_enabled = true
+
+    splash.indexeddb_enabled = true
+    splash.html5_media_enabled = true
+    splash.http2_enabled = true
+
+    -- User Agent
+    splash:set_user_agent(args.user_agent)
+
+    -- User defined
+    splash.resource_timeout = args.resource_timeout
+    splash.timeout = args.timeout
+
+    -- Allow to pass cookies
+    splash:init_cookies(args.cookies)
+
+    -- Run
+    ok, reason = splash:go{args.url}
+    if not ok and not reason:find("http") then
+        return {
+            error = reason,
+            last_url = splash:url()
+        }
+    end
+    if reason == "http504" then
+        splash:set_result_status_code(504)
+        return ''
+    end
+
+    splash:wait{args.wait}
+    -- Page instrumentation
+    -- splash.scroll_position = {y=1000}
+    -- splash:wait{args.wait}
+    -- Response
+    return {
+        har = splash:har(),
+        html = splash:html(),
+        png = splash:png{render_all=true},
+        cookies = splash:get_cookies(),
+        last_url = splash:url(),
+    }
+end
+"""
+
+class I2pSplashCrawler():
+
+    def __init__(self, splash_url, crawler_options):
+        self.process = CrawlerProcess({'LOG_ENABLED': True})
+        self.crawler = Crawler(self.I2pSplashSpider, {
+            'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script
+            'SPLASH_URL': f"{splash_url}/render.html",
+            'ROBOTSTXT_OBEY': False,
+            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
+                                       'scrapy_splash.SplashMiddleware': 725,
+                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+                                       },
+            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
+            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
+            'HTTPERROR_ALLOW_ALL': True,
+            'RETRY_TIMES': 2,
+            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
+            'DEPTH_LIMIT': crawler_options['depth_limit'],
+            'SPLASH_COOKIES_DEBUG': False
+            })
+
+    def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
+        i2p = self.I2pSplashSpider(splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
+        i2p.notbob(url, self.process, self.crawler)
+    
+
+    class I2pSplashSpider(Spider):
+        name = 'I2pSplashSpider'
+
+        def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
+            self.splash_url = splash_url
+            self.domain_type = type
+            self.requested_mode = requested_mode
+            self.original_item = original_item
+            self.root_key = None
+            self.start_urls = url
+            self.domains = [domain]
+            self.port = str(port)
+            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
+            self.full_date = date['date_day']
+            self.date_month = date['date_month']
+            self.date_epoch = int(date['epoch'])
+            self.crawler_options = crawler_options
+            self.date = date
+
+            self.user_agent = crawler_options['user_agent']
+            self.png = crawler_options['png']
+            self.har = crawler_options['har']
+            self.cookies = cookies
+
+            config_section = 'Crawler'
+            self.p = Process(config_section)
+            self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
+
+            config_loader = ConfigLoader.ConfigLoader()
+            self.har_dir = os.path.join(config_loader.get_files_directory('har') , date_str )
+            config_loader = None
+
+            self.r_serv_log_submit = redis.StrictRedis(
+                host=self.p.config.get("Redis_Log_submit", "host"),
+                port=self.p.config.getint("Redis_Log_submit", "port"),
+                db=self.p.config.getint("Redis_Log_submit", "db"),
+                decode_responses=True)
+
+            self.root_key = None
+
+
+        def build_request_arg(self, cookies):
+            return {'wait': 10,
+                    'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
+                    'timeout': 90,
+                    'user_agent': self.user_agent,
+                    'cookies': cookies,
+                    'lua_source': script_cookie
+                }
+
+        def start_requests(self):
+            url = self.process_url(self.start_urls)
+            url = f"http://{url}"
+            print(f"request url: {url}")
+            l_cookies = self.build_request_arg(self.cookies)
+            yield SplashRequest(
+                url,
+                self.parse,
+                errback=self.errback_catcher,
+                endpoint='execute',
+                meta={'father': self.original_item, 'current_url': url},
+                args=l_cookies
+            )
+            
+
+        # # TODO: remove duplicate and anchor
+        def parse(self,response):
+            if response.status == 504:
+                # no response
+                pass
+
+            # LUA ERROR # # TODO: logs errors
+            elif 'error' in response.data:
+                if(response.data['error'] == 'network99'):
+                    ## splash restart ##
+                    error_retry = response.meta.get('error_retry', 0)
+                    if error_retry < 3:
+                        error_retry += 1
+                        url = response.data['last_url']
+                        father = response.meta['father']
+
+                        self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
+                        time.sleep(10)
+                        if 'cookies' in response.data:
+                            all_cookies = response.data['cookies'] # # TODO:  use initial cookie ?????
+                        else:
+                            all_cookies = []
+                        l_cookies = self.build_request_arg(all_cookies)
+                        yield SplashRequest(
+                            url,
+                            self.parse,
+                            errback=self.errback_catcher,
+                            endpoint='execute',
+                            dont_filter=True,
+                            meta={'father': father, 'current_url': url, 'error_retry': error_retry},
+                            args=l_cookies
+                        )
+                    else:
+                        if self.requested_mode == 'test':
+                            crawlers.save_test_ail_crawlers_result(False, 'Connection to proxy refused')
+                        print('Connection to proxy refused')
+                elif response.data['error'] == 'network3':
+                    if self.requested_mode == 'test':
+                        crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
+                    print('HostNotFoundError: the remote host name was not found (invalid hostname)')
+                else:
+                    if self.requested_mode == 'test':
+                        crawlers.save_test_ail_crawlers_result(False, response.data['error'])
+                    print(response.data['error'])
+
+            elif response.status != 200:
+                print('other response: {}'.format(response.status))
+                # detect connection to proxy refused
+                error_log = (json.loads(response.body.decode()))
+                print(error_log)
+            else:
+                ## TEST MODE ##
+                if self.requested_mode == 'test':
+                    if 'It works!' in response.data['html']:
+                        crawlers.save_test_ail_crawlers_result(True, 'It works!')
+                    else:
+                        print('TEST ERROR')
+                        crawlers.save_test_ail_crawlers_result(False, 'TEST ERROR')
+                    return
+                ## -- ##
+
+                item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
+                self.save_crawled_item(item_id, response.data['html'])
+                crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
+
+                if self.root_key is None:
+                    self.root_key = item_id
+                    crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
+                    crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
+
+                if 'cookies' in response.data:
+                    all_cookies = response.data['cookies']
+                else:
+                    all_cookies = []
+
+                # SCREENSHOT
+                if 'png' in response.data and self.png:
+                    sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
+                    if sha256_string:
+                        Screenshot.save_item_relationship(sha256_string, item_id)
+                        Screenshot.save_domain_relationship(sha256_string, self.domains[0])
+                # HAR
+                if 'har' in response.data and self.har:
+                    crawlers.save_har(self.har_dir, item_id, response.data['har'])
+
+                le = LinkExtractor(allow_domains=self.domains, unique=True)
+                for link in le.extract_links(response):
+                    l_cookies = self.build_request_arg(all_cookies)
+                    yield SplashRequest(
+                        link.url,
+                        self.parse,
+                        errback=self.errback_catcher,
+                        endpoint='execute',
+                        meta={'father': item_id, 'current_url': link.url},
+                        args=l_cookies
+                    )
+
+        def errback_catcher(self, failure):
+            # catch all errback failures,
+            self.logger.error(repr(failure))
+
+            if failure.check(ResponseNeverReceived):
+                ## DEBUG ##
+                self.logger.error(failure.request)
+                if failure.value.response:
+                    self.logger.error(failure.value.response)
+                ## ----- ##
+
+                # Extract request metadata
+                url = failure.request.meta['current_url']
+                father = failure.request.meta['father']
+                l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies'])
+
+                # Check if Splash restarted
+                if not crawlers.is_splash_reachable(self.splash_url):
+                    self.logger.error('Splash, ResponseNeverReceived for %s, retry in 30s ...', url)
+                    time.sleep(30)
+
+                yield SplashRequest(
+                    url,
+                    self.parse,
+                    errback=self.errback_catcher,
+                    endpoint='execute',
+                    meta={'father': father, 'current_url': url},
+                    args=l_cookies
+                )
+
+            else:
+                self.logger.error(failure.type)
+                self.logger.error(failure.getErrorMessage())
+
+        def save_crawled_item(self, item_id, item_content):
+            gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
+
+            # Send item to queue
+            # send paste to Global
+            relay_message = "{0} {1}".format(item_id, gzip64encoded)
+            self.p.populate_set_out(relay_message, 'Mixer')
+
+            # increase nb of paste by feeder name
+            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
+
+            # tag crawled paste
+            msg = 'infoleak:submission="crawler";{}'.format(item_id)
+            self.p.populate_set_out(msg, 'Tags')
+
+
+        def notbob(self, website, process, crawler, reload=False):
+            print(f"Splash_url: {self.splash_url}")
+            website = self.process_url(website)
+            print("\t" + website)
+            if reload:
+                print("reload Notbob")
+                url = f"http://{website}"
+            else:
+                print("Notbob")
+                url = f"http://notbob.i2p/cgi-bin/jump.cgi?q={website}"
+            try:
+                r = requests.get(f"{self.splash_url}/render.html", params={'url': url, 'wait': 2})
+            except Exception as e:
+                print("notbob error")
+                print(e)
+
+            soup = BeautifulSoup(r.content, "html.parser")
+            html = soup.find_all(id="jump", limit=1)
+            dead = soup.find_all(id="dead", limit=1)
+
+            # Find
+            if html:
+                #Jump
+                meta = soup.find_all("meta", limit=1)
+
+                urlJump = meta[0].get("content").split("url=")[1]
+                urlJump = urlJump[1:-1]
+                try:
+                    r = requests.get(f"{self.splash_url}/render.html", params={'url': urlJump, 'wait': 2})
+                except Exception as e:
+                    print("notbob error")
+                    print(e)
+
+                soup2 = BeautifulSoup(r.content, "html.parser")
+                title = soup2.find_all('title', limit=1)
+                if title:
+                    t = str(title[0])
+                    t = t[7:]
+                    t = t[:-8]
+
+                    if t == "Information: New Host Name":
+                        self.notbob(website, process, crawler, reload=True)
+                    elif t == "Website Unreachable":
+                        print("Not find with Notbob")
+                        self.i2pjump(website, process, crawler)
+                    elif t == "Warning: Destination Key Conflict":
+                        link = soup2.find_all("a", href=True)
+                        for l in link:
+                            if l.get_text() == f'Destination for {website} in address book':
+                                self.regular_request(l["href"], process, crawler)
+                    else:
+                        print(t)
+                        print("notbob")
+                        try:
+                            process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                            process.start()
+                        except Exception as e:
+                            print("notbob error process")
+                            print(e)
+
+                else:
+                    print("Not find with Notbob")
+                    self.i2pjump(website, process, crawler)
+            # Not find, try an other jump server
+            else:
+                if not dead:
+                    title = soup.find_all('title', limit=1)
+                    if title:
+                        t = str(title[0])
+                        t = t[7:]
+                        t = t[:-8]
+
+                        if t == "Information: New Host Name":
+                            self.notbob(website, process, crawler, reload=True)
+                        elif t == "Website Unreachable":
+                            print("Not find with Notbob")
+                            self.i2pjump(website, process, crawler)
+                        elif t == "Warning: Destination Key Conflict":
+                            link = soup.find_all("a", href=True)
+                            for l in link:
+                                if l.get_text() == f'Destination for {website} in address book':
+                                    self.regular_request(l["href"], process, crawler)
+                        else:
+                            print(t)
+                            print("notbob2")
+                            try:
+                                process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                                process.start()
+                            except Exception as e:
+                                print("notbob error process")
+                                print(e)
+                    else:
+                        print("Not find with Notbob")
+                        self.i2pjump(website, process, crawler)
+                else:
+                    print("Not find with Notbob")
+                    self.i2pjump(website, process, crawler)
+
+
+        def i2pjump(self, website, process, crawler, reload=False):
+            print(website)
+            if reload:
+                print("reload i2pjump")
+                url = f"http://{website}"
+            else:
+                print("i2pjump")
+                url = f"http://i2pjump.i2p/jump/{website}"
+            try:
+                r = requests.get(f"{self.splash_url}/render.html", params={'url': url, 'wait': 2})
+            except Exception as e:
+                print("i2pjump error")
+                print(e)
+
+            soup = BeautifulSoup(r.content, "html.parser")
+
+            title = soup.find_all('title', limit=1)
+            if title:
+                t = str(title[0])
+                t = t[7:]
+                t = t[:-8]
+                if t == "Information: New Host Name":
+                    self.i2pjump(website, process, crawler, reload=True)
+                elif t == "Website Unreachable":
+                    print("Not find with i2pjump")
+                    self.statsi2p(website, process, crawler)
+                elif t == "Warning: Destination Key Conflict":
+                    link = soup.find_all("a", href=True)
+                    for l in link:
+                        if l.get_text() == f'Destination for {website} in address book':
+                            self.regular_request(l["href"], process, crawler)
+                else:
+                    print(t)
+                    print("i2pjump")
+                    try:
+                        process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                        process.start()
+                    except Exception as e:
+                        print("i2pjump error process")
+                        print(e)
+            else:
+                if "was not found in index" in r.text:
+                    print("Not find with i2pjump")
+                    self.statsi2p(website, process, crawler)
+                else:
+                    print("don't know the error i2pjump")
+                    self.statsi2p(website, process, crawler)
+
+
+        def statsi2p(self, website, process, crawler, reload=False):
+            if reload:
+                print("reload statsi2p")
+                url = f"http://{website}"
+            else:
+                print("statsi2p")
+                url = f"http://stats.i2p/cgi-bin/jump.cgi?a={website}"
+            try:
+                r = requests.get(f"{self.splash_url}/render.html", params={'url': url, 'wait': 2})
+            except Exception as e:
+                print("stati2p error")
+                print(e)
+
+            soup = BeautifulSoup(r.content, "html.parser")
+
+            if not reload:
+                meta = soup.find_all("meta", limit=1)
+                # Success
+                if meta:
+                    urlJump = meta[0].get("content").split("url=")[1]
+                    urlJump = urlJump[0:-1]
+
+                    try:
+                        r = requests.get(f"{self.splash_url}/render.html", params={'url': urlJump, 'wait': 2})
+                    except Exception as e:
+                        print("stati2p error")
+                        print(e)
+                    soup2 = BeautifulSoup(r.content, "html.parser")
+                    title = soup2.find_all('title', limit=1)
+
+                    if title:
+                        t = str(title[0])
+                        t = t[7:]
+                        t = t[:-8]
+
+                        if t == "Information: New Host Name":
+                            self.statsi2p(website, process, crawler, reload=True)
+                        elif t == "Website Unreachable":
+                            print("Not find with stati2p")
+                            self.regular_request(website, process, crawler)
+                        elif t == "Warning: Destination Key Conflict":
+                            link = soup2.find_all("a", href=True)
+                            for l in link:
+                                if l.get_text() == f'Destination for {website} in address book':
+                                    self.regular_request(l["href"], process, crawler)
+                        else:
+                            print(t)
+                            print("stati2p")
+                            try:
+                                process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                                process.start()
+                            except Exception as e:
+                                print("stati2p error process")
+                                print(e)
+                    else:
+                        print("Not find with stati2p")
+                        self.regular_request(website, process, crawler)
+                else:
+                    print("Not find with stati2p")
+                    self.regular_request(website, process, crawler)
+            else:
+                title = soup.find_all('title', limit=1)
+                if title:
+                    t = str(title[0])
+                    t = t[7:]
+                    t = t[:-8]
+
+                    if t == "Information: New Host Name":
+                        self.statsi2p(website, process, crawler, reload=True)
+                    elif t == "Website Unreachable":
+                        print("Not find with stati2p")
+                        self.regular_request(website, process, crawler)
+                    elif t == "Warning: Destination Key Conflict":
+                        link = soup.find_all("a", href=True)
+                        for l in link:
+                            if l.get_text() == f'Destination for {website} in address book':
+                                self.regular_request(l["href"], process, crawler)
+                    else:
+                        print(t)
+                        print("stati2p")
+                        try:
+                            process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                            process.start()
+                        except Exception as e:
+                            print("stati2p error process")
+                            print(e)
+                else:
+                    print("Not find with stati2p")
+                    self.regular_request(website, process, crawler)
+
+
+        def regular_request(self, website, process, crawler, reload=False):
+            print(website)
+            if reload:
+                print("reload regular_request")
+            else:
+                print("regular_request")
+
+            try:
+                r = requests.get(f"{self.splash_url}/render.html", params={'url': website, 'wait': 2})
+            except Exception as e:
+                print("regular request error")
+                print(e)
+
+            soup = BeautifulSoup(r.content, "html.parser")
+
+            title = soup.find_all('title', limit=1)
+            if title:
+                t = str(title[0])
+                t = t[7:]
+                t = t[:-8]
+                if t == "Information: New Host Name":
+                    self.regular_request(website, process, crawler, reload=True)
+                elif t == "Website Unreachable":
+                    print("Not find with regular request")
+                    print("Exit...\n\n")
+                    crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
+                else:
+                    print(t)
+                    print("regular")
+                    try:
+                        process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                        process.start()
+                    except Exception as e:
+                        print("regular request error process")
+                        print(e)
+            else:
+                print("Not find with regular request")
+                print("Exit...\n\n")
+                crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
+
+        def process_url(self, url):
+            if "http://" == url[0:7]:
+                url = url[7:]
+            if url[-1] == "/":
+                url = url[:-1]
+            return url
+
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@ -6,6 +6,9 @@ import sys
 import json
 import redis
 from TorSplashCrawler import TorSplashCrawler
+from I2pSplashCrawler import I2pSplashCrawler
+
+from pyfaup.faup import Faup

 sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
 import ConfigLoader
@ -18,6 +21,8 @@ if __name__ == '__main__':
        exit(1)


+    faup = Faup()
+
    config_loader = ConfigLoader.ConfigLoader()
    redis_cache = config_loader.get_redis_conn("Redis_Cache")
    config_loader = None
@ -45,6 +50,24 @@ if __name__ == '__main__':

    redis_cache.delete('crawler_request:{}'.format(uuid))

+
+    # get crawler_mode
+    faup.decode(url)
+    unpack_url = faup.get()
+
+    try:
+        tld = unpack_url['tld'].decode()
+    except:
+        tld = unpack_url['tld']
+
+    if tld == "i2p":
+        try:
+            crawler = I2pSplashCrawler(splash_url, crawler_options)
+            crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)
+        except Exception as e:
+            print(e)
+            print(e, file=sys.stderr)
+    else:
        try:
            crawler = TorSplashCrawler(splash_url, crawler_options)
            crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item)