chg: [Crawler] add bootsrap4 src + refractor crawler

2025-01-31 06:26:14 +00:00 · 2019-02-05 17:16:44 +01:00 · 2019-02-05 17:16:44 +01:00 · 516238025f
commit 516238025f
parent 423c7b1455
6 changed files with 106 additions and 57 deletions
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN'])
 from Helper import Process
 from pubsublogger import publisher

+def decode_val(value):
+    if value is not None:
+        value = value.decode()
+    return value
+
+def load_type_blacklist(type_service):
+    # load domains blacklist
+    try:
+        with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f:
+            # # TODO: # FIXME:  remove this
+            r_onion.delete('blacklist_{}'.format(type_service))
+            lines = f.read().splitlines()
+            for line in lines:
+                r_onion.sadd('blacklist_{}'.format(type_service), line)
+    except Exception:
+        pass
+
+
 def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
    # send this msg back in the queue
    if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message):
 if __name__ == '__main__':

    if len(sys.argv) != 3:
-        print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
+        #print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
+        print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port')
        exit(1)

-    type_hidden_service = sys.argv[1]
+    mode = sys.argv[1]
    splash_port = sys.argv[2]

+    if mode == 'automatic':
+        type_hidden_service = 'onion'
+
    publisher.port = 6380
    publisher.channel = "Script"

@ -107,6 +129,16 @@ if __name__ == '__main__':
    # Setup the I/O queues
    p = Process(config_section)

+    accepted_services = ['onion', 'regular']
+
+    dic_regex = {}
+    dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+    re.compile(dic_regex['onion'])
+    dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+    re.compile(dic_regex['i2p'])
+    dic_regex['regular'] = dic_regex['i2p']
+
+
    url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
    re.compile(url_onion)
    url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@ -114,17 +146,15 @@ if __name__ == '__main__':

    if type_hidden_service == 'onion':
        regex_hidden_service = url_onion
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
    elif type_hidden_service == 'i2p':
        regex_hidden_service = url_i2p
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"),  splash_port)
    elif type_hidden_service == 'regular':
        regex_hidden_service = url_i2p
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
    else:
        print('incorrect crawler type: {}'.format(type_hidden_service))
        exit(0)

+    splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
    print('splash url: {}'.format(splash_url))

    crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
@ -150,19 +180,13 @@ if __name__ == '__main__':
        db=p.config.getint("ARDB_Onion", "db"),
        decode_responses=True)

+    # Crawler status
    r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))

    # load domains blacklist
-    try:
-        with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
-            r_onion.delete('blacklist_{}'.format(type_hidden_service))
-            lines = f.read().splitlines()
-            for line in lines:
-                r_onion.sadd('blacklist_{}'.format(type_hidden_service), line)
-    except Exception:
-        pass
+    load_type_blacklist(type_hidden_service)

    while True:

@ -180,17 +204,24 @@ if __name__ == '__main__':
                url, paste = splitted
                paste = paste.replace(PASTES_FOLDER+'/', '')

-                url_list = re.findall(regex_hidden_service, url)[0]
-                if url_list[1] == '':
+                # extract data from url
+                faup.decode(url)
+                url_unpack = faup.get()
+                url = decode_val(url_unpack['url'])
+                port = decode_val(url_unpack['port'])
+                scheme = decode_val(url_unpack['scheme'])
+                domain = decode_val(url_unpack['domain'])
+                host = decode_val(url_unpack['domain'])
+
+                # Add Scheme to url
+                if scheme is None:
                    url= 'http://{}'.format(url)
-
-                link, s, credential, subdomain, domain, host, port, \
-                    resource_path, query_string, f1, f2, f3, f4 = url_list
-                domain = url_list[4]
-                r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
-
                domain_url = 'http://{}'.format(domain)

+
+                # remove url to crawl from queue
+                r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
+
                print()
                print()
                print('\033[92m------------------START CRAWLER------------------\033[0m')
@ -200,10 +231,7 @@ if __name__ == '__main__':
                print('domain:      {}'.format(domain))
                print('domain_url:  {}'.format(domain_url))

-                faup.decode(domain)
-                onion_domain=faup.get()['domain'].decode()
-
-                if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
+                if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):

                    date = datetime.datetime.now().strftime("%Y%m%d")
                    date_month = datetime.datetime.now().strftime("%Y%m")
@ -219,17 +247,24 @@ if __name__ == '__main__':
                        # last check
                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)

+                        # Launch Scrapy-Splash Crawler
                        crawl_onion(url, domain, date, date_month, message)
+                        # Crawl Domain
                        if url != domain_url:
-                            print(url)
+                            #Crawl Domain with port number
+                            if port is not None:
+                                print('{}:{}'.format(domain_url, port))
+                                crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
+                            #Crawl without port number
                            print(domain_url)
                            crawl_onion(domain_url, domain, date, date_month, message)

+                        # update last check
+                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
+
                        # save down onion
                        if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
                            r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
-                            #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
-                            #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
                        else:
                            #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
                            if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
@ -241,28 +276,28 @@ if __name__ == '__main__':
                        if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
                            r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
                            # add crawled history by date
-                        r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
+                        r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste)

+                        if mode == 'automatic':
+                            # check external onions links (full_crawl)
+                            external_domains = set()
+                            for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
+                                external_domain = re.findall(dic_regex[type_hidden_service], link)
+                                external_domain.extend(re.findall(url_i2p, link))
+                                if len(external_domain) > 0:
+                                    external_domain = external_domain[0][4]
+                                else:
+                                    continue
+                                if '.onion' in external_domain and external_domain != domain:
+                                    external_domains.add(external_domain)
+                                elif '.i2p' in external_domain and external_domain != domain:
+                                    external_domains.add(external_domain)
+                            if len(external_domains) >= 10:
+                                r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
+                            r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
+                            print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))

-                        # check external onions links (full_scrawl)
-                        external_domains = set()
-                        for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
-                            external_domain = re.findall(url_onion, link)
-                            external_domain.extend(re.findall(url_i2p, link))
-                            if len(external_domain) > 0:
-                                external_domain = external_domain[0][4]
-                            else:
-                                continue
-                            if '.onion' in external_domain and external_domain != domain:
-                                external_domains.add(external_domain)
-                            elif '.i2p' in external_domain and external_domain != domain:
-                                external_domains.add(external_domain)
-                        if len(external_domains) >= 10:
-                            r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
-                        r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
-                        print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
-
-                        # update list, last crawled onions
+                        # update list, last crawled sites
                        r_onion.lpush('last_{}'.format(type_hidden_service), domain)
                        r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)

@ -270,7 +305,7 @@ if __name__ == '__main__':
                        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
                        r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
                else:
-                    print('                 Blacklisted Onion')
+                    print('                 Blacklisted Site')
                    print()
                    print()

--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -221,7 +221,7 @@ function launching_scripts {
 function launching_crawler {
    if [[ ! $iscrawler ]]; then
        CONFIG=$AIL_BIN/packages/config.cfg
-        lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
+        lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")

        IFS='-' read -ra PORTS <<< "$lport"
        if [ ${#PORTS[@]} -eq 1 ]
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -249,5 +249,5 @@ db = 0
 [Crawler]
 activate_crawler = False
 crawler_depth_limit = 1
-splash_url_onion = http://127.0.0.1
-splash_onion_port = 8050-8052
+splash_url = http://127.0.0.1
+splash_port = 8050-8052
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -28,10 +28,10 @@ from Helper import Process

 class TorSplashCrawler():

-    def __init__(self, splash_url, crawler_depth_limit):
+    def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
        self.process = CrawlerProcess({'LOG_ENABLED': False})
        self.crawler = Crawler(self.TorSplashSpider, {
-            'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
+            'USER_AGENT': user_agent,
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -42,7 +42,7 @@ class TorSplashCrawler():
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
-            'CLOSESPIDER_PAGECOUNT': 50,
+            'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
            'DEPTH_LIMIT': crawler_depth_limit
            })

--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@ -30,5 +30,10 @@ if __name__ == '__main__':
    paste = sys.argv[5]
    super_father = sys.argv[6]

-    crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
+    tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
+    user_agent = tor_browser_agent
+
+    closespider_pagecount = 50
+
+    crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
    crawler.crawl(type, url, domain, paste, super_father)
--- a/var/www/update_thirdparty.sh
+++ b/var/www/update_thirdparty.sh
@ -5,12 +5,14 @@ set -e
 wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js

 SBADMIN_VERSION='3.3.7'
+BOOTSTRAP_VERSION='4.2.1'
 FONT_AWESOME_VERSION='4.7.0'
 D3_JS_VERSION='5.5.0'

 rm -rf temp
 mkdir temp

+wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip
 wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
 wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
 wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O  tem
 wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
 wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip

-
+unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/
 unzip temp/${SBADMIN_VERSION}.zip -d temp/
 unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
 unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/
 unzip temp/moment_2.22.2.zip -d temp/
 unzip temp/daterangepicker_v0.18.0.zip -d temp/

+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/
+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/
+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/
+
 mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
 mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
 mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/
 wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
 wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js

+wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css
+wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js
+
 #Ressource for graph
 wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
 wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js