From 516238025f8a54b3897b93b6b7e7b7562b159b78 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Tue, 5 Feb 2019 17:16:44 +0100
Subject: [PATCH] chg: [Crawler] add bootsrap4 src + refractor crawler

---
 bin/Crawler.py                     | 133 ++++++++++++++++++-----------
 bin/LAUNCH.sh                      |   2 +-
 bin/packages/config.cfg.sample     |   4 +-
 bin/torcrawler/TorSplashCrawler.py |   6 +-
 bin/torcrawler/tor_crawler.py      |   7 +-
 var/www/update_thirdparty.sh       |  11 ++-
 6 files changed, 106 insertions(+), 57 deletions(-)

diff --git a/bin/Crawler.py b/bin/Crawler.py
index e6b61a99..e1591d55 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN'])
 from Helper import Process
 from pubsublogger import publisher
 
+def decode_val(value):
+    if value is not None:
+        value = value.decode()
+    return value
+
+def load_type_blacklist(type_service):
+    # load domains blacklist
+    try:
+        with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f:
+            # # TODO: # FIXME:  remove this
+            r_onion.delete('blacklist_{}'.format(type_service))
+            lines = f.read().splitlines()
+            for line in lines:
+                r_onion.sadd('blacklist_{}'.format(type_service), line)
+    except Exception:
+        pass
+
+
 def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
     # send this msg back in the queue
     if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
@@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message):
 if __name__ == '__main__':
 
     if len(sys.argv) != 3:
-        print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
+        #print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
+        print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port')
         exit(1)
 
-    type_hidden_service = sys.argv[1]
+    mode = sys.argv[1]
     splash_port = sys.argv[2]
 
+    if mode == 'automatic':
+        type_hidden_service = 'onion'
+
     publisher.port = 6380
     publisher.channel = "Script"
 
@@ -107,6 +129,16 @@ if __name__ == '__main__':
     # Setup the I/O queues
     p = Process(config_section)
 
+    accepted_services = ['onion', 'regular']
+
+    dic_regex = {}
+    dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+    re.compile(dic_regex['onion'])
+    dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+    re.compile(dic_regex['i2p'])
+    dic_regex['regular'] = dic_regex['i2p']
+
+
     url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
     re.compile(url_onion)
     url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@@ -114,17 +146,15 @@ if __name__ == '__main__':
 
     if type_hidden_service == 'onion':
         regex_hidden_service = url_onion
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
     elif type_hidden_service == 'i2p':
         regex_hidden_service = url_i2p
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"),  splash_port)
     elif type_hidden_service == 'regular':
         regex_hidden_service = url_i2p
-        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
     else:
         print('incorrect crawler type: {}'.format(type_hidden_service))
         exit(0)
 
+    splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
     print('splash url: {}'.format(splash_url))
 
     crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
@@ -150,19 +180,13 @@ if __name__ == '__main__':
         db=p.config.getint("ARDB_Onion", "db"),
         decode_responses=True)
 
+    # Crawler status
     r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
 
     # load domains blacklist
-    try:
-        with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
-            r_onion.delete('blacklist_{}'.format(type_hidden_service))
-            lines = f.read().splitlines()
-            for line in lines:
-                r_onion.sadd('blacklist_{}'.format(type_hidden_service), line)
-    except Exception:
-        pass
+    load_type_blacklist(type_hidden_service)
 
     while True:
 
@@ -180,17 +204,24 @@ if __name__ == '__main__':
                 url, paste = splitted
                 paste = paste.replace(PASTES_FOLDER+'/', '')
 
-                url_list = re.findall(regex_hidden_service, url)[0]
-                if url_list[1] == '':
+                # extract data from url
+                faup.decode(url)
+                url_unpack = faup.get()
+                url = decode_val(url_unpack['url'])
+                port = decode_val(url_unpack['port'])
+                scheme = decode_val(url_unpack['scheme'])
+                domain = decode_val(url_unpack['domain'])
+                host = decode_val(url_unpack['domain'])
+
+                # Add Scheme to url
+                if scheme is None:
                     url= 'http://{}'.format(url)
-
-                link, s, credential, subdomain, domain, host, port, \
-                    resource_path, query_string, f1, f2, f3, f4 = url_list
-                domain = url_list[4]
-                r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
-
                 domain_url = 'http://{}'.format(domain)
 
+
+                # remove url to crawl from queue
+                r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
+
                 print()
                 print()
                 print('\033[92m------------------START CRAWLER------------------\033[0m')
@@ -200,10 +231,7 @@ if __name__ == '__main__':
                 print('domain:      {}'.format(domain))
                 print('domain_url:  {}'.format(domain_url))
 
-                faup.decode(domain)
-                onion_domain=faup.get()['domain'].decode()
-
-                if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
+                if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
 
                     date = datetime.datetime.now().strftime("%Y%m%d")
                     date_month = datetime.datetime.now().strftime("%Y%m")
@@ -219,17 +247,24 @@ if __name__ == '__main__':
                         # last check
                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
 
+                        # Launch Scrapy-Splash Crawler
                         crawl_onion(url, domain, date, date_month, message)
+                        # Crawl Domain
                         if url != domain_url:
-                            print(url)
+                            #Crawl Domain with port number
+                            if port is not None:
+                                print('{}:{}'.format(domain_url, port))
+                                crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
+                            #Crawl without port number
                             print(domain_url)
                             crawl_onion(domain_url, domain, date, date_month, message)
 
+                        # update last check
+                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
+
                         # save down onion
                         if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
                             r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
-                            #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
-                            #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
                         else:
                             #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
                             if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
@@ -241,28 +276,28 @@ if __name__ == '__main__':
                         if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
                             r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
                             # add crawled history by date
-                        r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
+                        r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste)
 
+                        if mode == 'automatic':
+                            # check external onions links (full_crawl)
+                            external_domains = set()
+                            for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
+                                external_domain = re.findall(dic_regex[type_hidden_service], link)
+                                external_domain.extend(re.findall(url_i2p, link))
+                                if len(external_domain) > 0:
+                                    external_domain = external_domain[0][4]
+                                else:
+                                    continue
+                                if '.onion' in external_domain and external_domain != domain:
+                                    external_domains.add(external_domain)
+                                elif '.i2p' in external_domain and external_domain != domain:
+                                    external_domains.add(external_domain)
+                            if len(external_domains) >= 10:
+                                r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
+                            r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
+                            print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
 
-                        # check external onions links (full_scrawl)
-                        external_domains = set()
-                        for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
-                            external_domain = re.findall(url_onion, link)
-                            external_domain.extend(re.findall(url_i2p, link))
-                            if len(external_domain) > 0:
-                                external_domain = external_domain[0][4]
-                            else:
-                                continue
-                            if '.onion' in external_domain and external_domain != domain:
-                                external_domains.add(external_domain)
-                            elif '.i2p' in external_domain and external_domain != domain:
-                                external_domains.add(external_domain)
-                        if len(external_domains) >= 10:
-                            r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
-                        r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
-                        print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
-
-                        # update list, last crawled onions
+                        # update list, last crawled sites
                         r_onion.lpush('last_{}'.format(type_hidden_service), domain)
                         r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
 
@@ -270,7 +305,7 @@ if __name__ == '__main__':
                         r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
                         r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
                 else:
-                    print('                 Blacklisted Onion')
+                    print('                 Blacklisted Site')
                     print()
                     print()
 
diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh
index 549c0425..dd5a0517 100755
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@@ -221,7 +221,7 @@ function launching_scripts {
 function launching_crawler {
     if [[ ! $iscrawler ]]; then
         CONFIG=$AIL_BIN/packages/config.cfg
-        lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
+        lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
 
         IFS='-' read -ra PORTS <<< "$lport"
         if [ ${#PORTS[@]} -eq 1 ]
diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample
index ace656cc..f9483476 100644
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@@ -249,5 +249,5 @@ db = 0
 [Crawler]
 activate_crawler = False
 crawler_depth_limit = 1
-splash_url_onion = http://127.0.0.1
-splash_onion_port = 8050-8052
+splash_url = http://127.0.0.1
+splash_port = 8050-8052
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index 99a4f3b3..b5a5c1f9 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -28,10 +28,10 @@ from Helper import Process
 
 class TorSplashCrawler():
 
-    def __init__(self, splash_url, crawler_depth_limit):
+    def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
         self.process = CrawlerProcess({'LOG_ENABLED': False})
         self.crawler = Crawler(self.TorSplashSpider, {
-            'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
+            'USER_AGENT': user_agent,
             'SPLASH_URL': splash_url,
             'ROBOTSTXT_OBEY': False,
             'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@@ -42,7 +42,7 @@ class TorSplashCrawler():
             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
             'HTTPERROR_ALLOW_ALL': True,
             'RETRY_TIMES': 2,
-            'CLOSESPIDER_PAGECOUNT': 50,
+            'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
             'DEPTH_LIMIT': crawler_depth_limit
             })
 
diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py
index 58e8331b..99bda837 100755
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@@ -30,5 +30,10 @@ if __name__ == '__main__':
     paste = sys.argv[5]
     super_father = sys.argv[6]
 
-    crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
+    tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
+    user_agent = tor_browser_agent
+
+    closespider_pagecount = 50
+
+    crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
     crawler.crawl(type, url, domain, paste, super_father)
diff --git a/var/www/update_thirdparty.sh b/var/www/update_thirdparty.sh
index 258fa7ca..01a73136 100755
--- a/var/www/update_thirdparty.sh
+++ b/var/www/update_thirdparty.sh
@@ -5,12 +5,14 @@ set -e
 wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js
 
 SBADMIN_VERSION='3.3.7'
+BOOTSTRAP_VERSION='4.2.1'
 FONT_AWESOME_VERSION='4.7.0'
 D3_JS_VERSION='5.5.0'
 
 rm -rf temp
 mkdir temp
 
+wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip
 wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
 wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
 wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
@@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O  tem
 wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
 wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip
 
-
+unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/
 unzip temp/${SBADMIN_VERSION}.zip -d temp/
 unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
 unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
@@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/
 unzip temp/moment_2.22.2.zip -d temp/
 unzip temp/daterangepicker_v0.18.0.zip -d temp/
 
+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/
+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/
+mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/
+
 mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
 mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
 mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
@@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/
 wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
 wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js
 
+wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css
+wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js
+
 #Ressource for graph
 wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
 wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js