diff --git a/bin/Crawler.py b/bin/Crawler.py index e6b61a99..e1591d55 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN']) from Helper import Process from pubsublogger import publisher +def decode_val(value): + if value is not None: + value = value.decode() + return value + +def load_type_blacklist(type_service): + # load domains blacklist + try: + with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f: + # # TODO: # FIXME: remove this + r_onion.delete('blacklist_{}'.format(type_service)) + lines = f.read().splitlines() + for line in lines: + r_onion.sadd('blacklist_{}'.format(type_service), line) + except Exception: + pass + + def on_error_send_message_back_in_queue(type_hidden_service, domain, message): # send this msg back in the queue if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): @@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message): if __name__ == '__main__': if len(sys.argv) != 3: - print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') + #print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') + print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port') exit(1) - type_hidden_service = sys.argv[1] + mode = sys.argv[1] splash_port = sys.argv[2] + if mode == 'automatic': + type_hidden_service = 'onion' + publisher.port = 6380 publisher.channel = "Script" @@ -107,6 +129,16 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) + accepted_services = ['onion', 'regular'] + + dic_regex = {} + dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(dic_regex['onion']) + dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(dic_regex['i2p']) + dic_regex['regular'] = dic_regex['i2p'] + + url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" re.compile(url_onion) url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" @@ -114,17 +146,15 @@ if __name__ == '__main__': if type_hidden_service == 'onion': regex_hidden_service = url_onion - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) elif type_hidden_service == 'i2p': regex_hidden_service = url_i2p - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port) elif type_hidden_service == 'regular': regex_hidden_service = url_i2p - splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) else: print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) print('splash url: {}'.format(splash_url)) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") @@ -150,19 +180,13 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) + # Crawler status r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) # load domains blacklist - try: - with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: - r_onion.delete('blacklist_{}'.format(type_hidden_service)) - lines = f.read().splitlines() - for line in lines: - r_onion.sadd('blacklist_{}'.format(type_hidden_service), line) - except Exception: - pass + load_type_blacklist(type_hidden_service) while True: @@ -180,17 +204,24 @@ if __name__ == '__main__': url, paste = splitted paste = paste.replace(PASTES_FOLDER+'/', '') - url_list = re.findall(regex_hidden_service, url)[0] - if url_list[1] == '': + # extract data from url + faup.decode(url) + url_unpack = faup.get() + url = decode_val(url_unpack['url']) + port = decode_val(url_unpack['port']) + scheme = decode_val(url_unpack['scheme']) + domain = decode_val(url_unpack['domain']) + host = decode_val(url_unpack['domain']) + + # Add Scheme to url + if scheme is None: url= 'http://{}'.format(url) - - link, s, credential, subdomain, domain, host, port, \ - resource_path, query_string, f1, f2, f3, f4 = url_list - domain = url_list[4] - r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) - domain_url = 'http://{}'.format(domain) + + # remove url to crawl from queue + r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) + print() print() print('\033[92m------------------START CRAWLER------------------\033[0m') @@ -200,10 +231,7 @@ if __name__ == '__main__': print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - faup.decode(domain) - onion_domain=faup.get()['domain'].decode() - - if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain): + if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") @@ -219,17 +247,24 @@ if __name__ == '__main__': # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) + # Launch Scrapy-Splash Crawler crawl_onion(url, domain, date, date_month, message) + # Crawl Domain if url != domain_url: - print(url) + #Crawl Domain with port number + if port is not None: + print('{}:{}'.format(domain_url, port)) + crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message) + #Crawl without port number print(domain_url) crawl_onion(domain_url, domain, date, date_month, message) + # update last check + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) + # save down onion if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) - #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) - #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) else: #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste): @@ -241,28 +276,28 @@ if __name__ == '__main__': if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date) # add crawled history by date - r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here + r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) + if mode == 'automatic': + # check external onions links (full_crawl) + external_domains = set() + for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): + external_domain = re.findall(dic_regex[type_hidden_service], link) + external_domain.extend(re.findall(url_i2p, link)) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + elif '.i2p' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) + r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) + print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - # check external onions links (full_scrawl) - external_domains = set() - for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): - external_domain = re.findall(url_onion, link) - external_domain.extend(re.findall(url_i2p, link)) - if len(external_domain) > 0: - external_domain = external_domain[0][4] - else: - continue - if '.onion' in external_domain and external_domain != domain: - external_domains.add(external_domain) - elif '.i2p' in external_domain and external_domain != domain: - external_domains.add(external_domain) - if len(external_domains) >= 10: - r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) - r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) - print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - - # update list, last crawled onions + # update list, last crawled sites r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) @@ -270,7 +305,7 @@ if __name__ == '__main__': r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') else: - print(' Blacklisted Onion') + print(' Blacklisted Site') print() print() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 549c0425..dd5a0517 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -221,7 +221,7 @@ function launching_scripts { function launching_crawler { if [[ ! $iscrawler ]]; then CONFIG=$AIL_BIN/packages/config.cfg - lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}") IFS='-' read -ra PORTS <<< "$lport" if [ ${#PORTS[@]} -eq 1 ] diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index ace656cc..f9483476 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -249,5 +249,5 @@ db = 0 [Crawler] activate_crawler = False crawler_depth_limit = 1 -splash_url_onion = http://127.0.0.1 -splash_onion_port = 8050-8052 +splash_url = http://127.0.0.1 +splash_port = 8050-8052 diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 99a4f3b3..b5a5c1f9 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -28,10 +28,10 @@ from Helper import Process class TorSplashCrawler(): - def __init__(self, splash_url, crawler_depth_limit): + def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { - 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', + 'USER_AGENT': user_agent, 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, @@ -42,7 +42,7 @@ class TorSplashCrawler(): 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, - 'CLOSESPIDER_PAGECOUNT': 50, + 'CLOSESPIDER_PAGECOUNT': closespider_pagecount, 'DEPTH_LIMIT': crawler_depth_limit }) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 58e8331b..99bda837 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -30,5 +30,10 @@ if __name__ == '__main__': paste = sys.argv[5] super_father = sys.argv[6] - crawler = TorSplashCrawler(splash_url, crawler_depth_limit) + tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' + user_agent = tor_browser_agent + + closespider_pagecount = 50 + + crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount) crawler.crawl(type, url, domain, paste, super_father) diff --git a/var/www/update_thirdparty.sh b/var/www/update_thirdparty.sh index 258fa7ca..01a73136 100755 --- a/var/www/update_thirdparty.sh +++ b/var/www/update_thirdparty.sh @@ -5,12 +5,14 @@ set -e wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js SBADMIN_VERSION='3.3.7' +BOOTSTRAP_VERSION='4.2.1' FONT_AWESOME_VERSION='4.7.0' D3_JS_VERSION='5.5.0' rm -rf temp mkdir temp +wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip @@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O tem wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip - +unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/ unzip temp/${SBADMIN_VERSION}.zip -d temp/ unzip temp/${SBADMIN_VERSION}-2.zip -d temp/ unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/ @@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/ unzip temp/moment_2.22.2.zip -d temp/ unzip temp/daterangepicker_v0.18.0.zip -d temp/ +mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/ +mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/ +mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/ + mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2 mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome @@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/ wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js +wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css +wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js + #Ressource for graph wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js