diff --git a/.gitignore b/.gitignore index c4bd48c9..2d276111 100644 --- a/.gitignore +++ b/.gitignore @@ -11,9 +11,10 @@ ardb faup tlsh Blooms -LEVEL_DB_DATA PASTES +CRAWLED_SCREENSHOT BASE64 +HASHS DATA_ARDB indexdir/ logs/ @@ -33,6 +34,7 @@ var/www/submitted bin/packages/config.cfg bin/packages/config.cfg.backup configs/keys +files # installed files nltk_data/ diff --git a/HOWTO.md b/HOWTO.md index 1a66402b..6e8d09b2 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -84,9 +84,9 @@ You can navigate into the interface by using arrow keys. In order to perform an To change list, you can press the key. -Also, you can quickly stop or start modules by clicking on the or symbol respectively. These are located in the _Action_ column. +Also, you can quickly stop or start modules by clicking on the ```` or ```` symbol respectively. These are located in the _Action_ column. -Finally, you can quit this program by pressing either or +Finally, you can quit this program by pressing either ```` or ````. Terms frequency usage @@ -96,3 +96,52 @@ In AIL, you can track terms, set of terms and even regexes without creating a de - You can track a term by simply putting it in the box. - You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\` - You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`. + + +Crawler +--------------------- +In AIL, you can crawl hidden services. + +There are two types of installation. You can install a *local* or a *remote* Splash server. +``(Splash host) = the server running the splash service`` +``(AIL host) = the server running AIL`` + +### Installation/Configuration + +1. *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirements (type ``y`` if a localhost splah server is used or use the ``-y`` option) + +2. *(Splash host)* To install and setup your tor proxy: + - Install the tor proxy: ``sudo apt-get install tor -y`` + (Not required if ``Splah host == AIL host`` - The tor proxy is installed by default in AIL) + - Add the following line ``SOCKSPolicy accept 172.17.0.0/16`` in ``/etc/tor/torrc`` + (for a linux docker, the localhost IP is *172.17.0.1*; Should be adapted for other platform) + - Restart the tor proxy: ``sudo service tor restart`` + +3. *(AIL host)* Edit the ``/bin/packages/config.cfg`` file: + - In the crawler section, set ``activate_crawler`` to ``True`` + - Change the IP address of Splash servers if needed (remote only) + - Set ``splash_onion_port`` according to your Splash servers port numbers that will be used. + those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). + + +### Starting the scripts + +- *(Splash host)* Launch all Splash servers with: +```sudo ./bin/torcrawler/launch_splash_crawler.sh -f -p -n ``` +With ```` and ```` matching those specified at ``splash_onion_port`` in the configuration file of point 3 (``/bin/packages/config.cfg``) + +All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. + +- (AIL host) launch all AIL crawler scripts using: +```./bin/LAUNCH.sh -c``` + + +### TL;DR - Local setup +#### Installation +- ```crawler_hidden_services_install.sh -y``` +- Add the following line in ``SOCKSPolicy accept 172.17.0.0/16`` in ``/etc/tor/torrc`` +- ```sudo service tor restart``` +- set activate_crawler to True in ``/bin/packages/config.cfg`` +#### Start +- ```sudo ./bin/torcrawler/launch_splash_crawler.sh -f $AIL_HOME/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 1";``` +- ```./bin/LAUNCH.sh -c``` diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py index 5ec2199f..1b7694b7 100755 --- a/bin/Bitcoin.py +++ b/bin/Bitcoin.py @@ -32,6 +32,7 @@ def decode_base58(bc, length): for char in bc: n = n * 58 + digits58.index(char) return n.to_bytes(length, 'big') + def check_bc(bc): try: bcbytes = decode_base58(bc, 25) diff --git a/bin/Crawler.py b/bin/Crawler.py new file mode 100755 index 00000000..99917c49 --- /dev/null +++ b/bin/Crawler.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import re +import redis +import datetime +import time +import subprocess +import requests + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process +from pubsublogger import publisher + +def on_error_send_message_back_in_queue(type_hidden_service, domain, message): + # send this msg back in the queue + if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): + r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) + r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) + +def crawl_onion(url, domain, date, date_month, message): + + #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste + + try: + r = requests.get(splash_url , timeout=30.0) + except Exception: + # TODO: relaunch docker or send error message + + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + publisher.error('{} SPASH DOWN'.format(splash_url)) + print('--------------------------------------') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) + exit(1) + + if r.status_code == 200: + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) + + if process.returncode == 0: + output = process.stdout.read().decode() + print(output) + # error: splash:Connection to proxy refused + if 'Connection to proxy refused' in output: + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + publisher.error('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) + print('------------------------------------------------------------------------') + print(' \033[91m SPLASH: Connection to proxy refused') + print('') + print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) + print('------------------------------------------------------------------------') + exit(-2) + else: + print(process.stdout.read()) + exit(-1) + else: + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + print('--------------------------------------') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) + exit(1) + + +if __name__ == '__main__': + + if len(sys.argv) != 3: + print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') + exit(1) + + type_hidden_service = sys.argv[1] + splash_port = sys.argv[2] + + publisher.port = 6380 + publisher.channel = "Script" + + publisher.info("Script Crawler started") + + config_section = 'Crawler' + + # Setup the I/O queues + p = Process(config_section) + + url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_onion) + url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_i2p) + + if type_hidden_service == 'onion': + regex_hidden_service = url_onion + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) + elif type_hidden_service == 'i2p': + regex_hidden_service = url_i2p + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port) + elif type_hidden_service == 'regular': + regex_hidden_service = url_i2p + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) + else: + print('incorrect crawler type: {}'.format(type_hidden_service)) + exit(0) + + print('splash url: {}'.format(splash_url)) + + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + + r_serv_metadata = redis.StrictRedis( + host=p.config.get("ARDB_Metadata", "host"), + port=p.config.getint("ARDB_Metadata", "port"), + db=p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_cache = redis.StrictRedis( + host=p.config.get("Redis_Cache", "host"), + port=p.config.getint("Redis_Cache", "port"), + db=p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + # load domains blacklist + try: + with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: + r_onion.delete('blacklist_{}'.format(type_hidden_service)) + lines = f.read().splitlines() + for line in lines: + r_onion.sadd('blacklist_{}'.format(type_hidden_service), line) + except Exception: + pass + + while True: + + # Recovering the streamed message informations. + message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + + if message is not None: + + splitted = message.split(';') + if len(splitted) == 2: + url, paste = splitted + paste = paste.replace(PASTES_FOLDER+'/', '') + + url_list = re.findall(regex_hidden_service, url)[0] + if url_list[1] == '': + url= 'http://{}'.format(url) + + link, s, credential, subdomain, domain, host, port, \ + resource_path, query_string, f1, f2, f3, f4 = url_list + domain = url_list[4] + r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) + + domain_url = 'http://{}'.format(domain) + + print('\033[92m------------------START CRAWLER------------------\033[0m') + print('crawler type: {}'.format(type_hidden_service)) + print('\033[92m-------------------------------------------------\033[0m') + print('url: {}'.format(url)) + print('domain: {}'.format(domain)) + print('domain_url: {}'.format(domain_url)) + + if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): + + date = datetime.datetime.now().strftime("%Y%m%d") + date_month = datetime.datetime.now().strftime("%Y%m") + + if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): + + crawl_onion(url, domain, date, date_month, message) + if url != domain_url: + print(url) + print(domain_url) + crawl_onion(domain_url, domain, date, date_month, message) + + # save down onion + if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): + r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) + #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) + #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) + if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) + r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date) + else: + #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) + if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) + p.populate_set_out(msg, 'Tags') + + # last check + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) + + # last_father + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) + + # add onion screenshot history + # add crawled days + if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: + r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date) + # add crawled history by date + r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here + + + # check external onions links (full_scrawl) + external_domains = set() + for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): + external_domain = re.findall(url_onion, link) + external_domain.extend(re.findall(url_i2p, link)) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + elif '.i2p' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) + r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) + print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) + + # update list, last crawled onions + r_onion.lpush('last_{}'.format(type_hidden_service), domain) + r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) + + else: + continue + else: + time.sleep(1) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 5cffe413..684af83b 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1` isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1` isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1` isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1` +iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1` isfeeded=`screen -ls | egrep '[0-9]+.Feeder_Pystemon' | cut -d. -f1` function helptext { @@ -199,6 +200,35 @@ function launching_scripts { } +function launching_crawler { + if [[ ! $iscrawler ]]; then + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") + + IFS='-' read -ra PORTS <<< "$lport" + if [ ${#PORTS[@]} -eq 1 ] + then + first_port=${PORTS[0]} + last_port=${PORTS[0]} + else + first_port=${PORTS[0]} + last_port=${PORTS[1]} + fi + + screen -dmS "Crawler_AIL" + sleep 0.1 + + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done + + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT + else + echo -e $RED"\t* A screen is already launched"$DEFAULT + fi +} + function shutting_down_redis { redis_dir=${AIL_HOME}/redis/src/ bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN' @@ -420,6 +450,9 @@ function launch_all { Flask) launch_flask; ;; + Crawler) + launching_crawler; + ;; Killall) killall; ;; @@ -445,9 +478,9 @@ while [ "$1" != "" ]; do ;; -k | --killAll ) killall; ;; - -c | --configUpdate ) checking_configuration "manual"; + -t | --thirdpartyUpdate ) update_thirdparty; ;; - -t | --thirdpartyUpdate ) update_thirdparty; + -c | --crawler ) launching_crawler; ;; -f | --launchFeeder ) launch_feeder; ;; diff --git a/bin/Onion.py b/bin/Onion.py index 277f1c71..1f233fcf 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -21,7 +21,6 @@ Requirements *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly. """ -import pprint import time from packages import Paste from pubsublogger import publisher @@ -30,6 +29,7 @@ import os import base64 import subprocess import redis +import re from Helper import Process @@ -97,6 +97,12 @@ if __name__ == "__main__": db=p.config.getint("Redis_Cache", "db"), decode_responses=True) + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + # FUNCTIONS # publisher.info("Script subscribed to channel onion_categ") @@ -107,9 +113,21 @@ if __name__ == "__main__": message = p.get_from_set() prec_filename = None + # send to crawler: + activate_crawler = p.config.get("Crawler", "activate_crawler") + if activate_crawler == 'True': + activate_crawler = True + print('Crawler enabled') + else: + activate_crawler = False + print('Crawler disabled') + # Thanks to Faup project for this regex # https://github.com/stricaud/faup - url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_regex) + while True: if message is not None: @@ -123,12 +141,32 @@ if __name__ == "__main__": PST = Paste.Paste(filename) for x in PST.get_regex(url_regex): + print(x) # Extracting url with regex url, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = x - domains_list.append(domain) - urls.append(url) + if '.onion' in url: + print(url) + domains_list.append(domain) + urls.append(url) + + ''' + for x in PST.get_regex(i2p_regex): + # Extracting url with regex + url, s, credential, subdomain, domain, host, port, \ + resource_path, query_string, f1, f2, f3, f4 = x + + if '.i2p' in url: + print('add i2p') + print(domain) + if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain): + r_onion.sadd('i2p_domain', domain) + r_onion.sadd('i2p_link', url) + r_onion.sadd('i2p_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('i2p_crawler_queue', msg) + ''' # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -149,12 +187,33 @@ if __name__ == "__main__": to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) - for url in fetch(p, r_cache, urls, domains_list, path): - publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) - p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) - p.populate_set_out(msg, 'Tags') + if activate_crawler: + date_month = datetime.datetime.now().strftime("%Y%m") + date = datetime.datetime.now().strftime("%Y%m%d") + for url in urls: + + domain = re.findall(url_regex, url) + if len(domain) > 0: + domain = domain[0][4] + else: + continue + + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + if not r_onion.sismember('onion_domain_crawler_queue', domain): + print('send to onion crawler') + r_onion.sadd('onion_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('onion_crawler_queue', msg) + #p.populate_set_out(msg, 'Crawler') + + else: + for url in fetch(p, r_cache, urls, domains_list, path): + publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) + p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') + + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py new file mode 100755 index 00000000..d515c955 --- /dev/null +++ b/bin/packages/HiddenServices.py @@ -0,0 +1,183 @@ +#!/usr/bin/python3 + +""" +The ``hiddenServices Class`` +=================== + +Use it to create an object from an existing paste or other random file. + +Conditions to fulfill to be able to use this class correctly: +------------------------------------------------------------- + +1/ The paste need to be saved on disk somewhere (have an accessible path) +2/ The paste need to be gziped. +3/ The filepath need to look like something like this: + /directory/source/year/month/day/paste.gz + +""" + +import os +import gzip +import redis +import random + +import configparser +import sys +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +from Date import Date + +class HiddenServices(object): + """ + This class representing a hiddenServices as an object. + When created, the object will have by default some "main attributes" + + :Example: + + PST = HiddenServices("xxxxxxxx.onion", "onion") + + """ + + def __init__(self, domain, type): + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + self.r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + self.r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + + self.domain = domain + self.type = type + self.tags = {} + + if type == 'onion': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled")) + self.paste_crawled_directory_name = cfg.get("Directories", "crawled") + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + elif type == 'i2p': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + else: + ## TODO: # FIXME: add error + pass + + def get_origin_paste_name(self): + origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + if origin_paste is None: + return '' + return origin_paste.replace(self.paste_directory+'/', '') + + def get_domain_tags(self): + return self.tags + + def update_domain_tags(self, children): + p_tags = self.r_serv_metadata.smembers('tag:'+children) + for tag in p_tags: + self.tags[tag] = self.tags.get(tag, 0) + 1 + + #todo use the right paste + def get_last_crawled_pastes(self): + paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent') + #paste_parent = paste_parent.replace(self.paste_directory, '')[1:] + return self.get_all_pastes_domain(paste_parent) + + def get_all_pastes_domain(self, father): + if father is None: + return [] + l_crawled_pastes = [] + paste_parent = father.replace(self.paste_directory+'/', '') + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) + ## TODO: # FIXME: remove me + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + paste_childrens = paste_childrens | paste_children + for children in paste_childrens: + if self.domain in children: + l_crawled_pastes.append(children) + self.update_domain_tags(children) + l_crawled_pastes.extend(self.get_all_pastes_domain(children)) + return l_crawled_pastes + + def get_domain_son(self, l_paste): + if l_paste is None: + return None + + set_domain = set() + for paste in l_paste: + paste_full = paste.replace(self.paste_directory+'/', '') + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_full)) + ## TODO: # FIXME: remove me + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(paste)) + paste_childrens = paste_childrens | paste_children + for children in paste_childrens: + if not self.domain in children: + print(children) + set_domain.add((children.split('.onion')[0]+'.onion').split('/')[-1]) + + return set_domain + + def get_all_domain_son(self, father): + if father is None: + return [] + l_crawled_pastes = [] + paste_parent = father.replace(self.paste_directory+'/', '') + paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent)) + ## TODO: # FIXME: remove me + paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father)) + paste_childrens = paste_childrens | paste_children + for children in paste_childrens: + if not self.domain in children: + l_crawled_pastes.append(children) + #self.update_domain_tags(children) + l_crawled_pastes.extend(self.get_all_domain_son(children)) + + return l_crawled_pastes + + def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1): + l_screenshot_paste = [] + for paste in l_crawled_pastes: + ## FIXME: # TODO: remove me + paste= paste.replace(self.paste_directory+'/', '') + + paste = paste.replace(self.paste_crawled_directory_name, '') + if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): + l_screenshot_paste.append(paste[1:]) + + if len(l_screenshot_paste) > num_screenshot: + l_random_screenshot = [] + for index in random.sample( range(0, len(l_screenshot_paste)), num_screenshot ): + l_random_screenshot.append(l_screenshot_paste[index]) + return l_random_screenshot + else: + return l_screenshot_paste + + def get_crawled_pastes_by_date(self, date): + + pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) + paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + + l_crawled_pastes = [] + return l_crawled_pastes + + def get_last_crawled_pastes_fileSearch(self): + + last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + return self.get_crawled_pastes_by_date_fileSearch(last_check) + + def get_crawled_pastes_by_date_fileSearch(self, date): + pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8]) + l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] + return l_crawled_pastes diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 6942cb31..524a7665 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -94,6 +94,7 @@ class Paste(object): var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) + self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) @@ -291,6 +292,9 @@ class Paste(object): else: return '[]' + def get_p_rel_path(self): + return self.p_rel_path + def save_all_attributes_redis(self, key=None): """ Saving all the attributes in a "Redis-like" Database (Redis, LevelDB) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 7c2cff55..c30fa071 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -4,6 +4,8 @@ dicofilters = Dicos pastes = PASTES hash = HASHS base64 = BASE64 +crawled = crawled +crawled_screenshot = CRAWLED_SCREENSHOT wordtrending_csv = var/www/static/csv/wordstrendingdata wordsfile = files/wordfile @@ -184,6 +186,11 @@ host = localhost port = 6382 db = 8 +[ARDB_Onion] +host = localhost +port = 6382 +db = 9 + [Url] cc_critical = DE @@ -228,3 +235,9 @@ channel = FetchedOnion host = localhost port = 6381 db = 0 + +[Crawler] +activate_crawler = True +crawler_depth_limit = 1 +splash_url_onion = http://127.0.0.1 +splash_onion_port = 8050-8052 diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index a3c5e579..deb5a069 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags [Onion] subscribe = Redis_Onion -publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags +publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler #publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler [DumpValidOnion] @@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags [submit_paste] subscribe = Redis publish = Redis_Mixer + +[Crawler] +subscribe = Redis_Crawler +publish = Redis_Mixer,Redis_Tags + diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py new file mode 100644 index 00000000..47486dd9 --- /dev/null +++ b/bin/torcrawler/TorSplashCrawler.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import gzip +import base64 +import uuid +import datetime +import base64 +import redis +import json + +from scrapy.spidermiddlewares.httperror import HttpError +from twisted.internet.error import DNSLookupError +from twisted.internet.error import TimeoutError + +from scrapy import Spider +from scrapy.linkextractors import LinkExtractor +from scrapy.crawler import CrawlerProcess, Crawler + +from scrapy_splash import SplashRequest + +sys.path.append(os.environ['AIL_BIN']) +from Helper import Process + +class TorSplashCrawler(): + + def __init__(self, splash_url, crawler_depth_limit): + self.process = CrawlerProcess({'LOG_ENABLED': False}) + self.crawler = Crawler(self.TorSplashSpider, { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', + 'SPLASH_URL': splash_url, + 'ROBOTSTXT_OBEY': False, + 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, + }, + 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, + 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', + 'HTTPERROR_ALLOW_ALL': True, + 'DEPTH_LIMIT': crawler_depth_limit + }) + + def crawl(self, type, url, domain, original_paste, super_father): + self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father) + self.process.start() + + class TorSplashSpider(Spider): + name = 'TorSplashSpider' + + def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): + self.type = type + self.original_paste = original_paste + self.super_father = super_father + self.start_urls = url + self.domains = [domain] + date = datetime.datetime.now().strftime("%Y/%m/%d") + self.full_date = datetime.datetime.now().strftime("%Y%m%d") + self.date_month = datetime.datetime.now().strftime("%Y%m") + + config_section = 'Crawler' + self.p = Process(config_section) + + self.r_cache = redis.StrictRedis( + host=self.p.config.get("Redis_Cache", "host"), + port=self.p.config.getint("Redis_Cache", "port"), + db=self.p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + self.r_serv_log_submit = redis.StrictRedis( + host=self.p.config.get("Redis_Log_submit", "host"), + port=self.p.config.getint("Redis_Log_submit", "port"), + db=self.p.config.getint("Redis_Log_submit", "db"), + decode_responses=True) + + self.r_serv_metadata = redis.StrictRedis( + host=self.p.config.get("ARDB_Metadata", "host"), + port=self.p.config.getint("ARDB_Metadata", "port"), + db=self.p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + self.r_serv_onion = redis.StrictRedis( + host=self.p.config.get("ARDB_Onion", "host"), + port=self.p.config.getint("ARDB_Onion", "port"), + db=self.p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) + + self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), + self.p.config.get("Directories", "crawled"), date ) + + self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) + + def start_requests(self): + yield SplashRequest( + self.start_urls, + self.parse, + #errback=self.errback_catcher, + endpoint='render.json', + meta={'father': self.original_paste}, + args={ 'html': 1, + 'wait': 10, + 'render_all': 1, + 'har': 1, + 'png': 1} + ) + + def parse(self,response): + #print(response.headers) + #print(response.status) + if response.status == 504: + # down ? + print('504 detected') + elif response.status != 200: + print('other response: {}'.format(response.status)) + #print(error_log) + #detect connection to proxy refused + error_log = (json.loads(response.body.decode())) + if(error_log['info']['text'] == 'Connection to proxy refused'): + print('Connection to proxy refused') + else: + + UUID = self.domains[0]+str(uuid.uuid4()) + filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + relative_filename_paste = os.path.join(self.crawler_path, UUID) + filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + + # save new paste on disk + if self.save_crawled_paste(filename_paste, response.data['html']): + + # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? + #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) + self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) + self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) + + # create onion metadata + if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) + + #create paste metadata + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) + + self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) + + dirname = os.path.dirname(filename_screenshot) + if not os.path.exists(dirname): + os.makedirs(dirname) + + size_screenshot = (len(response.data['png'])*3) /4 + + if size_screenshot < 5000000: #bytes + with open(filename_screenshot, 'wb') as f: + f.write(base64.standard_b64decode(response.data['png'].encode())) + + with open(filename_screenshot+'har.txt', 'wb') as f: + f.write(json.dumps(response.data['har']).encode()) + + # save external links in set + #lext = LinkExtractor(deny_domains=self.domains, unique=True) + #for link in lext.extract_links(response): + # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) + + le = LinkExtractor(allow_domains=self.domains, unique=True) + for link in le.extract_links(response): + yield SplashRequest( + link.url, + self.parse, + #errback=self.errback_catcher, + endpoint='render.json', + meta={'father': relative_filename_paste}, + args={ 'html': 1, + 'png': 1, + 'render_all': 1, + 'har': 1, + 'wait': 10} + ) + + ''' + def errback_catcher(self, failure): + # catch all errback failures, + self.logger.error(repr(failure)) + print('failure') + #print(failure) + print(failure.type) + #print(failure.request.meta['item']) + + #if isinstance(failure.value, HttpError): + if failure.check(HttpError): + # you can get the response + response = failure.value.response + print('HttpError') + self.logger.error('HttpError on %s', response.url) + + #elif isinstance(failure.value, DNSLookupError): + elif failure.check(DNSLookupError): + # this is the original request + request = failure.request + print(DNSLookupError) + print('DNSLookupError') + self.logger.error('DNSLookupError on %s', request.url) + + #elif isinstance(failure.value, TimeoutError): + elif failure.check(TimeoutError): + request = failure.request + print('TimeoutError') + print(TimeoutError) + self.logger.error('TimeoutError on %s', request.url) + ''' + + def save_crawled_paste(self, filename, content): + + if os.path.isfile(filename): + print('File: {} already exist in submitted pastes'.format(filename)) + return False + + try: + gzipencoded = gzip.compress(content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + except: + print("file error: {}".format(filename)) + return False + + # send paste to Global + relay_message = "{0} {1}".format(filename, gzip64encoded) + self.p.populate_set_out(relay_message, 'Mixer') + + # increase nb of paste by feeder name + self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) + + # tag crawled paste + msg = 'infoleak:submission="crawler";{}'.format(filename) + self.p.populate_set_out(msg, 'Tags') + return True diff --git a/bin/torcrawler/blacklist_onion.txt b/bin/torcrawler/blacklist_onion.txt new file mode 100644 index 00000000..a96b0bb8 --- /dev/null +++ b/bin/torcrawler/blacklist_onion.txt @@ -0,0 +1 @@ +www.facebookcorewwwi.onion diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh new file mode 100755 index 00000000..412022c1 --- /dev/null +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; + echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"; + echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; + echo " -n: number of splash servers to start"; + echo ""; + echo "example:"; + echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; + exit 1; + } + +while getopts ":p:f:n:" o; do + case "${o}" in + p) + p=${OPTARG} + ;; + f) + f=${OPTARG} + ;; + n) + n=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then + usage; +fi + +screen -dmS "Docker_Splash" +sleep 0.1 + +for ((i=0;i<=$((${n} - 1));i++)); do + port_number=$((${p} + $i)) + screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + sleep 0.1 + echo " Splash server launched on port $port_number" +done diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py new file mode 100755 index 00000000..58e8331b --- /dev/null +++ b/bin/torcrawler/tor_crawler.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import configparser +from TorSplashCrawler import TorSplashCrawler + +if __name__ == '__main__': + + if len(sys.argv) != 7: + print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') + exit(1) + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + + splash_url = sys.argv[1] + type = sys.argv[2] + crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") + + url = sys.argv[3] + domain = sys.argv[4] + paste = sys.argv[5] + super_father = sys.argv[6] + + crawler = TorSplashCrawler(splash_url, crawler_depth_limit) + crawler.crawl(type, url, domain, paste, super_father) diff --git a/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..63217c2a --- /dev/null +++ b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=172.17.0.1 +port=9050 +type=SOCKS5 diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh new file mode 100755 index 00000000..3fbccb74 --- /dev/null +++ b/crawler_hidden_services_install.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +install_docker() { + # install docker + sudo apt install docker.io; + + # pull splah docker + sudo docker pull scrapinghub/splash; +} + +install_python_requirement() { + . ./AILENV/bin/activate; + pip3 install -U -r crawler_requirements.txt; +} + +install_all() { + read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r + echo # (optional) move to a new line + if [[ $REPLY =~ ^[Yy]$ ]] + then + install_docker; + fi + install_python_requirement; +} + +usage() { + echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2; + echo " -y: install docker" + echo " -n: don't install docker" + echo "" + echo "example:" + echo "crawler_hidden_services_install.sh -y" + exit 1; +} + +if [[ $1 == "" ]]; then + install_all; + exit; +else + key="$1" + case $key in + "") + install_all; + ;; + -y|--yes) + install_docker; + install_python_requirement; + ;; + -n|--no) + install_python_requirement; + ;; + *) # unknown option + usage; + ;; + esac +fi diff --git a/crawler_requirements.txt b/crawler_requirements.txt new file mode 100644 index 00000000..b0c096ac --- /dev/null +++ b/crawler_requirements.txt @@ -0,0 +1,2 @@ +scrapy +scrapy-splash diff --git a/etc/splash/proxy-profiles/default.ini b/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..91208135 --- /dev/null +++ b/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=localhost +port=9050 +type=SOCKS5 diff --git a/files/Onion b/files/Onion index 5c9980e2..69fcf878 100644 --- a/files/Onion +++ b/files/Onion @@ -1 +1,2 @@ onion +i2p diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 51b492d7..95433757 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -48,6 +48,9 @@ except IOError: f = open('templates/ignored_modules.txt', 'w') f.close() +activate_crawler = cfg.get("Crawler", "activate_crawler") +if activate_crawler != 'True': + toIgnoreModule.add('hiddenServices') # Dynamically import routes and functions from modules # Also, prepare header.html diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 364a15df..ea6fd6ed 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis( db=cfg.getint("ARDB_Statistics", "db"), decode_responses=True) +r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + sys.path.append('../../configs/keys') # MISP # @@ -150,6 +156,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) +SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs")) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py new file mode 100644 index 00000000..47ea56f1 --- /dev/null +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Flask functions and routes for the trending modules page +''' +import redis +import datetime +import sys +import os +from flask import Flask, render_template, jsonify, request, Blueprint + +from Date import Date +from HiddenServices import HiddenServices + +# ============ VARIABLES ============ +import Flask_config + +app = Flask_config.app +cfg = Flask_config.cfg +baseUrl = Flask_config.baseUrl +r_serv_onion = Flask_config.r_serv_onion +r_serv_metadata = Flask_config.r_serv_metadata +bootstrap_label = Flask_config.bootstrap_label +PASTES_FOLDER = Flask_config.PASTES_FOLDER + +hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') + +# ============ FUNCTIONS ============ +def one(): + return 1 + +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date( '{}{}{}'.format(str(curr_date.year), str(curr_date.month).zfill(2), str(curr_date.day).zfill(2)) ) + date_list = [] + + for i in range(0, num_day): + date_list.append(date.substract_day(i)) + + return list(reversed(date_list)) + +def unpack_paste_tags(p_tags): + l_tags = [] + for tag in p_tags: + complete_tag = tag + tag = tag.split('=') + if len(tag) > 1: + if tag[1] != '': + tag = tag[1][1:-1] + # no value + else: + tag = tag[0][1:-1] + # use for custom tags + else: + tag = tag[0] + l_tags.append( (tag, complete_tag) ) + return l_tags + +def get_onion_status(domain, date): + if r_serv_onion.sismember('onion_up:'+date , domain): + return True + else: + return False +# ============= ROUTES ============== + +@hiddenServices.route("/hiddenServices/", methods=['GET']) +def hiddenServices_page(): + last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) + list_onion = [] + + now = datetime.datetime.now() + date = '{}{}{}'.format(now.strftime("%Y"), now.strftime("%m"), now.strftime("%d")) + statDomains = {} + statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date)) + statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date)) + statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] + statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue') + + for onion in last_onions: + metadata_onion = {} + metadata_onion['domain'] = onion + metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check') + metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen') + if get_onion_status(onion, metadata_onion['last_check']): + metadata_onion['status_text'] = 'UP' + metadata_onion['status_color'] = 'Green' + metadata_onion['status_icon'] = 'fa-check-circle' + else: + metadata_onion['status_text'] = 'DOWN' + metadata_onion['status_color'] = 'Red' + metadata_onion['status_icon'] = 'fa-times-circle' + list_onion.append(metadata_onion) + + return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains) + +@hiddenServices.route("/hiddenServices/onion_domain", methods=['GET']) +def onion_domain(): + onion_domain = request.args.get('onion_domain') + if onion_domain is None or not r_serv_onion.exists('onion_metadata:{}'.format(onion_domain)): + return '404' + # # TODO: FIXME return 404 + + last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') + last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8]) + first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') + first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8]) + origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') + + h = HiddenServices(onion_domain, 'onion') + l_pastes = h.get_last_crawled_pastes() + if l_pastes: + status = True + else: + status = False + screenshot = h.get_domain_random_screenshot(l_pastes) + if screenshot: + screenshot = screenshot[0] + else: + screenshot = 'None' + + domain_tags = h.get_domain_tags() + + origin_paste_name = h.get_origin_paste_name() + origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste))) + paste_tags = [] + path_name = [] + for path in l_pastes: + path_name.append(path.replace(PASTES_FOLDER+'/', '')) + p_tags = r_serv_metadata.smembers('tag:'+path) + paste_tags.append(unpack_paste_tags(p_tags)) + + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, + l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label, + path_name=path_name, origin_paste_tags=origin_paste_tags, status=status, + origin_paste=origin_paste, origin_paste_name=origin_paste_name, + domain_tags=domain_tags, screenshot=screenshot) + +@hiddenServices.route("/hiddenServices/onion_son", methods=['GET']) +def onion_son(): + onion_domain = request.args.get('onion_domain') + + h = HiddenServices(onion_domain, 'onion') + l_pastes = h.get_last_crawled_pastes() + l_son = h.get_domain_son(l_pastes) + print(l_son) + return 'l_son' + +# ============= JSON ============== +@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) +def domain_crawled_7days_json(): + type = 'onion' + ## TODO: # FIXME: 404 error + + date_range = get_date_range(7) + json_domain_stats = [] + #try: + for date in date_range: + nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date)) + nb_domain_down = r_serv_onion.scard('{}_up:{}'.format(type, date)) + date = date[0:4] + '-' + date[4:6] + '-' + date[6:8] + json_domain_stats.append({ 'date': date, 'value': int( nb_domain_up ), 'nb_domain_down': int( nb_domain_down )}) + #except: + #return jsonify() + + return jsonify(json_domain_stats) + +# ========= REGISTRATION ========= +app.register_blueprint(hiddenServices, url_prefix=baseUrl) diff --git a/var/www/modules/hiddenServices/templates/header_hiddenServices.html b/var/www/modules/hiddenServices/templates/header_hiddenServices.html new file mode 100644 index 00000000..5c77963c --- /dev/null +++ b/var/www/modules/hiddenServices/templates/header_hiddenServices.html @@ -0,0 +1 @@ +
  • hidden Services
  • diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html new file mode 100644 index 00000000..59aeb2ae --- /dev/null +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -0,0 +1,234 @@ + + + + + + + + Hidden Service - AIL + + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    +
    + +
    +
    ONION
    +
    + + + + + + + + + + + {% for metadata_onion in last_onions %} + + + + + + + {% endfor %} + +
    DomainFirst SeenLast CheckStatus
    {{ metadata_onion['domain'] }}{{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}}{{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}}
    + + {{metadata_onion['status_text']}} +
    +
    + +
    +
    + +
    +
    +
    + +
    +
    +
    + Domains Crawled Today +
    + + + + + + + + + + + + + + + + + + + + +
    +
    + + Domains UP +
    +
    +
    + {{ statDomains['domains_up'] }} +
    +
    +
    + + Domains DOWN +
    +
    +
    + {{ statDomains['domains_down'] }} +
    +
    Crawled Domains{{ statDomains['total'] }}
    Domains in Queue{{ statDomains['domains_queue'] }}
    +
    +
    + +
    + +
    + + + + + + + + diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html new file mode 100644 index 00000000..dd6b2056 --- /dev/null +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -0,0 +1,213 @@ + + + + + + + + Show Domain - AIL + + + + + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    + +
    +
    +
    +
    + {% if status %} +
    + + UP +
    + {% else %} +
    + + DOWN +
    + {% endif %} +

    {{ domain }} :

    + + +
    +
    +
    + {% for tag in domain_tags %} + + {{ tag }} {{ domain_tags[tag] }} + + {% endfor %} +
    +
    +
    + + + + + + + + + + {% for path in l_pastes %} + + + + {% endfor %} + + +
    Crawled Pastes
    {{ path_name[loop.index0] }} +
    + {% for tag in paste_tags[loop.index0] %} + + {{ tag[0] }} + + {% endfor %} +
    +
    + +
    +
    + +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    + +
    + +
    + + + + + + + + + diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index cd7319c3..4912e7b0 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -8,7 +8,7 @@ import redis import json import os import flask -from flask import Flask, render_template, jsonify, request, Blueprint, make_response, redirect, url_for, Response +from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory, redirect, url_for import difflib import ssdeep @@ -25,6 +25,7 @@ r_serv_pasteName = Flask_config.r_serv_pasteName r_serv_metadata = Flask_config.r_serv_metadata r_serv_tags = Flask_config.r_serv_tags r_serv_statistics = Flask_config.r_serv_statistics +r_serv_onion = Flask_config.r_serv_onion max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal DiffMaxLineLength = Flask_config.DiffMaxLineLength @@ -33,6 +34,7 @@ misp_event_url = Flask_config.misp_event_url hive_case_url = Flask_config.hive_case_url vt_enabled = Flask_config.vt_enabled PASTES_FOLDER = Flask_config.PASTES_FOLDER +SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates') @@ -41,6 +43,8 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa def showpaste(content_range, requested_path): if PASTES_FOLDER not in requested_path: requested_path = os.path.join(PASTES_FOLDER, requested_path) + # remove old full path + #requested_path = requested_path.replace(PASTES_FOLDER, '') # escape directory transversal if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: return 'path transversal detected' @@ -175,6 +179,16 @@ def showpaste(content_range, requested_path): l_64.append( (file_icon, estimated_type, hash, saved_path, nb_in_file, b64_vt, b64_vt_link, b64_vt_report) ) + crawler_metadata = {} + if 'infoleak:submission="crawler"' in l_tags: + crawler_metadata['get_metadata'] = True + crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') + crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') + crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') + crawler_metadata['screenshot'] = paste.get_p_rel_path() + else: + crawler_metadata['get_metadata'] = False + if Flask_config.pymisp is False: misp = False else: @@ -202,6 +216,7 @@ def showpaste(content_range, requested_path): hive_url = hive_case_url.replace('id_here', hive_case) return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list, + crawler_metadata=crawler_metadata, l_64=l_64, vt_enabled=vt_enabled, misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url) # ============ ROUTES ============ @@ -250,6 +265,10 @@ def showDiff(): the_html = htmlD.make_file(lines1, lines2) return the_html +@showsavedpastes.route('/screenshot/') +def screenshot(filename): + return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True) + @showsavedpastes.route('/send_file_to_vt/', methods=['POST']) def send_file_to_vt(): b64_path = request.form['b64_path'] diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html index cb99637c..b6afbb7f 100644 --- a/var/www/modules/showpaste/templates/show_saved_paste.html +++ b/var/www/modules/showpaste/templates/show_saved_paste.html @@ -351,7 +351,6 @@
    {% if duplicate_list|length == 0 %} -

    No Duplicate

    {% else %}

    Duplicate list:

    @@ -420,6 +419,57 @@
    {% endif %} + {% if crawler_metadata['get_metadata'] %} +
    + +
    +
    +
    +
    + Crawled Paste +
    + + + + + + + + + + + + + + + + +
    Domain{{ crawler_metadata['domain'] }}
    Father{{ crawler_metadata['paste_father'] }}
    Source link{{ crawler_metadata['real_link'] }}
    +
    +
    +
    + +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    +
    + {% endif %} +

    Content:

    [Raw content]

    {{ content }}

    @@ -535,5 +585,51 @@ {% endfor %} +