From 8b1c10b38c4fd5f3a34fe8ee9f52061e78fb01ec Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Thu, 9 Aug 2018 17:42:21 +0200
Subject: [PATCH 01/28] chg: [Onion] add onion splash crawler
---
bin/Crawler.py | 92 ++++++++++++++
bin/Onion.py | 8 +-
bin/packages/config.cfg.sample | 12 ++
bin/packages/modules.cfg | 7 +-
bin/torcrawler/TorSplashCrawler.py | 165 ++++++++++++++++++++++++++
bin/torcrawler/tor_crawler.py | 33 ++++++
etc/splash/proxy-profiles/default.ini | 4 +
7 files changed, 319 insertions(+), 2 deletions(-)
create mode 100755 bin/Crawler.py
create mode 100644 bin/torcrawler/TorSplashCrawler.py
create mode 100755 bin/torcrawler/tor_crawler.py
create mode 100644 etc/splash/proxy-profiles/default.ini
diff --git a/bin/Crawler.py b/bin/Crawler.py
new file mode 100755
index 00000000..92d43a81
--- /dev/null
+++ b/bin/Crawler.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import redis
+import datetime
+import time
+import subprocess
+
+sys.path.append(os.environ['AIL_BIN'])
+from Helper import Process
+from pubsublogger import publisher
+
+
+def signal_handler(sig, frame):
+ sys.exit(0)
+
+if __name__ == '__main__':
+
+ publisher.port = 6380
+ publisher.channel = "Script"
+
+ publisher.info("Script Crawler started")
+
+ config_section = 'Crawler'
+
+ # Setup the I/O queues
+ p = Process(config_section)
+
+ splash_url = p.config.get("Crawler", "splash_url")
+ http_proxy = p.config.get("Crawler", "http_proxy")
+ crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
+
+ #signal.signal(signal.SIGINT, signal_handler)
+
+ r_serv_metadata = redis.StrictRedis(
+ host=p.config.get("ARDB_Metadata", "host"),
+ port=p.config.getint("ARDB_Metadata", "port"),
+ db=p.config.getint("ARDB_Metadata", "db"),
+ decode_responses=True)
+
+ r_cache = redis.StrictRedis(
+ host=p.config.get("Redis_Cache", "host"),
+ port=p.config.getint("Redis_Cache", "port"),
+ db=p.config.getint("Redis_Cache", "db"),
+ decode_responses=True)
+
+ r_onion = redis.StrictRedis(
+ host=p.config.get("ARDB_Onion", "host"),
+ port=p.config.getint("ARDB_Onion", "port"),
+ db=p.config.getint("ARDB_Onion", "db"),
+ decode_responses=True)
+
+ while True:
+
+ message = p.get_from_set()
+ # Recovering the streamed message informations.
+ if message is not None:
+ splitted = message.split(';')
+ if len(splitted) == 2:
+ url, paste = splitted
+
+ print(url)
+
+ if not r_cache.exists(url):
+ super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
+ if super_father is None:
+ super_father=paste
+
+ process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
+ stdout=subprocess.PIPE)
+ while process.poll() is None:
+ time.sleep(1)
+
+ date = datetime.datetime.now().strftime("%Y%m%d")
+ print(date)
+ url_domain = url.replace('http://', '')
+ if process.returncode == 0:
+ if r_serv_metadata.exists('paste_children:'+paste):
+ msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+ p.populate_set_out(msg, 'Tags')
+
+ r_onion.sadd('onion_up:'+date , url_domain)
+ else:
+ r_onion.sadd('onion_down:'+date , url_domain)
+ print(process.stdout.read())
+
+ else:
+ continue
+ else:
+ time.sleep(1)
diff --git a/bin/Onion.py b/bin/Onion.py
index 277f1c71..dbedf1e1 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -21,7 +21,6 @@ Requirements
*Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.
"""
-import pprint
import time
from packages import Paste
from pubsublogger import publisher
@@ -123,6 +122,7 @@ if __name__ == "__main__":
PST = Paste.Paste(filename)
for x in PST.get_regex(url_regex):
+ print(x)
# Extracting url with regex
url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x
@@ -149,12 +149,18 @@ if __name__ == "__main__":
to_print = 'Onion;{};{};{};'.format(PST.p_source,
PST.p_date,
PST.p_name)
+ '''
for url in fetch(p, r_cache, urls, domains_list, path):
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
p.populate_set_out(msg, 'Tags')
+ '''
+ for url in urls:
+ msg = '{};{}'.format(url,PST.p_path)
+ print('send to crawler')
+ p.populate_set_out(msg, 'Crawler')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample
index 2ed662c1..62ea0887 100644
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@@ -3,6 +3,8 @@ bloomfilters = Blooms
dicofilters = Dicos
pastes = PASTES
base64 = BASE64
+crawled = crawled
+crawled_screenshot = CRAWLED_SCREENSHOT
wordtrending_csv = var/www/static/csv/wordstrendingdata
wordsfile = files/wordfile
@@ -171,6 +173,11 @@ host = localhost
port = 6382
db = 8
+[ARDB_Onion]
+host = localhost
+port = 6382
+db = 9
+
[Url]
cc_critical = DE
@@ -215,3 +222,8 @@ channel = FetchedOnion
host = localhost
port = 6381
db = 0
+
+[Crawler]
+crawler_depth_limit = 1
+splash_url = http://127.0.0.1:8050
+http_proxy = http://127.0.0.1:9050
diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg
index 452850f7..d8acf2dc 100644
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags
[Onion]
subscribe = Redis_Onion
-publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags
+publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler
#publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler
[DumpValidOnion]
@@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags
[submit_paste]
subscribe = Redis
publish = Redis_Mixer
+
+[Crawler]
+subscribe = Redis_Crawler
+publish = Redis_Mixer,Redis_Tags
+
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
new file mode 100644
index 00000000..ace36056
--- /dev/null
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import gzip
+import base64
+import uuid
+import datetime
+import base64
+import redis
+from urllib.parse import urlparse
+
+from scrapy import Spider
+from scrapy.linkextractors import LinkExtractor
+from scrapy.crawler import CrawlerProcess, Crawler
+
+from twisted.internet import reactor
+
+from scrapy_splash import SplashRequest
+
+sys.path.append(os.environ['AIL_BIN'])
+from Helper import Process
+
+class TorSplashCrawler():
+
+ def __init__(self, splash_url, http_proxy, crawler_depth_limit):
+ self.process = CrawlerProcess({'LOG_ENABLED': False})
+ self.crawler = Crawler(self.TorSplashSpider, {
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
+ 'SPLASH_URL': splash_url,
+ 'HTTP_PROXY': http_proxy,
+ 'ROBOTSTXT_OBEY': False,
+ 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
+ 'scrapy_splash.SplashMiddleware': 725,
+ 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+ },
+ 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
+ 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
+ 'DEPTH_LIMIT': crawler_depth_limit
+ })
+
+ def crawl(self, url, original_paste, super_father):
+ self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
+ self.process.start()
+
+ class TorSplashSpider(Spider):
+ name = 'TorSplashSpider'
+
+ def __init__(self, url, original_paste, super_father, *args, **kwargs):
+ self.original_paste = original_paste
+ self.super_father = super_father
+ self.start_urls = url
+ self.domains = [urlparse(url).netloc]
+ date = datetime.datetime.now().strftime("%Y/%m/%d")
+
+ config_section = 'Crawler'
+ self.p = Process(config_section)
+
+ self.r_cache = redis.StrictRedis(
+ host=self.p.config.get("Redis_Cache", "host"),
+ port=self.p.config.getint("Redis_Cache", "port"),
+ db=self.p.config.getint("Redis_Cache", "db"),
+ decode_responses=True)
+
+ self.r_serv_log_submit = redis.StrictRedis(
+ host=self.p.config.get("Redis_Log_submit", "host"),
+ port=self.p.config.getint("Redis_Log_submit", "port"),
+ db=self.p.config.getint("Redis_Log_submit", "db"),
+ decode_responses=True)
+
+ self.r_serv_metadata = redis.StrictRedis(
+ host=self.p.config.get("ARDB_Metadata", "host"),
+ port=self.p.config.getint("ARDB_Metadata", "port"),
+ db=self.p.config.getint("ARDB_Metadata", "db"),
+ decode_responses=True)
+
+ self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
+ self.p.config.get("Directories", "crawled"), date )
+
+ self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
+
+ def start_requests(self):
+ yield SplashRequest(
+ self.start_urls,
+ self.parse,
+ endpoint='render.json',
+ meta={'parent': self.original_paste},
+ args={ 'html': 1,
+ 'wait': 10,
+ 'render_all': 1,
+ 'png': 1}
+ )
+
+ def parse(self,response):
+ print(response.headers)
+ print(response.status)
+
+ self.r_cache.setbit(response.url, 0, 1)
+ self.r_cache.expire(response.url, 360000)
+
+ UUID = self.domains[0]+str(uuid.uuid4())
+ filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
+ filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
+
+ # save new paste on disk
+ if self.save_crawled_paste(filename_paste, response.data['html']):
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
+
+ self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
+
+ dirname = os.path.dirname(filename_screenshot)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
+ with open(filename_screenshot, 'wb') as f:
+ f.write(base64.standard_b64decode(response.data['png'].encode()))
+
+ # save external links in set
+ lext = LinkExtractor(deny_domains=self.domains, unique=True)
+ for link in lext.extract_links(response):
+ self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
+
+ #le = LinkExtractor(unique=True)
+ le = LinkExtractor(allow_domains=self.domains, unique=True)
+ for link in le.extract_links(response):
+ self.r_cache.setbit(link, 0, 0)
+ self.r_cache.expire(link, 360000)
+ yield SplashRequest(
+ link.url,
+ self.parse,
+ endpoint='render.json',
+ meta={'parent': UUID},
+ args={ 'html': 1,
+ 'png': 1,
+ 'render_all': 1,
+ 'wait': 10}
+ )
+
+ def save_crawled_paste(self, filename, content):
+
+ print(filename)
+ if os.path.isfile(filename):
+ print('File: {} already exist in submitted pastes'.format(filename))
+ return False
+
+ try:
+ gzipencoded = gzip.compress(content.encode())
+ gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
+ except:
+ print("file error: {}".format(filename))
+ return False
+
+ # send paste to Global
+ relay_message = "{0} {1}".format(filename, gzip64encoded)
+ self.p.populate_set_out(relay_message, 'Mixer')
+
+ # increase nb of paste by feeder name
+ self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
+
+ # tag crawled paste
+ msg = 'infoleak:submission="crawler";{}'.format(filename)
+ self.p.populate_set_out(msg, 'Tags')
+ return True
diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py
new file mode 100755
index 00000000..3085f213
--- /dev/null
+++ b/bin/torcrawler/tor_crawler.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import configparser
+from TorSplashCrawler import TorSplashCrawler
+
+if __name__ == '__main__':
+
+ if len(sys.argv) != 4:
+ print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
+ exit(1)
+
+ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
+ if not os.path.exists(configfile):
+ raise Exception('Unable to find the configuration file. \
+ Did you set environment variables? \
+ Or activate the virtualenv.')
+
+ cfg = configparser.ConfigParser()
+ cfg.read(configfile)
+
+ splash_url = cfg.get("Crawler", "splash_url")
+ http_proxy = cfg.get("Crawler", "http_proxy")
+ crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
+
+ url = sys.argv[1]
+ paste = sys.argv[2]
+ super_father = sys.argv[3]
+
+ crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
+ crawler.crawl(url, paste, super_father)
diff --git a/etc/splash/proxy-profiles/default.ini b/etc/splash/proxy-profiles/default.ini
new file mode 100644
index 00000000..91208135
--- /dev/null
+++ b/etc/splash/proxy-profiles/default.ini
@@ -0,0 +1,4 @@
+[proxy]
+host=localhost
+port=9050
+type=SOCKS5
From 765208943344fb56bfad652d79aecdac83a38364 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Mon, 13 Aug 2018 09:23:14 +0200
Subject: [PATCH 02/28] chg: [Onion] change onion regex, fix crawler
---
bin/Crawler.py | 65 +++++++++++++++++++++---------
bin/Onion.py | 2 +-
bin/torcrawler/TorSplashCrawler.py | 38 +++++++++++++----
bin/torcrawler/tor_crawler.py | 11 ++---
4 files changed, 82 insertions(+), 34 deletions(-)
diff --git a/bin/Crawler.py b/bin/Crawler.py
index 92d43a81..a8292b74 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -3,6 +3,7 @@
import os
import sys
+import re
import redis
import datetime
import time
@@ -16,6 +17,33 @@ from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
+def crawl_onion(url, domain):
+ date = datetime.datetime.now().strftime("%Y%m%d")
+
+ if not r_onion.sismember('onion_up:'+date , domain):
+ super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
+ if super_father is None:
+ super_father=paste
+
+ process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
+ stdout=subprocess.PIPE)
+ while process.poll() is None:
+ time.sleep(1)
+
+ if process.returncode == 0:
+ if r_serv_metadata.exists('paste_children:'+paste):
+ msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+ p.populate_set_out(msg, 'Tags')
+ print(process.stdout.read())
+
+ r_onion.sadd('onion_up:'+date , domain)
+ r_onion.sadd('onion_up_link:'+date , url)
+ else:
+ r_onion.sadd('onion_down:'+date , domain)
+ r_onion.sadd('onion_down_link:'+date , url)
+ print(process.stdout.read())
+
+
if __name__ == '__main__':
publisher.port = 6380
@@ -52,6 +80,9 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
+ url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ re.compile(url_regex)
+
while True:
message = p.get_from_set()
@@ -61,30 +92,24 @@ if __name__ == '__main__':
if len(splitted) == 2:
url, paste = splitted
- print(url)
+ url_list = re.findall(url_regex, url)[0]
+ if url_list[1] == '':
+ url= 'http://{}'.format(url)
- if not r_cache.exists(url):
- super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
- if super_father is None:
- super_father=paste
+ link, s, credential, subdomain, domain, host, port, \
+ resource_path, query_string, f1, f2, f3, f4 = url_list
+ domain = url_list[4]
- process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
- stdout=subprocess.PIPE)
- while process.poll() is None:
- time.sleep(1)
+ domain_url = 'http://{}'.format(domain)
- date = datetime.datetime.now().strftime("%Y%m%d")
- print(date)
- url_domain = url.replace('http://', '')
- if process.returncode == 0:
- if r_serv_metadata.exists('paste_children:'+paste):
- msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
- p.populate_set_out(msg, 'Tags')
+ print('------------------START ONIOM CRAWLER------------------')
+ print('url: {}'.format(url))
+ print('domain: {}'.format(domain))
+ print('domain_url: {}'.format(domain_url))
- r_onion.sadd('onion_up:'+date , url_domain)
- else:
- r_onion.sadd('onion_down:'+date , url_domain)
- print(process.stdout.read())
+ crawl_onion(url, domain)
+ if url != domain_url:
+ crawl_onion(domain_url, domain)
else:
continue
diff --git a/bin/Onion.py b/bin/Onion.py
index dbedf1e1..1e2dff32 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -108,7 +108,7 @@ if __name__ == "__main__":
# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
- url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
while True:
if message is not None:
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index ace36056..63839799 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -9,14 +9,11 @@ import uuid
import datetime
import base64
import redis
-from urllib.parse import urlparse
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
-from twisted.internet import reactor
-
from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN'])
@@ -40,19 +37,20 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_depth_limit
})
- def crawl(self, url, original_paste, super_father):
- self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
+ def crawl(self, url, domain, original_paste, super_father):
+ self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
- def __init__(self, url, original_paste, super_father, *args, **kwargs):
+ def __init__(self, url, domain,original_paste, super_father, *args, **kwargs):
self.original_paste = original_paste
self.super_father = super_father
self.start_urls = url
- self.domains = [urlparse(url).netloc]
+ self.domains = [domain]
date = datetime.datetime.now().strftime("%Y/%m/%d")
+ self.full_date = datetime.datetime.now().strftime("%Y%m%d")
config_section = 'Crawler'
self.p = Process(config_section)
@@ -75,6 +73,12 @@ class TorSplashCrawler():
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
+ self.r_serv_onion = redis.StrictRedis(
+ host=self.p.config.get("ARDB_Onion", "host"),
+ port=self.p.config.getint("ARDB_Onion", "port"),
+ db=self.p.config.getint("ARDB_Onion", "db"),
+ decode_responses=True)
+
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
@@ -96,6 +100,7 @@ class TorSplashCrawler():
print(response.headers)
print(response.status)
+ # # TODO: # FIXME:
self.r_cache.setbit(response.url, 0, 1)
self.r_cache.expire(response.url, 360000)
@@ -105,8 +110,19 @@ class TorSplashCrawler():
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
+
+ # create onion metadata
+ if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
+ self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
+ self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
+
+ # add onion screenshot history
+ self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
+
+ #create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
@@ -114,6 +130,13 @@ class TorSplashCrawler():
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
+
+ print(sys.getsizeof(response.data['png']))
+ print(sys.getsizeof(response.data['html']))
+ print(self.domains[0])
+
+
+
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
@@ -140,7 +163,6 @@ class TorSplashCrawler():
def save_crawled_paste(self, filename, content):
- print(filename)
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False
diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py
index 3085f213..57a77e76 100755
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__':
- if len(sys.argv) != 4:
- print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
+ if len(sys.argv) != 5:
+ print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father')
exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@@ -26,8 +26,9 @@ if __name__ == '__main__':
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[1]
- paste = sys.argv[2]
- super_father = sys.argv[3]
+ domain = sys.argv[2]
+ paste = sys.argv[3]
+ super_father = sys.argv[4]
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
- crawler.crawl(url, paste, super_father)
+ crawler.crawl(url, domain, paste, super_father)
From ed559d9f4a9d2b2cd36918c6559fe49fe4fbf140 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Thu, 16 Aug 2018 17:24:39 +0200
Subject: [PATCH 03/28] chg: [Showpaste] add screenshot + improve onion db
---
.gitignore | 1 +
bin/Crawler.py | 88 ++++++++++++++-----
bin/packages/Paste.py | 4 +
bin/torcrawler/TorSplashCrawler.py | 67 ++++++++++----
var/www/modules/Flask_config.py | 8 ++
var/www/modules/showpaste/Flask_showpaste.py | 20 ++++-
.../showpaste/templates/show_saved_paste.html | 36 ++++++++
7 files changed, 186 insertions(+), 38 deletions(-)
diff --git a/.gitignore b/.gitignore
index e74906ae..b5755ee6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ var/www/submitted
# Local config
bin/packages/config.cfg
configs/keys
+files
# installed files
nltk_data/
diff --git a/bin/Crawler.py b/bin/Crawler.py
index a8292b74..df1e0117 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -8,6 +8,7 @@ import redis
import datetime
import time
import subprocess
+import requests
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
@@ -17,31 +18,40 @@ from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
-def crawl_onion(url, domain):
- date = datetime.datetime.now().strftime("%Y%m%d")
+def crawl_onion(url, domain, date):
- if not r_onion.sismember('onion_up:'+date , domain):
+ if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain):
+ #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
- process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
- stdout=subprocess.PIPE)
- while process.poll() is None:
- time.sleep(1)
+ try:
+ r = requests.get(splash_url , timeout=0.010)
+ except Exception:
+ ## FIXME: # TODO: relaunch docker
+ exit(0)
- if process.returncode == 0:
- if r_serv_metadata.exists('paste_children:'+paste):
- msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
- p.populate_set_out(msg, 'Tags')
- print(process.stdout.read())
+ if r.status_code == 200:
+ process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
+ stdout=subprocess.PIPE)
+ while process.poll() is None:
+ time.sleep(1)
- r_onion.sadd('onion_up:'+date , domain)
- r_onion.sadd('onion_up_link:'+date , url)
+ if process.returncode == 0:
+ if r_serv_metadata.exists('paste_children:'+paste):
+ msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+ p.populate_set_out(msg, 'Tags')
+
+ print(process.stdout.read())
+
+ else:
+ r_onion.sadd('onion_down:'+date , domain)
+ r_onion.sadd('onion_down_link:'+date , url)
+ print(process.stdout.read())
else:
- r_onion.sadd('onion_down:'+date , domain)
- r_onion.sadd('onion_down_link:'+date , url)
- print(process.stdout.read())
+ ## FIXME: # TODO: relaunch docker
+ exit(0)
if __name__ == '__main__':
@@ -102,15 +112,51 @@ if __name__ == '__main__':
domain_url = 'http://{}'.format(domain)
- print('------------------START ONIOM CRAWLER------------------')
+ print('------------------START ONION CRAWLER------------------')
print('url: {}'.format(url))
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
- crawl_onion(url, domain)
- if url != domain_url:
- crawl_onion(domain_url, domain)
+ if not r_onion.sismember('banned_onion', domain):
+ date = datetime.datetime.now().strftime("%Y%m%d")
+
+ crawl_onion(url, domain, date)
+ if url != domain_url:
+ crawl_onion(domain_url, domain, date)
+
+ # save dowm onion
+ if not r_onion.sismember('onion_up:'+date , domain):
+ r_onion.sadd('onion_down:'+date , domain)
+ r_onion.sadd('onion_down_link:'+date , url)
+ r_onion.hincrby('onion_link_down', url, 1)
+ if not r_onion.exists('onion_metadata:{}'.format(domain)):
+ r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
+ r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
+ else:
+ r_onion.hincrby('onion_link_up', url, 1)
+
+ # last check
+ r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
+
+ # check external onions links (full_scrawl)
+ external_domains = set()
+ for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
+ print(link)
+ external_domain = re.findall(url_regex, link)
+ print(external_domain)
+ if len(external_domain) > 0:
+ external_domain = external_domain[0][4]
+ else:
+ continue
+ print(external_domain)
+ # # TODO: add i2p
+ if '.onion' in external_domain and external_domain != domain:
+ external_domains.add(external_domain)
+ if len(external_domains) >= 10:
+ r_onion.sadd('onion_potential_source', domain)
+ r_onion.delete('domain_onion_external_links:{}'.format(domain))
+ print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
else:
continue
else:
diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py
index d1e3f0d3..45ed1ed2 100755
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@@ -94,6 +94,7 @@ class Paste(object):
var = self.p_path.split('/')
self.p_date = Date(var[-4], var[-3], var[-2])
+ self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
self.p_source = var[-5]
self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0])
@@ -291,6 +292,9 @@ class Paste(object):
else:
return '[]'
+ def get_p_rel_path(self):
+ return self.p_rel_path
+
def save_all_attributes_redis(self, key=None):
"""
Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index 63839799..3d392b93 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -10,6 +10,10 @@ import datetime
import base64
import redis
+from scrapy.spidermiddlewares.httperror import HttpError
+from twisted.internet.error import DNSLookupError
+from twisted.internet.error import TimeoutError
+
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
@@ -79,6 +83,8 @@ class TorSplashCrawler():
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
+ self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )
+
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
@@ -89,7 +95,7 @@ class TorSplashCrawler():
self.start_urls,
self.parse,
endpoint='render.json',
- meta={'parent': self.original_paste},
+ meta={'father': self.original_paste},
args={ 'html': 1,
'wait': 10,
'render_all': 1,
@@ -106,44 +112,47 @@ class TorSplashCrawler():
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
+ relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
+ self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
+ self.r_serv_onion.sadd('full_onion_up', self.domains[0])
+
# create onion metadata
- if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
- self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
- self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
+ if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
+ self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date)
+ self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date)
# add onion screenshot history
- self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
+ self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
- self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
- self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
+ self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
- print(sys.getsizeof(response.data['png']))
- print(sys.getsizeof(response.data['html']))
- print(self.domains[0])
+ size_screenshot = (len(response.data['png'])*3) /4
+ print(size_screenshot)
-
-
- with open(filename_screenshot, 'wb') as f:
- f.write(base64.standard_b64decode(response.data['png'].encode()))
+ if size_screenshot < 5000000: #bytes
+ with open(filename_screenshot, 'wb') as f:
+ f.write(base64.standard_b64decode(response.data['png'].encode()))
# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
- self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
+ self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url)
+ self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url)
#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
@@ -154,12 +163,38 @@ class TorSplashCrawler():
link.url,
self.parse,
endpoint='render.json',
- meta={'parent': UUID},
+ meta={'father': relative_filename_paste},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'wait': 10}
+ #errback=self.errback_catcher
)
+ '''
+ def errback_catcher(self, failure):
+ # catch all errback failures,
+ self.logger.error(repr(failure))
+
+ #if isinstance(failure.value, HttpError):
+ if failure.check(HttpError):
+ # you can get the response
+ response = failure.value.response
+ print('HttpError')
+ self.logger.error('HttpError on %s', response.url)
+
+ #elif isinstance(failure.value, DNSLookupError):
+ elif failure.check(DNSLookupError):
+ # this is the original request
+ request = failure.request
+ print(DNSLookupError)
+ self.logger.error('DNSLookupError on %s', request.url)
+
+ #elif isinstance(failure.value, TimeoutError):
+ elif failure.check(TimeoutError):
+ request = failure.request
+ print(TimeoutError)
+ self.logger.error('TimeoutError on %s', request.url)
+ '''
def save_crawled_paste(self, filename, content):
diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py
index 2c3e736a..5424ccc8 100644
--- a/var/www/modules/Flask_config.py
+++ b/var/www/modules/Flask_config.py
@@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis(
db=cfg.getint("ARDB_Statistics", "db"),
decode_responses=True)
+r_serv_onion = redis.StrictRedis(
+ host=cfg.get("ARDB_Onion", "host"),
+ port=cfg.getint("ARDB_Onion", "port"),
+ db=cfg.getint("ARDB_Onion", "db"),
+ decode_responses=True)
+
sys.path.append('../../configs/keys')
# MISP #
@@ -144,4 +150,6 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
+SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
+
max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs"))
diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py
index cc70527c..6fa5a983 100644
--- a/var/www/modules/showpaste/Flask_showpaste.py
+++ b/var/www/modules/showpaste/Flask_showpaste.py
@@ -5,9 +5,10 @@
Flask functions and routes for the trending modules page
'''
import redis
+import os
import json
import flask
-from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response
+from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory
import difflib
import ssdeep
@@ -22,12 +23,14 @@ r_serv_pasteName = Flask_config.r_serv_pasteName
r_serv_metadata = Flask_config.r_serv_metadata
r_serv_tags = Flask_config.r_serv_tags
r_serv_statistics = Flask_config.r_serv_statistics
+r_serv_onion = Flask_config.r_serv_onion
max_preview_char = Flask_config.max_preview_char
max_preview_modal = Flask_config.max_preview_modal
DiffMaxLineLength = Flask_config.DiffMaxLineLength
bootstrap_label = Flask_config.bootstrap_label
misp_event_url = Flask_config.misp_event_url
hive_case_url = Flask_config.hive_case_url
+SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
@@ -130,6 +133,16 @@ def showpaste(content_range):
list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) )
+ crawler_metadata = {}
+ if 'infoleak:submission="crawler"' in l_tags:
+ crawler_metadata['get_metadata'] = True
+ crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
+ crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
+ crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
+ crawler_metadata['screenshot'] = paste.get_p_rel_path()
+ else:
+ crawler_metadata['get_metadata'] = False
+
if Flask_config.pymisp is False:
misp = False
else:
@@ -157,6 +170,7 @@ def showpaste(content_range):
hive_url = hive_case_url.replace('id_here', hive_case)
return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list,
+ crawler_metadata=crawler_metadata,
misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url)
# ============ ROUTES ============
@@ -202,5 +216,9 @@ def showDiff():
the_html = htmlD.make_file(lines1, lines2)
return the_html
+@showsavedpastes.route('/screenshot/')
+def screenshot(filename):
+ return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True)
+
# ========= REGISTRATION =========
app.register_blueprint(showsavedpastes)
diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html
index cb3f8b68..866f64c1 100644
--- a/var/www/modules/showpaste/templates/show_saved_paste.html
+++ b/var/www/modules/showpaste/templates/show_saved_paste.html
@@ -373,6 +373,42 @@
{% endif %}
+
+ {% if crawler_metadata['get_metadata'] %}
+
+
+
+
+
+
+
+
+
+ Graph
+
+
+
+
+
+ Father |
+ {{ crawler_metadata['paste_father'] }} |
+
+
+ Source link |
+ {{ crawler_metadata['real_link'] }} |
+
+
+ External links |
+ {{ crawler_metadata['external_links'] }} |
+
+
+
+
+
+
+
+ {% endif %}
+
Content:
[Raw content]
{{ content }}
From e9580d6775981a6a7eeea882bd96ce77ea59cb32 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Tue, 21 Aug 2018 15:54:53 +0200
Subject: [PATCH 04/28] chg: [Crawler] change BDD, save i2p links
---
bin/Crawler.py | 140 +++++++------
bin/Onion.py | 45 ++++-
bin/packages/HiddenServices.py | 79 ++++++++
bin/torcrawler/TorSplashCrawler.py | 2 +
files/Onion | 1 +
.../hiddenServices/Flask_hiddenServices.py | 99 +++++++++
.../templates/header_hiddenServices.html | 1 +
.../templates/hiddenServices.html | 188 ++++++++++++++++++
.../hiddenServices/templates/showDomain.html | 76 +++++++
9 files changed, 567 insertions(+), 64 deletions(-)
create mode 100755 bin/packages/HiddenServices.py
create mode 100644 var/www/modules/hiddenServices/Flask_hiddenServices.py
create mode 100644 var/www/modules/hiddenServices/templates/header_hiddenServices.html
create mode 100644 var/www/modules/hiddenServices/templates/hiddenServices.html
create mode 100644 var/www/modules/hiddenServices/templates/showDomain.html
diff --git a/bin/Crawler.py b/bin/Crawler.py
index df1e0117..2e617959 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -18,40 +18,41 @@ from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
-def crawl_onion(url, domain, date):
+def crawl_onion(url, domain, date, date_month):
- if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain):
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
- super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
- if super_father is None:
- super_father=paste
+ super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
+ if super_father is None:
+ super_father=paste
- try:
- r = requests.get(splash_url , timeout=0.010)
- except Exception:
- ## FIXME: # TODO: relaunch docker
- exit(0)
+ try:
+ r = requests.get(splash_url , timeout=30.0)
+ except Exception:
+ ## FIXME: # TODO: relaunch docker or send error message
+ print('--------------------------------------')
+ print(' DOCKER SPLASH DOWN')
+ exit(0)
- if r.status_code == 200:
- process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
- stdout=subprocess.PIPE)
- while process.poll() is None:
- time.sleep(1)
+ if r.status_code == 200:
+ process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
+ stdout=subprocess.PIPE)
+ while process.poll() is None:
+ time.sleep(1)
- if process.returncode == 0:
- if r_serv_metadata.exists('paste_children:'+paste):
- msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
- p.populate_set_out(msg, 'Tags')
+ if process.returncode == 0:
+ if r_serv_metadata.exists('paste_children:'+paste):
+ msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+ p.populate_set_out(msg, 'Tags')
- print(process.stdout.read())
+ print(process.stdout.read())
- else:
- r_onion.sadd('onion_down:'+date , domain)
- r_onion.sadd('onion_down_link:'+date , url)
- print(process.stdout.read())
else:
- ## FIXME: # TODO: relaunch docker
- exit(0)
+ r_onion.sadd('onion_down:'+date , domain)
+ r_onion.sadd('onion_down_link:'+date , url)
+ print(process.stdout.read())
+ else:
+ ## FIXME: # TODO: relaunch docker
+ exit(0)
if __name__ == '__main__':
@@ -97,11 +98,23 @@ if __name__ == '__main__':
message = p.get_from_set()
# Recovering the streamed message informations.
+ #message = r_onion.spop('mess_onion')
+ print(message)
+
+ if message is None:
+ print('get ardb message')
+ message = r_onion.spop('mess_onion')
+
if message is not None:
+
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
+ if not '.onion' in url:
+ print('not onion')
+ continue
+
url_list = re.findall(url_regex, url)[0]
if url_list[1] == '':
url= 'http://{}'.format(url)
@@ -117,46 +130,55 @@ if __name__ == '__main__':
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
+ '''if not r_onion.sismember('full_onion_up', domain):
+ r_onion.sadd('mess_onion', message)
+ print('added ..............')'''
+
+
if not r_onion.sismember('banned_onion', domain):
date = datetime.datetime.now().strftime("%Y%m%d")
+ date_month = datetime.datetime.now().strftime("%Y%m")
- crawl_onion(url, domain, date)
- if url != domain_url:
- crawl_onion(domain_url, domain, date)
+ if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
- # save dowm onion
- if not r_onion.sismember('onion_up:'+date , domain):
- r_onion.sadd('onion_down:'+date , domain)
- r_onion.sadd('onion_down_link:'+date , url)
- r_onion.hincrby('onion_link_down', url, 1)
- if not r_onion.exists('onion_metadata:{}'.format(domain)):
- r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
- r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
- else:
- r_onion.hincrby('onion_link_up', url, 1)
+ crawl_onion(url, domain, date, date_month)
+ if url != domain_url:
+ crawl_onion(domain_url, domain, date, date_month)
- # last check
- r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
-
- # check external onions links (full_scrawl)
- external_domains = set()
- for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
- print(link)
- external_domain = re.findall(url_regex, link)
- print(external_domain)
- if len(external_domain) > 0:
- external_domain = external_domain[0][4]
+ # save down onion
+ if not r_onion.sismember('onion_up:'+date , domain):
+ r_onion.sadd('onion_down:'+date , domain)
+ r_onion.sadd('onion_down_link:'+date , url)
+ r_onion.hincrby('onion_link_down', url, 1)
+ if not r_onion.exists('onion_metadata:{}'.format(domain)):
+ r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
+ r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
else:
- continue
- print(external_domain)
- # # TODO: add i2p
- if '.onion' in external_domain and external_domain != domain:
- external_domains.add(external_domain)
- if len(external_domains) >= 10:
- r_onion.sadd('onion_potential_source', domain)
- r_onion.delete('domain_onion_external_links:{}'.format(domain))
- print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
+ r_onion.hincrby('onion_link_up', url, 1)
+
+ # last check
+ r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
+
+ # check external onions links (full_scrawl)
+ external_domains = set()
+ for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
+ external_domain = re.findall(url_regex, link)
+ if len(external_domain) > 0:
+ external_domain = external_domain[0][4]
+ else:
+ continue
+ # # TODO: add i2p
+ if '.onion' in external_domain and external_domain != domain:
+ external_domains.add(external_domain)
+ if len(external_domains) >= 10:
+ r_onion.sadd('onion_potential_source', domain)
+ r_onion.delete('domain_onion_external_links:{}'.format(domain))
+ print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
+
+ r_onion.lpush('last_onions', domain)
+ r_onion.ltrim('last_onions', 0, 15)
+
else:
continue
else:
diff --git a/bin/Onion.py b/bin/Onion.py
index 1e2dff32..23a81755 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -29,6 +29,7 @@ import os
import base64
import subprocess
import redis
+import re
from Helper import Process
@@ -96,6 +97,12 @@ if __name__ == "__main__":
db=p.config.getint("Redis_Cache", "db"),
decode_responses=True)
+ r_onion = redis.StrictRedis(
+ host=p.config.get("ARDB_Onion", "host"),
+ port=p.config.getint("ARDB_Onion", "port"),
+ db=p.config.getint("ARDB_Onion", "db"),
+ decode_responses=True)
+
# FUNCTIONS #
publisher.info("Script subscribed to channel onion_categ")
@@ -109,6 +116,9 @@ if __name__ == "__main__":
# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ re.compile(url_regex)
+
while True:
if message is not None:
@@ -127,8 +137,22 @@ if __name__ == "__main__":
url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x
- domains_list.append(domain)
- urls.append(url)
+ if '.onion' in url:
+ print(url)
+ domains_list.append(domain)
+ urls.append(url)
+
+ for x in PST.get_regex(i2p_regex):
+ # Extracting url with regex
+ url, s, credential, subdomain, domain, host, port, \
+ resource_path, query_string, f1, f2, f3, f4 = x
+
+ if '.i2p' in url:
+ print('add i2p')
+ print(domain)
+ if not r_onion.sismember('i2p_domain', domain):
+ r_onion.sadd('i2p_domain', domain)
+ r_onion.sadd('i2p_link', url)
# Saving the list of extracted onion domains.
PST.__setattr__(channel, domains_list)
@@ -157,10 +181,21 @@ if __name__ == "__main__":
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
p.populate_set_out(msg, 'Tags')
'''
+
+ date_month = datetime.datetime.now().strftime("%Y%m")
+ date = datetime.datetime.now().strftime("%Y%m%d")
for url in urls:
- msg = '{};{}'.format(url,PST.p_path)
- print('send to crawler')
- p.populate_set_out(msg, 'Crawler')
+
+ domain = re.findall(url_regex, url)
+ if len(domain) > 0:
+ domain = domain[0][4]
+ else:
+ continue
+
+ if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
+ msg = '{};{}'.format(url,PST.p_path)
+ print('send to crawler')
+ p.populate_set_out(msg, 'Crawler')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py
new file mode 100755
index 00000000..48f514fc
--- /dev/null
+++ b/bin/packages/HiddenServices.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python3
+
+"""
+The ``hiddenServices Class``
+===================
+
+Use it to create an object from an existing paste or other random file.
+
+Conditions to fulfill to be able to use this class correctly:
+-------------------------------------------------------------
+
+1/ The paste need to be saved on disk somewhere (have an accessible path)
+2/ The paste need to be gziped.
+3/ The filepath need to look like something like this:
+ /directory/source/year/month/day/paste.gz
+
+"""
+
+import os
+import gzip
+import redis
+
+import configparser
+import sys
+sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
+from Date import Date
+
+class HiddenServices(object):
+ """
+ This class representing a hiddenServices as an object.
+ When created, the object will have by default some "main attributes"
+
+ :Example:
+
+ PST = HiddenServices("xxxxxxxx.onion", "onion")
+
+ """
+
+ def __init__(self, domain, type):
+
+ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
+ if not os.path.exists(configfile):
+ raise Exception('Unable to find the configuration file. \
+ Did you set environment variables? \
+ Or activate the virtualenv.')
+
+ cfg = configparser.ConfigParser()
+ cfg.read(configfile)
+ self.r_serv_onion = redis.StrictRedis(
+ host=cfg.get("ARDB_Onion", "host"),
+ port=cfg.getint("ARDB_Onion", "port"),
+ db=cfg.getint("ARDB_Onion", "db"),
+ decode_responses=True)
+
+ self.domain = domain
+ self.type = type
+
+ if type == 'onion':
+ self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"), cfg.get("Directories", "crawled"))
+ self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
+ elif type == 'i2p':
+ self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
+ self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
+ else:
+ ## TODO: # FIXME: add error
+ pass
+
+
+ def get_last_crawled_pastes(self):
+
+ last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check')
+ return self.get_crawled_pastes_by_date(last_check)
+
+ def get_crawled_pastes_by_date(self, date):
+ pastes_path = os.path.join(self.paste_directory, date[0:4], date[4:6], date[6:8])
+ l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f]
+ print(len(l_crawled_pastes))
+ print(l_crawled_pastes)
+ return l_crawled_pastes
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index 3d392b93..c5280329 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -55,6 +55,7 @@ class TorSplashCrawler():
self.domains = [domain]
date = datetime.datetime.now().strftime("%Y/%m/%d")
self.full_date = datetime.datetime.now().strftime("%Y%m%d")
+ self.date_month = datetime.datetime.now().strftime("%Y%m")
config_section = 'Crawler'
self.p = Process(config_section)
@@ -120,6 +121,7 @@ class TorSplashCrawler():
self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
self.r_serv_onion.sadd('full_onion_up', self.domains[0])
+ self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0])
# create onion metadata
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
diff --git a/files/Onion b/files/Onion
index 5c9980e2..69fcf878 100644
--- a/files/Onion
+++ b/files/Onion
@@ -1 +1,2 @@
onion
+i2p
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
new file mode 100644
index 00000000..04740a93
--- /dev/null
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+'''
+ Flask functions and routes for the trending modules page
+'''
+import redis
+import datetime
+from flask import Flask, render_template, jsonify, request, Blueprint
+
+import HiddenServices
+from Date import Date
+
+# ============ VARIABLES ============
+import Flask_config
+
+app = Flask_config.app
+cfg = Flask_config.cfg
+r_serv_onion = Flask_config.r_serv_onion
+
+hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates')
+
+# ============ FUNCTIONS ============
+def one():
+ return 1
+
+def get_date_range(num_day):
+ curr_date = datetime.date.today()
+ date = Date( '{}{}{}'.format(str(curr_date.year), str(curr_date.month).zfill(2), str(curr_date.day).zfill(2)) )
+ date_list = []
+
+ for i in range(0, num_day):
+ date_list.append(date.substract_day(i))
+
+ return list(reversed(date_list))
+
+def get_onion_status(domain, date):
+ if r_serv_onion.sismember('onion_up:'+date , domain):
+ return True
+ else:
+ return False
+# ============= ROUTES ==============
+
+@hiddenServices.route("/hiddenServices/", methods=['GET'])
+def hiddenServices_page():
+ last_onions = r_serv_onion.lrange('last_onions', 0 ,-1)
+ list_onion = []
+
+ for onion in last_onions:
+ metadata_onion = {}
+ metadata_onion['domain'] = onion
+ metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check')
+ metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen')
+ if get_onion_status(onion, metadata_onion['last_check']):
+ metadata_onion['status_text'] = 'UP'
+ metadata_onion['status_color'] = 'Green'
+ metadata_onion['status_icon'] = 'fa-check-circle'
+ else:
+ metadata_onion['status_text'] = 'DOWN'
+ metadata_onion['status_color'] = 'Red'
+ metadata_onion['status_icon'] = 'fa-times-circle'
+ list_onion.append(metadata_onion)
+
+ return render_template("hiddenServices.html", last_onions=list_onion)
+
+@hiddenServices.route("/hiddenServices/onion_domain", methods=['GET'])
+def onion_domain():
+ onion_domain = request.args.get('onion_domain')
+ if onion_domain is None or not r_serv_onion.exists('onion_metadata:{}'.format(onion_domain)):
+ pass
+ # # TODO: FIXME return 404
+
+ last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
+ first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
+ date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
+
+ return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen)
+
+# ============= JSON ==============
+@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
+def domain_crawled_7days_json():
+ type = 'onion'
+ ## TODO: # FIXME: 404 error
+
+ date_range = get_date_range(7)
+ json_domain_stats = []
+ #try:
+ for date in date_range:
+ nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date))
+ nb_domain_down = r_serv_onion.scard('{}_up:{}'.format(type, date))
+ date = date[0:4] + '-' + date[4:6] + '-' + date[6:8]
+ json_domain_stats.append({ 'date': date, 'value': int( nb_domain_up ), 'nb_domain_down': int( nb_domain_down )})
+ #except:
+ #return jsonify()
+
+ return jsonify(json_domain_stats)
+
+# ========= REGISTRATION =========
+app.register_blueprint(hiddenServices)
diff --git a/var/www/modules/hiddenServices/templates/header_hiddenServices.html b/var/www/modules/hiddenServices/templates/header_hiddenServices.html
new file mode 100644
index 00000000..5c77963c
--- /dev/null
+++ b/var/www/modules/hiddenServices/templates/header_hiddenServices.html
@@ -0,0 +1 @@
+ hidden Services
diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html
new file mode 100644
index 00000000..bbc66ace
--- /dev/null
+++ b/var/www/modules/hiddenServices/templates/hiddenServices.html
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+ Hidden Service - AIL
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% include 'navbar.html' %}
+
+
+
+
+
+
+
+
ONION
+
+
+
+
+ Domain |
+ First Seen |
+ Last Check |
+ Status |
+
+
+
+ {% for metadata_onion in last_onions %}
+
+ {{ metadata_onion['domain'] }} |
+ {{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}} |
+ {{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}} |
+
+
+ {{metadata_onion['status_text']}}
+
+ |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
new file mode 100644
index 00000000..18cd79be
--- /dev/null
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -0,0 +1,76 @@
+
+
+
+
+
+
+
+
Show Domain - AIL
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% include 'navbar.html' %}
+
+
+
+
+
+
+
+
+
+ Graph
+
+
+
+
+
+ Domain |
+ {{ domain }} |
+
+
+ First Seen |
+ {{ first_seen }} |
+
+
+ Last Check |
+ {{ last_check }} |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
From 7e24943537ede802dbdbd887db2b32a30751f90a Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Fri, 24 Aug 2018 10:13:56 +0200
Subject: [PATCH 05/28] chg: [Crawler] crawler accept all kind of domains
---
bin/Crawler.py | 97 +++++++++++--------
bin/Onion.py | 14 ++-
bin/torcrawler/TorSplashCrawler.py | 39 +++++---
bin/torcrawler/tor_crawler.py | 19 ++--
.../hiddenServices/Flask_hiddenServices.py | 6 +-
.../hiddenServices/templates/showDomain.html | 6 ++
6 files changed, 112 insertions(+), 69 deletions(-)
diff --git a/bin/Crawler.py b/bin/Crawler.py
index 2e617959..240ae2a3 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -34,21 +34,21 @@ def crawl_onion(url, domain, date, date_month):
exit(0)
if r.status_code == 200:
- process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
+ process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
- msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+ msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
p.populate_set_out(msg, 'Tags')
print(process.stdout.read())
else:
- r_onion.sadd('onion_down:'+date , domain)
- r_onion.sadd('onion_down_link:'+date , url)
+ r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
+ r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
print(process.stdout.read())
else:
## FIXME: # TODO: relaunch docker
@@ -67,8 +67,28 @@ if __name__ == '__main__':
# Setup the I/O queues
p = Process(config_section)
- splash_url = p.config.get("Crawler", "splash_url")
- http_proxy = p.config.get("Crawler", "http_proxy")
+ url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ re.compile(url_onion)
+ url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+ re.compile(url_i2p)
+
+ type_hidden_service = 'onion'
+ if type_hidden_service == 'onion':
+ regex_hidden_service = url_onion
+ splash_url = p.config.get("Crawler", "splash_url_onion")
+ http_proxy = p.config.get("Crawler", "http_proxy_onion")
+ elif type_hidden_service == 'i2p':
+ regex_hidden_service = url_i2p
+ splash_url = p.config.get("Crawler", "splash_url_i2p")
+ http_proxy = p.config.get("Crawler", "http_proxy_i2p")
+ elif type_hidden_service == 'regular':
+ regex_hidden_service = url_i2p
+ splash_url = p.config.get("Crawler", "splash_url_onion")
+ http_proxy = p.config.get("Crawler", "http_proxy_onion")
+ else:
+ print('incorrect crawler type: {}'.format(type_hidden_service))
+ exit(0)
+
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
#signal.signal(signal.SIGINT, signal_handler)
@@ -91,93 +111,94 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
- url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
- re.compile(url_regex)
-
while True:
- message = p.get_from_set()
# Recovering the streamed message informations.
- #message = r_onion.spop('mess_onion')
- print(message)
+ message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
+ #message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2'
+ # # FIXME: remove
if message is None:
print('get ardb message')
message = r_onion.spop('mess_onion')
+ print(message)
+
if message is not None:
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
+
if not '.onion' in url:
print('not onion')
continue
- url_list = re.findall(url_regex, url)[0]
+ url_list = re.findall(regex_hidden_service, url)[0]
if url_list[1] == '':
url= 'http://{}'.format(url)
link, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4]
+ r_onion.srem('onion_domain_crawler_queue', domain)
+ #domain = 'myip.com'
domain_url = 'http://{}'.format(domain)
- print('------------------START ONION CRAWLER------------------')
+ print('------------------START CRAWLER------------------')
+ print(type_hidden_service)
+ print('-------------------------------------------------')
print('url: {}'.format(url))
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
- '''if not r_onion.sismember('full_onion_up', domain):
- r_onion.sadd('mess_onion', message)
- print('added ..............')'''
-
-
- if not r_onion.sismember('banned_onion', domain):
+ if not r_onion.sismember('banned_{}'.format(type_hidden_service), domain):
date = datetime.datetime.now().strftime("%Y%m%d")
date_month = datetime.datetime.now().strftime("%Y%m")
- if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
+ if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
crawl_onion(url, domain, date, date_month)
if url != domain_url:
crawl_onion(domain_url, domain, date, date_month)
# save down onion
- if not r_onion.sismember('onion_up:'+date , domain):
- r_onion.sadd('onion_down:'+date , domain)
- r_onion.sadd('onion_down_link:'+date , url)
- r_onion.hincrby('onion_link_down', url, 1)
- if not r_onion.exists('onion_metadata:{}'.format(domain)):
- r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
- r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
+ if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
+ r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
+ r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
+ r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
+ if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
+ r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
+ r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
else:
- r_onion.hincrby('onion_link_up', url, 1)
+ r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
# last check
- r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
+ r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# check external onions links (full_scrawl)
external_domains = set()
- for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
- external_domain = re.findall(url_regex, link)
+ for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
+ external_domain = re.findall(url_onion, link)
+ external_domain.extend(re.findall(url_i2p, link))
if len(external_domain) > 0:
external_domain = external_domain[0][4]
else:
continue
- # # TODO: add i2p
if '.onion' in external_domain and external_domain != domain:
external_domains.add(external_domain)
+ elif '.i2p' in external_domain and external_domain != domain:
+ external_domains.add(external_domain)
if len(external_domains) >= 10:
- r_onion.sadd('onion_potential_source', domain)
- r_onion.delete('domain_onion_external_links:{}'.format(domain))
- print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
+ r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
+ r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
+ print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
- r_onion.lpush('last_onions', domain)
- r_onion.ltrim('last_onions', 0, 15)
+ r_onion.lpush('last_{}'.format(type_hidden_service), domain)
+ r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
else:
continue
diff --git a/bin/Onion.py b/bin/Onion.py
index 23a81755..d77c010f 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -150,9 +150,12 @@ if __name__ == "__main__":
if '.i2p' in url:
print('add i2p')
print(domain)
- if not r_onion.sismember('i2p_domain', domain):
+ if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain):
r_onion.sadd('i2p_domain', domain)
r_onion.sadd('i2p_link', url)
+ r_onion.sadd('i2p_domain_crawler_queue', domain)
+ msg = '{};{}'.format(url,PST.p_path)
+ r_onion.sadd('i2p_crawler_queue', msg)
# Saving the list of extracted onion domains.
PST.__setattr__(channel, domains_list)
@@ -193,9 +196,12 @@ if __name__ == "__main__":
continue
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
- msg = '{};{}'.format(url,PST.p_path)
- print('send to crawler')
- p.populate_set_out(msg, 'Crawler')
+ if not r_onion.sismember('onion_domain_crawler_queue', domain):
+ print('send to onion crawler')
+ r_onion.sadd('onion_domain_crawler_queue', domain)
+ msg = '{};{}'.format(url,PST.p_path)
+ r_onion.sadd('onion_crawler_queue', msg)
+ #p.populate_set_out(msg, 'Crawler')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index c5280329..135ad0a7 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -9,6 +9,7 @@ import uuid
import datetime
import base64
import redis
+import json
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
@@ -30,7 +31,6 @@ class TorSplashCrawler():
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'SPLASH_URL': splash_url,
- 'HTTP_PROXY': http_proxy,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
@@ -41,14 +41,15 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_depth_limit
})
- def crawl(self, url, domain, original_paste, super_father):
- self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
+ def crawl(self, type, url, domain, original_paste, super_father):
+ self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
- def __init__(self, url, domain,original_paste, super_father, *args, **kwargs):
+ def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
+ self.type = type
self.original_paste = original_paste
self.super_father = super_father
self.start_urls = url
@@ -100,12 +101,13 @@ class TorSplashCrawler():
args={ 'html': 1,
'wait': 10,
'render_all': 1,
+ 'har': 1,
'png': 1}
)
def parse(self,response):
- print(response.headers)
- print(response.status)
+ #print(response.headers)
+ #print(response.status)
# # TODO: # FIXME:
self.r_cache.setbit(response.url, 0, 1)
@@ -119,17 +121,18 @@ class TorSplashCrawler():
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
- self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
- self.r_serv_onion.sadd('full_onion_up', self.domains[0])
- self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0])
+ self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
+ self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
+ self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
# create onion metadata
- if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
- self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date)
- self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date)
+ if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
+ self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
+ self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
+ self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste)
# add onion screenshot history
- self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date)
+ self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
@@ -144,17 +147,20 @@ class TorSplashCrawler():
os.makedirs(dirname)
size_screenshot = (len(response.data['png'])*3) /4
- print(size_screenshot)
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
+ #interest = response.data['har']['log']['entries'][0]['response']['header'][0]
+ with open(filename_screenshot+'har.txt', 'wb') as f:
+ f.write(json.dumps(response.data['har']).encode())
+
# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
- self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url)
- self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url)
+ self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
+ self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
@@ -169,6 +175,7 @@ class TorSplashCrawler():
args={ 'html': 1,
'png': 1,
'render_all': 1,
+ 'har': 1,
'wait': 10}
#errback=self.errback_catcher
)
diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py
index 57a77e76..99eb18c8 100755
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__':
- if len(sys.argv) != 5:
- print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father')
+ if len(sys.argv) != 8:
+ print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father')
exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@@ -21,14 +21,15 @@ if __name__ == '__main__':
cfg = configparser.ConfigParser()
cfg.read(configfile)
- splash_url = cfg.get("Crawler", "splash_url")
- http_proxy = cfg.get("Crawler", "http_proxy")
+ splash_url = sys.argv[1]
+ http_proxy = sys.argv[2]
+ type = sys.argv[3]
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
- url = sys.argv[1]
- domain = sys.argv[2]
- paste = sys.argv[3]
- super_father = sys.argv[4]
+ url = sys.argv[4]
+ domain = sys.argv[5]
+ paste = sys.argv[6]
+ super_father = sys.argv[7]
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
- crawler.crawl(url, domain, paste, super_father)
+ crawler.crawl(type, url, domain, paste, super_father)
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
index 04740a93..6d01bbbb 100644
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -43,7 +43,7 @@ def get_onion_status(domain, date):
@hiddenServices.route("/hiddenServices/", methods=['GET'])
def hiddenServices_page():
- last_onions = r_serv_onion.lrange('last_onions', 0 ,-1)
+ last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
list_onion = []
for onion in last_onions:
@@ -72,9 +72,11 @@ def onion_domain():
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
+ domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
- return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen)
+ return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
+ domain_paste=domain_paste)
# ============= JSON ==============
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
index 18cd79be..88942c73 100644
--- a/var/www/modules/hiddenServices/templates/showDomain.html
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -49,6 +49,12 @@
Last Check |
{{ last_check }} |
+
+ Origin Paste |
+
+ {{ domain_paste }}
+ |
+
From ced0b1e350e85228a5a6ec9a645047d98f5d14e1 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Fri, 24 Aug 2018 10:24:03 +0200
Subject: [PATCH 06/28] chg: [I2P] add default config
---
bin/Crawler.py | 6 ------
bin/packages/config.cfg.sample | 6 ++++--
2 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/bin/Crawler.py b/bin/Crawler.py
index 240ae2a3..ab74c64b 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -115,22 +115,17 @@ if __name__ == '__main__':
# Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
- #message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2'
# # FIXME: remove
if message is None:
print('get ardb message')
message = r_onion.spop('mess_onion')
- print(message)
-
if message is not None:
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
-
-
if not '.onion' in url:
print('not onion')
continue
@@ -143,7 +138,6 @@ if __name__ == '__main__':
resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4]
r_onion.srem('onion_domain_crawler_queue', domain)
- #domain = 'myip.com'
domain_url = 'http://{}'.format(domain)
diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample
index 62ea0887..2ca156d4 100644
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@@ -225,5 +225,7 @@ db = 0
[Crawler]
crawler_depth_limit = 1
-splash_url = http://127.0.0.1:8050
-http_proxy = http://127.0.0.1:9050
+splash_url_onion = http://127.0.0.1:8050
+splash_url_i2p = http://127.0.0.1:8050
+http_proxy_onion = http://127.0.0.1:9050
+http_proxy_i2p = http://127.0.0.1:9050
From d42dd118a4572d107d8c8e09b5be11ac1f417b53 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Mon, 27 Aug 2018 11:02:39 +0200
Subject: [PATCH 07/28] chg: [Domain crawled] add random screenshot
---
bin/packages/HiddenServices.py | 67 ++++++++++++++++---
.../hiddenServices/Flask_hiddenServices.py | 10 ++-
.../hiddenServices/templates/showDomain.html | 2 +-
3 files changed, 68 insertions(+), 11 deletions(-)
diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py
index 48f514fc..9f4e9302 100755
--- a/bin/packages/HiddenServices.py
+++ b/bin/packages/HiddenServices.py
@@ -19,6 +19,7 @@ Conditions to fulfill to be able to use this class correctly:
import os
import gzip
import redis
+import random
import configparser
import sys
@@ -52,11 +53,19 @@ class HiddenServices(object):
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
+ self.r_serv_metadata = redis.StrictRedis(
+ host=cfg.get("ARDB_Metadata", "host"),
+ port=cfg.getint("ARDB_Metadata", "port"),
+ db=cfg.getint("ARDB_Metadata", "db"),
+ decode_responses=True)
+
self.domain = domain
self.type = type
if type == 'onion':
- self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"), cfg.get("Directories", "crawled"))
+ self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
+ self.paste_crawled_directory = os.path.join(self.paste_directory, cfg.get("Directories", "crawled"))
+ self.paste_crawled_directory_name = cfg.get("Directories", "crawled")
self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
elif type == 'i2p':
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
@@ -65,15 +74,57 @@ class HiddenServices(object):
## TODO: # FIXME: add error
pass
-
+ #todo use the right paste
def get_last_crawled_pastes(self):
+ paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
+ #paste_parent = paste_parent.replace(self.paste_directory, '')[1:]
+ return self.get_all_pastes_domain(paste_parent)
- last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check')
- return self.get_crawled_pastes_by_date(last_check)
+ def get_all_pastes_domain(self, father):
+ l_crawled_pastes = []
+ paste_parent = father.replace(self.paste_directory, '')[1:]
+ paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
+ ## TODO: # FIXME: remove me
+ if not paste_childrens:
+ paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
+ for children in paste_childrens:
+ if self.domain in children:
+ l_crawled_pastes.append(children)
+ l_crawled_pastes.extend(self.get_all_pastes_domain(children))
+ return l_crawled_pastes
+
+ def get_domain_random_screenshot(self, l_crawled_pastes, num_screenshot = 1):
+ l_screenshot_paste = []
+ for paste in l_crawled_pastes:
+ ## FIXME: # TODO: remove me
+ paste= paste.replace(self.paste_directory, '')[1:]
+
+ paste = paste.replace(self.paste_crawled_directory_name, '')
+ if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):
+ l_screenshot_paste.append(paste[1:])
+
+ if len(l_screenshot_paste) > num_screenshot:
+ l_random_screenshot = []
+ for index in random.sample( range(0, len(l_screenshot_paste)), num_screenshot ):
+ l_random_screenshot.append(l_screenshot_paste[index])
+ return l_random_screenshot
+ else:
+ return l_screenshot_paste
def get_crawled_pastes_by_date(self, date):
- pastes_path = os.path.join(self.paste_directory, date[0:4], date[4:6], date[6:8])
- l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f]
- print(len(l_crawled_pastes))
- print(l_crawled_pastes)
+
+ pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
+ paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check')
+
+ l_crawled_pastes = []
+ return l_crawled_pastes
+
+ def get_last_crawled_pastes_fileSearch(self):
+
+ last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check')
+ return self.get_crawled_pastes_by_date_fileSearch(last_check)
+
+ def get_crawled_pastes_by_date_fileSearch(self, date):
+ pastes_path = os.path.join(self.paste_crawled_directory, date[0:4], date[4:6], date[6:8])
+ l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f]
return l_crawled_pastes
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
index 6d01bbbb..7969aae8 100644
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -6,10 +6,12 @@
'''
import redis
import datetime
+import sys
+import os
from flask import Flask, render_template, jsonify, request, Blueprint
-import HiddenServices
from Date import Date
+from HiddenServices import HiddenServices
# ============ VARIABLES ============
import Flask_config
@@ -75,8 +77,12 @@ def onion_domain():
domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
+ h = HiddenServices(onion_domain, 'onion')
+ l_pastes = h.get_last_crawled_pastes()
+ screenshot = h.get_domain_random_screenshot(l_pastes)[0]
+
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
- domain_paste=domain_paste)
+ domain_paste=domain_paste, screenshot=screenshot)
# ============= JSON ==============
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
index 88942c73..3f5b8736 100644
--- a/var/www/modules/hiddenServices/templates/showDomain.html
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -62,7 +62,7 @@
-
+
From 40772a5732002081ea7b19cef2447b008a8e4675 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Mon, 27 Aug 2018 11:30:19 +0200
Subject: [PATCH 08/28] fix: merge
---
var/www/modules/showpaste/Flask_showpaste.py | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py
index 39e2283e..40240591 100644
--- a/var/www/modules/showpaste/Flask_showpaste.py
+++ b/var/www/modules/showpaste/Flask_showpaste.py
@@ -9,11 +9,7 @@ import os
import json
import os
import flask
-<<<<<<< HEAD
-from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory
-=======
-from flask import Flask, render_template, jsonify, request, Blueprint, make_response, redirect, url_for, Response, send_from_directory
->>>>>>> master
+from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory, redirect, url_for
import difflib
import ssdeep
From ca982e13e1b2adf21e4814efcdcbfca1c89ba2a0 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Mon, 27 Aug 2018 14:34:08 +0200
Subject: [PATCH 09/28] chg: [Crawled Domain] show crawled pastes by domain
---
bin/packages/HiddenServices.py | 4 +-
var/www/modules/Flask_config.py | 1 +
.../hiddenServices/Flask_hiddenServices.py | 32 +++++++++++-
.../hiddenServices/templates/showDomain.html | 51 +++++++++++++++++--
4 files changed, 80 insertions(+), 8 deletions(-)
diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py
index 9f4e9302..5143553b 100755
--- a/bin/packages/HiddenServices.py
+++ b/bin/packages/HiddenServices.py
@@ -85,8 +85,8 @@ class HiddenServices(object):
paste_parent = father.replace(self.paste_directory, '')[1:]
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
## TODO: # FIXME: remove me
- if not paste_childrens:
- paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
+ paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
+ paste_childrens = paste_childrens | paste_children
for children in paste_childrens:
if self.domain in children:
l_crawled_pastes.append(children)
diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py
index 34e630f2..07a6a3f0 100644
--- a/var/www/modules/Flask_config.py
+++ b/var/www/modules/Flask_config.py
@@ -150,6 +150,7 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
+PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs"))
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
index 7969aae8..2c0c7e4a 100644
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -19,6 +19,9 @@ import Flask_config
app = Flask_config.app
cfg = Flask_config.cfg
r_serv_onion = Flask_config.r_serv_onion
+r_serv_metadata = Flask_config.r_serv_metadata
+bootstrap_label = Flask_config.bootstrap_label
+PASTES_FOLDER = Flask_config.PASTES_FOLDER
hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates')
@@ -79,9 +82,36 @@ def onion_domain():
h = HiddenServices(onion_domain, 'onion')
l_pastes = h.get_last_crawled_pastes()
- screenshot = h.get_domain_random_screenshot(l_pastes)[0]
+ screenshot = h.get_domain_random_screenshot(l_pastes)
+ if screenshot:
+ screenshot = screenshot[0]
+ else:
+ screenshot = 'None'
+
+ paste_tags = []
+ path_name = []
+ for path in l_pastes:
+ path_name.append(path.replace(PASTES_FOLDER, ''))
+ p_tags = r_serv_metadata.smembers('tag:'+path)
+ l_tags = []
+ for tag in p_tags:
+ complete_tag = tag
+ tag = tag.split('=')
+ if len(tag) > 1:
+ if tag[1] != '':
+ tag = tag[1][1:-1]
+ # no value
+ else:
+ tag = tag[0][1:-1]
+ # use for custom tags
+ else:
+ tag = tag[0]
+ l_tags.append( (tag, complete_tag) )
+ paste_tags.append(l_tags)
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
+ l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label,
+ path_name=path_name,
domain_paste=domain_paste, screenshot=screenshot)
# ============= JSON ==============
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
index 3f5b8736..29aa821c 100644
--- a/var/www/modules/hiddenServices/templates/showDomain.html
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -15,9 +15,15 @@
-
-
-
+
+
+
+
@@ -28,8 +34,9 @@
-
+
+
Graph
@@ -57,11 +64,38 @@
+
+
+
+
-
+
@@ -74,6 +108,13 @@
$(document).ready(function(){
activePage = "page-hiddenServices"
$("#"+activePage).addClass("active");
+ table = $('#myTable_').DataTable(
+ {
+ "aLengthMenu": [[5, 10, 15, 20, -1], [5, 10, 15, 20, "All"]],
+ "iDisplayLength": 5,
+ "order": [[ 0, "desc" ]]
+ }
+ );
});
From 6f0817365acc0891537c80355f250d9b5d28a9c8 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Wed, 12 Sep 2018 09:55:49 +0200
Subject: [PATCH 10/28] chg: [Crawler UI] display domain information
---
.gitignore | 3 +-
bin/Bitcoin.py | 1 +
bin/Crawler.py | 42 +++++++++-
bin/packages/HiddenServices.py | 22 ++++-
bin/torcrawler/TorSplashCrawler.py | 18 +++--
pip3_packages_requirement.txt | 3 +
.../hiddenServices/Flask_hiddenServices.py | 55 ++++++++-----
.../hiddenServices/templates/showDomain.html | 80 ++++++++++++-------
8 files changed, 164 insertions(+), 60 deletions(-)
diff --git a/.gitignore b/.gitignore
index b5755ee6..6973080f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,9 +11,10 @@ ardb
faup
tlsh
Blooms
-LEVEL_DB_DATA
PASTES
+CRAWLED_SCREENSHOT
BASE64
+HASHS
DATA_ARDB
indexdir/
logs/
diff --git a/bin/Bitcoin.py b/bin/Bitcoin.py
index 5ec2199f..1b7694b7 100755
--- a/bin/Bitcoin.py
+++ b/bin/Bitcoin.py
@@ -32,6 +32,7 @@ def decode_base58(bc, length):
for char in bc:
n = n * 58 + digits58.index(char)
return n.to_bytes(length, 'big')
+
def check_bc(bc):
try:
bcbytes = decode_base58(bc, 25)
diff --git a/bin/Crawler.py b/bin/Crawler.py
index ab74c64b..3660aa41 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -57,6 +57,12 @@ def crawl_onion(url, domain, date, date_month):
if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
+ exit(1)
+
+ type_hidden_service = sys.argv[1]
+
publisher.port = 6380
publisher.channel = "Script"
@@ -72,7 +78,6 @@ if __name__ == '__main__':
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_i2p)
- type_hidden_service = 'onion'
if type_hidden_service == 'onion':
regex_hidden_service = url_onion
splash_url = p.config.get("Crawler", "splash_url_onion")
@@ -89,8 +94,12 @@ if __name__ == '__main__':
print('incorrect crawler type: {}'.format(type_hidden_service))
exit(0)
+ print(type_hidden_service)
+
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
+ PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
+
#signal.signal(signal.SIGINT, signal_handler)
r_serv_metadata = redis.StrictRedis(
@@ -113,8 +122,10 @@ if __name__ == '__main__':
while True:
- # Recovering the streamed message informations.
+ # Recovering the streamed message informations. http://eepsites.i2p
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
+ #message = 'http://i2pwiki.i2p;test'
+ #message = 'http://i2host.i2p;test'
# # FIXME: remove
if message is None:
@@ -122,13 +133,19 @@ if __name__ == '__main__':
message = r_onion.spop('mess_onion')
if message is not None:
+ print(message)
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
+ paste = paste.replace(PASTES_FOLDER+'/', '')
+ print(paste)
+ '''
if not '.onion' in url:
print('not onion')
continue
+ '''
+
url_list = re.findall(regex_hidden_service, url)[0]
if url_list[1] == '':
@@ -137,7 +154,7 @@ if __name__ == '__main__':
link, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4]
- r_onion.srem('onion_domain_crawler_queue', domain)
+ r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
domain_url = 'http://{}'.format(domain)
@@ -157,6 +174,8 @@ if __name__ == '__main__':
crawl_onion(url, domain, date, date_month)
if url != domain_url:
+ print(url)
+ print(domain_url)
crawl_onion(domain_url, domain, date, date_month)
# save down onion
@@ -173,6 +192,17 @@ if __name__ == '__main__':
# last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
+ # last_father
+ r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
+
+ # add onion screenshot history
+ # add crawled days
+ if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
+ r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
+ # add crawled history by date
+ r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
+
+
# check external onions links (full_scrawl)
external_domains = set()
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
@@ -194,6 +224,12 @@ if __name__ == '__main__':
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
+ #send all crawled domain past
+ msg = domain
+ p.populate_set_out(msg, 'DomainSubject')
+
+ #time.sleep(30)
+
else:
continue
else:
diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py
index 5143553b..ca07bfd2 100755
--- a/bin/packages/HiddenServices.py
+++ b/bin/packages/HiddenServices.py
@@ -61,6 +61,7 @@ class HiddenServices(object):
self.domain = domain
self.type = type
+ self.tags = {}
if type == 'onion':
self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
@@ -74,6 +75,20 @@ class HiddenServices(object):
## TODO: # FIXME: add error
pass
+ def get_origin_paste_name(self):
+ origin_paste = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
+ if origin_paste is None:
+ return ''
+ return origin_paste.replace(self.paste_directory+'/', '')
+
+ def get_domain_tags(self):
+ return self.tags
+
+ def update_domain_tags(self, children):
+ p_tags = self.r_serv_metadata.smembers('tag:'+children)
+ for tag in p_tags:
+ self.tags[tag] = self.tags.get(tag, 0) + 1
+
#todo use the right paste
def get_last_crawled_pastes(self):
paste_parent = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
@@ -81,8 +96,10 @@ class HiddenServices(object):
return self.get_all_pastes_domain(paste_parent)
def get_all_pastes_domain(self, father):
+ if father is None:
+ return []
l_crawled_pastes = []
- paste_parent = father.replace(self.paste_directory, '')[1:]
+ paste_parent = father.replace(self.paste_directory+'/', '')
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(paste_parent))
## TODO: # FIXME: remove me
paste_children = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
@@ -90,6 +107,7 @@ class HiddenServices(object):
for children in paste_childrens:
if self.domain in children:
l_crawled_pastes.append(children)
+ self.update_domain_tags(children)
l_crawled_pastes.extend(self.get_all_pastes_domain(children))
return l_crawled_pastes
@@ -97,7 +115,7 @@ class HiddenServices(object):
l_screenshot_paste = []
for paste in l_crawled_pastes:
## FIXME: # TODO: remove me
- paste= paste.replace(self.paste_directory, '')[1:]
+ paste= paste.replace(self.paste_directory+'/', '')
paste = paste.replace(self.paste_crawled_directory_name, '')
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ):
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index 135ad0a7..ffbc5da9 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -96,6 +96,7 @@ class TorSplashCrawler():
yield SplashRequest(
self.start_urls,
self.parse,
+ errback=self.errback_catcher,
endpoint='render.json',
meta={'father': self.original_paste},
args={ 'html': 1,
@@ -121,6 +122,9 @@ class TorSplashCrawler():
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
+ # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
+ self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
+
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
@@ -129,10 +133,6 @@ class TorSplashCrawler():
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
- self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste)
-
- # add onion screenshot history
- self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
@@ -170,6 +170,7 @@ class TorSplashCrawler():
yield SplashRequest(
link.url,
self.parse,
+ errback=self.errback_catcher,
endpoint='render.json',
meta={'father': relative_filename_paste},
args={ 'html': 1,
@@ -179,10 +180,13 @@ class TorSplashCrawler():
'wait': 10}
#errback=self.errback_catcher
)
- '''
+
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
+ print('failure')
+ print(failure)
+ print(failure.request.meta['item'])
#if isinstance(failure.value, HttpError):
if failure.check(HttpError):
@@ -196,14 +200,16 @@ class TorSplashCrawler():
# this is the original request
request = failure.request
print(DNSLookupError)
+ print('DNSLookupError')
self.logger.error('DNSLookupError on %s', request.url)
#elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
+ print('TimeoutError')
print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url)
- '''
+
def save_crawled_paste(self, filename, content):
diff --git a/pip3_packages_requirement.txt b/pip3_packages_requirement.txt
index 53ec97e7..ddf60626 100644
--- a/pip3_packages_requirement.txt
+++ b/pip3_packages_requirement.txt
@@ -58,6 +58,9 @@ pycountry
# To fetch Onion urls
PySocks
+#extract subject
+newspaper3k
+
# decompress files
sflock
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
index 2c0c7e4a..5e63374b 100644
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -39,6 +39,23 @@ def get_date_range(num_day):
return list(reversed(date_list))
+def unpack_paste_tags(p_tags):
+ l_tags = []
+ for tag in p_tags:
+ complete_tag = tag
+ tag = tag.split('=')
+ if len(tag) > 1:
+ if tag[1] != '':
+ tag = tag[1][1:-1]
+ # no value
+ else:
+ tag = tag[0][1:-1]
+ # use for custom tags
+ else:
+ tag = tag[0]
+ l_tags.append( (tag, complete_tag) )
+ return l_tags
+
def get_onion_status(domain, date):
if r_serv_onion.sismember('onion_up:'+date , domain):
return True
@@ -76,43 +93,39 @@ def onion_domain():
# # TODO: FIXME return 404
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
+ last_check = '{}/{}/{}'.format(last_check[0:4], last_check[4:6], last_check[6:8])
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
- domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
- date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
+ first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
+ origin_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
h = HiddenServices(onion_domain, 'onion')
l_pastes = h.get_last_crawled_pastes()
+ if l_pastes:
+ status = True
+ else:
+ status = False
screenshot = h.get_domain_random_screenshot(l_pastes)
if screenshot:
screenshot = screenshot[0]
else:
screenshot = 'None'
+ domain_tags = h.get_domain_tags()
+
+ origin_paste_name = h.get_origin_paste_name()
+ origin_paste_tags = unpack_paste_tags(r_serv_metadata.smembers('tag:{}'.format(origin_paste)))
paste_tags = []
path_name = []
for path in l_pastes:
- path_name.append(path.replace(PASTES_FOLDER, ''))
+ path_name.append(path.replace(PASTES_FOLDER+'/', ''))
p_tags = r_serv_metadata.smembers('tag:'+path)
- l_tags = []
- for tag in p_tags:
- complete_tag = tag
- tag = tag.split('=')
- if len(tag) > 1:
- if tag[1] != '':
- tag = tag[1][1:-1]
- # no value
- else:
- tag = tag[0][1:-1]
- # use for custom tags
- else:
- tag = tag[0]
- l_tags.append( (tag, complete_tag) )
- paste_tags.append(l_tags)
+ paste_tags.append(unpack_paste_tags(p_tags))
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
- l_pastes=l_pastes, paste_tags=paste_tags, l_tags=l_tags, bootstrap_label=bootstrap_label,
- path_name=path_name,
- domain_paste=domain_paste, screenshot=screenshot)
+ l_pastes=l_pastes, paste_tags=paste_tags, bootstrap_label=bootstrap_label,
+ path_name=path_name, origin_paste_tags=origin_paste_tags, status=status,
+ origin_paste=origin_paste, origin_paste_name=origin_paste_name,
+ domain_tags=domain_tags, screenshot=screenshot)
# ============= JSON ==============
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
index 29aa821c..b89388aa 100644
--- a/var/www/modules/hiddenServices/templates/showDomain.html
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -36,35 +36,61 @@
-
-
+
- Graph
-
+ {% if status %}
+
+
+ UP
+
+ {% else %}
+
+
+ DOWN
+
+ {% endif %}
+
{{ domain }} :
+
+ -
-
-
-
- Domain |
- {{ domain }} |
-
-
- First Seen |
- {{ first_seen }} |
-
-
- Last Check |
- {{ last_check }} |
-
-
- Origin Paste |
-
- {{ domain_paste }}
- |
-
-
-
-
+
+
+
+
+ Origin Paste: {{ origin_paste_name }}
+
+
+
+
+
+
From 0c63f2f24f399a2337dc3e3389128646744e2c91 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Mon, 17 Sep 2018 15:35:06 +0200
Subject: [PATCH 11/28] chg: [Crawler] catch server response
---
bin/torcrawler/TorSplashCrawler.py | 130 +++++++++---------
var/www/modules/showpaste/Flask_showpaste.py | 10 ++
.../showpaste/templates/show_saved_paste.html | 6 +-
3 files changed, 83 insertions(+), 63 deletions(-)
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index ffbc5da9..6673436b 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -38,6 +38,7 @@ class TorSplashCrawler():
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
+ 'HTTPERROR_ALLOW_ALL': True,
'DEPTH_LIMIT': crawler_depth_limit
})
@@ -96,7 +97,7 @@ class TorSplashCrawler():
yield SplashRequest(
self.start_urls,
self.parse,
- errback=self.errback_catcher,
+ #errback=self.errback_catcher,
endpoint='render.json',
meta={'father': self.original_paste},
args={ 'html': 1,
@@ -109,84 +110,89 @@ class TorSplashCrawler():
def parse(self,response):
#print(response.headers)
#print(response.status)
+ print(' | ')
+ if response.status == 504:
+ # down ?
+ print('504 detected')
+ #elif response.status in in range(400, 600):
+ elif response.status != 200:
+ print('other: {}'.format(response.status))
+ else:
- # # TODO: # FIXME:
- self.r_cache.setbit(response.url, 0, 1)
- self.r_cache.expire(response.url, 360000)
+ UUID = self.domains[0]+str(uuid.uuid4())
+ filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
+ relative_filename_paste = os.path.join(self.crawler_path, UUID)
+ filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
- UUID = self.domains[0]+str(uuid.uuid4())
- filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
- relative_filename_paste = os.path.join(self.crawler_path, UUID)
- filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
+ # save new paste on disk
+ if self.save_crawled_paste(filename_paste, response.data['html']):
- # save new paste on disk
- if self.save_crawled_paste(filename_paste, response.data['html']):
+ # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
+ self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
- # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
- self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
+ self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
+ self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
+ self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
- self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
- self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
- self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
+ # create onion metadata
+ if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
+ self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
+ self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
- # create onion metadata
- if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
- self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
- self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
+ #create paste metadata
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
+ self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
- #create paste metadata
- self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
- self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
- self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
- self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
+ self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
- self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
+ dirname = os.path.dirname(filename_screenshot)
+ if not os.path.exists(dirname):
+ os.makedirs(dirname)
- dirname = os.path.dirname(filename_screenshot)
- if not os.path.exists(dirname):
- os.makedirs(dirname)
+ size_screenshot = (len(response.data['png'])*3) /4
- size_screenshot = (len(response.data['png'])*3) /4
+ if size_screenshot < 5000000: #bytes
+ with open(filename_screenshot, 'wb') as f:
+ f.write(base64.standard_b64decode(response.data['png'].encode()))
- if size_screenshot < 5000000: #bytes
- with open(filename_screenshot, 'wb') as f:
- f.write(base64.standard_b64decode(response.data['png'].encode()))
+ #interest = response.data['har']['log']['entries'][0]['response']['header'][0]
+ with open(filename_screenshot+'har.txt', 'wb') as f:
+ f.write(json.dumps(response.data['har']).encode())
- #interest = response.data['har']['log']['entries'][0]['response']['header'][0]
- with open(filename_screenshot+'har.txt', 'wb') as f:
- f.write(json.dumps(response.data['har']).encode())
+ # save external links in set
+ lext = LinkExtractor(deny_domains=self.domains, unique=True)
+ for link in lext.extract_links(response):
+ self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
+ self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
- # save external links in set
- lext = LinkExtractor(deny_domains=self.domains, unique=True)
- for link in lext.extract_links(response):
- self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
- self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
-
- #le = LinkExtractor(unique=True)
- le = LinkExtractor(allow_domains=self.domains, unique=True)
- for link in le.extract_links(response):
- self.r_cache.setbit(link, 0, 0)
- self.r_cache.expire(link, 360000)
- yield SplashRequest(
- link.url,
- self.parse,
- errback=self.errback_catcher,
- endpoint='render.json',
- meta={'father': relative_filename_paste},
- args={ 'html': 1,
- 'png': 1,
- 'render_all': 1,
- 'har': 1,
- 'wait': 10}
- #errback=self.errback_catcher
- )
+ #le = LinkExtractor(unique=True)
+ le = LinkExtractor(allow_domains=self.domains, unique=True)
+ for link in le.extract_links(response):
+ self.r_cache.setbit(link, 0, 0)
+ self.r_cache.expire(link, 360000)
+ yield SplashRequest(
+ link.url,
+ self.parse,
+ #errback=self.errback_catcher,
+ endpoint='render.json',
+ meta={'father': relative_filename_paste},
+ args={ 'html': 1,
+ 'png': 1,
+ 'render_all': 1,
+ 'har': 1,
+ 'wait': 10}
+ )
+ '''
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
print('failure')
- print(failure)
- print(failure.request.meta['item'])
+ #print(failure)
+ print(failure.type)
+ #print(failure.request.meta['item'])
#if isinstance(failure.value, HttpError):
if failure.check(HttpError):
@@ -209,7 +215,7 @@ class TorSplashCrawler():
print('TimeoutError')
print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url)
-
+ '''
def save_crawled_paste(self, filename, content):
diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py
index 40240591..e2780e2a 100644
--- a/var/www/modules/showpaste/Flask_showpaste.py
+++ b/var/www/modules/showpaste/Flask_showpaste.py
@@ -33,6 +33,7 @@ bootstrap_label = Flask_config.bootstrap_label
misp_event_url = Flask_config.misp_event_url
hive_case_url = Flask_config.hive_case_url
vt_enabled = Flask_config.vt_enabled
+PASTES_FOLDER = Flask_config.PASTES_FOLDER
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
@@ -40,6 +41,14 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
# ============ FUNCTIONS ============
def showpaste(content_range, requested_path):
+ if PASTES_FOLDER not in requested_path:
+ requested_path = os.path.join(PASTES_FOLDER, requested_path)
+ # remove old full path
+ #requested_path = requested_path.replace(PASTES_FOLDER, '')
+ # escape directory transversal
+ if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER:
+ return 'path transversal detected'
+
vt_enabled = Flask_config.vt_enabled
paste = Paste.Paste(requested_path)
@@ -173,6 +182,7 @@ def showpaste(content_range, requested_path):
crawler_metadata = {}
if 'infoleak:submission="crawler"' in l_tags:
crawler_metadata['get_metadata'] = True
+ crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html
index 54ea99b5..6f0ccccc 100644
--- a/var/www/modules/showpaste/templates/show_saved_paste.html
+++ b/var/www/modules/showpaste/templates/show_saved_paste.html
@@ -435,9 +435,13 @@
+
+ Domain |
+ {{ crawler_metadata['domain'] }} |
+
Father |
- {{ crawler_metadata['paste_father'] }} |
+ {{ crawler_metadata['paste_father'] }} |
Source link |
From f5b648d72a6aad836a36ec4cdc89f156d15eeb93 Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Tue, 18 Sep 2018 11:03:40 +0200
Subject: [PATCH 12/28] pixelate paste screenshot
---
bin/Crawler.py | 3 +-
.../showpaste/templates/show_saved_paste.html | 63 ++++++++++++++++++-
2 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/bin/Crawler.py b/bin/Crawler.py
index 3660aa41..3e6e89aa 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -47,9 +47,8 @@ def crawl_onion(url, domain, date, date_month):
print(process.stdout.read())
else:
- r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
- r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
print(process.stdout.read())
+ exit(0)
else:
## FIXME: # TODO: relaunch docker
exit(0)
diff --git a/var/www/modules/showpaste/templates/show_saved_paste.html b/var/www/modules/showpaste/templates/show_saved_paste.html
index 6f0ccccc..8db7cabd 100644
--- a/var/www/modules/showpaste/templates/show_saved_paste.html
+++ b/var/www/modules/showpaste/templates/show_saved_paste.html
@@ -423,11 +423,27 @@
{% if crawler_metadata['get_metadata'] %}
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Graph
@@ -574,5 +590,50 @@
{% endfor %}
+
From ce63d81878b762fb09345bcb94f9dc6c484025cc Mon Sep 17 00:00:00 2001
From: Terrtia
Date: Tue, 18 Sep 2018 11:51:21 +0200
Subject: [PATCH 13/28] chg: [UI] pixelate crawled screenshot by default
---
.../hiddenServices/templates/showDomain.html | 63 ++++++++++++++++++-
.../showpaste/templates/show_saved_paste.html | 39 ++++++------
2 files changed, 81 insertions(+), 21 deletions(-)
diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html
index b89388aa..30b078fb 100644
--- a/var/www/modules/hiddenServices/templates/showDomain.html
+++ b/var/www/modules/hiddenServices/templates/showDomain.html
@@ -122,7 +122,22 @@
-
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -144,6 +159,52 @@
});
+
+