mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
chg: [Onion] add onion splash crawler
This commit is contained in:
parent
54cc4f3723
commit
8b1c10b38c
7 changed files with 319 additions and 2 deletions
92
bin/Crawler.py
Executable file
92
bin/Crawler.py
Executable file
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import redis
|
||||
import datetime
|
||||
import time
|
||||
import subprocess
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
from Helper import Process
|
||||
from pubsublogger import publisher
|
||||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
publisher.info("Script Crawler started")
|
||||
|
||||
config_section = 'Crawler'
|
||||
|
||||
# Setup the I/O queues
|
||||
p = Process(config_section)
|
||||
|
||||
splash_url = p.config.get("Crawler", "splash_url")
|
||||
http_proxy = p.config.get("Crawler", "http_proxy")
|
||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||
|
||||
#signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
r_serv_metadata = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_Metadata", "host"),
|
||||
port=p.config.getint("ARDB_Metadata", "port"),
|
||||
db=p.config.getint("ARDB_Metadata", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_cache = redis.StrictRedis(
|
||||
host=p.config.get("Redis_Cache", "host"),
|
||||
port=p.config.getint("Redis_Cache", "port"),
|
||||
db=p.config.getint("Redis_Cache", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_onion = redis.StrictRedis(
|
||||
host=p.config.get("ARDB_Onion", "host"),
|
||||
port=p.config.getint("ARDB_Onion", "port"),
|
||||
db=p.config.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
while True:
|
||||
|
||||
message = p.get_from_set()
|
||||
# Recovering the streamed message informations.
|
||||
if message is not None:
|
||||
splitted = message.split(';')
|
||||
if len(splitted) == 2:
|
||||
url, paste = splitted
|
||||
|
||||
print(url)
|
||||
|
||||
if not r_cache.exists(url):
|
||||
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
|
||||
if super_father is None:
|
||||
super_father=paste
|
||||
|
||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
|
||||
stdout=subprocess.PIPE)
|
||||
while process.poll() is None:
|
||||
time.sleep(1)
|
||||
|
||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
print(date)
|
||||
url_domain = url.replace('http://', '')
|
||||
if process.returncode == 0:
|
||||
if r_serv_metadata.exists('paste_children:'+paste):
|
||||
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
r_onion.sadd('onion_up:'+date , url_domain)
|
||||
else:
|
||||
r_onion.sadd('onion_down:'+date , url_domain)
|
||||
print(process.stdout.read())
|
||||
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
time.sleep(1)
|
|
@ -21,7 +21,6 @@ Requirements
|
|||
*Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.
|
||||
|
||||
"""
|
||||
import pprint
|
||||
import time
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
@ -123,6 +122,7 @@ if __name__ == "__main__":
|
|||
PST = Paste.Paste(filename)
|
||||
|
||||
for x in PST.get_regex(url_regex):
|
||||
print(x)
|
||||
# Extracting url with regex
|
||||
url, s, credential, subdomain, domain, host, port, \
|
||||
resource_path, query_string, f1, f2, f3, f4 = x
|
||||
|
@ -149,12 +149,18 @@ if __name__ == "__main__":
|
|||
to_print = 'Onion;{};{};{};'.format(PST.p_source,
|
||||
PST.p_date,
|
||||
PST.p_name)
|
||||
'''
|
||||
for url in fetch(p, r_cache, urls, domains_list, path):
|
||||
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
|
||||
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
|
||||
|
||||
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
'''
|
||||
for url in urls:
|
||||
msg = '{};{}'.format(url,PST.p_path)
|
||||
print('send to crawler')
|
||||
p.populate_set_out(msg, 'Crawler')
|
||||
else:
|
||||
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@ bloomfilters = Blooms
|
|||
dicofilters = Dicos
|
||||
pastes = PASTES
|
||||
base64 = BASE64
|
||||
crawled = crawled
|
||||
crawled_screenshot = CRAWLED_SCREENSHOT
|
||||
|
||||
wordtrending_csv = var/www/static/csv/wordstrendingdata
|
||||
wordsfile = files/wordfile
|
||||
|
@ -171,6 +173,11 @@ host = localhost
|
|||
port = 6382
|
||||
db = 8
|
||||
|
||||
[ARDB_Onion]
|
||||
host = localhost
|
||||
port = 6382
|
||||
db = 9
|
||||
|
||||
[Url]
|
||||
cc_critical = DE
|
||||
|
||||
|
@ -215,3 +222,8 @@ channel = FetchedOnion
|
|||
host = localhost
|
||||
port = 6381
|
||||
db = 0
|
||||
|
||||
[Crawler]
|
||||
crawler_depth_limit = 1
|
||||
splash_url = http://127.0.0.1:8050
|
||||
http_proxy = http://127.0.0.1:9050
|
||||
|
|
|
@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags
|
|||
|
||||
[Onion]
|
||||
subscribe = Redis_Onion
|
||||
publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags
|
||||
publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler
|
||||
#publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler
|
||||
|
||||
[DumpValidOnion]
|
||||
|
@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags
|
|||
[submit_paste]
|
||||
subscribe = Redis
|
||||
publish = Redis_Mixer
|
||||
|
||||
[Crawler]
|
||||
subscribe = Redis_Crawler
|
||||
publish = Redis_Mixer,Redis_Tags
|
||||
|
||||
|
|
165
bin/torcrawler/TorSplashCrawler.py
Normal file
165
bin/torcrawler/TorSplashCrawler.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import gzip
|
||||
import base64
|
||||
import uuid
|
||||
import datetime
|
||||
import base64
|
||||
import redis
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from scrapy import Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.crawler import CrawlerProcess, Crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
|
||||
from scrapy_splash import SplashRequest
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
from Helper import Process
|
||||
|
||||
class TorSplashCrawler():
|
||||
|
||||
def __init__(self, splash_url, http_proxy, crawler_depth_limit):
|
||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
||||
self.crawler = Crawler(self.TorSplashSpider, {
|
||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
||||
'SPLASH_URL': splash_url,
|
||||
'HTTP_PROXY': http_proxy,
|
||||
'ROBOTSTXT_OBEY': False,
|
||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||
'scrapy_splash.SplashMiddleware': 725,
|
||||
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
||||
},
|
||||
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||
'DEPTH_LIMIT': crawler_depth_limit
|
||||
})
|
||||
|
||||
def crawl(self, url, original_paste, super_father):
|
||||
self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
|
||||
self.process.start()
|
||||
|
||||
class TorSplashSpider(Spider):
|
||||
name = 'TorSplashSpider'
|
||||
|
||||
def __init__(self, url, original_paste, super_father, *args, **kwargs):
|
||||
self.original_paste = original_paste
|
||||
self.super_father = super_father
|
||||
self.start_urls = url
|
||||
self.domains = [urlparse(url).netloc]
|
||||
date = datetime.datetime.now().strftime("%Y/%m/%d")
|
||||
|
||||
config_section = 'Crawler'
|
||||
self.p = Process(config_section)
|
||||
|
||||
self.r_cache = redis.StrictRedis(
|
||||
host=self.p.config.get("Redis_Cache", "host"),
|
||||
port=self.p.config.getint("Redis_Cache", "port"),
|
||||
db=self.p.config.getint("Redis_Cache", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
self.r_serv_log_submit = redis.StrictRedis(
|
||||
host=self.p.config.get("Redis_Log_submit", "host"),
|
||||
port=self.p.config.getint("Redis_Log_submit", "port"),
|
||||
db=self.p.config.getint("Redis_Log_submit", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
self.r_serv_metadata = redis.StrictRedis(
|
||||
host=self.p.config.get("ARDB_Metadata", "host"),
|
||||
port=self.p.config.getint("ARDB_Metadata", "port"),
|
||||
db=self.p.config.getint("ARDB_Metadata", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
|
||||
self.p.config.get("Directories", "crawled"), date )
|
||||
|
||||
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
|
||||
|
||||
def start_requests(self):
|
||||
yield SplashRequest(
|
||||
self.start_urls,
|
||||
self.parse,
|
||||
endpoint='render.json',
|
||||
meta={'parent': self.original_paste},
|
||||
args={ 'html': 1,
|
||||
'wait': 10,
|
||||
'render_all': 1,
|
||||
'png': 1}
|
||||
)
|
||||
|
||||
def parse(self,response):
|
||||
print(response.headers)
|
||||
print(response.status)
|
||||
|
||||
self.r_cache.setbit(response.url, 0, 1)
|
||||
self.r_cache.expire(response.url, 360000)
|
||||
|
||||
UUID = self.domains[0]+str(uuid.uuid4())
|
||||
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
|
||||
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
||||
|
||||
# save new paste on disk
|
||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
||||
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
|
||||
|
||||
dirname = os.path.dirname(filename_screenshot)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
with open(filename_screenshot, 'wb') as f:
|
||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||
|
||||
# save external links in set
|
||||
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||
for link in lext.extract_links(response):
|
||||
self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
|
||||
|
||||
#le = LinkExtractor(unique=True)
|
||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||
for link in le.extract_links(response):
|
||||
self.r_cache.setbit(link, 0, 0)
|
||||
self.r_cache.expire(link, 360000)
|
||||
yield SplashRequest(
|
||||
link.url,
|
||||
self.parse,
|
||||
endpoint='render.json',
|
||||
meta={'parent': UUID},
|
||||
args={ 'html': 1,
|
||||
'png': 1,
|
||||
'render_all': 1,
|
||||
'wait': 10}
|
||||
)
|
||||
|
||||
def save_crawled_paste(self, filename, content):
|
||||
|
||||
print(filename)
|
||||
if os.path.isfile(filename):
|
||||
print('File: {} already exist in submitted pastes'.format(filename))
|
||||
return False
|
||||
|
||||
try:
|
||||
gzipencoded = gzip.compress(content.encode())
|
||||
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
|
||||
except:
|
||||
print("file error: {}".format(filename))
|
||||
return False
|
||||
|
||||
# send paste to Global
|
||||
relay_message = "{0} {1}".format(filename, gzip64encoded)
|
||||
self.p.populate_set_out(relay_message, 'Mixer')
|
||||
|
||||
# increase nb of paste by feeder name
|
||||
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
|
||||
|
||||
# tag crawled paste
|
||||
msg = 'infoleak:submission="crawler";{}'.format(filename)
|
||||
self.p.populate_set_out(msg, 'Tags')
|
||||
return True
|
33
bin/torcrawler/tor_crawler.py
Executable file
33
bin/torcrawler/tor_crawler.py
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
from TorSplashCrawler import TorSplashCrawler
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
|
||||
exit(1)
|
||||
|
||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||
if not os.path.exists(configfile):
|
||||
raise Exception('Unable to find the configuration file. \
|
||||
Did you set environment variables? \
|
||||
Or activate the virtualenv.')
|
||||
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg.read(configfile)
|
||||
|
||||
splash_url = cfg.get("Crawler", "splash_url")
|
||||
http_proxy = cfg.get("Crawler", "http_proxy")
|
||||
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
||||
|
||||
url = sys.argv[1]
|
||||
paste = sys.argv[2]
|
||||
super_father = sys.argv[3]
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
|
||||
crawler.crawl(url, paste, super_father)
|
4
etc/splash/proxy-profiles/default.ini
Normal file
4
etc/splash/proxy-profiles/default.ini
Normal file
|
@ -0,0 +1,4 @@
|
|||
[proxy]
|
||||
host=localhost
|
||||
port=9050
|
||||
type=SOCKS5
|
Loading…
Reference in a new issue