mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-13 01:58:22 +00:00
chg: [Crawler] crawler accept all kind of domains
This commit is contained in:
parent
e9580d6775
commit
7e24943537
6 changed files with 112 additions and 69 deletions
|
@ -34,21 +34,21 @@ def crawl_onion(url, domain, date, date_month):
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
|
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father],
|
||||||
stdout=subprocess.PIPE)
|
stdout=subprocess.PIPE)
|
||||||
while process.poll() is None:
|
while process.poll() is None:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
if process.returncode == 0:
|
if process.returncode == 0:
|
||||||
if r_serv_metadata.exists('paste_children:'+paste):
|
if r_serv_metadata.exists('paste_children:'+paste):
|
||||||
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
|
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
||||||
p.populate_set_out(msg, 'Tags')
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
r_onion.sadd('onion_down:'+date , domain)
|
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
||||||
r_onion.sadd('onion_down_link:'+date , url)
|
r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
else:
|
else:
|
||||||
## FIXME: # TODO: relaunch docker
|
## FIXME: # TODO: relaunch docker
|
||||||
|
@ -67,8 +67,28 @@ if __name__ == '__main__':
|
||||||
# Setup the I/O queues
|
# Setup the I/O queues
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
|
|
||||||
splash_url = p.config.get("Crawler", "splash_url")
|
url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
http_proxy = p.config.get("Crawler", "http_proxy")
|
re.compile(url_onion)
|
||||||
|
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
re.compile(url_i2p)
|
||||||
|
|
||||||
|
type_hidden_service = 'onion'
|
||||||
|
if type_hidden_service == 'onion':
|
||||||
|
regex_hidden_service = url_onion
|
||||||
|
splash_url = p.config.get("Crawler", "splash_url_onion")
|
||||||
|
http_proxy = p.config.get("Crawler", "http_proxy_onion")
|
||||||
|
elif type_hidden_service == 'i2p':
|
||||||
|
regex_hidden_service = url_i2p
|
||||||
|
splash_url = p.config.get("Crawler", "splash_url_i2p")
|
||||||
|
http_proxy = p.config.get("Crawler", "http_proxy_i2p")
|
||||||
|
elif type_hidden_service == 'regular':
|
||||||
|
regex_hidden_service = url_i2p
|
||||||
|
splash_url = p.config.get("Crawler", "splash_url_onion")
|
||||||
|
http_proxy = p.config.get("Crawler", "http_proxy_onion")
|
||||||
|
else:
|
||||||
|
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||||
|
exit(0)
|
||||||
|
|
||||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||||
|
|
||||||
#signal.signal(signal.SIGINT, signal_handler)
|
#signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
@ -91,93 +111,94 @@ if __name__ == '__main__':
|
||||||
db=p.config.getint("ARDB_Onion", "db"),
|
db=p.config.getint("ARDB_Onion", "db"),
|
||||||
decode_responses=True)
|
decode_responses=True)
|
||||||
|
|
||||||
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
||||||
re.compile(url_regex)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
message = p.get_from_set()
|
|
||||||
# Recovering the streamed message informations.
|
# Recovering the streamed message informations.
|
||||||
#message = r_onion.spop('mess_onion')
|
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
|
||||||
print(message)
|
#message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2'
|
||||||
|
|
||||||
|
# # FIXME: remove
|
||||||
if message is None:
|
if message is None:
|
||||||
print('get ardb message')
|
print('get ardb message')
|
||||||
message = r_onion.spop('mess_onion')
|
message = r_onion.spop('mess_onion')
|
||||||
|
|
||||||
|
print(message)
|
||||||
|
|
||||||
if message is not None:
|
if message is not None:
|
||||||
|
|
||||||
splitted = message.split(';')
|
splitted = message.split(';')
|
||||||
if len(splitted) == 2:
|
if len(splitted) == 2:
|
||||||
url, paste = splitted
|
url, paste = splitted
|
||||||
|
|
||||||
|
|
||||||
if not '.onion' in url:
|
if not '.onion' in url:
|
||||||
print('not onion')
|
print('not onion')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url_list = re.findall(url_regex, url)[0]
|
url_list = re.findall(regex_hidden_service, url)[0]
|
||||||
if url_list[1] == '':
|
if url_list[1] == '':
|
||||||
url= 'http://{}'.format(url)
|
url= 'http://{}'.format(url)
|
||||||
|
|
||||||
link, s, credential, subdomain, domain, host, port, \
|
link, s, credential, subdomain, domain, host, port, \
|
||||||
resource_path, query_string, f1, f2, f3, f4 = url_list
|
resource_path, query_string, f1, f2, f3, f4 = url_list
|
||||||
domain = url_list[4]
|
domain = url_list[4]
|
||||||
|
r_onion.srem('onion_domain_crawler_queue', domain)
|
||||||
|
#domain = 'myip.com'
|
||||||
|
|
||||||
domain_url = 'http://{}'.format(domain)
|
domain_url = 'http://{}'.format(domain)
|
||||||
|
|
||||||
print('------------------START ONION CRAWLER------------------')
|
print('------------------START CRAWLER------------------')
|
||||||
|
print(type_hidden_service)
|
||||||
|
print('-------------------------------------------------')
|
||||||
print('url: {}'.format(url))
|
print('url: {}'.format(url))
|
||||||
print('domain: {}'.format(domain))
|
print('domain: {}'.format(domain))
|
||||||
print('domain_url: {}'.format(domain_url))
|
print('domain_url: {}'.format(domain_url))
|
||||||
|
|
||||||
'''if not r_onion.sismember('full_onion_up', domain):
|
if not r_onion.sismember('banned_{}'.format(type_hidden_service), domain):
|
||||||
r_onion.sadd('mess_onion', message)
|
|
||||||
print('added ..............')'''
|
|
||||||
|
|
||||||
|
|
||||||
if not r_onion.sismember('banned_onion', domain):
|
|
||||||
|
|
||||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
|
|
||||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
||||||
|
|
||||||
crawl_onion(url, domain, date, date_month)
|
crawl_onion(url, domain, date, date_month)
|
||||||
if url != domain_url:
|
if url != domain_url:
|
||||||
crawl_onion(domain_url, domain, date, date_month)
|
crawl_onion(domain_url, domain, date, date_month)
|
||||||
|
|
||||||
# save down onion
|
# save down onion
|
||||||
if not r_onion.sismember('onion_up:'+date , domain):
|
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
||||||
r_onion.sadd('onion_down:'+date , domain)
|
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
||||||
r_onion.sadd('onion_down_link:'+date , url)
|
r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
||||||
r_onion.hincrby('onion_link_down', url, 1)
|
r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
|
||||||
if not r_onion.exists('onion_metadata:{}'.format(domain)):
|
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
|
||||||
r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
|
||||||
r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
|
||||||
else:
|
else:
|
||||||
r_onion.hincrby('onion_link_up', url, 1)
|
r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
||||||
|
|
||||||
# last check
|
# last check
|
||||||
r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
|
||||||
# check external onions links (full_scrawl)
|
# check external onions links (full_scrawl)
|
||||||
external_domains = set()
|
external_domains = set()
|
||||||
for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
|
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
||||||
external_domain = re.findall(url_regex, link)
|
external_domain = re.findall(url_onion, link)
|
||||||
|
external_domain.extend(re.findall(url_i2p, link))
|
||||||
if len(external_domain) > 0:
|
if len(external_domain) > 0:
|
||||||
external_domain = external_domain[0][4]
|
external_domain = external_domain[0][4]
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
# # TODO: add i2p
|
|
||||||
if '.onion' in external_domain and external_domain != domain:
|
if '.onion' in external_domain and external_domain != domain:
|
||||||
external_domains.add(external_domain)
|
external_domains.add(external_domain)
|
||||||
|
elif '.i2p' in external_domain and external_domain != domain:
|
||||||
|
external_domains.add(external_domain)
|
||||||
if len(external_domains) >= 10:
|
if len(external_domains) >= 10:
|
||||||
r_onion.sadd('onion_potential_source', domain)
|
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
|
||||||
r_onion.delete('domain_onion_external_links:{}'.format(domain))
|
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||||
print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
|
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||||
|
|
||||||
r_onion.lpush('last_onions', domain)
|
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||||
r_onion.ltrim('last_onions', 0, 15)
|
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
14
bin/Onion.py
14
bin/Onion.py
|
@ -150,9 +150,12 @@ if __name__ == "__main__":
|
||||||
if '.i2p' in url:
|
if '.i2p' in url:
|
||||||
print('add i2p')
|
print('add i2p')
|
||||||
print(domain)
|
print(domain)
|
||||||
if not r_onion.sismember('i2p_domain', domain):
|
if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain):
|
||||||
r_onion.sadd('i2p_domain', domain)
|
r_onion.sadd('i2p_domain', domain)
|
||||||
r_onion.sadd('i2p_link', url)
|
r_onion.sadd('i2p_link', url)
|
||||||
|
r_onion.sadd('i2p_domain_crawler_queue', domain)
|
||||||
|
msg = '{};{}'.format(url,PST.p_path)
|
||||||
|
r_onion.sadd('i2p_crawler_queue', msg)
|
||||||
|
|
||||||
# Saving the list of extracted onion domains.
|
# Saving the list of extracted onion domains.
|
||||||
PST.__setattr__(channel, domains_list)
|
PST.__setattr__(channel, domains_list)
|
||||||
|
@ -193,9 +196,12 @@ if __name__ == "__main__":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||||
msg = '{};{}'.format(url,PST.p_path)
|
if not r_onion.sismember('onion_domain_crawler_queue', domain):
|
||||||
print('send to crawler')
|
print('send to onion crawler')
|
||||||
p.populate_set_out(msg, 'Crawler')
|
r_onion.sadd('onion_domain_crawler_queue', domain)
|
||||||
|
msg = '{};{}'.format(url,PST.p_path)
|
||||||
|
r_onion.sadd('onion_crawler_queue', msg)
|
||||||
|
#p.populate_set_out(msg, 'Crawler')
|
||||||
else:
|
else:
|
||||||
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
|
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ import uuid
|
||||||
import datetime
|
import datetime
|
||||||
import base64
|
import base64
|
||||||
import redis
|
import redis
|
||||||
|
import json
|
||||||
|
|
||||||
from scrapy.spidermiddlewares.httperror import HttpError
|
from scrapy.spidermiddlewares.httperror import HttpError
|
||||||
from twisted.internet.error import DNSLookupError
|
from twisted.internet.error import DNSLookupError
|
||||||
|
@ -30,7 +31,6 @@ class TorSplashCrawler():
|
||||||
self.crawler = Crawler(self.TorSplashSpider, {
|
self.crawler = Crawler(self.TorSplashSpider, {
|
||||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
||||||
'SPLASH_URL': splash_url,
|
'SPLASH_URL': splash_url,
|
||||||
'HTTP_PROXY': http_proxy,
|
|
||||||
'ROBOTSTXT_OBEY': False,
|
'ROBOTSTXT_OBEY': False,
|
||||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
'scrapy_splash.SplashMiddleware': 725,
|
'scrapy_splash.SplashMiddleware': 725,
|
||||||
|
@ -41,14 +41,15 @@ class TorSplashCrawler():
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
def crawl(self, url, domain, original_paste, super_father):
|
def crawl(self, type, url, domain, original_paste, super_father):
|
||||||
self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
|
self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
|
||||||
self.process.start()
|
self.process.start()
|
||||||
|
|
||||||
class TorSplashSpider(Spider):
|
class TorSplashSpider(Spider):
|
||||||
name = 'TorSplashSpider'
|
name = 'TorSplashSpider'
|
||||||
|
|
||||||
def __init__(self, url, domain,original_paste, super_father, *args, **kwargs):
|
def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
|
||||||
|
self.type = type
|
||||||
self.original_paste = original_paste
|
self.original_paste = original_paste
|
||||||
self.super_father = super_father
|
self.super_father = super_father
|
||||||
self.start_urls = url
|
self.start_urls = url
|
||||||
|
@ -100,12 +101,13 @@ class TorSplashCrawler():
|
||||||
args={ 'html': 1,
|
args={ 'html': 1,
|
||||||
'wait': 10,
|
'wait': 10,
|
||||||
'render_all': 1,
|
'render_all': 1,
|
||||||
|
'har': 1,
|
||||||
'png': 1}
|
'png': 1}
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse(self,response):
|
def parse(self,response):
|
||||||
print(response.headers)
|
#print(response.headers)
|
||||||
print(response.status)
|
#print(response.status)
|
||||||
|
|
||||||
# # TODO: # FIXME:
|
# # TODO: # FIXME:
|
||||||
self.r_cache.setbit(response.url, 0, 1)
|
self.r_cache.setbit(response.url, 0, 1)
|
||||||
|
@ -119,17 +121,18 @@ class TorSplashCrawler():
|
||||||
# save new paste on disk
|
# save new paste on disk
|
||||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||||
|
|
||||||
self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
|
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
||||||
self.r_serv_onion.sadd('full_onion_up', self.domains[0])
|
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
||||||
self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0])
|
self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])
|
||||||
|
|
||||||
# create onion metadata
|
# create onion metadata
|
||||||
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
|
if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
|
||||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date)
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
|
||||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date)
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)
|
||||||
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste)
|
||||||
|
|
||||||
# add onion screenshot history
|
# add onion screenshot history
|
||||||
self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date)
|
self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date)
|
||||||
|
|
||||||
#create paste metadata
|
#create paste metadata
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
||||||
|
@ -144,17 +147,20 @@ class TorSplashCrawler():
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
|
|
||||||
size_screenshot = (len(response.data['png'])*3) /4
|
size_screenshot = (len(response.data['png'])*3) /4
|
||||||
print(size_screenshot)
|
|
||||||
|
|
||||||
if size_screenshot < 5000000: #bytes
|
if size_screenshot < 5000000: #bytes
|
||||||
with open(filename_screenshot, 'wb') as f:
|
with open(filename_screenshot, 'wb') as f:
|
||||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||||
|
|
||||||
|
#interest = response.data['har']['log']['entries'][0]['response']['header'][0]
|
||||||
|
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||||
|
f.write(json.dumps(response.data['har']).encode())
|
||||||
|
|
||||||
# save external links in set
|
# save external links in set
|
||||||
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||||
for link in lext.extract_links(response):
|
for link in lext.extract_links(response):
|
||||||
self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url)
|
self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
||||||
self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url)
|
self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
||||||
|
|
||||||
#le = LinkExtractor(unique=True)
|
#le = LinkExtractor(unique=True)
|
||||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||||
|
@ -169,6 +175,7 @@ class TorSplashCrawler():
|
||||||
args={ 'html': 1,
|
args={ 'html': 1,
|
||||||
'png': 1,
|
'png': 1,
|
||||||
'render_all': 1,
|
'render_all': 1,
|
||||||
|
'har': 1,
|
||||||
'wait': 10}
|
'wait': 10}
|
||||||
#errback=self.errback_catcher
|
#errback=self.errback_catcher
|
||||||
)
|
)
|
||||||
|
|
|
@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
if len(sys.argv) != 5:
|
if len(sys.argv) != 8:
|
||||||
print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father')
|
print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
|
@ -21,14 +21,15 @@ if __name__ == '__main__':
|
||||||
cfg = configparser.ConfigParser()
|
cfg = configparser.ConfigParser()
|
||||||
cfg.read(configfile)
|
cfg.read(configfile)
|
||||||
|
|
||||||
splash_url = cfg.get("Crawler", "splash_url")
|
splash_url = sys.argv[1]
|
||||||
http_proxy = cfg.get("Crawler", "http_proxy")
|
http_proxy = sys.argv[2]
|
||||||
|
type = sys.argv[3]
|
||||||
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
||||||
|
|
||||||
url = sys.argv[1]
|
url = sys.argv[4]
|
||||||
domain = sys.argv[2]
|
domain = sys.argv[5]
|
||||||
paste = sys.argv[3]
|
paste = sys.argv[6]
|
||||||
super_father = sys.argv[4]
|
super_father = sys.argv[7]
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
|
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
|
||||||
crawler.crawl(url, domain, paste, super_father)
|
crawler.crawl(type, url, domain, paste, super_father)
|
||||||
|
|
|
@ -43,7 +43,7 @@ def get_onion_status(domain, date):
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/", methods=['GET'])
|
@hiddenServices.route("/hiddenServices/", methods=['GET'])
|
||||||
def hiddenServices_page():
|
def hiddenServices_page():
|
||||||
last_onions = r_serv_onion.lrange('last_onions', 0 ,-1)
|
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||||
list_onion = []
|
list_onion = []
|
||||||
|
|
||||||
for onion in last_onions:
|
for onion in last_onions:
|
||||||
|
@ -72,9 +72,11 @@ def onion_domain():
|
||||||
|
|
||||||
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
|
last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check')
|
||||||
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
|
first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen')
|
||||||
|
domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent')
|
||||||
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
|
date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain))
|
||||||
|
|
||||||
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen)
|
return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen,
|
||||||
|
domain_paste=domain_paste)
|
||||||
|
|
||||||
# ============= JSON ==============
|
# ============= JSON ==============
|
||||||
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
|
@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET'])
|
||||||
|
|
|
@ -49,6 +49,12 @@
|
||||||
<td>Last Check</td>
|
<td>Last Check</td>
|
||||||
<td>{{ last_check }}</td>
|
<td>{{ last_check }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Origin Paste</td>
|
||||||
|
<td>
|
||||||
|
<a target="_blank" href="{{ url_for('showsavedpastes.showsavedpaste', paste=domain_paste) }}" />{{ domain_paste }}</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue