chg: [Crawler] refractor

This commit is contained in:
Terrtia 2019-02-21 09:54:43 +01:00
parent 0832784f7a
commit e5dca268a8
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
5 changed files with 61 additions and 65 deletions

View file

@ -79,7 +79,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message) r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message)
def crawl_onion(url, domain, date, date_month, message): def crawl_onion(url, domain, date, date_month, message, mode):
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
@ -166,7 +166,8 @@ if __name__ == '__main__':
publisher.info("Script Crawler started") publisher.info("Script Crawler started")
# load domains blacklist # load domains blacklist
load_type_blacklist(type_hidden_service) load_type_blacklist('onions')
load_type_blacklist('regular')
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
print('splash url: {}'.format(splash_url)) print('splash url: {}'.format(splash_url))
@ -180,16 +181,15 @@ if __name__ == '__main__':
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
while True: while True:
if mode == 'automatic': if mode == 'automatic':
# Priority Queue - Recovering the streamed message informations. # Priority Queue - Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service)) message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service))
# Recovering the streamed message informations.
if message is None: if message is None:
# Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
else: else:
pass pass
@ -244,16 +244,16 @@ if __name__ == '__main__':
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# Launch Scrapy-Splash Crawler # Launch Scrapy-Splash Crawler
crawl_onion(url, domain, date, date_month, message) crawl_onion(url, domain, date, date_month, message, mode)
# Crawl Domain # Crawl Domain
if url != domain_url: if url != domain_url:
#Crawl Domain with port number #Crawl Domain with port number
if port is not None: if port is not None:
print('{}:{}'.format(domain_url, port)) print('{}:{}'.format(domain_url, port))
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message) crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode)
#Crawl without port number #Crawl without port number
print(domain_url) print(domain_url)
crawl_onion(domain_url, domain, date, date_month, message) crawl_onion(domain_url, domain, date, date_month, message, mode)
# update last check # update last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
@ -293,14 +293,9 @@ if __name__ == '__main__':
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
# update list, last crawled sites # update list, last crawled sites
r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
# manual
else:
# update list, last crawled sites
r_onion.lpush('last_crawled_manual', domain)
r_onion.ltrim('last_crawled_manual', 0, 15)
#update crawler status #update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')

View file

@ -28,10 +28,10 @@ from Helper import Process
class TorSplashCrawler(): class TorSplashCrawler():
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount): def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': False}) self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, { self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': user_agent, 'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url, 'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False, 'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -42,18 +42,18 @@ class TorSplashCrawler():
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True, 'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2, 'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': closespider_pagecount, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_depth_limit 'DEPTH_LIMIT': crawler_options['depth_limit']
}) })
def crawl(self, type, url, domain, original_paste, super_father): def crawl(self, type, crawler_options, url, domain, original_paste, super_father):
self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father) self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start() self.process.start()
class TorSplashSpider(Spider): class TorSplashSpider(Spider):
name = 'TorSplashSpider' name = 'TorSplashSpider'
def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs):
self.type = type self.type = type
self.original_paste = original_paste self.original_paste = original_paste
self.super_father = super_father self.super_father = super_father
@ -63,6 +63,12 @@ class TorSplashCrawler():
self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d")
self.date_month = datetime.datetime.now().strftime("%Y%m") self.date_month = datetime.datetime.now().strftime("%Y%m")
self.arg_crawler = { 'html': crawler_options['html'],
'wait': 10,
'render_all': 1,
'har': crawler_options['har'],
'png': crawler_options['png']}
config_section = 'Crawler' config_section = 'Crawler'
self.p = Process(config_section) self.p = Process(config_section)
@ -104,11 +110,7 @@ class TorSplashCrawler():
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': self.original_paste}, meta={'father': self.original_paste},
args={ 'html': 1, args=self.arg_crawler
'wait': 10,
'render_all': 1,
'har': 1,
'png': 1}
) )
def parse(self,response): def parse(self,response):
@ -131,6 +133,7 @@ class TorSplashCrawler():
relative_filename_paste = os.path.join(self.crawler_path, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# # TODO: modify me
# save new paste on disk # save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']): if self.save_crawled_paste(filename_paste, response.data['html']):
@ -158,14 +161,16 @@ class TorSplashCrawler():
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
size_screenshot = (len(response.data['png'])*3) /4 if 'png' in response.data:
size_screenshot = (len(response.data['png'])*3) /4
if size_screenshot < 5000000: #bytes if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f: with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode())) f.write(base64.standard_b64decode(response.data['png'].encode()))
with open(filename_screenshot+'har.txt', 'wb') as f: if 'har' in response.data:
f.write(json.dumps(response.data['har']).encode()) with open(filename_screenshot+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
# save external links in set # save external links in set
#lext = LinkExtractor(deny_domains=self.domains, unique=True) #lext = LinkExtractor(deny_domains=self.domains, unique=True)
@ -181,11 +186,7 @@ class TorSplashCrawler():
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': relative_filename_paste}, meta={'father': relative_filename_paste},
args={ 'html': 1, args=self.arg_crawler
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
) )
def errback_catcher(self, failure): def errback_catcher(self, failure):
@ -205,11 +206,7 @@ class TorSplashCrawler():
errback=self.errback_catcher, errback=self.errback_catcher,
endpoint='render.json', endpoint='render.json',
meta={'father': father}, meta={'father': father},
args={ 'html': 1, args=self.arg_crawler
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
) )
else: else:

View file

@ -6,6 +6,9 @@ import sys
import configparser import configparser
from TorSplashCrawler import TorSplashCrawler from TorSplashCrawler import TorSplashCrawler
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50}
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 7: if len(sys.argv) != 7:
@ -23,17 +26,17 @@ if __name__ == '__main__':
splash_url = sys.argv[1] splash_url = sys.argv[1]
type = sys.argv[2] type = sys.argv[2]
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[3] url = sys.argv[3]
domain = sys.argv[4] domain = sys.argv[4]
paste = sys.argv[5] paste = sys.argv[5]
super_father = sys.argv[6] super_father = sys.argv[6]
if crawler_options is None:
crawler_options = default_crawler_options
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0' crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit")
user_agent = tor_browser_agent crawler_options['user_agent'] = tor_browser_agent
closespider_pagecount = 50 crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(type, crawler_options, url, domain, paste, super_father)
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
crawler.crawl(type, url, domain, paste, super_father)

View file

@ -2,30 +2,27 @@
<html> <html>
<head> <head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Tags - AIL</title> <title>Tags - AIL</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}"> <link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">
<!-- Core CSS --> <!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap.min.css') }}" rel="stylesheet"> <link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='font-awesome/css/font-awesome.css') }}" rel="stylesheet"> <link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/sb-admin-2.css') }}" rel="stylesheet"> <link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dygraph_gallery.css') }}" rel="stylesheet" type="text/css" />
<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" /> <link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
<!-- JS --> <!-- JS -->
<script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script> <script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script> <script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.js') }}"></script> <script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.pie.js') }}"></script> <script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.time.js') }}"></script> <script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
<script src="{{ url_for('static', filename='js/tags.js') }}"></script> <script src="{{ url_for('static', filename='js/tags.js') }}"></script>
</head> </head>
<body> <body>
{% include 'navbar.html' %} {% include 'nav_bar.html' %}
<div id="page-wrapper"> <div id="page-wrapper">
<div class="row"> <div class="row">

View file

@ -142,6 +142,10 @@ def get_crawler_splash_status(mode, type):
def hiddenServices_page_test(): def hiddenServices_page_test():
return render_template("Crawler_index.html") return render_template("Crawler_index.html")
@hiddenServices.route("/crawlers/manual", methods=['GET'])
def manual():
return render_template("Crawler_Splash_manual.html")
@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET'])
def crawler_splash_onion(): def crawler_splash_onion():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)