chg: [Crawler] refractor

This commit is contained in:
Terrtia 2019-02-21 09:54:43 +01:00
parent 0832784f7a
commit e5dca268a8
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
5 changed files with 61 additions and 65 deletions

View file

@ -79,7 +79,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message)
def crawl_onion(url, domain, date, date_month, message):
def crawl_onion(url, domain, date, date_month, message, mode):
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
@ -166,7 +166,8 @@ if __name__ == '__main__':
publisher.info("Script Crawler started")
# load domains blacklist
load_type_blacklist(type_hidden_service)
load_type_blacklist('onions')
load_type_blacklist('regular')
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
print('splash url: {}'.format(splash_url))
@ -180,16 +181,15 @@ if __name__ == '__main__':
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
while True:
if mode == 'automatic':
# Priority Queue - Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service))
# Recovering the streamed message informations.
if message is None:
# Recovering the streamed message informations.
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
else:
pass
@ -244,16 +244,16 @@ if __name__ == '__main__':
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# Launch Scrapy-Splash Crawler
crawl_onion(url, domain, date, date_month, message)
crawl_onion(url, domain, date, date_month, message, mode)
# Crawl Domain
if url != domain_url:
#Crawl Domain with port number
if port is not None:
print('{}:{}'.format(domain_url, port))
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode)
#Crawl without port number
print(domain_url)
crawl_onion(domain_url, domain, date, date_month, message)
crawl_onion(domain_url, domain, date, date_month, message, mode)
# update last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
@ -293,14 +293,9 @@ if __name__ == '__main__':
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
# update list, last crawled sites
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
# manual
else:
# update list, last crawled sites
r_onion.lpush('last_crawled_manual', domain)
r_onion.ltrim('last_crawled_manual', 0, 15)
# update list, last crawled sites
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
#update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')

View file

@ -28,10 +28,10 @@ from Helper import Process
class TorSplashCrawler():
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': user_agent,
'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -42,18 +42,18 @@ class TorSplashCrawler():
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
'DEPTH_LIMIT': crawler_depth_limit
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit']
})
def crawl(self, type, url, domain, original_paste, super_father):
self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
def crawl(self, type, crawler_options, url, domain, original_paste, super_father):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs):
self.type = type
self.original_paste = original_paste
self.super_father = super_father
@ -63,6 +63,12 @@ class TorSplashCrawler():
self.full_date = datetime.datetime.now().strftime("%Y%m%d")
self.date_month = datetime.datetime.now().strftime("%Y%m")
self.arg_crawler = { 'html': crawler_options['html'],
'wait': 10,
'render_all': 1,
'har': crawler_options['har'],
'png': crawler_options['png']}
config_section = 'Crawler'
self.p = Process(config_section)
@ -104,11 +110,7 @@ class TorSplashCrawler():
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': self.original_paste},
args={ 'html': 1,
'wait': 10,
'render_all': 1,
'har': 1,
'png': 1}
args=self.arg_crawler
)
def parse(self,response):
@ -131,6 +133,7 @@ class TorSplashCrawler():
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# # TODO: modify me
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
@ -158,14 +161,16 @@ class TorSplashCrawler():
if not os.path.exists(dirname):
os.makedirs(dirname)
size_screenshot = (len(response.data['png'])*3) /4
if 'png' in response.data:
size_screenshot = (len(response.data['png'])*3) /4
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
with open(filename_screenshot+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
if 'har' in response.data:
with open(filename_screenshot+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())
# save external links in set
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
@ -181,11 +186,7 @@ class TorSplashCrawler():
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': relative_filename_paste},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
args=self.arg_crawler
)
def errback_catcher(self, failure):
@ -205,11 +206,7 @@ class TorSplashCrawler():
errback=self.errback_catcher,
endpoint='render.json',
meta={'father': father},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'har': 1,
'wait': 10}
args=self.arg_crawler
)
else:

View file

@ -6,6 +6,9 @@ import sys
import configparser
from TorSplashCrawler import TorSplashCrawler
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50}
if __name__ == '__main__':
if len(sys.argv) != 7:
@ -23,17 +26,17 @@ if __name__ == '__main__':
splash_url = sys.argv[1]
type = sys.argv[2]
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[3]
domain = sys.argv[4]
paste = sys.argv[5]
super_father = sys.argv[6]
if crawler_options is None:
crawler_options = default_crawler_options
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
user_agent = tor_browser_agent
crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit")
crawler_options['user_agent'] = tor_browser_agent
closespider_pagecount = 50
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
crawler.crawl(type, url, domain, paste, super_father)
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(type, crawler_options, url, domain, paste, super_father)

View file

@ -2,30 +2,27 @@
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Tags - AIL</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='font-awesome/css/font-awesome.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/sb-admin-2.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dygraph_gallery.css') }}" rel="stylesheet" type="text/css" />
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
<!-- JS -->
<script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.pie.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery.flot.time.js') }}"></script>
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
</head>
<body>
{% include 'navbar.html' %}
{% include 'nav_bar.html' %}
<div id="page-wrapper">
<div class="row">

View file

@ -142,6 +142,10 @@ def get_crawler_splash_status(mode, type):
def hiddenServices_page_test():
return render_template("Crawler_index.html")
@hiddenServices.route("/crawlers/manual", methods=['GET'])
def manual():
return render_template("Crawler_Splash_manual.html")
@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET'])
def crawler_splash_onion():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)