mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
chg: [Crawler] refractor
This commit is contained in:
parent
0832784f7a
commit
e5dca268a8
5 changed files with 61 additions and 65 deletions
|
@ -79,7 +79,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
|||
r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message)
|
||||
|
||||
def crawl_onion(url, domain, date, date_month, message):
|
||||
def crawl_onion(url, domain, date, date_month, message, mode):
|
||||
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
@ -166,7 +166,8 @@ if __name__ == '__main__':
|
|||
publisher.info("Script Crawler started")
|
||||
|
||||
# load domains blacklist
|
||||
load_type_blacklist(type_hidden_service)
|
||||
load_type_blacklist('onions')
|
||||
load_type_blacklist('regular')
|
||||
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||
print('splash url: {}'.format(splash_url))
|
||||
|
@ -180,16 +181,15 @@ if __name__ == '__main__':
|
|||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
|
||||
while True:
|
||||
|
||||
if mode == 'automatic':
|
||||
# Priority Queue - Recovering the streamed message informations.
|
||||
message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service))
|
||||
|
||||
# Recovering the streamed message informations.
|
||||
if message is None:
|
||||
# Recovering the streamed message informations.
|
||||
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
|
||||
|
||||
else:
|
||||
pass
|
||||
|
||||
|
@ -244,16 +244,16 @@ if __name__ == '__main__':
|
|||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
# Launch Scrapy-Splash Crawler
|
||||
crawl_onion(url, domain, date, date_month, message)
|
||||
crawl_onion(url, domain, date, date_month, message, mode)
|
||||
# Crawl Domain
|
||||
if url != domain_url:
|
||||
#Crawl Domain with port number
|
||||
if port is not None:
|
||||
print('{}:{}'.format(domain_url, port))
|
||||
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
|
||||
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode)
|
||||
#Crawl without port number
|
||||
print(domain_url)
|
||||
crawl_onion(domain_url, domain, date, date_month, message)
|
||||
crawl_onion(domain_url, domain, date, date_month, message, mode)
|
||||
|
||||
# update last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
@ -293,14 +293,9 @@ if __name__ == '__main__':
|
|||
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||
|
||||
# update list, last crawled sites
|
||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||
# manual
|
||||
else:
|
||||
# update list, last crawled sites
|
||||
r_onion.lpush('last_crawled_manual', domain)
|
||||
r_onion.ltrim('last_crawled_manual', 0, 15)
|
||||
# update list, last crawled sites
|
||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||
|
||||
#update crawler status
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
|
|
|
@ -28,10 +28,10 @@ from Helper import Process
|
|||
|
||||
class TorSplashCrawler():
|
||||
|
||||
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
|
||||
def __init__(self, splash_url, crawler_options):
|
||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
||||
self.crawler = Crawler(self.TorSplashSpider, {
|
||||
'USER_AGENT': user_agent,
|
||||
'USER_AGENT': crawler_options['user_agent'],
|
||||
'SPLASH_URL': splash_url,
|
||||
'ROBOTSTXT_OBEY': False,
|
||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||
|
@ -42,18 +42,18 @@ class TorSplashCrawler():
|
|||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||
'HTTPERROR_ALLOW_ALL': True,
|
||||
'RETRY_TIMES': 2,
|
||||
'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
|
||||
'DEPTH_LIMIT': crawler_depth_limit
|
||||
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
|
||||
'DEPTH_LIMIT': crawler_options['depth_limit']
|
||||
})
|
||||
|
||||
def crawl(self, type, url, domain, original_paste, super_father):
|
||||
self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
|
||||
def crawl(self, type, crawler_options, url, domain, original_paste, super_father):
|
||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
|
||||
self.process.start()
|
||||
|
||||
class TorSplashSpider(Spider):
|
||||
name = 'TorSplashSpider'
|
||||
|
||||
def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
|
||||
def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs):
|
||||
self.type = type
|
||||
self.original_paste = original_paste
|
||||
self.super_father = super_father
|
||||
|
@ -63,6 +63,12 @@ class TorSplashCrawler():
|
|||
self.full_date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
self.date_month = datetime.datetime.now().strftime("%Y%m")
|
||||
|
||||
self.arg_crawler = { 'html': crawler_options['html'],
|
||||
'wait': 10,
|
||||
'render_all': 1,
|
||||
'har': crawler_options['har'],
|
||||
'png': crawler_options['png']}
|
||||
|
||||
config_section = 'Crawler'
|
||||
self.p = Process(config_section)
|
||||
|
||||
|
@ -104,11 +110,7 @@ class TorSplashCrawler():
|
|||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': self.original_paste},
|
||||
args={ 'html': 1,
|
||||
'wait': 10,
|
||||
'render_all': 1,
|
||||
'har': 1,
|
||||
'png': 1}
|
||||
args=self.arg_crawler
|
||||
)
|
||||
|
||||
def parse(self,response):
|
||||
|
@ -131,6 +133,7 @@ class TorSplashCrawler():
|
|||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
||||
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
||||
|
||||
# # TODO: modify me
|
||||
# save new paste on disk
|
||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||
|
||||
|
@ -158,14 +161,16 @@ class TorSplashCrawler():
|
|||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
size_screenshot = (len(response.data['png'])*3) /4
|
||||
if 'png' in response.data:
|
||||
size_screenshot = (len(response.data['png'])*3) /4
|
||||
|
||||
if size_screenshot < 5000000: #bytes
|
||||
with open(filename_screenshot, 'wb') as f:
|
||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||
if size_screenshot < 5000000: #bytes
|
||||
with open(filename_screenshot, 'wb') as f:
|
||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||
|
||||
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||
f.write(json.dumps(response.data['har']).encode())
|
||||
if 'har' in response.data:
|
||||
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||
f.write(json.dumps(response.data['har']).encode())
|
||||
|
||||
# save external links in set
|
||||
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||
|
@ -181,11 +186,7 @@ class TorSplashCrawler():
|
|||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': relative_filename_paste},
|
||||
args={ 'html': 1,
|
||||
'png': 1,
|
||||
'render_all': 1,
|
||||
'har': 1,
|
||||
'wait': 10}
|
||||
args=self.arg_crawler
|
||||
)
|
||||
|
||||
def errback_catcher(self, failure):
|
||||
|
@ -205,11 +206,7 @@ class TorSplashCrawler():
|
|||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': father},
|
||||
args={ 'html': 1,
|
||||
'png': 1,
|
||||
'render_all': 1,
|
||||
'har': 1,
|
||||
'wait': 10}
|
||||
args=self.arg_crawler
|
||||
)
|
||||
|
||||
else:
|
||||
|
|
|
@ -6,6 +6,9 @@ import sys
|
|||
import configparser
|
||||
from TorSplashCrawler import TorSplashCrawler
|
||||
|
||||
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
|
||||
default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50}
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 7:
|
||||
|
@ -23,17 +26,17 @@ if __name__ == '__main__':
|
|||
|
||||
splash_url = sys.argv[1]
|
||||
type = sys.argv[2]
|
||||
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
||||
|
||||
url = sys.argv[3]
|
||||
domain = sys.argv[4]
|
||||
paste = sys.argv[5]
|
||||
super_father = sys.argv[6]
|
||||
|
||||
if crawler_options is None:
|
||||
crawler_options = default_crawler_options
|
||||
|
||||
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
|
||||
user_agent = tor_browser_agent
|
||||
crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit")
|
||||
crawler_options['user_agent'] = tor_browser_agent
|
||||
|
||||
closespider_pagecount = 50
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
|
||||
crawler.crawl(type, url, domain, paste, super_father)
|
||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||
crawler.crawl(type, crawler_options, url, domain, paste, super_father)
|
||||
|
|
|
@ -2,30 +2,27 @@
|
|||
<html>
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
|
||||
<title>Tags - AIL</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">
|
||||
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='font-awesome/css/font-awesome.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/sb-admin-2.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/dygraph_gallery.css') }}" rel="stylesheet" type="text/css" />
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
|
||||
|
||||
<!-- JS -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.flot.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.flot.pie.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.flot.time.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
|
||||
|
||||
</head>
|
||||
<body>
|
||||
|
||||
{% include 'navbar.html' %}
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div id="page-wrapper">
|
||||
<div class="row">
|
||||
|
|
|
@ -142,6 +142,10 @@ def get_crawler_splash_status(mode, type):
|
|||
def hiddenServices_page_test():
|
||||
return render_template("Crawler_index.html")
|
||||
|
||||
@hiddenServices.route("/crawlers/manual", methods=['GET'])
|
||||
def manual():
|
||||
return render_template("Crawler_Splash_manual.html")
|
||||
|
||||
@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET'])
|
||||
def crawler_splash_onion():
|
||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||
|
|
Loading…
Reference in a new issue