chg: [Crawler] refractor

2024-11-27 00:07:16 +00:00 · 2019-02-21 09:54:43 +01:00 · 2019-02-21 09:54:43 +01:00 · e5dca268a8
commit e5dca268a8
parent 0832784f7a
5 changed files with 61 additions and 65 deletions
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -79,7 +79,7 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
        r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain)
        r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message)

-def crawl_onion(url, domain, date, date_month, message):
+def crawl_onion(url, domain, date, date_month, message, mode):

    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))
@ -166,7 +166,8 @@ if __name__ == '__main__':
        publisher.info("Script Crawler started")

    # load domains blacklist
-    load_type_blacklist(type_hidden_service)
+    load_type_blacklist('onions')
+    load_type_blacklist('regular')

    splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
    print('splash url: {}'.format(splash_url))
@ -180,16 +181,15 @@ if __name__ == '__main__':
    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
    r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S"))

-
    while True:

        if mode == 'automatic':
            # Priority Queue - Recovering the streamed message informations.
            message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service))
-
+            # Recovering the streamed message informations.
            if message is None:
-                # Recovering the streamed message informations.
                message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
+            
        else:
            pass

@ -244,16 +244,16 @@ if __name__ == '__main__':
                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)

                        # Launch Scrapy-Splash Crawler
-                        crawl_onion(url, domain, date, date_month, message)
+                        crawl_onion(url, domain, date, date_month, message, mode)
                        # Crawl Domain
                        if url != domain_url:
                            #Crawl Domain with port number
                            if port is not None:
                                print('{}:{}'.format(domain_url, port))
-                                crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
+                                crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode)
                            #Crawl without port number
                            print(domain_url)
-                            crawl_onion(domain_url, domain, date, date_month, message)
+                            crawl_onion(domain_url, domain, date, date_month, message, mode)

                        # update last check
                        r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
@ -293,14 +293,9 @@ if __name__ == '__main__':
                            r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
                            print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))

-                            # update list, last crawled sites
-                            r_onion.lpush('last_{}'.format(type_hidden_service), domain)
-                            r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
-                        # manual
-                        else:
-                            # update list, last crawled sites
-                            r_onion.lpush('last_crawled_manual', domain)
-                            r_onion.ltrim('last_crawled_manual', 0, 15)
+                        # update list, last crawled sites
+                        r_onion.lpush('last_{}'.format(type_hidden_service), domain)
+                        r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)

                        #update crawler status
                        r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -28,10 +28,10 @@ from Helper import Process

 class TorSplashCrawler():

-    def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
+    def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': False})
        self.crawler = Crawler(self.TorSplashSpider, {
-            'USER_AGENT': user_agent,
+            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -42,18 +42,18 @@ class TorSplashCrawler():
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
-            'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
-            'DEPTH_LIMIT': crawler_depth_limit
+            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
+            'DEPTH_LIMIT': crawler_options['depth_limit']
            })

-    def crawl(self, type, url, domain, original_paste, super_father):
-        self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
+    def crawl(self, type, crawler_options, url, domain, original_paste, super_father):
+        self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
        self.process.start()

    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

-        def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
+        def __init__(self, type, crawler_options, url, domain,original_paste, super_father, *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
@ -63,6 +63,12 @@ class TorSplashCrawler():
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

+            self.arg_crawler = {  'html': crawler_options['html'],
+                                  'wait': 10,
+                                  'render_all': 1,
+                                  'har': crawler_options['har'],
+                                  'png': crawler_options['png']}
+
            config_section = 'Crawler'
            self.p = Process(config_section)

@ -104,11 +110,7 @@ class TorSplashCrawler():
                errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_paste},
-                args={  'html': 1,
-                        'wait': 10,
-                        'render_all': 1,
-                        'har': 1,
-                        'png': 1}
+                args=self.arg_crawler
            )

        def parse(self,response):
@ -131,6 +133,7 @@ class TorSplashCrawler():
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')

+                # # TODO: modify me
                # save new paste on disk
                if self.save_crawled_paste(filename_paste, response.data['html']):

@ -158,14 +161,16 @@ class TorSplashCrawler():
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)

-                    size_screenshot = (len(response.data['png'])*3) /4
+                    if 'png' in response.data:
+                        size_screenshot = (len(response.data['png'])*3) /4

-                    if size_screenshot < 5000000: #bytes
-                        with open(filename_screenshot, 'wb') as f:
-                            f.write(base64.standard_b64decode(response.data['png'].encode()))
+                        if size_screenshot < 5000000: #bytes
+                            with open(filename_screenshot, 'wb') as f:
+                                f.write(base64.standard_b64decode(response.data['png'].encode()))

-                    with open(filename_screenshot+'har.txt', 'wb') as f:
-                        f.write(json.dumps(response.data['har']).encode())
+                    if 'har' in response.data:
+                        with open(filename_screenshot+'har.txt', 'wb') as f:
+                            f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
@ -181,11 +186,7 @@ class TorSplashCrawler():
                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
-                            args={  'html': 1,
-                                    'png': 1,
-                                    'render_all': 1,
-                                    'har': 1,
-                                    'wait': 10}
+                            args=self.arg_crawler
                        )

        def errback_catcher(self, failure):
@ -205,11 +206,7 @@ class TorSplashCrawler():
                    errback=self.errback_catcher,
                    endpoint='render.json',
                    meta={'father': father},
-                    args={  'html': 1,
-                            'png': 1,
-                            'render_all': 1,
-                            'har': 1,
-                            'wait': 10}
+                    args=self.arg_crawler
                )

            else:
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@ -6,6 +6,9 @@ import sys
 import configparser
 from TorSplashCrawler import TorSplashCrawler

+tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
+default_crawler_options = {'html': 1, 'har': 1, 'png': 1, 'closespider_pagecount': 50}
+
 if __name__ == '__main__':

    if len(sys.argv) != 7:
@ -23,17 +26,17 @@ if __name__ == '__main__':

    splash_url = sys.argv[1]
    type = sys.argv[2]
-    crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")

    url = sys.argv[3]
    domain = sys.argv[4]
    paste = sys.argv[5]
    super_father = sys.argv[6]
+    
+    if crawler_options is None:
+        crawler_options = default_crawler_options

-    tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
-    user_agent = tor_browser_agent
+    crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit")
+    crawler_options['user_agent'] = tor_browser_agent

-    closespider_pagecount = 50
-
-    crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
-    crawler.crawl(type, url, domain, paste, super_father)
+    crawler = TorSplashCrawler(splash_url, crawler_options)
+    crawler.crawl(type, crawler_options, url, domain, paste, super_father)
--- a/var/www/modules/Tags/templates/Tags.html
+++ b/var/www/modules/Tags/templates/Tags.html
@ -2,30 +2,27 @@
 <html>

 	<head>
-	  <meta charset="utf-8">
-	  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-
 	  <title>Tags - AIL</title>
 		<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">

 	  <!-- Core CSS -->
-	  <link href="{{ url_for('static', filename='css/bootstrap.min.css') }}" rel="stylesheet">
-	  <link href="{{ url_for('static', filename='font-awesome/css/font-awesome.css') }}" rel="stylesheet">
-	  <link href="{{ url_for('static', filename='css/sb-admin-2.css') }}" rel="stylesheet">
-	  <link href="{{ url_for('static', filename='css/dygraph_gallery.css') }}" rel="stylesheet" type="text/css" />
+		<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
+		<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
+		<link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
 		<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
+
 	  <!-- JS -->
-	  <script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script>
-	  <script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script>
-          <script src="{{ url_for('static', filename='js/jquery.flot.js') }}"></script>
-          <script src="{{ url_for('static', filename='js/jquery.flot.pie.js') }}"></script>
-          <script src="{{ url_for('static', filename='js/jquery.flot.time.js') }}"></script>
-					<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
+		<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
+		<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
+		<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
+		<script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
+		<script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
+		<script src="{{ url_for('static', filename='js/tags.js') }}"></script>

 	</head>
 	<body>

-            {% include 'navbar.html' %}
+            {% include 'nav_bar.html' %}

    <div id="page-wrapper">
 			<div class="row">
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@ -142,6 +142,10 @@ def get_crawler_splash_status(mode, type):
 def hiddenServices_page_test():
    return render_template("Crawler_index.html")

+@hiddenServices.route("/crawlers/manual", methods=['GET'])
+def manual():
+    return render_template("Crawler_Splash_manual.html")
+
@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET'])
 def crawler_splash_onion():
    last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)