From e4e314e28aae99d7eba44a2e0fb0314a17e66470 Mon Sep 17 00:00:00 2001
From: David Cruciani <da.cruciani@laposte.net>
Date: Fri, 1 Jul 2022 08:40:49 +0200
Subject: [PATCH] chg: [i2p crawler] refactor

---
 bin/Update.py                      |   2 +-
 bin/torcrawler/I2pSplashCrawler.py | 177 +++++++++++------------------
 2 files changed, 66 insertions(+), 113 deletions(-)

diff --git a/bin/Update.py b/bin/Update.py
index 5d6f92dd..5e885301 100755
--- a/bin/Update.py
+++ b/bin/Update.py
@@ -431,7 +431,7 @@ if __name__ == "__main__":
                 for upper_tag in list_upper_tags_remote:
                     print('    {}{}{}: {}'.format(TERMINAL_BLUE, upper_tag[0], TERMINAL_DEFAULT, upper_tag[1]))
             print()
-            #update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)
+            update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)
 
         else:
             print('Please, commit your changes or stash them before you can update AIL')
diff --git a/bin/torcrawler/I2pSplashCrawler.py b/bin/torcrawler/I2pSplashCrawler.py
index d088a7ca..5a34291d 100644
--- a/bin/torcrawler/I2pSplashCrawler.py
+++ b/bin/torcrawler/I2pSplashCrawler.py
@@ -324,7 +324,6 @@ class I2pSplashCrawler():
 
 
         def notbob(self, website, process, crawler, reload=False):
-            print(f"Splash_url: {self.splash_url}")
             website = self.process_url(website)
             print("\t" + website)
             if reload:
@@ -357,69 +356,45 @@ class I2pSplashCrawler():
                     print(e)
 
                 soup2 = BeautifulSoup(r.content, "html.parser")
-                title = soup2.find_all('title', limit=1)
-                if title:
-                    t = str(title[0])
-                    t = t[7:]
-                    t = t[:-8]
-
-                    if t == "Information: New Host Name":
-                        self.notbob(website, process, crawler, reload=True)
-                    elif t == "Website Unreachable":
-                        print("Not find with Notbob")
-                        self.i2pjump(website, process, crawler)
-                    elif t == "Warning: Destination Key Conflict":
-                        link = soup2.find_all("a", href=True)
-                        for l in link:
-                            if l.get_text() == f'Destination for {website} in address book':
-                                self.regular_request(l["href"], process, crawler)
-                    else:
-                        print(t)
-                        print("notbob")
-                        try:
-                            process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
-                            process.start()
-                        except Exception as e:
-                            print("notbob error process")
-                            print(e)
-
-                else:
-                    print("Not find with Notbob")
-                    self.i2pjump(website, process, crawler)
+                self.notBobBody(website, process, crawler, soup2)
             # Not find, try an other jump server
             else:
                 if not dead:
-                    title = soup.find_all('title', limit=1)
-                    if title:
-                        t = str(title[0])
-                        t = t[7:]
-                        t = t[:-8]
-
-                        if t == "Information: New Host Name":
-                            self.notbob(website, process, crawler, reload=True)
-                        elif t == "Website Unreachable":
-                            print("Not find with Notbob")
-                            self.i2pjump(website, process, crawler)
-                        elif t == "Warning: Destination Key Conflict":
-                            link = soup.find_all("a", href=True)
-                            for l in link:
-                                if l.get_text() == f'Destination for {website} in address book':
-                                    self.regular_request(l["href"], process, crawler)
-                        else:
-                            print(t)
-                            print("notbob2")
-                            try:
-                                process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
-                                process.start()
-                            except Exception as e:
-                                print("notbob error process")
-                                print(e)
-                    else:
-                        print("Not find with Notbob")
-                        self.i2pjump(website, process, crawler)
+                    self.notBobBody(website, process, crawler, soup)
                 else:
                     print("Not find with Notbob")
                     self.i2pjump(website, process, crawler)
+        
+        
+        def notBobBody(self, website, process, crawler, soup):
+            """notbob's body"""
+            title = soup.find_all('title', limit=1)
+            if title:
+                t = str(title[0])
+                t = t[7:]
+                t = t[:-8]
+
+                if t == "Information: New Host Name":
+                    self.notbob(website, process, crawler, reload=True)
+                elif t == "Website Unreachable":
+                    print("Not find with Notbob")
+                    self.i2pjump(website, process, crawler)
+                elif t == "Warning: Destination Key Conflict":
+                    link = soup.find_all("a", href=True)
+                    for l in link:
+                        if l.get_text() == f'Destination for {website} in address book':
+                            self.regular_request(l["href"], process, crawler)
+                else:
+                    print(t)
+                    try:
+                        process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                        process.start()
+                    except Exception as e:
+                        print("notbob error process")
+                        print(e)
+            else:
+                print("Not find with Notbob")
+                self.i2pjump(website, process, crawler)
 
 
         def i2pjump(self, website, process, crawler, reload=False):
@@ -498,68 +473,45 @@ class I2pSplashCrawler():
                     except Exception as e:
                         print("stati2p error")
                         print(e)
+
                     soup2 = BeautifulSoup(r.content, "html.parser")
-                    title = soup2.find_all('title', limit=1)
-
-                    if title:
-                        t = str(title[0])
-                        t = t[7:]
-                        t = t[:-8]
-
-                        if t == "Information: New Host Name":
-                            self.statsi2p(website, process, crawler, reload=True)
-                        elif t == "Website Unreachable":
-                            print("Not find with stati2p")
-                            self.regular_request(website, process, crawler)
-                        elif t == "Warning: Destination Key Conflict":
-                            link = soup2.find_all("a", href=True)
-                            for l in link:
-                                if l.get_text() == f'Destination for {website} in address book':
-                                    self.regular_request(l["href"], process, crawler)
-                        else:
-                            print(t)
-                            print("stati2p")
-                            try:
-                                process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
-                                process.start()
-                            except Exception as e:
-                                print("stati2p error process")
-                                print(e)
-                    else:
-                        print("Not find with stati2p")
-                        self.regular_request(website, process, crawler)
+                    self.statsi2pBody(website, process, crawler, soup2)
                 else:
                     print("Not find with stati2p")
                     self.regular_request(website, process, crawler)
             else:
-                title = soup.find_all('title', limit=1)
-                if title:
-                    t = str(title[0])
-                    t = t[7:]
-                    t = t[:-8]
+                self.statsi2pBody(website, process, crawler, soup)
+            
 
-                    if t == "Information: New Host Name":
-                        self.statsi2p(website, process, crawler, reload=True)
-                    elif t == "Website Unreachable":
-                        print("Not find with stati2p")
-                        self.regular_request(website, process, crawler)
-                    elif t == "Warning: Destination Key Conflict":
-                        link = soup.find_all("a", href=True)
-                        for l in link:
-                            if l.get_text() == f'Destination for {website} in address book':
-                                self.regular_request(l["href"], process, crawler)
-                    else:
-                        print(t)
-                        print("stati2p")
-                        try:
-                            process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
-                            process.start()
-                        except Exception as e:
-                            print("stati2p error process")
-                            print(e)
-                else:
+        def statsi2pBody(self, website, process, crawler, soup):
+            """stati2p's body"""
+            title = soup.find_all('title', limit=1)
+            if title:
+                t = str(title[0])
+                t = t[7:]
+                t = t[:-8]
+
+                if t == "Information: New Host Name":
+                    self.statsi2p(website, process, crawler, reload=True)
+                elif t == "Website Unreachable":
                     print("Not find with stati2p")
                     self.regular_request(website, process, crawler)
+                elif t == "Warning: Destination Key Conflict":
+                    link = soup.find_all("a", href=True)
+                    for l in link:
+                        if l.get_text() == f'Destination for {website} in address book':
+                            self.regular_request(l["href"], process, crawler)
+                else:
+                    print(t)
+                    try:
+                        process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
+                        process.start()
+                    except Exception as e:
+                        print("stati2p error process")
+                        print(e)
+            else:
+                print("Not find with stati2p")
+                self.regular_request(website, process, crawler)
 
 
         def regular_request(self, website, process, crawler, reload=False):
@@ -602,6 +554,7 @@ class I2pSplashCrawler():
                 print("Exit...\n\n")
                 crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
 
+
         def process_url(self, url):
             if "http://" == url[0:7]:
                 url = url[7:]