From e4e314e28aae99d7eba44a2e0fb0314a17e66470 Mon Sep 17 00:00:00 2001 From: David Cruciani Date: Fri, 1 Jul 2022 08:40:49 +0200 Subject: [PATCH] chg: [i2p crawler] refactor --- bin/Update.py | 2 +- bin/torcrawler/I2pSplashCrawler.py | 177 +++++++++++------------------ 2 files changed, 66 insertions(+), 113 deletions(-) diff --git a/bin/Update.py b/bin/Update.py index 5d6f92dd..5e885301 100755 --- a/bin/Update.py +++ b/bin/Update.py @@ -431,7 +431,7 @@ if __name__ == "__main__": for upper_tag in list_upper_tags_remote: print(' {}{}{}: {}'.format(TERMINAL_BLUE, upper_tag[0], TERMINAL_DEFAULT, upper_tag[1])) print() - #update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork) + update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork) else: print('Please, commit your changes or stash them before you can update AIL') diff --git a/bin/torcrawler/I2pSplashCrawler.py b/bin/torcrawler/I2pSplashCrawler.py index d088a7ca..5a34291d 100644 --- a/bin/torcrawler/I2pSplashCrawler.py +++ b/bin/torcrawler/I2pSplashCrawler.py @@ -324,7 +324,6 @@ class I2pSplashCrawler(): def notbob(self, website, process, crawler, reload=False): - print(f"Splash_url: {self.splash_url}") website = self.process_url(website) print("\t" + website) if reload: @@ -357,69 +356,45 @@ class I2pSplashCrawler(): print(e) soup2 = BeautifulSoup(r.content, "html.parser") - title = soup2.find_all('title', limit=1) - if title: - t = str(title[0]) - t = t[7:] - t = t[:-8] - - if t == "Information: New Host Name": - self.notbob(website, process, crawler, reload=True) - elif t == "Website Unreachable": - print("Not find with Notbob") - self.i2pjump(website, process, crawler) - elif t == "Warning: Destination Key Conflict": - link = soup2.find_all("a", href=True) - for l in link: - if l.get_text() == f'Destination for {website} in address book': - self.regular_request(l["href"], process, crawler) - else: - print(t) - print("notbob") - try: - process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) - process.start() - except Exception as e: - print("notbob error process") - print(e) - - else: - print("Not find with Notbob") - self.i2pjump(website, process, crawler) + self.notBobBody(website, process, crawler, soup2) # Not find, try an other jump server else: if not dead: - title = soup.find_all('title', limit=1) - if title: - t = str(title[0]) - t = t[7:] - t = t[:-8] - - if t == "Information: New Host Name": - self.notbob(website, process, crawler, reload=True) - elif t == "Website Unreachable": - print("Not find with Notbob") - self.i2pjump(website, process, crawler) - elif t == "Warning: Destination Key Conflict": - link = soup.find_all("a", href=True) - for l in link: - if l.get_text() == f'Destination for {website} in address book': - self.regular_request(l["href"], process, crawler) - else: - print(t) - print("notbob2") - try: - process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) - process.start() - except Exception as e: - print("notbob error process") - print(e) - else: - print("Not find with Notbob") - self.i2pjump(website, process, crawler) + self.notBobBody(website, process, crawler, soup) else: print("Not find with Notbob") self.i2pjump(website, process, crawler) + + + def notBobBody(self, website, process, crawler, soup): + """notbob's body""" + title = soup.find_all('title', limit=1) + if title: + t = str(title[0]) + t = t[7:] + t = t[:-8] + + if t == "Information: New Host Name": + self.notbob(website, process, crawler, reload=True) + elif t == "Website Unreachable": + print("Not find with Notbob") + self.i2pjump(website, process, crawler) + elif t == "Warning: Destination Key Conflict": + link = soup.find_all("a", href=True) + for l in link: + if l.get_text() == f'Destination for {website} in address book': + self.regular_request(l["href"], process, crawler) + else: + print(t) + try: + process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) + process.start() + except Exception as e: + print("notbob error process") + print(e) + else: + print("Not find with Notbob") + self.i2pjump(website, process, crawler) def i2pjump(self, website, process, crawler, reload=False): @@ -498,68 +473,45 @@ class I2pSplashCrawler(): except Exception as e: print("stati2p error") print(e) + soup2 = BeautifulSoup(r.content, "html.parser") - title = soup2.find_all('title', limit=1) - - if title: - t = str(title[0]) - t = t[7:] - t = t[:-8] - - if t == "Information: New Host Name": - self.statsi2p(website, process, crawler, reload=True) - elif t == "Website Unreachable": - print("Not find with stati2p") - self.regular_request(website, process, crawler) - elif t == "Warning: Destination Key Conflict": - link = soup2.find_all("a", href=True) - for l in link: - if l.get_text() == f'Destination for {website} in address book': - self.regular_request(l["href"], process, crawler) - else: - print(t) - print("stati2p") - try: - process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) - process.start() - except Exception as e: - print("stati2p error process") - print(e) - else: - print("Not find with stati2p") - self.regular_request(website, process, crawler) + self.statsi2pBody(website, process, crawler, soup2) else: print("Not find with stati2p") self.regular_request(website, process, crawler) else: - title = soup.find_all('title', limit=1) - if title: - t = str(title[0]) - t = t[7:] - t = t[:-8] + self.statsi2pBody(website, process, crawler, soup) + - if t == "Information: New Host Name": - self.statsi2p(website, process, crawler, reload=True) - elif t == "Website Unreachable": - print("Not find with stati2p") - self.regular_request(website, process, crawler) - elif t == "Warning: Destination Key Conflict": - link = soup.find_all("a", href=True) - for l in link: - if l.get_text() == f'Destination for {website} in address book': - self.regular_request(l["href"], process, crawler) - else: - print(t) - print("stati2p") - try: - process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) - process.start() - except Exception as e: - print("stati2p error process") - print(e) - else: + def statsi2pBody(self, website, process, crawler, soup): + """stati2p's body""" + title = soup.find_all('title', limit=1) + if title: + t = str(title[0]) + t = t[7:] + t = t[:-8] + + if t == "Information: New Host Name": + self.statsi2p(website, process, crawler, reload=True) + elif t == "Website Unreachable": print("Not find with stati2p") self.regular_request(website, process, crawler) + elif t == "Warning: Destination Key Conflict": + link = soup.find_all("a", href=True) + for l in link: + if l.get_text() == f'Destination for {website} in address book': + self.regular_request(l["href"], process, crawler) + else: + print(t) + try: + process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item) + process.start() + except Exception as e: + print("stati2p error process") + print(e) + else: + print("Not find with stati2p") + self.regular_request(website, process, crawler) def regular_request(self, website, process, crawler, reload=False): @@ -602,6 +554,7 @@ class I2pSplashCrawler(): print("Exit...\n\n") crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)') + def process_url(self, url): if "http://" == url[0:7]: url = url[7:]