mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 22:27:17 +00:00
chg: [i2p crawler] refactor
This commit is contained in:
parent
f5ac98bcf0
commit
e4e314e28a
2 changed files with 66 additions and 113 deletions
|
@ -431,7 +431,7 @@ if __name__ == "__main__":
|
||||||
for upper_tag in list_upper_tags_remote:
|
for upper_tag in list_upper_tags_remote:
|
||||||
print(' {}{}{}: {}'.format(TERMINAL_BLUE, upper_tag[0], TERMINAL_DEFAULT, upper_tag[1]))
|
print(' {}{}{}: {}'.format(TERMINAL_BLUE, upper_tag[0], TERMINAL_DEFAULT, upper_tag[1]))
|
||||||
print()
|
print()
|
||||||
#update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)
|
update_ail(current_tag, list_upper_tags_remote, current_version_path, is_fork)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('Please, commit your changes or stash them before you can update AIL')
|
print('Please, commit your changes or stash them before you can update AIL')
|
||||||
|
|
|
@ -324,7 +324,6 @@ class I2pSplashCrawler():
|
||||||
|
|
||||||
|
|
||||||
def notbob(self, website, process, crawler, reload=False):
|
def notbob(self, website, process, crawler, reload=False):
|
||||||
print(f"Splash_url: {self.splash_url}")
|
|
||||||
website = self.process_url(website)
|
website = self.process_url(website)
|
||||||
print("\t" + website)
|
print("\t" + website)
|
||||||
if reload:
|
if reload:
|
||||||
|
@ -357,69 +356,45 @@ class I2pSplashCrawler():
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
soup2 = BeautifulSoup(r.content, "html.parser")
|
soup2 = BeautifulSoup(r.content, "html.parser")
|
||||||
title = soup2.find_all('title', limit=1)
|
self.notBobBody(website, process, crawler, soup2)
|
||||||
if title:
|
|
||||||
t = str(title[0])
|
|
||||||
t = t[7:]
|
|
||||||
t = t[:-8]
|
|
||||||
|
|
||||||
if t == "Information: New Host Name":
|
|
||||||
self.notbob(website, process, crawler, reload=True)
|
|
||||||
elif t == "Website Unreachable":
|
|
||||||
print("Not find with Notbob")
|
|
||||||
self.i2pjump(website, process, crawler)
|
|
||||||
elif t == "Warning: Destination Key Conflict":
|
|
||||||
link = soup2.find_all("a", href=True)
|
|
||||||
for l in link:
|
|
||||||
if l.get_text() == f'Destination for {website} in address book':
|
|
||||||
self.regular_request(l["href"], process, crawler)
|
|
||||||
else:
|
|
||||||
print(t)
|
|
||||||
print("notbob")
|
|
||||||
try:
|
|
||||||
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
|
||||||
process.start()
|
|
||||||
except Exception as e:
|
|
||||||
print("notbob error process")
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("Not find with Notbob")
|
|
||||||
self.i2pjump(website, process, crawler)
|
|
||||||
# Not find, try an other jump server
|
# Not find, try an other jump server
|
||||||
else:
|
else:
|
||||||
if not dead:
|
if not dead:
|
||||||
title = soup.find_all('title', limit=1)
|
self.notBobBody(website, process, crawler, soup)
|
||||||
if title:
|
|
||||||
t = str(title[0])
|
|
||||||
t = t[7:]
|
|
||||||
t = t[:-8]
|
|
||||||
|
|
||||||
if t == "Information: New Host Name":
|
|
||||||
self.notbob(website, process, crawler, reload=True)
|
|
||||||
elif t == "Website Unreachable":
|
|
||||||
print("Not find with Notbob")
|
|
||||||
self.i2pjump(website, process, crawler)
|
|
||||||
elif t == "Warning: Destination Key Conflict":
|
|
||||||
link = soup.find_all("a", href=True)
|
|
||||||
for l in link:
|
|
||||||
if l.get_text() == f'Destination for {website} in address book':
|
|
||||||
self.regular_request(l["href"], process, crawler)
|
|
||||||
else:
|
|
||||||
print(t)
|
|
||||||
print("notbob2")
|
|
||||||
try:
|
|
||||||
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
|
||||||
process.start()
|
|
||||||
except Exception as e:
|
|
||||||
print("notbob error process")
|
|
||||||
print(e)
|
|
||||||
else:
|
|
||||||
print("Not find with Notbob")
|
|
||||||
self.i2pjump(website, process, crawler)
|
|
||||||
else:
|
else:
|
||||||
print("Not find with Notbob")
|
print("Not find with Notbob")
|
||||||
self.i2pjump(website, process, crawler)
|
self.i2pjump(website, process, crawler)
|
||||||
|
|
||||||
|
|
||||||
|
def notBobBody(self, website, process, crawler, soup):
|
||||||
|
"""notbob's body"""
|
||||||
|
title = soup.find_all('title', limit=1)
|
||||||
|
if title:
|
||||||
|
t = str(title[0])
|
||||||
|
t = t[7:]
|
||||||
|
t = t[:-8]
|
||||||
|
|
||||||
|
if t == "Information: New Host Name":
|
||||||
|
self.notbob(website, process, crawler, reload=True)
|
||||||
|
elif t == "Website Unreachable":
|
||||||
|
print("Not find with Notbob")
|
||||||
|
self.i2pjump(website, process, crawler)
|
||||||
|
elif t == "Warning: Destination Key Conflict":
|
||||||
|
link = soup.find_all("a", href=True)
|
||||||
|
for l in link:
|
||||||
|
if l.get_text() == f'Destination for {website} in address book':
|
||||||
|
self.regular_request(l["href"], process, crawler)
|
||||||
|
else:
|
||||||
|
print(t)
|
||||||
|
try:
|
||||||
|
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
||||||
|
process.start()
|
||||||
|
except Exception as e:
|
||||||
|
print("notbob error process")
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
print("Not find with Notbob")
|
||||||
|
self.i2pjump(website, process, crawler)
|
||||||
|
|
||||||
|
|
||||||
def i2pjump(self, website, process, crawler, reload=False):
|
def i2pjump(self, website, process, crawler, reload=False):
|
||||||
|
@ -498,68 +473,45 @@ class I2pSplashCrawler():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("stati2p error")
|
print("stati2p error")
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
soup2 = BeautifulSoup(r.content, "html.parser")
|
soup2 = BeautifulSoup(r.content, "html.parser")
|
||||||
title = soup2.find_all('title', limit=1)
|
self.statsi2pBody(website, process, crawler, soup2)
|
||||||
|
|
||||||
if title:
|
|
||||||
t = str(title[0])
|
|
||||||
t = t[7:]
|
|
||||||
t = t[:-8]
|
|
||||||
|
|
||||||
if t == "Information: New Host Name":
|
|
||||||
self.statsi2p(website, process, crawler, reload=True)
|
|
||||||
elif t == "Website Unreachable":
|
|
||||||
print("Not find with stati2p")
|
|
||||||
self.regular_request(website, process, crawler)
|
|
||||||
elif t == "Warning: Destination Key Conflict":
|
|
||||||
link = soup2.find_all("a", href=True)
|
|
||||||
for l in link:
|
|
||||||
if l.get_text() == f'Destination for {website} in address book':
|
|
||||||
self.regular_request(l["href"], process, crawler)
|
|
||||||
else:
|
|
||||||
print(t)
|
|
||||||
print("stati2p")
|
|
||||||
try:
|
|
||||||
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
|
||||||
process.start()
|
|
||||||
except Exception as e:
|
|
||||||
print("stati2p error process")
|
|
||||||
print(e)
|
|
||||||
else:
|
|
||||||
print("Not find with stati2p")
|
|
||||||
self.regular_request(website, process, crawler)
|
|
||||||
else:
|
else:
|
||||||
print("Not find with stati2p")
|
print("Not find with stati2p")
|
||||||
self.regular_request(website, process, crawler)
|
self.regular_request(website, process, crawler)
|
||||||
else:
|
else:
|
||||||
title = soup.find_all('title', limit=1)
|
self.statsi2pBody(website, process, crawler, soup)
|
||||||
if title:
|
|
||||||
t = str(title[0])
|
|
||||||
t = t[7:]
|
|
||||||
t = t[:-8]
|
|
||||||
|
|
||||||
if t == "Information: New Host Name":
|
def statsi2pBody(self, website, process, crawler, soup):
|
||||||
self.statsi2p(website, process, crawler, reload=True)
|
"""stati2p's body"""
|
||||||
elif t == "Website Unreachable":
|
title = soup.find_all('title', limit=1)
|
||||||
print("Not find with stati2p")
|
if title:
|
||||||
self.regular_request(website, process, crawler)
|
t = str(title[0])
|
||||||
elif t == "Warning: Destination Key Conflict":
|
t = t[7:]
|
||||||
link = soup.find_all("a", href=True)
|
t = t[:-8]
|
||||||
for l in link:
|
|
||||||
if l.get_text() == f'Destination for {website} in address book':
|
if t == "Information: New Host Name":
|
||||||
self.regular_request(l["href"], process, crawler)
|
self.statsi2p(website, process, crawler, reload=True)
|
||||||
else:
|
elif t == "Website Unreachable":
|
||||||
print(t)
|
|
||||||
print("stati2p")
|
|
||||||
try:
|
|
||||||
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
|
||||||
process.start()
|
|
||||||
except Exception as e:
|
|
||||||
print("stati2p error process")
|
|
||||||
print(e)
|
|
||||||
else:
|
|
||||||
print("Not find with stati2p")
|
print("Not find with stati2p")
|
||||||
self.regular_request(website, process, crawler)
|
self.regular_request(website, process, crawler)
|
||||||
|
elif t == "Warning: Destination Key Conflict":
|
||||||
|
link = soup.find_all("a", href=True)
|
||||||
|
for l in link:
|
||||||
|
if l.get_text() == f'Destination for {website} in address book':
|
||||||
|
self.regular_request(l["href"], process, crawler)
|
||||||
|
else:
|
||||||
|
print(t)
|
||||||
|
try:
|
||||||
|
process.crawl(crawler, splash_url=self.splash_url, type=self.domain_type, crawler_options=self.crawler_options, date=self.date, requested_mode=self.requested_mode, url=self.start_urls, domain=self.domains[0], port=self.port, cookies=self.cookies, original_item=self.original_item)
|
||||||
|
process.start()
|
||||||
|
except Exception as e:
|
||||||
|
print("stati2p error process")
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
print("Not find with stati2p")
|
||||||
|
self.regular_request(website, process, crawler)
|
||||||
|
|
||||||
|
|
||||||
def regular_request(self, website, process, crawler, reload=False):
|
def regular_request(self, website, process, crawler, reload=False):
|
||||||
|
@ -602,6 +554,7 @@ class I2pSplashCrawler():
|
||||||
print("Exit...\n\n")
|
print("Exit...\n\n")
|
||||||
crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
|
crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)')
|
||||||
|
|
||||||
|
|
||||||
def process_url(self, url):
|
def process_url(self, url):
|
||||||
if "http://" == url[0:7]:
|
if "http://" == url[0:7]:
|
||||||
url = url[7:]
|
url = url[7:]
|
||||||
|
|
Loading…
Reference in a new issue