From a4c03b4ba4f24cd53bb700836bf8000d92b0c90f Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 6 May 2019 11:46:20 +0200 Subject: [PATCH] fix: [Crawler] force domains/subdomains lower case (rfc4343) --- bin/Crawler.py | 48 ++++++++++++++++++++++++++++++++++++++---------- bin/Onion.py | 8 ++++++-- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index a7e9365b..d8e6a430 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -43,25 +43,49 @@ def unpack_url(url): to_crawl = {} faup.decode(url) url_unpack = faup.get() - to_crawl['domain'] = url_unpack['domain'].decode() + # # FIXME: # TODO: remove me + try: + to_crawl['domain'] = url_unpack['domain'].decode() + except: + to_crawl['domain'] = url_unpack['domain'] + to_crawl['domain'] = to_crawl['domain'].lower() + + + # force lower case domain/subdomain (rfc4343) + # # FIXME: # TODO: remove me + try: + url_host = url_unpack['host'].decode() + except: + url_host = url_unpack['host'] + + new_url_host = url_host.lower() + url_lower_case = url.replace(url_host, new_url_host, 1) if url_unpack['scheme'] is None: to_crawl['scheme'] = 'http' - url= 'http://{}'.format(url_unpack['url'].decode()) + url= 'http://{}'.format(url_lower_case) else: - scheme = url_unpack['scheme'].decode() + # # FIXME: # TODO: remove me + try: + scheme = url_unpack['scheme'].decode() + except Exception as e: + scheme = url_unpack['scheme'] if scheme in default_proto_map: to_crawl['scheme'] = scheme - url = url_unpack['url'].decode() + url = url_lower_case else: - redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_unpack['url'].decode())) + redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case)) to_crawl['scheme'] = 'http' - url= 'http://{}'.format(url_unpack['url'].decode().replace(scheme, '', 1)) + url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1)) if url_unpack['port'] is None: to_crawl['port'] = default_proto_map[to_crawl['scheme']] else: - port = url_unpack['port'].decode() + # # FIXME: # TODO: remove me + try: + port = url_unpack['port'].decode() + except: + port = url_unpack['port'] # Verify port number #################### make function to verify/correct port number try: int(port) @@ -80,12 +104,16 @@ def unpack_url(url): to_crawl['url'] = url if to_crawl['port'] == 80: - to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode()) + to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host) else: - to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) + to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port']) + # # FIXME: # TODO: remove me + try: + to_crawl['tld'] = url_unpack['tld'].decode() + except: + to_crawl['tld'] = url_unpack['tld'] - to_crawl['tld'] = url_unpack['tld'].decode() return to_crawl # get url, paste and service_type to crawl diff --git a/bin/Onion.py b/bin/Onion.py index cbe8bb9f..2aa56d8e 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -224,7 +224,11 @@ if __name__ == "__main__": faup.decode(url) url_unpack = faup.get() - domain = url_unpack['domain'].decode() + ## TODO: # FIXME: remove me + try: + domain = url_unpack['domain'].decode().lower() + except Exception as e: + domain = url_unpack['domain'].lower() ## TODO: blackilst by port ? # check blacklist @@ -233,7 +237,7 @@ if __name__ == "__main__": subdomain = re.findall(url_regex, url) if len(subdomain) > 0: - subdomain = subdomain[0][4] + subdomain = subdomain[0][4].lower() else: continue