fix: [Crawler] force domains/subdomains lower case (rfc4343)

This commit is contained in:
Terrtia 2019-05-06 11:46:20 +02:00
parent cc61c99290
commit a4c03b4ba4
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
2 changed files with 44 additions and 12 deletions

View file

@ -43,25 +43,49 @@ def unpack_url(url):
to_crawl = {} to_crawl = {}
faup.decode(url) faup.decode(url)
url_unpack = faup.get() url_unpack = faup.get()
# # FIXME: # TODO: remove me
try:
to_crawl['domain'] = url_unpack['domain'].decode() to_crawl['domain'] = url_unpack['domain'].decode()
except:
to_crawl['domain'] = url_unpack['domain']
to_crawl['domain'] = to_crawl['domain'].lower()
# force lower case domain/subdomain (rfc4343)
# # FIXME: # TODO: remove me
try:
url_host = url_unpack['host'].decode()
except:
url_host = url_unpack['host']
new_url_host = url_host.lower()
url_lower_case = url.replace(url_host, new_url_host, 1)
if url_unpack['scheme'] is None: if url_unpack['scheme'] is None:
to_crawl['scheme'] = 'http' to_crawl['scheme'] = 'http'
url= 'http://{}'.format(url_unpack['url'].decode()) url= 'http://{}'.format(url_lower_case)
else: else:
# # FIXME: # TODO: remove me
try:
scheme = url_unpack['scheme'].decode() scheme = url_unpack['scheme'].decode()
except Exception as e:
scheme = url_unpack['scheme']
if scheme in default_proto_map: if scheme in default_proto_map:
to_crawl['scheme'] = scheme to_crawl['scheme'] = scheme
url = url_unpack['url'].decode() url = url_lower_case
else: else:
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_unpack['url'].decode())) redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
to_crawl['scheme'] = 'http' to_crawl['scheme'] = 'http'
url= 'http://{}'.format(url_unpack['url'].decode().replace(scheme, '', 1)) url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
if url_unpack['port'] is None: if url_unpack['port'] is None:
to_crawl['port'] = default_proto_map[to_crawl['scheme']] to_crawl['port'] = default_proto_map[to_crawl['scheme']]
else: else:
# # FIXME: # TODO: remove me
try:
port = url_unpack['port'].decode() port = url_unpack['port'].decode()
except:
port = url_unpack['port']
# Verify port number #################### make function to verify/correct port number # Verify port number #################### make function to verify/correct port number
try: try:
int(port) int(port)
@ -80,12 +104,16 @@ def unpack_url(url):
to_crawl['url'] = url to_crawl['url'] = url
if to_crawl['port'] == 80: if to_crawl['port'] == 80:
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode()) to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
else: else:
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
# # FIXME: # TODO: remove me
try:
to_crawl['tld'] = url_unpack['tld'].decode() to_crawl['tld'] = url_unpack['tld'].decode()
except:
to_crawl['tld'] = url_unpack['tld']
return to_crawl return to_crawl
# get url, paste and service_type to crawl # get url, paste and service_type to crawl

View file

@ -224,7 +224,11 @@ if __name__ == "__main__":
faup.decode(url) faup.decode(url)
url_unpack = faup.get() url_unpack = faup.get()
domain = url_unpack['domain'].decode() ## TODO: # FIXME: remove me
try:
domain = url_unpack['domain'].decode().lower()
except Exception as e:
domain = url_unpack['domain'].lower()
## TODO: blackilst by port ? ## TODO: blackilst by port ?
# check blacklist # check blacklist
@ -233,7 +237,7 @@ if __name__ == "__main__":
subdomain = re.findall(url_regex, url) subdomain = re.findall(url_regex, url)
if len(subdomain) > 0: if len(subdomain) > 0:
subdomain = subdomain[0][4] subdomain = subdomain[0][4].lower()
else: else:
continue continue