mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
fix: [Crawler] force domains/subdomains lower case (rfc4343)
This commit is contained in:
parent
cc61c99290
commit
a4c03b4ba4
2 changed files with 44 additions and 12 deletions
|
@ -43,25 +43,49 @@ def unpack_url(url):
|
||||||
to_crawl = {}
|
to_crawl = {}
|
||||||
faup.decode(url)
|
faup.decode(url)
|
||||||
url_unpack = faup.get()
|
url_unpack = faup.get()
|
||||||
to_crawl['domain'] = url_unpack['domain'].decode()
|
# # FIXME: # TODO: remove me
|
||||||
|
try:
|
||||||
|
to_crawl['domain'] = url_unpack['domain'].decode()
|
||||||
|
except:
|
||||||
|
to_crawl['domain'] = url_unpack['domain']
|
||||||
|
to_crawl['domain'] = to_crawl['domain'].lower()
|
||||||
|
|
||||||
|
|
||||||
|
# force lower case domain/subdomain (rfc4343)
|
||||||
|
# # FIXME: # TODO: remove me
|
||||||
|
try:
|
||||||
|
url_host = url_unpack['host'].decode()
|
||||||
|
except:
|
||||||
|
url_host = url_unpack['host']
|
||||||
|
|
||||||
|
new_url_host = url_host.lower()
|
||||||
|
url_lower_case = url.replace(url_host, new_url_host, 1)
|
||||||
|
|
||||||
if url_unpack['scheme'] is None:
|
if url_unpack['scheme'] is None:
|
||||||
to_crawl['scheme'] = 'http'
|
to_crawl['scheme'] = 'http'
|
||||||
url= 'http://{}'.format(url_unpack['url'].decode())
|
url= 'http://{}'.format(url_lower_case)
|
||||||
else:
|
else:
|
||||||
scheme = url_unpack['scheme'].decode()
|
# # FIXME: # TODO: remove me
|
||||||
|
try:
|
||||||
|
scheme = url_unpack['scheme'].decode()
|
||||||
|
except Exception as e:
|
||||||
|
scheme = url_unpack['scheme']
|
||||||
if scheme in default_proto_map:
|
if scheme in default_proto_map:
|
||||||
to_crawl['scheme'] = scheme
|
to_crawl['scheme'] = scheme
|
||||||
url = url_unpack['url'].decode()
|
url = url_lower_case
|
||||||
else:
|
else:
|
||||||
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_unpack['url'].decode()))
|
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
|
||||||
to_crawl['scheme'] = 'http'
|
to_crawl['scheme'] = 'http'
|
||||||
url= 'http://{}'.format(url_unpack['url'].decode().replace(scheme, '', 1))
|
url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
|
||||||
|
|
||||||
if url_unpack['port'] is None:
|
if url_unpack['port'] is None:
|
||||||
to_crawl['port'] = default_proto_map[to_crawl['scheme']]
|
to_crawl['port'] = default_proto_map[to_crawl['scheme']]
|
||||||
else:
|
else:
|
||||||
port = url_unpack['port'].decode()
|
# # FIXME: # TODO: remove me
|
||||||
|
try:
|
||||||
|
port = url_unpack['port'].decode()
|
||||||
|
except:
|
||||||
|
port = url_unpack['port']
|
||||||
# Verify port number #################### make function to verify/correct port number
|
# Verify port number #################### make function to verify/correct port number
|
||||||
try:
|
try:
|
||||||
int(port)
|
int(port)
|
||||||
|
@ -80,12 +104,16 @@ def unpack_url(url):
|
||||||
|
|
||||||
to_crawl['url'] = url
|
to_crawl['url'] = url
|
||||||
if to_crawl['port'] == 80:
|
if to_crawl['port'] == 80:
|
||||||
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
|
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
|
||||||
else:
|
else:
|
||||||
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
|
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
|
||||||
|
|
||||||
|
# # FIXME: # TODO: remove me
|
||||||
|
try:
|
||||||
|
to_crawl['tld'] = url_unpack['tld'].decode()
|
||||||
|
except:
|
||||||
|
to_crawl['tld'] = url_unpack['tld']
|
||||||
|
|
||||||
to_crawl['tld'] = url_unpack['tld'].decode()
|
|
||||||
return to_crawl
|
return to_crawl
|
||||||
|
|
||||||
# get url, paste and service_type to crawl
|
# get url, paste and service_type to crawl
|
||||||
|
|
|
@ -224,7 +224,11 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
faup.decode(url)
|
faup.decode(url)
|
||||||
url_unpack = faup.get()
|
url_unpack = faup.get()
|
||||||
domain = url_unpack['domain'].decode()
|
## TODO: # FIXME: remove me
|
||||||
|
try:
|
||||||
|
domain = url_unpack['domain'].decode().lower()
|
||||||
|
except Exception as e:
|
||||||
|
domain = url_unpack['domain'].lower()
|
||||||
|
|
||||||
## TODO: blackilst by port ?
|
## TODO: blackilst by port ?
|
||||||
# check blacklist
|
# check blacklist
|
||||||
|
@ -233,7 +237,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
subdomain = re.findall(url_regex, url)
|
subdomain = re.findall(url_regex, url)
|
||||||
if len(subdomain) > 0:
|
if len(subdomain) > 0:
|
||||||
subdomain = subdomain[0][4]
|
subdomain = subdomain[0][4].lower()
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue