mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-30 09:47:17 +00:00
chg: [Crawler] add bootsrap4 src + refractor crawler
This commit is contained in:
parent
423c7b1455
commit
516238025f
6 changed files with 106 additions and 57 deletions
133
bin/Crawler.py
133
bin/Crawler.py
|
@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
def decode_val(value):
|
||||||
|
if value is not None:
|
||||||
|
value = value.decode()
|
||||||
|
return value
|
||||||
|
|
||||||
|
def load_type_blacklist(type_service):
|
||||||
|
# load domains blacklist
|
||||||
|
try:
|
||||||
|
with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f:
|
||||||
|
# # TODO: # FIXME: remove this
|
||||||
|
r_onion.delete('blacklist_{}'.format(type_service))
|
||||||
|
lines = f.read().splitlines()
|
||||||
|
for line in lines:
|
||||||
|
r_onion.sadd('blacklist_{}'.format(type_service), line)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
||||||
# send this msg back in the queue
|
# send this msg back in the queue
|
||||||
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
||||||
|
@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
|
#print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
|
||||||
|
print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
type_hidden_service = sys.argv[1]
|
mode = sys.argv[1]
|
||||||
splash_port = sys.argv[2]
|
splash_port = sys.argv[2]
|
||||||
|
|
||||||
|
if mode == 'automatic':
|
||||||
|
type_hidden_service = 'onion'
|
||||||
|
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
|
||||||
|
@ -107,6 +129,16 @@ if __name__ == '__main__':
|
||||||
# Setup the I/O queues
|
# Setup the I/O queues
|
||||||
p = Process(config_section)
|
p = Process(config_section)
|
||||||
|
|
||||||
|
accepted_services = ['onion', 'regular']
|
||||||
|
|
||||||
|
dic_regex = {}
|
||||||
|
dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
re.compile(dic_regex['onion'])
|
||||||
|
dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
re.compile(dic_regex['i2p'])
|
||||||
|
dic_regex['regular'] = dic_regex['i2p']
|
||||||
|
|
||||||
|
|
||||||
url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
re.compile(url_onion)
|
re.compile(url_onion)
|
||||||
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
@ -114,17 +146,15 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
if type_hidden_service == 'onion':
|
if type_hidden_service == 'onion':
|
||||||
regex_hidden_service = url_onion
|
regex_hidden_service = url_onion
|
||||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
|
||||||
elif type_hidden_service == 'i2p':
|
elif type_hidden_service == 'i2p':
|
||||||
regex_hidden_service = url_i2p
|
regex_hidden_service = url_i2p
|
||||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port)
|
|
||||||
elif type_hidden_service == 'regular':
|
elif type_hidden_service == 'regular':
|
||||||
regex_hidden_service = url_i2p
|
regex_hidden_service = url_i2p
|
||||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
|
||||||
else:
|
else:
|
||||||
print('incorrect crawler type: {}'.format(type_hidden_service))
|
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||||
print('splash url: {}'.format(splash_url))
|
print('splash url: {}'.format(splash_url))
|
||||||
|
|
||||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||||
|
@ -150,19 +180,13 @@ if __name__ == '__main__':
|
||||||
db=p.config.getint("ARDB_Onion", "db"),
|
db=p.config.getint("ARDB_Onion", "db"),
|
||||||
decode_responses=True)
|
decode_responses=True)
|
||||||
|
|
||||||
|
# Crawler status
|
||||||
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
|
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
# load domains blacklist
|
# load domains blacklist
|
||||||
try:
|
load_type_blacklist(type_hidden_service)
|
||||||
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
|
|
||||||
r_onion.delete('blacklist_{}'.format(type_hidden_service))
|
|
||||||
lines = f.read().splitlines()
|
|
||||||
for line in lines:
|
|
||||||
r_onion.sadd('blacklist_{}'.format(type_hidden_service), line)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
|
@ -180,17 +204,24 @@ if __name__ == '__main__':
|
||||||
url, paste = splitted
|
url, paste = splitted
|
||||||
paste = paste.replace(PASTES_FOLDER+'/', '')
|
paste = paste.replace(PASTES_FOLDER+'/', '')
|
||||||
|
|
||||||
url_list = re.findall(regex_hidden_service, url)[0]
|
# extract data from url
|
||||||
if url_list[1] == '':
|
faup.decode(url)
|
||||||
|
url_unpack = faup.get()
|
||||||
|
url = decode_val(url_unpack['url'])
|
||||||
|
port = decode_val(url_unpack['port'])
|
||||||
|
scheme = decode_val(url_unpack['scheme'])
|
||||||
|
domain = decode_val(url_unpack['domain'])
|
||||||
|
host = decode_val(url_unpack['domain'])
|
||||||
|
|
||||||
|
# Add Scheme to url
|
||||||
|
if scheme is None:
|
||||||
url= 'http://{}'.format(url)
|
url= 'http://{}'.format(url)
|
||||||
|
|
||||||
link, s, credential, subdomain, domain, host, port, \
|
|
||||||
resource_path, query_string, f1, f2, f3, f4 = url_list
|
|
||||||
domain = url_list[4]
|
|
||||||
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
|
||||||
|
|
||||||
domain_url = 'http://{}'.format(domain)
|
domain_url = 'http://{}'.format(domain)
|
||||||
|
|
||||||
|
|
||||||
|
# remove url to crawl from queue
|
||||||
|
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
||||||
|
@ -200,10 +231,7 @@ if __name__ == '__main__':
|
||||||
print('domain: {}'.format(domain))
|
print('domain: {}'.format(domain))
|
||||||
print('domain_url: {}'.format(domain_url))
|
print('domain_url: {}'.format(domain_url))
|
||||||
|
|
||||||
faup.decode(domain)
|
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
|
||||||
onion_domain=faup.get()['domain'].decode()
|
|
||||||
|
|
||||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
|
|
||||||
|
|
||||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
|
@ -219,17 +247,24 @@ if __name__ == '__main__':
|
||||||
# last check
|
# last check
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
|
||||||
|
# Launch Scrapy-Splash Crawler
|
||||||
crawl_onion(url, domain, date, date_month, message)
|
crawl_onion(url, domain, date, date_month, message)
|
||||||
|
# Crawl Domain
|
||||||
if url != domain_url:
|
if url != domain_url:
|
||||||
print(url)
|
#Crawl Domain with port number
|
||||||
|
if port is not None:
|
||||||
|
print('{}:{}'.format(domain_url, port))
|
||||||
|
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
|
||||||
|
#Crawl without port number
|
||||||
print(domain_url)
|
print(domain_url)
|
||||||
crawl_onion(domain_url, domain, date, date_month, message)
|
crawl_onion(domain_url, domain, date, date_month, message)
|
||||||
|
|
||||||
|
# update last check
|
||||||
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
|
||||||
# save down onion
|
# save down onion
|
||||||
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
||||||
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
||||||
#r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
|
||||||
#r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
|
|
||||||
else:
|
else:
|
||||||
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
||||||
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
|
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
|
||||||
|
@ -241,28 +276,28 @@ if __name__ == '__main__':
|
||||||
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
||||||
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
|
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
|
||||||
# add crawled history by date
|
# add crawled history by date
|
||||||
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
|
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste)
|
||||||
|
|
||||||
|
if mode == 'automatic':
|
||||||
|
# check external onions links (full_crawl)
|
||||||
|
external_domains = set()
|
||||||
|
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
||||||
|
external_domain = re.findall(dic_regex[type_hidden_service], link)
|
||||||
|
external_domain.extend(re.findall(url_i2p, link))
|
||||||
|
if len(external_domain) > 0:
|
||||||
|
external_domain = external_domain[0][4]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
if '.onion' in external_domain and external_domain != domain:
|
||||||
|
external_domains.add(external_domain)
|
||||||
|
elif '.i2p' in external_domain and external_domain != domain:
|
||||||
|
external_domains.add(external_domain)
|
||||||
|
if len(external_domains) >= 10:
|
||||||
|
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
|
||||||
|
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||||
|
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||||
|
|
||||||
# check external onions links (full_scrawl)
|
# update list, last crawled sites
|
||||||
external_domains = set()
|
|
||||||
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
|
||||||
external_domain = re.findall(url_onion, link)
|
|
||||||
external_domain.extend(re.findall(url_i2p, link))
|
|
||||||
if len(external_domain) > 0:
|
|
||||||
external_domain = external_domain[0][4]
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
if '.onion' in external_domain and external_domain != domain:
|
|
||||||
external_domains.add(external_domain)
|
|
||||||
elif '.i2p' in external_domain and external_domain != domain:
|
|
||||||
external_domains.add(external_domain)
|
|
||||||
if len(external_domains) >= 10:
|
|
||||||
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
|
|
||||||
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
|
||||||
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
|
||||||
|
|
||||||
# update list, last crawled onions
|
|
||||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||||
|
|
||||||
|
@ -270,7 +305,7 @@ if __name__ == '__main__':
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||||
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||||
else:
|
else:
|
||||||
print(' Blacklisted Onion')
|
print(' Blacklisted Site')
|
||||||
print()
|
print()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
|
@ -221,7 +221,7 @@ function launching_scripts {
|
||||||
function launching_crawler {
|
function launching_crawler {
|
||||||
if [[ ! $iscrawler ]]; then
|
if [[ ! $iscrawler ]]; then
|
||||||
CONFIG=$AIL_BIN/packages/config.cfg
|
CONFIG=$AIL_BIN/packages/config.cfg
|
||||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
||||||
|
|
||||||
IFS='-' read -ra PORTS <<< "$lport"
|
IFS='-' read -ra PORTS <<< "$lport"
|
||||||
if [ ${#PORTS[@]} -eq 1 ]
|
if [ ${#PORTS[@]} -eq 1 ]
|
||||||
|
|
|
@ -249,5 +249,5 @@ db = 0
|
||||||
[Crawler]
|
[Crawler]
|
||||||
activate_crawler = False
|
activate_crawler = False
|
||||||
crawler_depth_limit = 1
|
crawler_depth_limit = 1
|
||||||
splash_url_onion = http://127.0.0.1
|
splash_url = http://127.0.0.1
|
||||||
splash_onion_port = 8050-8052
|
splash_port = 8050-8052
|
||||||
|
|
|
@ -28,10 +28,10 @@ from Helper import Process
|
||||||
|
|
||||||
class TorSplashCrawler():
|
class TorSplashCrawler():
|
||||||
|
|
||||||
def __init__(self, splash_url, crawler_depth_limit):
|
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
|
||||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
||||||
self.crawler = Crawler(self.TorSplashSpider, {
|
self.crawler = Crawler(self.TorSplashSpider, {
|
||||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
'USER_AGENT': user_agent,
|
||||||
'SPLASH_URL': splash_url,
|
'SPLASH_URL': splash_url,
|
||||||
'ROBOTSTXT_OBEY': False,
|
'ROBOTSTXT_OBEY': False,
|
||||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
|
@ -42,7 +42,7 @@ class TorSplashCrawler():
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
'HTTPERROR_ALLOW_ALL': True,
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
'RETRY_TIMES': 2,
|
'RETRY_TIMES': 2,
|
||||||
'CLOSESPIDER_PAGECOUNT': 50,
|
'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -30,5 +30,10 @@ if __name__ == '__main__':
|
||||||
paste = sys.argv[5]
|
paste = sys.argv[5]
|
||||||
super_father = sys.argv[6]
|
super_father = sys.argv[6]
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
|
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
|
||||||
|
user_agent = tor_browser_agent
|
||||||
|
|
||||||
|
closespider_pagecount = 50
|
||||||
|
|
||||||
|
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
|
||||||
crawler.crawl(type, url, domain, paste, super_father)
|
crawler.crawl(type, url, domain, paste, super_father)
|
||||||
|
|
|
@ -5,12 +5,14 @@ set -e
|
||||||
wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js
|
wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js
|
||||||
|
|
||||||
SBADMIN_VERSION='3.3.7'
|
SBADMIN_VERSION='3.3.7'
|
||||||
|
BOOTSTRAP_VERSION='4.2.1'
|
||||||
FONT_AWESOME_VERSION='4.7.0'
|
FONT_AWESOME_VERSION='4.7.0'
|
||||||
D3_JS_VERSION='5.5.0'
|
D3_JS_VERSION='5.5.0'
|
||||||
|
|
||||||
rm -rf temp
|
rm -rf temp
|
||||||
mkdir temp
|
mkdir temp
|
||||||
|
|
||||||
|
wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip
|
||||||
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
|
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
|
||||||
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
|
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
|
||||||
wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
|
wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
|
||||||
|
@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O tem
|
||||||
wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
|
wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
|
||||||
wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip
|
wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip
|
||||||
|
|
||||||
|
unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/
|
||||||
unzip temp/${SBADMIN_VERSION}.zip -d temp/
|
unzip temp/${SBADMIN_VERSION}.zip -d temp/
|
||||||
unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
|
unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
|
||||||
unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
|
unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
|
||||||
|
@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/
|
||||||
unzip temp/moment_2.22.2.zip -d temp/
|
unzip temp/moment_2.22.2.zip -d temp/
|
||||||
unzip temp/daterangepicker_v0.18.0.zip -d temp/
|
unzip temp/daterangepicker_v0.18.0.zip -d temp/
|
||||||
|
|
||||||
|
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/
|
||||||
|
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/
|
||||||
|
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/
|
||||||
|
|
||||||
mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
|
mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
|
||||||
mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
|
mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
|
||||||
mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
|
mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
|
||||||
|
@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/
|
||||||
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
|
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
|
||||||
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js
|
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js
|
||||||
|
|
||||||
|
wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css
|
||||||
|
wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js
|
||||||
|
|
||||||
#Ressource for graph
|
#Ressource for graph
|
||||||
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
|
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
|
||||||
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js
|
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js
|
||||||
|
|
Loading…
Reference in a new issue