mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 00:28:22 +00:00
chg: [Crawler] add launcher and install
This commit is contained in:
parent
6edc1ddbeb
commit
50c81773e9
11 changed files with 160 additions and 69 deletions
|
@ -40,16 +40,13 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father],
|
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
||||||
stdout=subprocess.PIPE)
|
stdout=subprocess.PIPE)
|
||||||
while process.poll() is None:
|
while process.poll() is None:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
if process.returncode == 0:
|
if process.returncode == 0:
|
||||||
if r_serv_metadata.exists('paste_children:'+paste):
|
# onion up
|
||||||
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
|
||||||
p.populate_set_out(msg, 'Tags')
|
|
||||||
|
|
||||||
print(process.stdout.read())
|
print(process.stdout.read())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@ -59,14 +56,19 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
## FIXME: # TODO: relaunch docker
|
## FIXME: # TODO: relaunch docker
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 3:
|
||||||
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
|
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
|
||||||
|
print(sys.argv[1])
|
||||||
|
print(sys.argv[2])
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
type_hidden_service = sys.argv[1]
|
type_hidden_service = sys.argv[1]
|
||||||
|
splash_port = sys.argv[2]
|
||||||
|
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
@ -85,21 +87,19 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
if type_hidden_service == 'onion':
|
if type_hidden_service == 'onion':
|
||||||
regex_hidden_service = url_onion
|
regex_hidden_service = url_onion
|
||||||
splash_url = p.config.get("Crawler", "splash_url_onion")
|
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||||
http_proxy = p.config.get("Crawler", "http_proxy_onion")
|
|
||||||
elif type_hidden_service == 'i2p':
|
elif type_hidden_service == 'i2p':
|
||||||
regex_hidden_service = url_i2p
|
regex_hidden_service = url_i2p
|
||||||
splash_url = p.config.get("Crawler", "splash_url_i2p")
|
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port)
|
||||||
http_proxy = p.config.get("Crawler", "http_proxy_i2p")
|
|
||||||
elif type_hidden_service == 'regular':
|
elif type_hidden_service == 'regular':
|
||||||
regex_hidden_service = url_i2p
|
regex_hidden_service = url_i2p
|
||||||
splash_url = p.config.get("Crawler", "splash_url_onion")
|
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||||
http_proxy = p.config.get("Crawler", "http_proxy_onion")
|
|
||||||
else:
|
else:
|
||||||
print('incorrect crawler type: {}'.format(type_hidden_service))
|
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
print(type_hidden_service)
|
print(type_hidden_service)
|
||||||
|
print(splash_url)
|
||||||
|
|
||||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||||
|
|
||||||
|
@ -129,8 +129,6 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Recovering the streamed message informations. http://eepsites.i2p
|
# Recovering the streamed message informations. http://eepsites.i2p
|
||||||
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
|
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
|
||||||
#message = 'http://i2pwiki.i2p;test'
|
|
||||||
#message = 'http://i2host.i2p;test'
|
|
||||||
|
|
||||||
# # FIXME: remove
|
# # FIXME: remove
|
||||||
if message is None:
|
if message is None:
|
||||||
|
@ -186,13 +184,16 @@ if __name__ == '__main__':
|
||||||
# save down onion
|
# save down onion
|
||||||
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
||||||
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
||||||
r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
#r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
||||||
r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
|
#r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
|
||||||
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
|
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
|
||||||
else:
|
else:
|
||||||
r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
||||||
|
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
|
||||||
|
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
||||||
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
# last check
|
# last check
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
@ -226,12 +227,13 @@ if __name__ == '__main__':
|
||||||
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||||
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||||
|
|
||||||
|
# update list, last crawled onions
|
||||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||||
|
|
||||||
#send all crawled domain past
|
#send all crawled domain past
|
||||||
msg = domain
|
#msg = domain
|
||||||
p.populate_set_out(msg, 'DomainSubject')
|
#p.populate_set_out(msg, 'DomainSubject')
|
||||||
|
|
||||||
#time.sleep(30)
|
#time.sleep(30)
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1`
|
||||||
isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1`
|
isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1`
|
||||||
isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1`
|
isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1`
|
||||||
isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1`
|
isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1`
|
||||||
|
iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1`
|
||||||
|
|
||||||
function helptext {
|
function helptext {
|
||||||
echo -e $YELLOW"
|
echo -e $YELLOW"
|
||||||
|
@ -198,6 +199,26 @@ function launching_scripts {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function launching_crawler {
|
||||||
|
CONFIG=$AIL_BIN/packages/config.cfg
|
||||||
|
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
||||||
|
echo $lport
|
||||||
|
|
||||||
|
IFS='-' read -ra PORTS <<< "$lport"
|
||||||
|
first_port=${PORTS[0]}
|
||||||
|
last_port=${PORTS[1]}
|
||||||
|
|
||||||
|
screen -dmS "Crawler_AIL"
|
||||||
|
sleep 0.1
|
||||||
|
|
||||||
|
for ((i=first_port;i<=last_port;i++)); do
|
||||||
|
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
|
||||||
|
sleep 0.1
|
||||||
|
done
|
||||||
|
|
||||||
|
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||||
|
}
|
||||||
|
|
||||||
function shutting_down_redis {
|
function shutting_down_redis {
|
||||||
redis_dir=${AIL_HOME}/redis/src/
|
redis_dir=${AIL_HOME}/redis/src/
|
||||||
bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN'
|
bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN'
|
||||||
|
@ -406,6 +427,9 @@ function launch_all {
|
||||||
Flask)
|
Flask)
|
||||||
launch_flask;
|
launch_flask;
|
||||||
;;
|
;;
|
||||||
|
Crawler)
|
||||||
|
launching_crawler;
|
||||||
|
;;
|
||||||
Killall)
|
Killall)
|
||||||
killall;
|
killall;
|
||||||
;;
|
;;
|
||||||
|
@ -427,13 +451,13 @@ function launch_all {
|
||||||
|
|
||||||
while [ "$1" != "" ]; do
|
while [ "$1" != "" ]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-l | --launchAuto ) launch_all "automatic";
|
-l | --launchAuto ) launch_all "automatic"; launching_crawler
|
||||||
;;
|
;;
|
||||||
-k | --killAll ) killall;
|
-k | --killAll ) killall;
|
||||||
;;
|
;;
|
||||||
-c | --configUpdate ) checking_configuration "manual";
|
-t | --thirdpartyUpdate ) update_thirdparty;
|
||||||
;;
|
;;
|
||||||
-t | --thirdpartyUpdate ) update_thirdparty;
|
-c | --crawler ) launching_crawler;
|
||||||
;;
|
;;
|
||||||
-h | --help ) helptext;
|
-h | --help ) helptext;
|
||||||
exit
|
exit
|
||||||
|
|
56
bin/Onion.py
56
bin/Onion.py
|
@ -113,6 +113,15 @@ if __name__ == "__main__":
|
||||||
message = p.get_from_set()
|
message = p.get_from_set()
|
||||||
prec_filename = None
|
prec_filename = None
|
||||||
|
|
||||||
|
# send to crawler:
|
||||||
|
activate_crawler = p.config.get("Crawler", "activate_crawler")
|
||||||
|
if activate_crawler == 'True':
|
||||||
|
activate_crawler = True
|
||||||
|
print('Crawler enabled')
|
||||||
|
else:
|
||||||
|
activate_crawler = False
|
||||||
|
print('Crawler disabled')
|
||||||
|
|
||||||
# Thanks to Faup project for this regex
|
# Thanks to Faup project for this regex
|
||||||
# https://github.com/stricaud/faup
|
# https://github.com/stricaud/faup
|
||||||
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
@ -142,6 +151,7 @@ if __name__ == "__main__":
|
||||||
domains_list.append(domain)
|
domains_list.append(domain)
|
||||||
urls.append(url)
|
urls.append(url)
|
||||||
|
|
||||||
|
'''
|
||||||
for x in PST.get_regex(i2p_regex):
|
for x in PST.get_regex(i2p_regex):
|
||||||
# Extracting url with regex
|
# Extracting url with regex
|
||||||
url, s, credential, subdomain, domain, host, port, \
|
url, s, credential, subdomain, domain, host, port, \
|
||||||
|
@ -156,6 +166,7 @@ if __name__ == "__main__":
|
||||||
r_onion.sadd('i2p_domain_crawler_queue', domain)
|
r_onion.sadd('i2p_domain_crawler_queue', domain)
|
||||||
msg = '{};{}'.format(url,PST.p_path)
|
msg = '{};{}'.format(url,PST.p_path)
|
||||||
r_onion.sadd('i2p_crawler_queue', msg)
|
r_onion.sadd('i2p_crawler_queue', msg)
|
||||||
|
'''
|
||||||
|
|
||||||
# Saving the list of extracted onion domains.
|
# Saving the list of extracted onion domains.
|
||||||
PST.__setattr__(channel, domains_list)
|
PST.__setattr__(channel, domains_list)
|
||||||
|
@ -176,32 +187,33 @@ if __name__ == "__main__":
|
||||||
to_print = 'Onion;{};{};{};'.format(PST.p_source,
|
to_print = 'Onion;{};{};{};'.format(PST.p_source,
|
||||||
PST.p_date,
|
PST.p_date,
|
||||||
PST.p_name)
|
PST.p_name)
|
||||||
'''
|
|
||||||
for url in fetch(p, r_cache, urls, domains_list, path):
|
|
||||||
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
|
|
||||||
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
|
|
||||||
|
|
||||||
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
|
if activate_crawler:
|
||||||
p.populate_set_out(msg, 'Tags')
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
'''
|
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||||
|
for url in urls:
|
||||||
|
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
domain = re.findall(url_regex, url)
|
||||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
if len(domain) > 0:
|
||||||
for url in urls:
|
domain = domain[0][4]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
domain = re.findall(url_regex, url)
|
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||||
if len(domain) > 0:
|
if not r_onion.sismember('onion_domain_crawler_queue', domain):
|
||||||
domain = domain[0][4]
|
print('send to onion crawler')
|
||||||
else:
|
r_onion.sadd('onion_domain_crawler_queue', domain)
|
||||||
continue
|
msg = '{};{}'.format(url,PST.p_path)
|
||||||
|
r_onion.sadd('onion_crawler_queue', msg)
|
||||||
|
#p.populate_set_out(msg, 'Crawler')
|
||||||
|
|
||||||
if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
|
else:
|
||||||
if not r_onion.sismember('onion_domain_crawler_queue', domain):
|
for url in fetch(p, r_cache, urls, domains_list, path):
|
||||||
print('send to onion crawler')
|
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
|
||||||
r_onion.sadd('onion_domain_crawler_queue', domain)
|
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
|
||||||
msg = '{};{}'.format(url,PST.p_path)
|
|
||||||
r_onion.sadd('onion_crawler_queue', msg)
|
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
|
||||||
#p.populate_set_out(msg, 'Crawler')
|
p.populate_set_out(msg, 'Tags')
|
||||||
else:
|
else:
|
||||||
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
|
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
|
||||||
|
|
||||||
|
|
|
@ -235,8 +235,7 @@ port = 6381
|
||||||
db = 0
|
db = 0
|
||||||
|
|
||||||
[Crawler]
|
[Crawler]
|
||||||
|
activate_crawler = True
|
||||||
crawler_depth_limit = 1
|
crawler_depth_limit = 1
|
||||||
splash_url_onion = http://127.0.0.1:8050
|
splash_url_onion = http://127.0.0.1
|
||||||
splash_url_i2p = http://127.0.0.1:8050
|
splash_onion_port = 8050-8050
|
||||||
http_proxy_onion = http://127.0.0.1:9050
|
|
||||||
http_proxy_i2p = http://127.0.0.1:9050
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ from Helper import Process
|
||||||
|
|
||||||
class TorSplashCrawler():
|
class TorSplashCrawler():
|
||||||
|
|
||||||
def __init__(self, splash_url, http_proxy, crawler_depth_limit):
|
def __init__(self, splash_url, crawler_depth_limit):
|
||||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
||||||
self.crawler = Crawler(self.TorSplashSpider, {
|
self.crawler = Crawler(self.TorSplashSpider, {
|
||||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
||||||
|
@ -114,7 +114,6 @@ class TorSplashCrawler():
|
||||||
if response.status == 504:
|
if response.status == 504:
|
||||||
# down ?
|
# down ?
|
||||||
print('504 detected')
|
print('504 detected')
|
||||||
#elif response.status in in range(400, 600):
|
|
||||||
elif response.status != 200:
|
elif response.status != 200:
|
||||||
print('other: {}'.format(response.status))
|
print('other: {}'.format(response.status))
|
||||||
else:
|
else:
|
||||||
|
@ -128,7 +127,7 @@ class TorSplashCrawler():
|
||||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||||
|
|
||||||
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
|
||||||
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
|
||||||
|
|
||||||
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
|
||||||
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
|
||||||
|
@ -157,21 +156,17 @@ class TorSplashCrawler():
|
||||||
with open(filename_screenshot, 'wb') as f:
|
with open(filename_screenshot, 'wb') as f:
|
||||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||||
|
|
||||||
#interest = response.data['har']['log']['entries'][0]['response']['header'][0]
|
|
||||||
with open(filename_screenshot+'har.txt', 'wb') as f:
|
with open(filename_screenshot+'har.txt', 'wb') as f:
|
||||||
f.write(json.dumps(response.data['har']).encode())
|
f.write(json.dumps(response.data['har']).encode())
|
||||||
|
|
||||||
# save external links in set
|
# save external links in set
|
||||||
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||||
for link in lext.extract_links(response):
|
#for link in lext.extract_links(response):
|
||||||
self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
# self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
|
||||||
self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
# self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
|
||||||
|
|
||||||
#le = LinkExtractor(unique=True)
|
|
||||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||||
for link in le.extract_links(response):
|
for link in le.extract_links(response):
|
||||||
self.r_cache.setbit(link, 0, 0)
|
|
||||||
self.r_cache.expire(link, 360000)
|
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
link.url,
|
link.url,
|
||||||
self.parse,
|
self.parse,
|
||||||
|
|
38
bin/torcrawler/launch_splash_crawler.sh
Executable file
38
bin/torcrawler/launch_splash_crawler.sh
Executable file
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#usage() { echo "Usage: sudo $0 [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]" 1>&2; exit 1; }
|
||||||
|
|
||||||
|
while getopts ":p:f:n:" o; do
|
||||||
|
case "${o}" in
|
||||||
|
p)
|
||||||
|
p=${OPTARG}
|
||||||
|
;;
|
||||||
|
f)
|
||||||
|
f=${OPTARG}
|
||||||
|
;;
|
||||||
|
n)
|
||||||
|
n=${OPTARG}
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
shift $((OPTIND-1))
|
||||||
|
|
||||||
|
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||||
|
#usage
|
||||||
|
echo "usage"
|
||||||
|
fi
|
||||||
|
|
||||||
|
first_port=$p
|
||||||
|
echo "usage0"
|
||||||
|
screen -dmS "Docker_Splash"
|
||||||
|
echo "usage1"
|
||||||
|
sleep 0.1
|
||||||
|
|
||||||
|
for ((i=0;i<=$((${n} - 1));i++)); do
|
||||||
|
port_number=$((${p} + $i))
|
||||||
|
screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
||||||
|
sleep 0.1
|
||||||
|
done
|
|
@ -8,8 +8,9 @@ from TorSplashCrawler import TorSplashCrawler
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
if len(sys.argv) != 8:
|
if len(sys.argv) != 7:
|
||||||
print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father')
|
print(sys.argv)
|
||||||
|
print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
|
@ -22,14 +23,13 @@ if __name__ == '__main__':
|
||||||
cfg.read(configfile)
|
cfg.read(configfile)
|
||||||
|
|
||||||
splash_url = sys.argv[1]
|
splash_url = sys.argv[1]
|
||||||
http_proxy = sys.argv[2]
|
type = sys.argv[2]
|
||||||
type = sys.argv[3]
|
|
||||||
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
|
||||||
|
|
||||||
url = sys.argv[4]
|
url = sys.argv[3]
|
||||||
domain = sys.argv[5]
|
domain = sys.argv[4]
|
||||||
paste = sys.argv[6]
|
paste = sys.argv[5]
|
||||||
super_father = sys.argv[7]
|
super_father = sys.argv[6]
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
|
crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
|
||||||
crawler.crawl(type, url, domain, paste, super_father)
|
crawler.crawl(type, url, domain, paste, super_father)
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
[proxy]
|
||||||
|
host=172.17.0.1
|
||||||
|
port=9050
|
||||||
|
type=SOCKS5
|
10
crawler_hidden_services_install.sh
Normal file
10
crawler_hidden_services_install.sh
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# install docker
|
||||||
|
sudo apt install docker.io
|
||||||
|
|
||||||
|
# pull splah docker
|
||||||
|
sudo docker pull scrapinghub/splash
|
||||||
|
|
||||||
|
. ./AILENV/bin/activate
|
||||||
|
pip3 install -U -r pip3_packages_requirement.txt
|
2
crawler_requirements.txt
Normal file
2
crawler_requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
scrapy
|
||||||
|
scrapy-splash
|
|
@ -44,6 +44,11 @@ except IOError:
|
||||||
f = open('templates/ignored_modules.txt', 'w')
|
f = open('templates/ignored_modules.txt', 'w')
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
activate_crawler = cfg.get("Crawler", "activate_crawler")
|
||||||
|
if activate_crawler != 'True':
|
||||||
|
toIgnoreModule.add('hiddenServices')
|
||||||
|
|
||||||
|
print(toIgnoreModule)
|
||||||
|
|
||||||
# Dynamically import routes and functions from modules
|
# Dynamically import routes and functions from modules
|
||||||
# Also, prepare header.html
|
# Also, prepare header.html
|
||||||
|
|
Loading…
Reference in a new issue