chg: [Crawler] add bootsrap4 src + refractor crawler

This commit is contained in:
Terrtia 2019-02-05 17:16:44 +01:00
parent 423c7b1455
commit 516238025f
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
6 changed files with 106 additions and 57 deletions

View file

@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
from pubsublogger import publisher
def decode_val(value):
if value is not None:
value = value.decode()
return value
def load_type_blacklist(type_service):
# load domains blacklist
try:
with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f:
# # TODO: # FIXME: remove this
r_onion.delete('blacklist_{}'.format(type_service))
lines = f.read().splitlines()
for line in lines:
r_onion.sadd('blacklist_{}'.format(type_service), line)
except Exception:
pass
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
# send this msg back in the queue
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message):
if __name__ == '__main__':
if len(sys.argv) != 3:
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
#print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port')
exit(1)
type_hidden_service = sys.argv[1]
mode = sys.argv[1]
splash_port = sys.argv[2]
if mode == 'automatic':
type_hidden_service = 'onion'
publisher.port = 6380
publisher.channel = "Script"
@ -107,6 +129,16 @@ if __name__ == '__main__':
# Setup the I/O queues
p = Process(config_section)
accepted_services = ['onion', 'regular']
dic_regex = {}
dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(dic_regex['onion'])
dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(dic_regex['i2p'])
dic_regex['regular'] = dic_regex['i2p']
url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_onion)
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@ -114,17 +146,15 @@ if __name__ == '__main__':
if type_hidden_service == 'onion':
regex_hidden_service = url_onion
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
elif type_hidden_service == 'i2p':
regex_hidden_service = url_i2p
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port)
elif type_hidden_service == 'regular':
regex_hidden_service = url_i2p
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
else:
print('incorrect crawler type: {}'.format(type_hidden_service))
exit(0)
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
print('splash url: {}'.format(splash_url))
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
@ -150,19 +180,13 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
# Crawler status
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
# load domains blacklist
try:
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
r_onion.delete('blacklist_{}'.format(type_hidden_service))
lines = f.read().splitlines()
for line in lines:
r_onion.sadd('blacklist_{}'.format(type_hidden_service), line)
except Exception:
pass
load_type_blacklist(type_hidden_service)
while True:
@ -180,17 +204,24 @@ if __name__ == '__main__':
url, paste = splitted
paste = paste.replace(PASTES_FOLDER+'/', '')
url_list = re.findall(regex_hidden_service, url)[0]
if url_list[1] == '':
# extract data from url
faup.decode(url)
url_unpack = faup.get()
url = decode_val(url_unpack['url'])
port = decode_val(url_unpack['port'])
scheme = decode_val(url_unpack['scheme'])
domain = decode_val(url_unpack['domain'])
host = decode_val(url_unpack['domain'])
# Add Scheme to url
if scheme is None:
url= 'http://{}'.format(url)
link, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4]
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
domain_url = 'http://{}'.format(domain)
# remove url to crawl from queue
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
print()
print()
print('\033[92m------------------START CRAWLER------------------\033[0m')
@ -200,10 +231,7 @@ if __name__ == '__main__':
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
faup.decode(domain)
onion_domain=faup.get()['domain'].decode()
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
date = datetime.datetime.now().strftime("%Y%m%d")
date_month = datetime.datetime.now().strftime("%Y%m")
@ -219,17 +247,24 @@ if __name__ == '__main__':
# last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# Launch Scrapy-Splash Crawler
crawl_onion(url, domain, date, date_month, message)
# Crawl Domain
if url != domain_url:
print(url)
#Crawl Domain with port number
if port is not None:
print('{}:{}'.format(domain_url, port))
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
#Crawl without port number
print(domain_url)
crawl_onion(domain_url, domain, date, date_month, message)
# update last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# save down onion
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
#r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
#r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
else:
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
@ -241,28 +276,28 @@ if __name__ == '__main__':
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
# add crawled history by date
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste)
if mode == 'automatic':
# check external onions links (full_crawl)
external_domains = set()
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
external_domain = re.findall(dic_regex[type_hidden_service], link)
external_domain.extend(re.findall(url_i2p, link))
if len(external_domain) > 0:
external_domain = external_domain[0][4]
else:
continue
if '.onion' in external_domain and external_domain != domain:
external_domains.add(external_domain)
elif '.i2p' in external_domain and external_domain != domain:
external_domains.add(external_domain)
if len(external_domains) >= 10:
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
# check external onions links (full_scrawl)
external_domains = set()
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
external_domain = re.findall(url_onion, link)
external_domain.extend(re.findall(url_i2p, link))
if len(external_domain) > 0:
external_domain = external_domain[0][4]
else:
continue
if '.onion' in external_domain and external_domain != domain:
external_domains.add(external_domain)
elif '.i2p' in external_domain and external_domain != domain:
external_domains.add(external_domain)
if len(external_domains) >= 10:
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
# update list, last crawled onions
# update list, last crawled sites
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
@ -270,7 +305,7 @@ if __name__ == '__main__':
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
else:
print(' Blacklisted Onion')
print(' Blacklisted Site')
print()
print()

View file

@ -221,7 +221,7 @@ function launching_scripts {
function launching_crawler {
if [[ ! $iscrawler ]]; then
CONFIG=$AIL_BIN/packages/config.cfg
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
IFS='-' read -ra PORTS <<< "$lport"
if [ ${#PORTS[@]} -eq 1 ]

View file

@ -249,5 +249,5 @@ db = 0
[Crawler]
activate_crawler = False
crawler_depth_limit = 1
splash_url_onion = http://127.0.0.1
splash_onion_port = 8050-8052
splash_url = http://127.0.0.1
splash_port = 8050-8052

View file

@ -28,10 +28,10 @@ from Helper import Process
class TorSplashCrawler():
def __init__(self, splash_url, crawler_depth_limit):
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'USER_AGENT': user_agent,
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
@ -42,7 +42,7 @@ class TorSplashCrawler():
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': 50,
'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
'DEPTH_LIMIT': crawler_depth_limit
})

View file

@ -30,5 +30,10 @@ if __name__ == '__main__':
paste = sys.argv[5]
super_father = sys.argv[6]
crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
user_agent = tor_browser_agent
closespider_pagecount = 50
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
crawler.crawl(type, url, domain, paste, super_father)

View file

@ -5,12 +5,14 @@ set -e
wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js
SBADMIN_VERSION='3.3.7'
BOOTSTRAP_VERSION='4.2.1'
FONT_AWESOME_VERSION='4.7.0'
D3_JS_VERSION='5.5.0'
rm -rf temp
mkdir temp
wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O tem
wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip
unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/
unzip temp/${SBADMIN_VERSION}.zip -d temp/
unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/
unzip temp/moment_2.22.2.zip -d temp/
unzip temp/daterangepicker_v0.18.0.zip -d temp/
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/
mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js
wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css
wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js
#Ressource for graph
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js