mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-27 00:07:16 +00:00
chg: [Crawler] add bootsrap4 src + refractor crawler
This commit is contained in:
parent
423c7b1455
commit
516238025f
6 changed files with 106 additions and 57 deletions
133
bin/Crawler.py
133
bin/Crawler.py
|
@ -16,6 +16,24 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
from Helper import Process
|
||||
from pubsublogger import publisher
|
||||
|
||||
def decode_val(value):
|
||||
if value is not None:
|
||||
value = value.decode()
|
||||
return value
|
||||
|
||||
def load_type_blacklist(type_service):
|
||||
# load domains blacklist
|
||||
try:
|
||||
with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f:
|
||||
# # TODO: # FIXME: remove this
|
||||
r_onion.delete('blacklist_{}'.format(type_service))
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
r_onion.sadd('blacklist_{}'.format(type_service), line)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
||||
# send this msg back in the queue
|
||||
if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain):
|
||||
|
@ -91,12 +109,16 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
|
||||
#print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
|
||||
print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port')
|
||||
exit(1)
|
||||
|
||||
type_hidden_service = sys.argv[1]
|
||||
mode = sys.argv[1]
|
||||
splash_port = sys.argv[2]
|
||||
|
||||
if mode == 'automatic':
|
||||
type_hidden_service = 'onion'
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
|
@ -107,6 +129,16 @@ if __name__ == '__main__':
|
|||
# Setup the I/O queues
|
||||
p = Process(config_section)
|
||||
|
||||
accepted_services = ['onion', 'regular']
|
||||
|
||||
dic_regex = {}
|
||||
dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
re.compile(dic_regex['onion'])
|
||||
dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
re.compile(dic_regex['i2p'])
|
||||
dic_regex['regular'] = dic_regex['i2p']
|
||||
|
||||
|
||||
url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
re.compile(url_onion)
|
||||
url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
|
@ -114,17 +146,15 @@ if __name__ == '__main__':
|
|||
|
||||
if type_hidden_service == 'onion':
|
||||
regex_hidden_service = url_onion
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||
elif type_hidden_service == 'i2p':
|
||||
regex_hidden_service = url_i2p
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port)
|
||||
elif type_hidden_service == 'regular':
|
||||
regex_hidden_service = url_i2p
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||
else:
|
||||
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||
exit(0)
|
||||
|
||||
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
|
||||
print('splash url: {}'.format(splash_url))
|
||||
|
||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||
|
@ -150,19 +180,13 @@ if __name__ == '__main__':
|
|||
db=p.config.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
# Crawler status
|
||||
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
# load domains blacklist
|
||||
try:
|
||||
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
|
||||
r_onion.delete('blacklist_{}'.format(type_hidden_service))
|
||||
lines = f.read().splitlines()
|
||||
for line in lines:
|
||||
r_onion.sadd('blacklist_{}'.format(type_hidden_service), line)
|
||||
except Exception:
|
||||
pass
|
||||
load_type_blacklist(type_hidden_service)
|
||||
|
||||
while True:
|
||||
|
||||
|
@ -180,17 +204,24 @@ if __name__ == '__main__':
|
|||
url, paste = splitted
|
||||
paste = paste.replace(PASTES_FOLDER+'/', '')
|
||||
|
||||
url_list = re.findall(regex_hidden_service, url)[0]
|
||||
if url_list[1] == '':
|
||||
# extract data from url
|
||||
faup.decode(url)
|
||||
url_unpack = faup.get()
|
||||
url = decode_val(url_unpack['url'])
|
||||
port = decode_val(url_unpack['port'])
|
||||
scheme = decode_val(url_unpack['scheme'])
|
||||
domain = decode_val(url_unpack['domain'])
|
||||
host = decode_val(url_unpack['domain'])
|
||||
|
||||
# Add Scheme to url
|
||||
if scheme is None:
|
||||
url= 'http://{}'.format(url)
|
||||
|
||||
link, s, credential, subdomain, domain, host, port, \
|
||||
resource_path, query_string, f1, f2, f3, f4 = url_list
|
||||
domain = url_list[4]
|
||||
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
|
||||
domain_url = 'http://{}'.format(domain)
|
||||
|
||||
|
||||
# remove url to crawl from queue
|
||||
r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain)
|
||||
|
||||
print()
|
||||
print()
|
||||
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
||||
|
@ -200,10 +231,7 @@ if __name__ == '__main__':
|
|||
print('domain: {}'.format(domain))
|
||||
print('domain_url: {}'.format(domain_url))
|
||||
|
||||
faup.decode(domain)
|
||||
onion_domain=faup.get()['domain'].decode()
|
||||
|
||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
|
||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
|
||||
|
||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||
|
@ -219,17 +247,24 @@ if __name__ == '__main__':
|
|||
# last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
# Launch Scrapy-Splash Crawler
|
||||
crawl_onion(url, domain, date, date_month, message)
|
||||
# Crawl Domain
|
||||
if url != domain_url:
|
||||
print(url)
|
||||
#Crawl Domain with port number
|
||||
if port is not None:
|
||||
print('{}:{}'.format(domain_url, port))
|
||||
crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message)
|
||||
#Crawl without port number
|
||||
print(domain_url)
|
||||
crawl_onion(domain_url, domain, date, date_month, message)
|
||||
|
||||
# update last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
# save down onion
|
||||
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
|
||||
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
|
||||
#r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
|
||||
#r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
|
||||
else:
|
||||
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
|
||||
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
|
||||
|
@ -241,28 +276,28 @@ if __name__ == '__main__':
|
|||
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
||||
r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date)
|
||||
# add crawled history by date
|
||||
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) #add datetime here
|
||||
r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste)
|
||||
|
||||
if mode == 'automatic':
|
||||
# check external onions links (full_crawl)
|
||||
external_domains = set()
|
||||
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
||||
external_domain = re.findall(dic_regex[type_hidden_service], link)
|
||||
external_domain.extend(re.findall(url_i2p, link))
|
||||
if len(external_domain) > 0:
|
||||
external_domain = external_domain[0][4]
|
||||
else:
|
||||
continue
|
||||
if '.onion' in external_domain and external_domain != domain:
|
||||
external_domains.add(external_domain)
|
||||
elif '.i2p' in external_domain and external_domain != domain:
|
||||
external_domains.add(external_domain)
|
||||
if len(external_domains) >= 10:
|
||||
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
|
||||
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||
|
||||
# check external onions links (full_scrawl)
|
||||
external_domains = set()
|
||||
for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)):
|
||||
external_domain = re.findall(url_onion, link)
|
||||
external_domain.extend(re.findall(url_i2p, link))
|
||||
if len(external_domain) > 0:
|
||||
external_domain = external_domain[0][4]
|
||||
else:
|
||||
continue
|
||||
if '.onion' in external_domain and external_domain != domain:
|
||||
external_domains.add(external_domain)
|
||||
elif '.i2p' in external_domain and external_domain != domain:
|
||||
external_domains.add(external_domain)
|
||||
if len(external_domains) >= 10:
|
||||
r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain)
|
||||
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
|
||||
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
|
||||
|
||||
# update list, last crawled onions
|
||||
# update list, last crawled sites
|
||||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||
|
||||
|
@ -270,7 +305,7 @@ if __name__ == '__main__':
|
|||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||
else:
|
||||
print(' Blacklisted Onion')
|
||||
print(' Blacklisted Site')
|
||||
print()
|
||||
print()
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ function launching_scripts {
|
|||
function launching_crawler {
|
||||
if [[ ! $iscrawler ]]; then
|
||||
CONFIG=$AIL_BIN/packages/config.cfg
|
||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
|
||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
||||
|
||||
IFS='-' read -ra PORTS <<< "$lport"
|
||||
if [ ${#PORTS[@]} -eq 1 ]
|
||||
|
|
|
@ -249,5 +249,5 @@ db = 0
|
|||
[Crawler]
|
||||
activate_crawler = False
|
||||
crawler_depth_limit = 1
|
||||
splash_url_onion = http://127.0.0.1
|
||||
splash_onion_port = 8050-8052
|
||||
splash_url = http://127.0.0.1
|
||||
splash_port = 8050-8052
|
||||
|
|
|
@ -28,10 +28,10 @@ from Helper import Process
|
|||
|
||||
class TorSplashCrawler():
|
||||
|
||||
def __init__(self, splash_url, crawler_depth_limit):
|
||||
def __init__(self, splash_url, crawler_depth_limit, user_agent, closespider_pagecount):
|
||||
self.process = CrawlerProcess({'LOG_ENABLED': False})
|
||||
self.crawler = Crawler(self.TorSplashSpider, {
|
||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
|
||||
'USER_AGENT': user_agent,
|
||||
'SPLASH_URL': splash_url,
|
||||
'ROBOTSTXT_OBEY': False,
|
||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||
|
@ -42,7 +42,7 @@ class TorSplashCrawler():
|
|||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||
'HTTPERROR_ALLOW_ALL': True,
|
||||
'RETRY_TIMES': 2,
|
||||
'CLOSESPIDER_PAGECOUNT': 50,
|
||||
'CLOSESPIDER_PAGECOUNT': closespider_pagecount,
|
||||
'DEPTH_LIMIT': crawler_depth_limit
|
||||
})
|
||||
|
||||
|
|
|
@ -30,5 +30,10 @@ if __name__ == '__main__':
|
|||
paste = sys.argv[5]
|
||||
super_father = sys.argv[6]
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
|
||||
tor_browser_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'
|
||||
user_agent = tor_browser_agent
|
||||
|
||||
closespider_pagecount = 50
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, crawler_depth_limit, user_agent, closespider_pagecount)
|
||||
crawler.crawl(type, url, domain, paste, super_father)
|
||||
|
|
|
@ -5,12 +5,14 @@ set -e
|
|||
wget http://dygraphs.com/dygraph-combined.js -O ./static/js/dygraph-combined.js
|
||||
|
||||
SBADMIN_VERSION='3.3.7'
|
||||
BOOTSTRAP_VERSION='4.2.1'
|
||||
FONT_AWESOME_VERSION='4.7.0'
|
||||
D3_JS_VERSION='5.5.0'
|
||||
|
||||
rm -rf temp
|
||||
mkdir temp
|
||||
|
||||
wget https://github.com/twbs/bootstrap/releases/download/v${BOOTSTRAP_VERSION}/bootstrap-${BOOTSTRAP_VERSION}-dist.zip -O temp/bootstrap${BOOTSTRAP_VERSION}.zip
|
||||
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}.zip
|
||||
wget https://github.com/BlackrockDigital/startbootstrap-sb-admin-2/archive/v${SBADMIN_VERSION}.zip -O temp/${SBADMIN_VERSION}-2.zip
|
||||
wget https://github.com/FortAwesome/Font-Awesome/archive/v${FONT_AWESOME_VERSION}.zip -O temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip
|
||||
|
@ -20,7 +22,7 @@ wget https://github.com/d3/d3/releases/download/v${D3_JS_VERSION}/d3.zip -O tem
|
|||
wget https://github.com/moment/moment/archive/2.22.2.zip -O temp/moment_2.22.2.zip
|
||||
wget https://github.com/longbill/jquery-date-range-picker/archive/v0.18.0.zip -O temp/daterangepicker_v0.18.0.zip
|
||||
|
||||
|
||||
unzip temp/bootstrap${BOOTSTRAP_VERSION}.zip -d temp/
|
||||
unzip temp/${SBADMIN_VERSION}.zip -d temp/
|
||||
unzip temp/${SBADMIN_VERSION}-2.zip -d temp/
|
||||
unzip temp/FONT_AWESOME_${FONT_AWESOME_VERSION}.zip -d temp/
|
||||
|
@ -29,6 +31,10 @@ unzip temp/d3_${D3_JS_VERSION}.zip -d temp/
|
|||
unzip temp/moment_2.22.2.zip -d temp/
|
||||
unzip temp/daterangepicker_v0.18.0.zip -d temp/
|
||||
|
||||
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/js/bootstrap.min.js ./static/js/
|
||||
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css ./static/css/
|
||||
mv temp/bootstrap-${BOOTSTRAP_VERSION}-dist/css/bootstrap.min.css.map ./static/css/
|
||||
|
||||
mv temp/startbootstrap-sb-admin-${SBADMIN_VERSION} temp/sb-admin
|
||||
mv temp/startbootstrap-sb-admin-2-${SBADMIN_VERSION} temp/sb-admin-2
|
||||
mv temp/Font-Awesome-${FONT_AWESOME_VERSION} temp/font-awesome
|
||||
|
@ -59,6 +65,9 @@ wget https://cdn.datatables.net/1.10.12/js/jquery.dataTables.min.js -O ./static/
|
|||
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.css -O ./static/css/dataTables.bootstrap.css
|
||||
wget https://cdn.datatables.net/plug-ins/1.10.7/integration/bootstrap/3/dataTables.bootstrap.js -O ./static/js/dataTables.bootstrap.js
|
||||
|
||||
wget https://cdn.datatables.net/1.10.18/css/dataTables.bootstrap4.min.css -O ./static/css/dataTables.bootstrap4.min.css
|
||||
wget https://cdn.datatables.net/1.10.18/js/dataTables.bootstrap4.min.js -O ./static/js/dataTables.bootstrap4.min.js
|
||||
|
||||
#Ressource for graph
|
||||
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.js -O ./static/js/jquery.flot.js
|
||||
wget https://raw.githubusercontent.com/flot/flot/958e5fd43c6dff4bab3e1fd5cb6109df5c1e8003/jquery.flot.pie.js -O ./static/js/jquery.flot.pie.js
|
||||
|
|
Loading…
Reference in a new issue