ail-framework/bin/lib/crawler_splash.py

141 lines
4.9 KiB
Python
Raw Normal View History

#!/usr/bin/python3
"""
API Helper
===================
"""
import base64
import gzip
import json
import os
import re
import redis
import sys
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
# # # #
# Cookies Fields:
# - name
# - value
# - path (optional)
# - domain (optional)
# - secure (optional)
# - httpOnly (optional)
# # # #
def create_cookie_dict(browser_cookie=[], cookie_name=None, cookie_value=None, domain=None, crawler_type='regular'):
# UI created
if cookie_name and cookie_value and domain:
dict_cookie = create_cookie_dict_from_input(cookie_name, cookie_value, domain)
# Cookies imported from the browser
else:
dict_cookie = create_cookie_dict_from_browser(browser_cookie)
# tor browser: disable secure cookie
if crawler_type=='onion':
dict_cookie['secure'] = False
dict_cookie['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
return dict_cookie
def create_cookie_dict_from_input(cookie_name, cookie_value, cookie_domain):
# WebKit use domain for cookie validation
return {'name': cookie_name, 'value': cookie_value, 'domain': '.{}'.format(cookie_domain)}
# # TODO: handle prefix cookies
# # TODO: fill empty fields
def create_cookie_dict_from_browser(browser_cookie):
url = urlparse(browser_cookie['Host raw'])
domain = url.netloc.split(':', 1)[0]
dict_cookie = {'path': browser_cookie['Path raw'],
'name': browser_cookie['Name raw'],
'httpOnly': browser_cookie['HTTP only raw'] == 'true',
'secure': browser_cookie['Send for'] == 'Encrypted connections only',
'domain': domain,
'value': browser_cookie['Content raw']
}
return dict_cookie
def load_cookies(l_cookies, domain=None, crawler_type='regular'):
all_cookies = []
for cookie_dict in l_cookies:
all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type))
return all_cookies
def get_cookies():
l_cookies = []
return l_cookies
# domain up
def create_domain_metadata(domain_type, domain, current_port, date, date_month):
# Add to global set
r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain)
r_serv_onion.sadd('full_{}_up'.format(domain_type), domain)
r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain)
# create onion metadata
if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)):
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date)
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date)
# Update domain port number
all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports')
if all_domain_ports:
all_domain_ports = all_domain_ports.split(';')
else:
all_domain_ports = []
if current_port not in all_domain_ports:
all_domain_ports.append(current_port)
r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports))
# add root_item to history
def add_domain_root_item(root_item, domain_type, domain, epoch_date, port):
# Create/Update crawler history
r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item)
def create_item_metadata(item_id, domain, url, port, item_father):
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father)
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port))
r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url)
# add this item_id to his father
r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id)
def create_item_id(item_dir, domain):
if len(domain) > 215:
UUID = domain[-215:]+str(uuid.uuid4())
else:
UUID = domain+str(uuid.uuid4())
return os.path.join(item_dir, UUID)
def save_crawled_item(item_id, item_content):
try:
gzipencoded = gzip.compress(item_content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
return gzip64encoded
except:
print("file error: {}".format(item_id))
return False
def save_har(har_dir, item_id, har_content):
if not os.path.exists(har_dir):
os.makedirs(har_dir)
item_id = item_id.split('/')[-1]
filename = os.path.join(har_dir, item_id + '.json')
with open(filename, 'w') as f:
f.write(json.dumps(har_content))