diff --git a/bin/Crawler.py b/bin/Crawler.py index c7051b75..793b0c93 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -351,19 +351,19 @@ if __name__ == '__main__': # get HAR files default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har") if default_crawler_har: - default_crawler_har = 1 + default_crawler_har = True else: - default_crawler_har = 0 + default_crawler_har = False # get PNG files default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png") if default_crawler_png: - default_crawler_png = 1 + default_crawler_png = True else: - default_crawler_png = 0 + default_crawler_png = False # Default crawler options - default_crawler_config = {'html': 1, + default_crawler_config = {'html': True, 'har': default_crawler_har, 'png': default_crawler_png, 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), diff --git a/bin/lib/crawler_splash.py b/bin/lib/crawler_splash.py index 1554b9cf..1ff28524 100755 --- a/bin/lib/crawler_splash.py +++ b/bin/lib/crawler_splash.py @@ -27,6 +27,12 @@ r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") config_loader = None +# # # # # # # # +# # +# COOKIES # +# # +# # # # # # # # + # # # # # Cookies Fields: # - name @@ -69,17 +75,76 @@ def create_cookie_dict_from_browser(browser_cookie): } return dict_cookie -def load_cookies(l_cookies, domain=None, crawler_type='regular'): +def load_cookies(cookies_uuid, domain=None, crawler_type='regular'): + cookies_json, l_cookies = get_cookies(cookies_uuid) all_cookies = [] - - for cookie_dict in l_cookies: + for cookie_dict in cookies_json: all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type)) - + for cookie_name, cookie_value in l_cookies: + all_cookies.append(create_cookie_dict( cookie_name=cookie_name, cookie_value=cookie_value, domain=domain, crawler_type=crawler_type)) return all_cookies -def get_cookies(): - l_cookies = [] - return l_cookies +def get_all_cookies(): + r_serv_onion.smembers('cookies:all') + +def get_all_global_cookies(): + r_serv_onion.smembers('cookies:global') + +def get_user_cookies(user_id): + r_serv_onion.smembers('cookies:user:{}'.format(user_id)) + +def exist_cookies_uuid(cookies_uuid): + return r_serv_onion.exists('cookie_metadata:{}'.format(cookies_uuid)) + +def get_manual_cookies_keys(cookies_uuid): + return r_serv_onion.hgetall('cookies:manual_cookies:{}'.format(cookies_uuid)) + +def get_manual_cookie_val(cookies_uuid, cookie_name): + return r_serv_onion.hget('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_name) + +def get_cookies(cookies_uuid): + cookies_json = r_serv_onion.get('cookies:json_cookies:{}'.format(cookies_uuid)) + if cookies_json: + cookies_json = json.loads(cookies_json) + else: + cookies_json = [] + l_cookies = [ ( cookie_name, get_manual_cookie_val(cookies_uuid, cookie_name)) for cookie_name in get_manual_cookies_keys(cookies_uuid) ] + return (cookies_json, l_cookies) + +# # TODO: handle errors + add api handler +def save_cookies(user_id, json_cookies=None, l_cookies=[], cookies_uuid=None, level=1, description=None): + if cookies_uuid is None or not exist_cookies_uuid(cookies_uuid): + cookies_uuid = str(uuid.uuid4()) + + if json_cookies: + json_cookies = json.loads(json_cookies) # # TODO: catch Exception + r_serv_onion.set('cookies:json_cookies:{}'.format(cookies_uuid), json.dumps(json_cookies)) + + for cookie_dict in l_cookies: + r_serv_onion.hset('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_dict['name'], cookie_dict['value']) + + # cookies level # # TODO: edit level set on edit + r_serv_onion.sadd('cookies:all', cookies_uuid) + if level==0: + r_serv_onion.sadd('cookies:user:{}'.format(user_id), cookies_uuid) + else: + r_serv_onion.sadd('cookies:global', cookies_uuid) + + # metadata + r_serv_onion.hset('cookie_metadata:{}'.format(id), 'user_id', user_id) + r_serv_onion.hset('cookie_metadata:{}'.format(id), 'level', level) + r_serv_onion.hset('cookie_metadata:{}'.format(id), 'description', description) + r_serv_onion.hset('cookie_metadata:{}'.format(id), 'date', datetime.date.today().strftime("%Y%m%d")) + return cookies_uuid + +#### #### + +def is_redirection(domain, last_url): + url = urlparse(last_url) + last_domain = url.netloc + last_domain = last_domain.split('.') + last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1]) + return domain != last_domain # domain up def create_domain_metadata(domain_type, domain, current_port, date, date_month): diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index f54d1c7b..7df5e5de 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -121,8 +121,8 @@ class TorSplashCrawler(): self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) - self.png = True - self.har = True + self.png = crawler_options['png'] + self.har = crawler_options['har'] self.cookies = cookies config_section = 'Crawler' @@ -176,6 +176,8 @@ class TorSplashCrawler(): # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) + elif crawler_splash.is_redirection(self.domains[0], response.data['last_url']): + pass # ignore response else: item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0]) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index bda7ecc8..77fb9385 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -37,8 +37,7 @@ if __name__ == '__main__': crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] requested_mode = crawler_json['requested'] - cookies = crawler_splash.load_cookies(crawler_splash.get_cookies(), domain, crawler_type='onion') - print(cookies) + cookies = crawler_splash.load_cookies('ccad0090-bdcb-4ba5-875b-3dae8f936216', domain, crawler_type=service_type) redis_cache.delete('crawler_request:{}'.format(uuid)) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 7acecfbe..6569ba47 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -25,6 +25,7 @@ import Tag sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import Domain +import crawler_splash r_cache = Flask_config.r_cache r_serv_db = Flask_config.r_serv_db @@ -156,3 +157,55 @@ def domains_explorer_web(): dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to) return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') + +@crawler_splash.route('/crawler/cookies/add', methods=['GET']) +#@login_required +#@login_analyst +def crawler_cookies_add(): + return render_template("add_cookies.html") + +@crawler_splash.route('/crawler/cookies/add_post', methods=['POST']) +#@login_required +#@login_analyst +def crawler_cookies_add_post(): + user_id = current_user.get_id() + + description = request.form.get('description') + level = request.form.get('level') + if level: + level = 1 + else: + level = 0 + + if 'file' in request.files: + file = request.files['file'] + json_file = file.read().decode() + else: + json_file = '[]' + + # Get cookies to add + l_manual_cookie = [] + l_invalid_cookie = [] + for obj_tuple in list(request.form): + l_input = request.form.getlist(obj_tuple) + if len(l_input) == 2: + if l_input[0]: # cookie_name + cookie_dict = {'name': l_input[0], 'value': l_input[1]} + l_manual_cookie.append(cookie_dict) + elif l_input[1]: # cookie_value + l_invalid_cookie.append({'name': '', 'value': l_input[1]}) + else: + #print(l_input) + pass + + cookie_uuid = crawler_splash.save_cookies(user_id, json_cookies=json_file, l_cookies=l_manual_cookie, level=level, description=description) + return render_template("add_cookies.html") + +@crawler_splash.route('/crawler/cookies/all', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookies_all(): + user_id = current_user.get_id(user_id) + user_cookies = crawler_splash.get_user_cookies(user_id) + global_cookies = crawler_splash.get_all_global_cookies() + return render_template("add_cookies.html", user_cookies=user_cookies, global_cookies=global_cookies) diff --git a/var/www/templates/crawler/crawler_splash/add_cookies.html b/var/www/templates/crawler/crawler_splash/add_cookies.html new file mode 100644 index 00000000..d2820a69 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/add_cookies.html @@ -0,0 +1,156 @@ + + + +
+