mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-10 08:38:28 +00:00
chg: [crawler] push onion discovery capture_uuid to another AIL
This commit is contained in:
parent
38ce17bc8a
commit
a382b572c6
4 changed files with 97 additions and 17 deletions
|
@ -6,6 +6,7 @@ import logging.config
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from pyail import PyAIL
|
||||||
from requests.exceptions import ConnectionError
|
from requests.exceptions import ConnectionError
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
@ -44,6 +45,15 @@ class Crawler(AbstractModule):
|
||||||
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||||
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
|
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
|
||||||
|
|
||||||
|
ail_url_to_push_discovery = config_loader.get_config_str('Crawler', 'ail_url_to_push_onion_discovery')
|
||||||
|
ail_key_to_push_discovery = config_loader.get_config_str('Crawler', 'ail_key_to_push_onion_discovery')
|
||||||
|
if ail_url_to_push_discovery and ail_key_to_push_discovery:
|
||||||
|
ail = PyAIL(ail_url_to_push_discovery, ail_key_to_push_discovery, ssl=False)
|
||||||
|
if ail.ping_ail():
|
||||||
|
self.ail_to_push_discovery = ail
|
||||||
|
else:
|
||||||
|
self.ail_to_push_discovery = None
|
||||||
|
|
||||||
# TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES
|
# TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES
|
||||||
|
|
||||||
# update hardcoded blacklist
|
# update hardcoded blacklist
|
||||||
|
@ -183,6 +193,14 @@ class Crawler(AbstractModule):
|
||||||
|
|
||||||
crawlers.create_capture(capture_uuid, task_uuid)
|
crawlers.create_capture(capture_uuid, task_uuid)
|
||||||
print(task.uuid, capture_uuid, 'launched')
|
print(task.uuid, capture_uuid, 'launched')
|
||||||
|
|
||||||
|
if self.ail_to_push_discovery:
|
||||||
|
if task.get_depth() == 1 and priority < 10 and task.get_domain().endswith('.onion'):
|
||||||
|
har = task.get_har()
|
||||||
|
screenshot = task.get_screenshot()
|
||||||
|
self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har,
|
||||||
|
screenshot=screenshot, depth_limit=1, proxy='force_tor')
|
||||||
|
print(task.uuid, capture_uuid, 'Added to ail_to_push_discovery')
|
||||||
return capture_uuid
|
return capture_uuid
|
||||||
|
|
||||||
# CRAWL DOMAIN
|
# CRAWL DOMAIN
|
||||||
|
|
|
@ -309,6 +309,16 @@ def get_all_har_ids():
|
||||||
har_ids.append(har_id)
|
har_ids.append(har_id)
|
||||||
return har_ids
|
return har_ids
|
||||||
|
|
||||||
|
def get_month_har_ids(year, month):
|
||||||
|
har_ids = []
|
||||||
|
month_path = os.path.join(HAR_DIR, year, month)
|
||||||
|
for root, dirs, files in os.walk(month_path):
|
||||||
|
for file in files:
|
||||||
|
har_id = os.path.relpath(os.path.join(root, file), HAR_DIR)
|
||||||
|
har_ids.append(har_id)
|
||||||
|
return har_ids
|
||||||
|
|
||||||
|
|
||||||
def get_har_content(har_id):
|
def get_har_content(har_id):
|
||||||
har_path = os.path.join(HAR_DIR, har_id)
|
har_path = os.path.join(HAR_DIR, har_id)
|
||||||
try:
|
try:
|
||||||
|
@ -1519,7 +1529,7 @@ class CrawlerTask:
|
||||||
# TODO SANITIZE PRIORITY
|
# TODO SANITIZE PRIORITY
|
||||||
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
||||||
def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
|
def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
|
||||||
user_agent=None, tags=[], parent='manual', priority=0):
|
user_agent=None, tags=[], parent='manual', priority=0, external=False):
|
||||||
if self.exists():
|
if self.exists():
|
||||||
raise Exception('Error: Task already exists')
|
raise Exception('Error: Task already exists')
|
||||||
|
|
||||||
|
@ -1576,8 +1586,8 @@ class CrawlerTask:
|
||||||
|
|
||||||
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
|
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
|
||||||
self._set_field('hash', hash_query)
|
self._set_field('hash', hash_query)
|
||||||
r_crawler.zadd('crawler:queue', {self.uuid: priority})
|
if not external:
|
||||||
self.add_to_db_crawler_queue(priority)
|
self.add_to_db_crawler_queue(priority)
|
||||||
# UI
|
# UI
|
||||||
domain_type = dom.get_domain_type()
|
domain_type = dom.get_domain_type()
|
||||||
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
|
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
|
||||||
|
@ -1637,7 +1647,7 @@ def add_task_to_lacus_queue():
|
||||||
|
|
||||||
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
|
||||||
def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
|
def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
|
||||||
user_agent=None, tags=[], parent='manual', priority=0, task_uuid=None):
|
user_agent=None, tags=[], parent='manual', priority=0, task_uuid=None, external=False):
|
||||||
if task_uuid:
|
if task_uuid:
|
||||||
if CrawlerTask(task_uuid).exists():
|
if CrawlerTask(task_uuid).exists():
|
||||||
task_uuid = gen_uuid()
|
task_uuid = gen_uuid()
|
||||||
|
@ -1645,7 +1655,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
|
||||||
task_uuid = gen_uuid()
|
task_uuid = gen_uuid()
|
||||||
task = CrawlerTask(task_uuid)
|
task = CrawlerTask(task_uuid)
|
||||||
task_uuid = task.create(url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar,
|
task_uuid = task.create(url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar,
|
||||||
proxy=proxy, user_agent=user_agent, tags=tags, parent=parent, priority=priority)
|
proxy=proxy, user_agent=user_agent, tags=tags, parent=parent, priority=priority,
|
||||||
|
external=external)
|
||||||
return task_uuid
|
return task_uuid
|
||||||
|
|
||||||
|
|
||||||
|
@ -1655,7 +1666,8 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
|
||||||
|
|
||||||
# # TODO: ADD user agent
|
# # TODO: ADD user agent
|
||||||
# # TODO: sanitize URL
|
# # TODO: sanitize URL
|
||||||
def api_add_crawler_task(data, user_id=None):
|
|
||||||
|
def api_parse_task_dict_basic(data, user_id):
|
||||||
url = data.get('url', None)
|
url = data.get('url', None)
|
||||||
if not url or url == '\n':
|
if not url or url == '\n':
|
||||||
return {'status': 'error', 'reason': 'No url supplied'}, 400
|
return {'status': 'error', 'reason': 'No url supplied'}, 400
|
||||||
|
@ -1681,6 +1693,31 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
else:
|
else:
|
||||||
depth_limit = 0
|
depth_limit = 0
|
||||||
|
|
||||||
|
# PROXY
|
||||||
|
proxy = data.get('proxy', None)
|
||||||
|
if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor':
|
||||||
|
proxy = 'force_tor'
|
||||||
|
elif proxy:
|
||||||
|
verify = api_verify_proxy(proxy)
|
||||||
|
if verify[1] != 200:
|
||||||
|
return verify
|
||||||
|
|
||||||
|
tags = data.get('tags', [])
|
||||||
|
|
||||||
|
return {'url': url, 'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}, 200
|
||||||
|
|
||||||
|
def api_add_crawler_task(data, user_id=None):
|
||||||
|
task, resp = api_parse_task_dict_basic(data, user_id)
|
||||||
|
if resp != 200:
|
||||||
|
return task, resp
|
||||||
|
|
||||||
|
url = task['url']
|
||||||
|
screenshot = task['screenshot']
|
||||||
|
har = task['har']
|
||||||
|
depth_limit = task['depth_limit']
|
||||||
|
proxy = task['proxy']
|
||||||
|
tags = task['tags']
|
||||||
|
|
||||||
cookiejar_uuid = data.get('cookiejar', None)
|
cookiejar_uuid = data.get('cookiejar', None)
|
||||||
if cookiejar_uuid:
|
if cookiejar_uuid:
|
||||||
cookiejar = Cookiejar(cookiejar_uuid)
|
cookiejar = Cookiejar(cookiejar_uuid)
|
||||||
|
@ -1725,17 +1762,6 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
return {'error': 'Invalid frequency'}, 400
|
return {'error': 'Invalid frequency'}, 400
|
||||||
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
||||||
|
|
||||||
# PROXY
|
|
||||||
proxy = data.get('proxy', None)
|
|
||||||
if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor':
|
|
||||||
proxy = 'force_tor'
|
|
||||||
elif proxy:
|
|
||||||
verify = api_verify_proxy(proxy)
|
|
||||||
if verify[1] != 200:
|
|
||||||
return verify
|
|
||||||
|
|
||||||
tags = data.get('tags', [])
|
|
||||||
|
|
||||||
if frequency:
|
if frequency:
|
||||||
# TODO verify user
|
# TODO verify user
|
||||||
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
|
@ -1752,6 +1778,26 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
|
|
||||||
#### ####
|
#### ####
|
||||||
|
|
||||||
|
# TODO cookiejar - cookies - frequency
|
||||||
|
def api_add_crawler_capture(data, user_id):
|
||||||
|
task, resp = api_parse_task_dict_basic(data, user_id)
|
||||||
|
if resp != 200:
|
||||||
|
return task, resp
|
||||||
|
|
||||||
|
task_uuid = data.get('task_uuid')
|
||||||
|
if not task_uuid:
|
||||||
|
return {'error': 'Invalid task_uuid', 'task_uuid': task_uuid}, 400
|
||||||
|
capture_uuid = data.get('capture_uuid')
|
||||||
|
if not capture_uuid:
|
||||||
|
return {'error': 'Invalid capture_uuid', 'task_uuid': capture_uuid}, 400
|
||||||
|
|
||||||
|
# TODO parent
|
||||||
|
create_task(task['url'], depth=task['depth_limit'], har=task['har'], screenshot=task['screenshot'],
|
||||||
|
proxy=task['proxy'], tags=task['tags'],
|
||||||
|
parent='AIL_capture', task_uuid=task_uuid, external=True)
|
||||||
|
|
||||||
|
create_capture(capture_uuid, task_uuid)
|
||||||
|
return capture_uuid, 200
|
||||||
|
|
||||||
###################################################################################
|
###################################################################################
|
||||||
###################################################################################
|
###################################################################################
|
||||||
|
|
|
@ -261,6 +261,8 @@ default_depth_limit = 1
|
||||||
default_har = True
|
default_har = True
|
||||||
default_screenshot = True
|
default_screenshot = True
|
||||||
onion_proxy = onion.foundation
|
onion_proxy = onion.foundation
|
||||||
|
ail_url_to_push_onion_discovery =
|
||||||
|
ail_key_to_push_onion_discovery =
|
||||||
|
|
||||||
[Translation]
|
[Translation]
|
||||||
libretranslate =
|
libretranslate =
|
||||||
|
|
|
@ -523,6 +523,20 @@ def add_crawler_task():
|
||||||
return create_json_response(dict_res, 200)
|
return create_json_response(dict_res, 200)
|
||||||
|
|
||||||
|
|
||||||
|
@restApi.route("api/v1/add/crawler/capture", methods=['POST'])
|
||||||
|
@token_required('analyst')
|
||||||
|
def add_crawler_task():
|
||||||
|
data = request.get_json()
|
||||||
|
user_token = get_auth_from_header()
|
||||||
|
user_id = Users.get_token_user(user_token)
|
||||||
|
res = crawlers.api_add_crawler_capture(data, user_id)
|
||||||
|
if res:
|
||||||
|
return create_json_response(res[0], res[1])
|
||||||
|
|
||||||
|
dict_res = {'url': data['url']}
|
||||||
|
return create_json_response(dict_res, 200)
|
||||||
|
|
||||||
|
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # # DOMAIN # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # DOMAIN # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
|
|
Loading…
Reference in a new issue