chg: [api crawler] fix response + add cookiejar, proxy and frequency parameters

This commit is contained in:
Terrtia 2023-07-25 15:57:11 +02:00
parent fe2769308b
commit 68dffcd26b
No known key found for this signature in database
GPG key ID: 1E1B1F50D84613D0
4 changed files with 134 additions and 9 deletions

View file

@ -1723,14 +1723,16 @@ def api_add_crawler_task(data, user_id=None):
if frequency: if frequency:
# TODO verify user # TODO verify user
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags), 200 cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
else: else:
# TODO HEADERS # TODO HEADERS
# TODO USER AGENT # TODO USER AGENT
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
parent='manual', priority=90), 200 parent='manual', priority=90)
return {'uuid': task_uuid}, 200
#### #### #### ####

View file

@ -42,7 +42,8 @@ class Onion(AbstractModule):
self.faup = crawlers.get_faup() self.faup = crawlers.get_faup()
# activate_crawler = p.config.get("Crawler", "activate_crawler") # activate_crawler = p.config.get("Crawler", "activate_crawler")
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" # self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@ -90,8 +91,9 @@ class Onion(AbstractModule):
if onion_urls: if onion_urls:
if crawlers.is_crawler_activated(): if crawlers.is_crawler_activated():
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR for domain in domains:
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0) task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0,
har=self.har, screenshot=self.screenshot)
if task_uuid: if task_uuid:
print(f'{domain} added to crawler queue: {task_uuid}') print(f'{domain} added to crawler queue: {task_uuid}')
else: else:

120
tools/crawler_add_task.py Executable file
View file

@ -0,0 +1,120 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DIR/File Importer Helper
================
Import Content
"""
import argparse
import os
from pyail import PyAIL
import sys
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
def check_frequency(value):
value = int(value)
if value <= 0:
raise argparse.ArgumentTypeError(f'Error: Invalid frequency {value}')
if __name__ == "__main__":
# TODO add c argument for config file
parser = argparse.ArgumentParser(description='Directory or file importer')
parser.add_argument('-u', '--url', type=str, help='URL to crawl', required=True)
parser.add_argument('-k', '--key', type=str, help='AIL API Key', required=True)
parser.add_argument('-a', '--ail', type=str, help='AIL URL')
parser.add_argument('-d', '--depth', type=int, default=1, help='Depth limit') # TODO improve me
parser.add_argument('--cookiejar', type=str, help='Cookiejar uuid')
parser.add_argument('-p', '--proxy', type=str, help='Proxy address to use, "web" and "tor" can be used as shortcut (web is used by default is )')
group = parser.add_mutually_exclusive_group()
group.add_argument('--har', dest='har', action='store_true', help='Save HAR')
group.add_argument('--no-har', dest='har', action='store_false', help='Don\'t save HAR')
parser.set_defaults(har=None)
group = parser.add_mutually_exclusive_group()
group.add_argument('--screenshot', dest='screenshot', action='store_true', help='Save screenshot')
group.add_argument('--no-screenshot', dest='screenshot', action='store_false', help='Don\'t save screenshot')
parser.set_defaults(screenshot=None)
group = parser.add_argument_group('Frequency, create a regular crawler/scheduler')
group.add_argument('-f', '--frequency', type=str, choices=['monthly', 'weekly', 'daily', 'hourly'],
help='monthly, weekly, daily or hourly frequency or specify a custom one with the others arguments')
group.add_argument('--minutes', type=int, help='frequency in minutes')
group.add_argument('--hours', type=int, help='frequency in hours')
group.add_argument('--days', type=int, help='frequency in days')
group.add_argument('--weeks', type=int, help='frequency in weeks')
group.add_argument('--months', type=int, help='frequency in months')
args = parser.parse_args()
if not args.url and not args.key:
parser.print_help()
sys.exit(0)
# Load crawler default config
config_loader = ConfigLoader()
har = args.har
if har is None:
har = config_loader.get_config_boolean('Crawler', 'default_har')
screenshot = args.screenshot
if screenshot is None:
screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
if args.depth:
depth = args.depth
if depth < 0:
raise argparse.ArgumentTypeError(f'Error: Invalid depth {depth}')
else:
depth = 1
# frequency
frequency = {}
if args.frequency:
if args.frequency in ['monthly', 'weekly', 'daily', 'hourly']:
frequency = args.frequency
else:
raise argparse.ArgumentTypeError('Invalid frequency')
elif args.minutes or args.hours or args.days or args.weeks or args.months:
if args.minutes:
check_frequency(args.minutes)
frequency['minutes'] = args.minutes
if args.hours:
check_frequency(args.hours)
frequency['hours'] = args.hours
if args.days:
check_frequency(args.days)
frequency['days'] = args.days
if args.weeks:
check_frequency(args.weeks)
frequency['weeks'] = args.weeks
if args.months:
check_frequency(args.months)
frequency['months'] = args.months
if not frequency:
frequency = None
proxy = args.proxy
if args.cookiejar:
cookiejar = args.cookiejar
else:
cookiejar = None
ail = args.ail
if not ail:
ail = 'https://localhost:7000/'
client = PyAIL(ail, args.key, ssl=False)
r = client.crawl_url(args.url, har=har, screenshot=screenshot, depth_limit=depth, frequency=frequency,
cookiejar=cookiejar, proxy=proxy)
print(r)

View file

@ -1,9 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
''' """
Flask functions and routes for the rest api Flask functions and routes for the rest api
''' """
import os import os
import re import re
@ -508,6 +508,7 @@ def get_item_cryptocurrency_bitcoin():
# # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # TODO: ADD RESULT JSON Response # # TODO: ADD RESULT JSON Response
# @restApi.route("api/v1/crawler/task/add", methods=['POST'])
@restApi.route("api/v1/add/crawler/task", methods=['POST']) @restApi.route("api/v1/add/crawler/task", methods=['POST'])
@token_required('analyst') @token_required('analyst')
def add_crawler_task(): def add_crawler_task():