mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 14:17:16 +00:00
chg: [api crawler] fix response + add cookiejar, proxy and frequency parameters
This commit is contained in:
parent
fe2769308b
commit
68dffcd26b
4 changed files with 134 additions and 9 deletions
|
@ -1723,14 +1723,16 @@ def api_add_crawler_task(data, user_id=None):
|
|||
|
||||
if frequency:
|
||||
# TODO verify user
|
||||
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags), 200
|
||||
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
||||
else:
|
||||
# TODO HEADERS
|
||||
# TODO USER AGENT
|
||||
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
||||
parent='manual', priority=90), 200
|
||||
parent='manual', priority=90)
|
||||
|
||||
return {'uuid': task_uuid}, 200
|
||||
|
||||
|
||||
#### ####
|
||||
|
|
|
@ -42,7 +42,8 @@ class Onion(AbstractModule):
|
|||
self.faup = crawlers.get_faup()
|
||||
|
||||
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
||||
|
||||
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||
|
||||
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
|
@ -90,8 +91,9 @@ class Onion(AbstractModule):
|
|||
|
||||
if onion_urls:
|
||||
if crawlers.is_crawler_activated():
|
||||
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR
|
||||
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0)
|
||||
for domain in domains:
|
||||
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0,
|
||||
har=self.har, screenshot=self.screenshot)
|
||||
if task_uuid:
|
||||
print(f'{domain} added to crawler queue: {task_uuid}')
|
||||
else:
|
||||
|
|
120
tools/crawler_add_task.py
Executable file
120
tools/crawler_add_task.py
Executable file
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
DIR/File Importer Helper
|
||||
================
|
||||
|
||||
Import Content
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pyail import PyAIL
|
||||
import sys
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
##################################
|
||||
# Import Project packages
|
||||
##################################
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
|
||||
def check_frequency(value):
|
||||
value = int(value)
|
||||
if value <= 0:
|
||||
raise argparse.ArgumentTypeError(f'Error: Invalid frequency {value}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# TODO add c argument for config file
|
||||
parser = argparse.ArgumentParser(description='Directory or file importer')
|
||||
parser.add_argument('-u', '--url', type=str, help='URL to crawl', required=True)
|
||||
parser.add_argument('-k', '--key', type=str, help='AIL API Key', required=True)
|
||||
parser.add_argument('-a', '--ail', type=str, help='AIL URL')
|
||||
parser.add_argument('-d', '--depth', type=int, default=1, help='Depth limit') # TODO improve me
|
||||
parser.add_argument('--cookiejar', type=str, help='Cookiejar uuid')
|
||||
parser.add_argument('-p', '--proxy', type=str, help='Proxy address to use, "web" and "tor" can be used as shortcut (web is used by default is )')
|
||||
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument('--har', dest='har', action='store_true', help='Save HAR')
|
||||
group.add_argument('--no-har', dest='har', action='store_false', help='Don\'t save HAR')
|
||||
parser.set_defaults(har=None)
|
||||
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument('--screenshot', dest='screenshot', action='store_true', help='Save screenshot')
|
||||
group.add_argument('--no-screenshot', dest='screenshot', action='store_false', help='Don\'t save screenshot')
|
||||
parser.set_defaults(screenshot=None)
|
||||
|
||||
group = parser.add_argument_group('Frequency, create a regular crawler/scheduler')
|
||||
group.add_argument('-f', '--frequency', type=str, choices=['monthly', 'weekly', 'daily', 'hourly'],
|
||||
help='monthly, weekly, daily or hourly frequency or specify a custom one with the others arguments')
|
||||
group.add_argument('--minutes', type=int, help='frequency in minutes')
|
||||
group.add_argument('--hours', type=int, help='frequency in hours')
|
||||
group.add_argument('--days', type=int, help='frequency in days')
|
||||
group.add_argument('--weeks', type=int, help='frequency in weeks')
|
||||
group.add_argument('--months', type=int, help='frequency in months')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.url and not args.key:
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
# Load crawler default config
|
||||
config_loader = ConfigLoader()
|
||||
har = args.har
|
||||
if har is None:
|
||||
har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||
screenshot = args.screenshot
|
||||
if screenshot is None:
|
||||
screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||
|
||||
if args.depth:
|
||||
depth = args.depth
|
||||
if depth < 0:
|
||||
raise argparse.ArgumentTypeError(f'Error: Invalid depth {depth}')
|
||||
else:
|
||||
depth = 1
|
||||
|
||||
# frequency
|
||||
frequency = {}
|
||||
if args.frequency:
|
||||
if args.frequency in ['monthly', 'weekly', 'daily', 'hourly']:
|
||||
frequency = args.frequency
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Invalid frequency')
|
||||
elif args.minutes or args.hours or args.days or args.weeks or args.months:
|
||||
if args.minutes:
|
||||
check_frequency(args.minutes)
|
||||
frequency['minutes'] = args.minutes
|
||||
if args.hours:
|
||||
check_frequency(args.hours)
|
||||
frequency['hours'] = args.hours
|
||||
if args.days:
|
||||
check_frequency(args.days)
|
||||
frequency['days'] = args.days
|
||||
if args.weeks:
|
||||
check_frequency(args.weeks)
|
||||
frequency['weeks'] = args.weeks
|
||||
if args.months:
|
||||
check_frequency(args.months)
|
||||
frequency['months'] = args.months
|
||||
if not frequency:
|
||||
frequency = None
|
||||
|
||||
proxy = args.proxy
|
||||
|
||||
if args.cookiejar:
|
||||
cookiejar = args.cookiejar
|
||||
else:
|
||||
cookiejar = None
|
||||
|
||||
ail = args.ail
|
||||
if not ail:
|
||||
ail = 'https://localhost:7000/'
|
||||
|
||||
client = PyAIL(ail, args.key, ssl=False)
|
||||
r = client.crawl_url(args.url, har=har, screenshot=screenshot, depth_limit=depth, frequency=frequency,
|
||||
cookiejar=cookiejar, proxy=proxy)
|
||||
print(r)
|
|
@ -1,9 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
'''
|
||||
"""
|
||||
Flask functions and routes for the rest api
|
||||
'''
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
|
@ -508,6 +508,7 @@ def get_item_cryptocurrency_bitcoin():
|
|||
# # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # #
|
||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||
# # TODO: ADD RESULT JSON Response
|
||||
# @restApi.route("api/v1/crawler/task/add", methods=['POST'])
|
||||
@restApi.route("api/v1/add/crawler/task", methods=['POST'])
|
||||
@token_required('analyst')
|
||||
def add_crawler_task():
|
||||
|
|
Loading…
Reference in a new issue