mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-26 15:57:16 +00:00
chg: [api crawler] fix response + add cookiejar, proxy and frequency parameters
This commit is contained in:
parent
fe2769308b
commit
68dffcd26b
4 changed files with 134 additions and 9 deletions
|
@ -1723,14 +1723,16 @@ def api_add_crawler_task(data, user_id=None):
|
||||||
|
|
||||||
if frequency:
|
if frequency:
|
||||||
# TODO verify user
|
# TODO verify user
|
||||||
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags), 200
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
||||||
else:
|
else:
|
||||||
# TODO HEADERS
|
# TODO HEADERS
|
||||||
# TODO USER AGENT
|
# TODO USER AGENT
|
||||||
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
||||||
parent='manual', priority=90), 200
|
parent='manual', priority=90)
|
||||||
|
|
||||||
|
return {'uuid': task_uuid}, 200
|
||||||
|
|
||||||
|
|
||||||
#### ####
|
#### ####
|
||||||
|
|
|
@ -42,7 +42,8 @@ class Onion(AbstractModule):
|
||||||
self.faup = crawlers.get_faup()
|
self.faup = crawlers.get_faup()
|
||||||
|
|
||||||
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
||||||
|
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||||
|
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||||
|
|
||||||
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||||
|
@ -90,8 +91,9 @@ class Onion(AbstractModule):
|
||||||
|
|
||||||
if onion_urls:
|
if onion_urls:
|
||||||
if crawlers.is_crawler_activated():
|
if crawlers.is_crawler_activated():
|
||||||
for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR
|
for domain in domains:
|
||||||
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0)
|
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0,
|
||||||
|
har=self.har, screenshot=self.screenshot)
|
||||||
if task_uuid:
|
if task_uuid:
|
||||||
print(f'{domain} added to crawler queue: {task_uuid}')
|
print(f'{domain} added to crawler queue: {task_uuid}')
|
||||||
else:
|
else:
|
||||||
|
|
120
tools/crawler_add_task.py
Executable file
120
tools/crawler_add_task.py
Executable file
|
@ -0,0 +1,120 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
DIR/File Importer Helper
|
||||||
|
================
|
||||||
|
|
||||||
|
Import Content
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from pyail import PyAIL
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
##################################
|
||||||
|
# Import Project packages
|
||||||
|
##################################
|
||||||
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
|
||||||
|
def check_frequency(value):
|
||||||
|
value = int(value)
|
||||||
|
if value <= 0:
|
||||||
|
raise argparse.ArgumentTypeError(f'Error: Invalid frequency {value}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
# TODO add c argument for config file
|
||||||
|
parser = argparse.ArgumentParser(description='Directory or file importer')
|
||||||
|
parser.add_argument('-u', '--url', type=str, help='URL to crawl', required=True)
|
||||||
|
parser.add_argument('-k', '--key', type=str, help='AIL API Key', required=True)
|
||||||
|
parser.add_argument('-a', '--ail', type=str, help='AIL URL')
|
||||||
|
parser.add_argument('-d', '--depth', type=int, default=1, help='Depth limit') # TODO improve me
|
||||||
|
parser.add_argument('--cookiejar', type=str, help='Cookiejar uuid')
|
||||||
|
parser.add_argument('-p', '--proxy', type=str, help='Proxy address to use, "web" and "tor" can be used as shortcut (web is used by default is )')
|
||||||
|
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument('--har', dest='har', action='store_true', help='Save HAR')
|
||||||
|
group.add_argument('--no-har', dest='har', action='store_false', help='Don\'t save HAR')
|
||||||
|
parser.set_defaults(har=None)
|
||||||
|
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument('--screenshot', dest='screenshot', action='store_true', help='Save screenshot')
|
||||||
|
group.add_argument('--no-screenshot', dest='screenshot', action='store_false', help='Don\'t save screenshot')
|
||||||
|
parser.set_defaults(screenshot=None)
|
||||||
|
|
||||||
|
group = parser.add_argument_group('Frequency, create a regular crawler/scheduler')
|
||||||
|
group.add_argument('-f', '--frequency', type=str, choices=['monthly', 'weekly', 'daily', 'hourly'],
|
||||||
|
help='monthly, weekly, daily or hourly frequency or specify a custom one with the others arguments')
|
||||||
|
group.add_argument('--minutes', type=int, help='frequency in minutes')
|
||||||
|
group.add_argument('--hours', type=int, help='frequency in hours')
|
||||||
|
group.add_argument('--days', type=int, help='frequency in days')
|
||||||
|
group.add_argument('--weeks', type=int, help='frequency in weeks')
|
||||||
|
group.add_argument('--months', type=int, help='frequency in months')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.url and not args.key:
|
||||||
|
parser.print_help()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Load crawler default config
|
||||||
|
config_loader = ConfigLoader()
|
||||||
|
har = args.har
|
||||||
|
if har is None:
|
||||||
|
har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||||
|
screenshot = args.screenshot
|
||||||
|
if screenshot is None:
|
||||||
|
screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
||||||
|
|
||||||
|
if args.depth:
|
||||||
|
depth = args.depth
|
||||||
|
if depth < 0:
|
||||||
|
raise argparse.ArgumentTypeError(f'Error: Invalid depth {depth}')
|
||||||
|
else:
|
||||||
|
depth = 1
|
||||||
|
|
||||||
|
# frequency
|
||||||
|
frequency = {}
|
||||||
|
if args.frequency:
|
||||||
|
if args.frequency in ['monthly', 'weekly', 'daily', 'hourly']:
|
||||||
|
frequency = args.frequency
|
||||||
|
else:
|
||||||
|
raise argparse.ArgumentTypeError('Invalid frequency')
|
||||||
|
elif args.minutes or args.hours or args.days or args.weeks or args.months:
|
||||||
|
if args.minutes:
|
||||||
|
check_frequency(args.minutes)
|
||||||
|
frequency['minutes'] = args.minutes
|
||||||
|
if args.hours:
|
||||||
|
check_frequency(args.hours)
|
||||||
|
frequency['hours'] = args.hours
|
||||||
|
if args.days:
|
||||||
|
check_frequency(args.days)
|
||||||
|
frequency['days'] = args.days
|
||||||
|
if args.weeks:
|
||||||
|
check_frequency(args.weeks)
|
||||||
|
frequency['weeks'] = args.weeks
|
||||||
|
if args.months:
|
||||||
|
check_frequency(args.months)
|
||||||
|
frequency['months'] = args.months
|
||||||
|
if not frequency:
|
||||||
|
frequency = None
|
||||||
|
|
||||||
|
proxy = args.proxy
|
||||||
|
|
||||||
|
if args.cookiejar:
|
||||||
|
cookiejar = args.cookiejar
|
||||||
|
else:
|
||||||
|
cookiejar = None
|
||||||
|
|
||||||
|
ail = args.ail
|
||||||
|
if not ail:
|
||||||
|
ail = 'https://localhost:7000/'
|
||||||
|
|
||||||
|
client = PyAIL(ail, args.key, ssl=False)
|
||||||
|
r = client.crawl_url(args.url, har=har, screenshot=screenshot, depth_limit=depth, frequency=frequency,
|
||||||
|
cookiejar=cookiejar, proxy=proxy)
|
||||||
|
print(r)
|
|
@ -1,9 +1,9 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
'''
|
"""
|
||||||
Flask functions and routes for the rest api
|
Flask functions and routes for the rest api
|
||||||
'''
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -508,6 +508,7 @@ def get_item_cryptocurrency_bitcoin():
|
||||||
# # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
# # TODO: ADD RESULT JSON Response
|
# # TODO: ADD RESULT JSON Response
|
||||||
|
# @restApi.route("api/v1/crawler/task/add", methods=['POST'])
|
||||||
@restApi.route("api/v1/add/crawler/task", methods=['POST'])
|
@restApi.route("api/v1/add/crawler/task", methods=['POST'])
|
||||||
@token_required('analyst')
|
@token_required('analyst')
|
||||||
def add_crawler_task():
|
def add_crawler_task():
|
||||||
|
|
Loading…
Reference in a new issue