mirror of
https://github.com/ail-project/ail-framework.git
synced 2024-11-22 22:27:17 +00:00
chg: [crawler] submit free text of urls to crawl
This commit is contained in:
parent
9d26a47c17
commit
1505bf0157
4 changed files with 94 additions and 21 deletions
|
@ -39,6 +39,7 @@ from packages import git_status
|
||||||
from packages import Date
|
from packages import Date
|
||||||
from lib import ail_orgs
|
from lib import ail_orgs
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
from lib.regex_helper import regex_findall
|
||||||
from lib.objects.Domains import Domain
|
from lib.objects.Domains import Domain
|
||||||
from lib.objects.Titles import Title
|
from lib.objects.Titles import Title
|
||||||
from lib.objects import HHHashs
|
from lib.objects import HHHashs
|
||||||
|
@ -183,6 +184,19 @@ def unpack_url(url):
|
||||||
url_decoded['url'] = url.replace(url_decoded['host'], url_decoded['host'].lower(), 1)
|
url_decoded['url'] = url.replace(url_decoded['host'], url_decoded['host'].lower(), 1)
|
||||||
return url_decoded
|
return url_decoded
|
||||||
|
|
||||||
|
# TODO options to only extract domains
|
||||||
|
# TODO extract onions
|
||||||
|
def extract_url_from_text(content):
|
||||||
|
urls = []
|
||||||
|
r_url = r"(?:(?:https?|ftp):\/\/)?(?:\S+(?::\S*)?@)?(?:\[(?:(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,4}:){1,7}:|(?:[A-Fa-f0-9]{1,4}:){1,6}:[A-Fa-f0-9]{1,4}|::(?:[A-Fa-f0-9]{1,4}:){0,5}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,4}:){1,5}::(?:[A-Fa-f0-9]{1,4})?|(?:[A-Fa-f0-9]{1,4}:){1,4}::(?:[A-Fa-f0-9]{1,4}:){0,1}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,3}:){1}::(?:[A-Fa-f0-9]{1,4}:){0,2}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,2}:){1}::(?:[A-Fa-f0-9]{1,4}:){0,3}[A-Fa-f0-9]{1,4}|[A-Fa-f0-9]{1,4}::(?:[A-Fa-f0-9]{1,4}:){0,4}[A-Fa-f0-9]{1,4}|::(?:[A-Fa-f0-9]{1,4}:){0,5}[A-Fa-f0-9]{1,4}|fe80:(?:[A-Fa-f0-9]{0,4}:){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9])?[0-9])\.){3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9])?[0-9]))\]|(?:(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})|(?:(?:[a-zA-Z0-9\-]+\.)+[a-zA-Z]{2,}))(?::\d{2,5})?(?:\/[^\s]*)?"
|
||||||
|
for url in regex_findall('extract_url_from_text', gen_uuid(), r_url, 'user_id', content, max_time=10):
|
||||||
|
urls.append(url)
|
||||||
|
# check if onions
|
||||||
|
return urls
|
||||||
|
# extract onions
|
||||||
|
# extract IP
|
||||||
|
|
||||||
|
|
||||||
# # # # # # # #
|
# # # # # # # #
|
||||||
# #
|
# #
|
||||||
# FAVICON # TODO REWRITE ME
|
# FAVICON # TODO REWRITE ME
|
||||||
|
@ -1828,8 +1842,9 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=
|
||||||
|
|
||||||
def api_parse_task_dict_basic(data, user_id):
|
def api_parse_task_dict_basic(data, user_id):
|
||||||
url = data.get('url', None)
|
url = data.get('url', None)
|
||||||
if not url or url == '\n':
|
urls = data.get('urls', None)
|
||||||
return {'status': 'error', 'reason': 'No url supplied'}, 400
|
if (not url or url == '\n') and not urls:
|
||||||
|
return {'status': 'error', 'reason': 'No url(s) supplied'}, 400
|
||||||
|
|
||||||
screenshot = data.get('screenshot', False)
|
screenshot = data.get('screenshot', False)
|
||||||
if screenshot:
|
if screenshot:
|
||||||
|
@ -1863,14 +1878,20 @@ def api_parse_task_dict_basic(data, user_id):
|
||||||
|
|
||||||
tags = data.get('tags', [])
|
tags = data.get('tags', [])
|
||||||
|
|
||||||
return {'url': url, 'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}, 200
|
data = {'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}
|
||||||
|
if url :
|
||||||
|
data['url'] = url
|
||||||
|
elif urls:
|
||||||
|
data['urls'] = urls
|
||||||
|
return data, 200
|
||||||
|
|
||||||
def api_add_crawler_task(data, user_org, user_id=None):
|
def api_add_crawler_task(data, user_org, user_id=None):
|
||||||
task, resp = api_parse_task_dict_basic(data, user_id)
|
task, resp = api_parse_task_dict_basic(data, user_id)
|
||||||
if resp != 200:
|
if resp != 200:
|
||||||
return task, resp
|
return task, resp
|
||||||
|
|
||||||
url = task['url']
|
url = task.get('url')
|
||||||
|
urls = task.get('urls')
|
||||||
screenshot = task['screenshot']
|
screenshot = task['screenshot']
|
||||||
har = task['har']
|
har = task['har']
|
||||||
depth_limit = task['depth_limit']
|
depth_limit = task['depth_limit']
|
||||||
|
@ -1920,17 +1941,22 @@ def api_add_crawler_task(data, user_org, user_id=None):
|
||||||
if max(months, weeks, days, hours, minutes) <= 0:
|
if max(months, weeks, days, hours, minutes) <= 0:
|
||||||
return {'error': 'Invalid frequency'}, 400
|
return {'error': 'Invalid frequency'}, 400
|
||||||
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
|
||||||
|
if url:
|
||||||
if frequency:
|
if frequency:
|
||||||
# TODO verify user
|
# TODO verify user
|
||||||
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
||||||
else:
|
else:
|
||||||
# TODO HEADERS
|
# TODO HEADERS
|
||||||
# TODO USER AGENT
|
# TODO USER AGENT
|
||||||
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
||||||
parent='manual', priority=90)
|
parent='manual', priority=90)
|
||||||
|
elif urls:
|
||||||
|
for url in urls:
|
||||||
|
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
||||||
|
parent='manual', priority=90)
|
||||||
|
|
||||||
return {'uuid': task_uuid}, 200
|
return {'uuid': task_uuid}, 200
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ Regex Helper
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging.config
|
import logging.config
|
||||||
import phonenumbers
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import uuid
|
import uuid
|
||||||
|
@ -20,7 +19,6 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
##################################
|
##################################
|
||||||
from lib import ail_logger
|
from lib import ail_logger
|
||||||
from lib import ConfigLoader
|
from lib import ConfigLoader
|
||||||
# from lib import Statistics
|
|
||||||
|
|
||||||
logging.config.dictConfig(ail_logger.get_config())
|
logging.config.dictConfig(ail_logger.get_config())
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
@ -171,6 +169,7 @@ def regex_search(r_key, regex, item_id, content, max_time=30):
|
||||||
|
|
||||||
## Phone Regexs ##
|
## Phone Regexs ##
|
||||||
def _regex_phone_iter(r_key, country_code, content):
|
def _regex_phone_iter(r_key, country_code, content):
|
||||||
|
import phonenumbers
|
||||||
iterator = phonenumbers.PhoneNumberMatcher(content, country_code)
|
iterator = phonenumbers.PhoneNumberMatcher(content, country_code)
|
||||||
for match in iterator:
|
for match in iterator:
|
||||||
value = match.raw_string
|
value = match.raw_string
|
||||||
|
|
|
@ -122,6 +122,20 @@ def send_to_spider():
|
||||||
|
|
||||||
# POST val
|
# POST val
|
||||||
url = request.form.get('url_to_crawl')
|
url = request.form.get('url_to_crawl')
|
||||||
|
urls = request.form.get('urls_to_crawl')
|
||||||
|
if urls:
|
||||||
|
urls = crawlers.extract_url_from_text(urls)
|
||||||
|
l_cookiejar = crawlers.api_get_cookiejars_selector(user_org, user_id)
|
||||||
|
crawlers_types = crawlers.get_crawler_all_types()
|
||||||
|
proxies = [] # TODO HANDLE PROXIES
|
||||||
|
return render_template("crawler_manual.html", urls=urls,
|
||||||
|
is_manager_connected=crawlers.get_lacus_connection_metadata(),
|
||||||
|
crawlers_types=crawlers_types,
|
||||||
|
proxies=proxies,
|
||||||
|
l_cookiejar=l_cookiejar,
|
||||||
|
tags_selector_data=Tag.get_tags_selector_data())
|
||||||
|
|
||||||
|
urls = request.form.getlist('urls')
|
||||||
crawler_type = request.form.get('crawler_queue_type')
|
crawler_type = request.form.get('crawler_queue_type')
|
||||||
screenshot = request.form.get('screenshot')
|
screenshot = request.form.get('screenshot')
|
||||||
har = request.form.get('har')
|
har = request.form.get('har')
|
||||||
|
@ -185,7 +199,11 @@ def send_to_spider():
|
||||||
cookiejar_uuid = cookiejar_uuid.rsplit(':')
|
cookiejar_uuid = cookiejar_uuid.rsplit(':')
|
||||||
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
||||||
|
|
||||||
data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
|
data = {'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency}
|
||||||
|
if url:
|
||||||
|
data['url']= url
|
||||||
|
if urls:
|
||||||
|
data['urls'] = urls
|
||||||
if proxy:
|
if proxy:
|
||||||
data['proxy'] = proxy
|
data['proxy'] = proxy
|
||||||
if cookiejar_uuid:
|
if cookiejar_uuid:
|
||||||
|
|
|
@ -43,9 +43,28 @@
|
||||||
<form action="{{ url_for('crawler_splash.send_to_spider') }}" method='post'>
|
<form action="{{ url_for('crawler_splash.send_to_spider') }}" method='post'>
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-12 col-lg-6">
|
<div class="col-12 col-lg-6">
|
||||||
<div class="input-group" id="date-range-from">
|
{% if urls %}
|
||||||
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
|
{% for url in urls %}
|
||||||
</div>
|
<div class="input-group mb-1">
|
||||||
|
<input type="text" class="form-control col-10" name="urls" value="{{ url }}">
|
||||||
|
<span class="btn btn-danger col-1" id="" onclick="$(this).parent().remove();"><i class="fas fa-trash-alt"></i></span>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
{% else %}
|
||||||
|
<div class="input-group" id="single_urls">
|
||||||
|
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
|
||||||
|
<div class="input-group-append">
|
||||||
|
<button class="btn btn-secondary" type="button" onclick="btn_multiple_urls()"><i class="fa fa-plus"></i> Multiple Urls</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="input-group" id="multiple_urls">
|
||||||
|
<textarea type="text" class="form-control" id="urls_to_crawl" name="urls_to_crawl" rows="3" placeholder="List Of Urls or Free Text"></textarea>
|
||||||
|
<div class="input-group-append">
|
||||||
|
<button class="btn btn-secondary" type="button" onclick="btn_single_url()"><i class="fa fa-minus"></i> One Url</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
<div class="d-flex mt-2">
|
<div class="d-flex mt-2">
|
||||||
<i class="fas fa-spider mt-1"></i> Crawler Type
|
<i class="fas fa-spider mt-1"></i> Crawler Type
|
||||||
<div class="custom-control custom-switch">
|
<div class="custom-control custom-switch">
|
||||||
|
@ -221,6 +240,7 @@ $(document).ready(function(){
|
||||||
queue_type_selector_input_controler()
|
queue_type_selector_input_controler()
|
||||||
manual_crawler_input_controler();
|
manual_crawler_input_controler();
|
||||||
$("#custom_frequency").hide();
|
$("#custom_frequency").hide();
|
||||||
|
$("#multiple_urls").hide();
|
||||||
|
|
||||||
$('#crawler_scheduler').on("change", function () {
|
$('#crawler_scheduler').on("change", function () {
|
||||||
manual_crawler_input_controler();
|
manual_crawler_input_controler();
|
||||||
|
@ -245,6 +265,16 @@ function toggle_sidebar(){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function btn_single_url() {
|
||||||
|
$("#multiple_urls").hide();
|
||||||
|
$("#single_urls").show()
|
||||||
|
}
|
||||||
|
|
||||||
|
function btn_multiple_urls() {
|
||||||
|
$("#single_urls").hide()
|
||||||
|
$("#multiple_urls").show();
|
||||||
|
}
|
||||||
|
|
||||||
function manual_crawler_input_controler() {
|
function manual_crawler_input_controler() {
|
||||||
if($('#crawler_scheduler').is(':checked')){
|
if($('#crawler_scheduler').is(':checked')){
|
||||||
$("#frequency_inputs").show();
|
$("#frequency_inputs").show();
|
||||||
|
|
Loading…
Reference in a new issue