From 501d10bbbd4ab603c9572e1fea36dbdfad6178c0 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 20 Jun 2023 08:11:44 +0200 Subject: [PATCH] chg: [crawler] auto tag crawled domains --- bin/crawlers/Crawler.py | 5 +- bin/lib/crawlers.py | 58 +++++++++++++++---- var/www/blueprints/crawler_splash.py | 35 ++++++++++- .../crawler_splash/crawler_manual.html | 23 +++++++- .../crawler_splash/crawler_schedule_uuid.html | 8 +++ .../crawler_scheduler_dashboard.html | 9 ++- 6 files changed, 121 insertions(+), 17 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 5f075038..be615993 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -200,10 +200,13 @@ class Crawler(AbstractModule): self.save_capture_response(parent_id, entries) self.domain.update_daterange(self.date.replace('/', '')) - # Origin + History + # Origin + History + tags if self.root_item: self.domain.set_last_origin(parent_id) self.domain.add_history(epoch, root_item=self.root_item) + # Tags + for tag in task.get_tags(): + self.domain.add_tag(tag) elif self.domain.was_up(): self.domain.add_history(epoch, root_item=epoch) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 1883490e..300edb66 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -967,6 +967,7 @@ class CrawlerScheduler: task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'], header=meta['header'], cookiejar=meta['cookiejar'], proxy=meta['proxy'], + tags=meta['tags'], user_agent=meta['user_agent'], parent='scheduler', priority=40) if task_uuid: schedule.set_task(task_uuid) @@ -1069,6 +1070,14 @@ class CrawlerSchedule: def _set_field(self, field, value): return r_crawler.hset(f'schedule:{self.uuid}', field, value) + def get_tags(self): + return r_crawler.smembers(f'schedule:tags:{self.uuid}') + + def set_tags(self, tags=[]): + for tag in tags: + r_crawler.sadd(f'schedule:tags:{self.uuid}', tag) + # Tag.create_custom_tag(tag) + def get_meta(self, ui=False): meta = { 'uuid': self.uuid, @@ -1083,6 +1092,7 @@ class CrawlerSchedule: 'cookiejar': self.get_cookiejar(), 'header': self.get_header(), 'proxy': self.get_proxy(), + 'tags': self.get_tags(), } status = self.get_status() if ui: @@ -1098,6 +1108,7 @@ class CrawlerSchedule: meta = {'uuid': self.uuid, 'url': self.get_url(), 'user': self.get_user(), + 'tags': self.get_tags(), 'next_run': self.get_next_run(r_str=True)} status = self.get_status() if isinstance(status, ScheduleStatus): @@ -1106,7 +1117,7 @@ class CrawlerSchedule: return meta def create(self, frequency, user, url, - depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None): + depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None, tags=[]): if self.exists(): raise Exception('Error: Monitor already exists') @@ -1135,6 +1146,9 @@ class CrawlerSchedule: if user_agent: self._set_field('user_agent', user_agent) + if tags: + self.set_tags(tags) + r_crawler.sadd('scheduler:schedules', self.uuid) def delete(self): @@ -1148,12 +1162,13 @@ class CrawlerSchedule: # delete meta r_crawler.delete(f'schedule:{self.uuid}') + r_crawler.delete(f'schedule:tags:{self.uuid}') r_crawler.srem('scheduler:schedules', self.uuid) -def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None): +def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None, tags=[]): schedule_uuid = gen_uuid() schedule = CrawlerSchedule(schedule_uuid) - schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent) + schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent, tags=tags) return schedule_uuid # TODO sanityze UUID @@ -1289,6 +1304,11 @@ def get_captures_status(): status.append(meta) return status +def delete_captures(): + for capture_uuid in get_crawler_captures(): + capture = CrawlerCapture(capture_uuid) + capture.delete() + ##-- CRAWLER STATE --## @@ -1371,6 +1391,14 @@ class CrawlerTask: def _set_field(self, field, value): return r_crawler.hset(f'crawler:task:{self.uuid}', field, value) + def get_tags(self): + return r_crawler.smembers(f'crawler:task:tags:{self.uuid}') + + def set_tags(self, tags): + for tag in tags: + r_crawler.sadd(f'crawler:task:tags:{self.uuid}', tag) + # Tag.create_custom_tag(tag) + def get_meta(self): meta = { 'uuid': self.uuid, @@ -1385,6 +1413,7 @@ class CrawlerTask: 'header': self.get_header(), 'proxy': self.get_proxy(), 'parent': self.get_parent(), + 'tags': self.get_tags(), } return meta @@ -1392,7 +1421,7 @@ class CrawlerTask: # TODO SANITIZE PRIORITY # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, - user_agent=None, parent='manual', priority=0): + user_agent=None, tags=[], parent='manual', priority=0): if self.exists(): raise Exception('Error: Task already exists') @@ -1423,7 +1452,7 @@ class CrawlerTask: # TODO SANITIZE COOKIEJAR -> UUID # Check if already in queue - hash_query = get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header) + hash_query = get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header, tags) if r_crawler.hexists(f'crawler:queue:hash', hash_query): self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query) return self.uuid @@ -1444,6 +1473,9 @@ class CrawlerTask: if user_agent: self._set_field('user_agent', user_agent) + if tags: + self.set_tags(tags) + r_crawler.hset('crawler:queue:hash', hash_query, self.uuid) self._set_field('hash', hash_query) r_crawler.zadd('crawler:queue', {self.uuid: priority}) @@ -1483,10 +1515,10 @@ class CrawlerTask: # TODO move to class ??? -def get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header): +def get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header, tags): to_enqueue = {'domain': domain, 'depth': depth, 'har': har, 'screenshot': screenshot, 'priority': priority, 'proxy': proxy, 'cookiejar': cookiejar, 'user_agent': user_agent, - 'header': header} + 'header': header, 'tags': tags} if priority != 0: to_enqueue['url'] = url return hashlib.sha512(pickle.dumps(to_enqueue)).hexdigest() @@ -1502,7 +1534,7 @@ def add_task_to_lacus_queue(): # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, - user_agent=None, parent='manual', priority=0, task_uuid=None): + user_agent=None, tags=[], parent='manual', priority=0, task_uuid=None): if task_uuid: if CrawlerTask(task_uuid).exists(): task_uuid = gen_uuid() @@ -1510,7 +1542,7 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar= task_uuid = gen_uuid() task = CrawlerTask(task_uuid) task_uuid = task.create(url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, - proxy=proxy, user_agent=user_agent, parent=parent, priority=priority) + proxy=proxy, user_agent=user_agent, tags=tags, parent=parent, priority=priority) return task_uuid @@ -1586,15 +1618,17 @@ def api_add_crawler_task(data, user_id=None): if verify[1] != 200: return verify + tags = data.get('tags', []) + if frequency: # TODO verify user return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, - cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200 + cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags), 200 else: # TODO HEADERS # TODO USER AGENT return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, - cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, + cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, parent='manual', priority=90), 200 @@ -1870,6 +1904,8 @@ def test_ail_crawlers(): load_blacklist() # if __name__ == '__main__': +# delete_captures() + # item_id = 'crawled/2023/02/20/data.gz' # item = Item(item_id) # content = item.get_content() diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 52ac5be9..39d84971 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -96,7 +96,8 @@ def manual(): is_manager_connected=crawlers.get_lacus_connection_metadata(), crawlers_types=crawlers_types, proxies=proxies, - l_cookiejar=l_cookiejar) + l_cookiejar=l_cookiejar, + tags_selector_data=Tag.get_tags_selector_data()) @crawler_splash.route("/crawlers/send_to_spider", methods=['POST']) @@ -113,6 +114,34 @@ def send_to_spider(): depth_limit = request.form.get('depth_limit') cookiejar_uuid = request.form.get('cookiejar') + # TAGS + tags = request.form.get("tags", []) + taxonomies_tags = request.form.get('taxonomies_tags') + if taxonomies_tags: + try: + taxonomies_tags = json.loads(taxonomies_tags) + except: + taxonomies_tags = [] + else: + taxonomies_tags = [] + galaxies_tags = request.form.get('galaxies_tags') + if galaxies_tags: + try: + galaxies_tags = json.loads(galaxies_tags) + except: + galaxies_tags = [] + else: + galaxies_tags = [] + # custom tags + if tags: + tags = tags.split() + else: + tags = [] + escaped = [] + for tag in tags: + escaped.append(tag) + tags = escaped + taxonomies_tags + galaxies_tags + # Frequency if request.form.get('crawler_scheduler'): frequency = request.form.get('frequency') @@ -147,6 +176,8 @@ def send_to_spider(): data['proxy'] = proxy if cookiejar_uuid: data['cookiejar'] = cookiejar_uuid + if tags: + data['tags'] = tags # print(data) res = crawlers.api_add_crawler_task(data, user_id=user_id) @@ -163,6 +194,7 @@ def scheduler_dashboard(): # print(schedulers) # TODO list currently queued ? return render_template("crawler_scheduler_dashboard.html", + bootstrap_label=bootstrap_label, schedulers=schedulers, is_manager_connected=crawlers.get_lacus_connection_metadata()) @@ -176,6 +208,7 @@ def schedule_show(): abort(404) meta = schedule.get_meta(ui=True) return render_template("crawler_schedule_uuid.html", + bootstrap_label=bootstrap_label, meta=meta) @crawler_splash.route("/crawlers/schedule/delete", methods=['GET']) diff --git a/var/www/templates/crawler/crawler_splash/crawler_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html index 94e376cf..72a5e47b 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_manual.html +++ b/var/www/templates/crawler/crawler_splash/crawler_manual.html @@ -8,14 +8,16 @@ + - - - + + + + @@ -119,6 +121,21 @@ +
+
+ Tags +
+
+
+
+
+
+ +
+ {% include 'tags/block_tags_selector.html' %} +
+
+ diff --git a/var/www/templates/crawler/crawler_splash/crawler_schedule_uuid.html b/var/www/templates/crawler/crawler_splash/crawler_schedule_uuid.html index e925ff3e..86325c5b 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_schedule_uuid.html +++ b/var/www/templates/crawler/crawler_splash/crawler_schedule_uuid.html @@ -72,6 +72,14 @@ {% endif %} + + Tags + + {%for tag in meta['tags']%} + {{ tag }} + {%endfor%} + + diff --git a/var/www/templates/crawler/crawler_splash/crawler_scheduler_dashboard.html b/var/www/templates/crawler/crawler_splash/crawler_scheduler_dashboard.html index 160365d0..eace714b 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_scheduler_dashboard.html +++ b/var/www/templates/crawler/crawler_splash/crawler_scheduler_dashboard.html @@ -45,7 +45,14 @@ {% for meta in schedulers %} - {{ meta['url'] }} + + {{ meta['url'] }} +
+ {% for tag in meta['tags'] %} + {{ tag }} + {%endfor%} +
+ {{ meta['status'] }} {% if not meta['next_run'] %}