misp-galaxy/tools/mkdocs/generator.py

499 lines
21 KiB
Python
Raw Normal View History

#!/usr/bin/python
import json
2024-02-01 10:05:45 +00:00
import operator
import os
2024-02-01 10:05:45 +00:00
import re
from typing import List
import validators
2024-01-31 13:09:30 +00:00
CLUSTER_PATH = '../../clusters'
SITE_PATH = './site/docs'
2024-02-01 10:05:45 +00:00
# PROJECTS_PATH = './site/projects'
2024-01-31 13:09:30 +00:00
FILES_TO_IGNORE = [] # if you want to skip a specific cluster in the generation
2024-01-31 10:32:12 +00:00
# Variables for statistics
public_relations_count = 0
private_relations_count = 0
private_clusters = []
2024-02-01 10:05:45 +00:00
public_clusters_dict = {}
2024-01-31 10:32:12 +00:00
relation_count_dict = {}
synonyms_count_dict = {}
empty_uuids_dict = {}
2024-02-01 10:05:45 +00:00
INTRO = """
# MISP Galaxy
The MISP galaxy offers a streamlined approach for representing large entities, known as clusters, which can be linked to MISP events or attributes. Each cluster consists of one or more elements, represented as key-value pairs. MISP galaxy comes with a default knowledge base, encompassing areas like Threat Actors, Tools, Ransomware, and ATT&CK matrices. However, users have the flexibility to modify, update, replace, or share these elements according to their needs.
Clusters and vocabularies within MISP galaxy can be utilized in their original form or as a foundational knowledge base. The distribution settings for each cluster can be adjusted, allowing for either restricted or wide dissemination.
Additionally, MISP galaxies enable the representation of existing standards like the MITRE ATT&CK framework, as well as custom matrices.
The aim is to provide a core set of clusters for organizations embarking on analysis, which can be further tailored to include localized, private information or additional, shareable data.
Clusters serve as an open and freely accessible knowledge base, which can be utilized and expanded within [MISP](https://www.misp-project.org/) or other threat intelligence platforms.
2024-01-03 16:28:30 +00:00
![Overview of the integration of MISP galaxy in the MISP Threat Intelligence Sharing Platform](https://raw.githubusercontent.com/MISP/misp-galaxy/aa41337fd78946a60aef3783f58f337d2342430a/doc/images/galaxy.png)
## Publicly available clusters
"""
2024-02-01 10:05:45 +00:00
STATISTICS= """
## Statistics
You can find some statistics about MISP galaxies [here](./statistics.md).
"""
CONTRIBUTING = """
# Contributing
In the dynamic realm of threat intelligence, a variety of models and approaches exist to systematically organize, categorize, and delineate threat actors, hazards, or activity groups. We embrace innovative methodologies for articulating threat intelligence. The galaxy model is particularly versatile, enabling you to leverage and integrate methodologies that you trust and are already utilizing within your organization or community.
We encourage collaboration and contributions to the [MISP Galaxy JSON files](https://github.com/MISP/misp-galaxy/). Feel free to fork the project, enhance existing elements or clusters, or introduce new ones. Your insights are valuable - share them with us through a pull-request.
"""
class Galaxy():
def __init__(self, cluster_list: List[dict], authors, description, name, json_file_name):
self.cluster_list = cluster_list
self.authors = authors
self.description = description
self.name = name
self.json_file_name = json_file_name
self.clusters = self._create_clusters()
self.entry = ""
def _create_metadata_entry(self):
self.entry += "---\n"
self.entry += f'title: {self.name}\n'
meta_description = self.description.replace("\"", "-")
self.entry += f'description: {meta_description}\n'
self.entry += "---\n"
def _create_title_entry(self):
self.entry += f'# {self.name}\n'
def _create_description_entry(self):
self.entry += f'{self.description}\n'
def _create_authors_entry(self):
if self.authors:
self.entry += f'\n'
self.entry += f'??? info "Authors"\n'
self.entry += f'\n'
self.entry += f' | Authors and/or Contributors|\n'
self.entry += f' |----------------------------|\n'
for author in self.authors:
self.entry += f' |{author}|\n'
def _create_clusters(self):
clusters = []
for cluster in self.cluster_list:
clusters.append(Cluster(
value=cluster.get('value', None),
description=cluster.get('description', None),
uuid=cluster.get('uuid', None),
date=cluster.get('date', None),
2024-01-30 15:53:47 +00:00
related_list=cluster.get('related', None),
2024-01-31 12:52:04 +00:00
meta=cluster.get('meta', None),
2024-02-02 13:10:57 +00:00
galaxie=self.name,
galaxie_file_name=self.json_file_name
))
return clusters
2024-02-01 10:05:45 +00:00
def _create_clusters_entry(self, cluster_dict):
for cluster in self.clusters:
2024-02-01 10:05:45 +00:00
self.entry += cluster.create_entry(cluster_dict)
2024-02-01 10:05:45 +00:00
def create_entry(self, cluster_dict):
self._create_metadata_entry()
self._create_title_entry()
self._create_description_entry()
self._create_authors_entry()
2024-02-01 10:05:45 +00:00
self._create_clusters_entry(cluster_dict)
return self.entry
2024-02-01 10:05:45 +00:00
def write_entry(self, path, cluster_dict):
self.create_entry(cluster_dict)
galaxy_path = os.path.join(path, self.json_file_name)
if not os.path.exists(galaxy_path):
os.mkdir(galaxy_path)
with open(os.path.join(galaxy_path, 'index.md'), "w") as index:
index.write(self.entry)
class Cluster():
2024-02-02 13:10:57 +00:00
def __init__(self, description, uuid, date, value, related_list, meta, galaxie, galaxie_file_name):
self.description = description
self.uuid = uuid
self.date = date
self.value = value
2024-01-30 15:53:47 +00:00
self.related_list = related_list
self.meta = meta
self.entry = ""
2024-01-31 12:52:04 +00:00
self.galaxie = galaxie
2024-02-02 13:10:57 +00:00
self.galaxie_file_name = galaxie_file_name
2024-02-01 10:05:45 +00:00
global public_clusters_dict
if self.galaxie:
public_clusters_dict[self.uuid] = self.galaxie
def _create_title_entry(self):
self.entry += f'## {self.value}\n'
self.entry += f'\n'
def _create_description_entry(self):
if self.description:
self.entry += f'{self.description}\n'
def _create_synonyms_entry(self):
if isinstance(self.meta, dict) and self.meta.get('synonyms'):
self.entry += f'\n'
self.entry += f'??? info "Synonyms"\n'
self.entry += f'\n'
self.entry += f' "synonyms" in the meta part typically refer to alternate names or labels that are associated with a particular {self.value}.\n\n'
self.entry += f' | Known Synonyms |\n'
self.entry += f' |---------------------|\n'
2024-01-31 10:32:12 +00:00
global synonyms_count_dict
synonyms_count = 0
for synonym in sorted(self.meta['synonyms']):
2024-01-31 10:32:12 +00:00
synonyms_count += 1
self.entry += f' | `{synonym}` |\n'
2024-01-31 10:32:12 +00:00
synonyms_count_dict[self.value] = synonyms_count
def _create_uuid_entry(self):
if self.uuid:
self.entry += f'\n'
self.entry += f'??? tip "Internal MISP references"\n'
self.entry += f'\n'
self.entry += f' UUID `{self.uuid}` which can be used as unique global reference for `{self.value}` in MISP communities and other software using the MISP galaxy\n'
self.entry += f'\n'
def _create_refs_entry(self):
if isinstance(self.meta, dict) and self.meta.get('refs'):
self.entry += f'\n'
self.entry += f'??? info "External references"\n'
self.entry += f'\n'
for ref in self.meta['refs']:
if validators.url(ref):
self.entry += f' - [{ref}]({ref}) - :material-archive: :material-arrow-right: [webarchive](https://web.archive.org/web/*/{ref})\n'
else:
self.entry += f' - {ref}\n'
self.entry += f'\n'
def _create_associated_metadata_entry(self):
if isinstance(self.meta, dict):
excluded_meta = ['synonyms', 'refs']
self.entry += f'\n'
self.entry += f'??? info "Associated metadata"\n'
self.entry += f'\n'
2024-02-02 13:10:57 +00:00
self.entry += f' |Metadata key {{ .no-filter }} |Value|\n'
self.entry += f' |-----------------------------------|-----|\n'
for meta in sorted(self.meta.keys()):
if meta not in excluded_meta:
self.entry += f' | {meta} | {self.meta[meta]} |\n'
2024-02-06 12:34:33 +00:00
2024-02-01 15:17:56 +00:00
def get_related_clusters(self, cluster_dict, depth=-1, visited=None, level=1):
2024-01-31 10:32:12 +00:00
global public_relations_count
global private_relations_count
global private_clusters
global empty_uuids_dict
empty_uuids = 0
if visited is None:
2024-02-06 12:34:33 +00:00
visited = {}
2024-01-30 15:53:47 +00:00
related_clusters = []
2024-02-06 12:34:33 +00:00
if depth == 0 or not self.related_list:
2024-01-31 10:32:12 +00:00
return related_clusters
2024-02-06 12:34:33 +00:00
if self.uuid in visited and visited[self.uuid] <= level:
return related_clusters
else:
visited[self.uuid] = level
2024-01-31 12:52:04 +00:00
2024-01-31 10:32:12 +00:00
for cluster in self.related_list:
dest_uuid = cluster["dest-uuid"]
2024-01-31 12:52:04 +00:00
# Cluster is private
2024-01-31 10:32:12 +00:00
if dest_uuid not in cluster_dict:
# Check if UUID is empty
if not dest_uuid:
empty_uuids += 1
continue
private_relations_count += 1
if dest_uuid not in private_clusters:
private_clusters.append(dest_uuid)
2024-02-02 13:10:57 +00:00
related_clusters.append((self, Cluster(value="Private Cluster", uuid=dest_uuid, date=None, description=None, related_list=None, meta=None, galaxie=None, galaxie_file_name=None), level))
2024-01-31 10:32:12 +00:00
continue
2024-01-31 12:52:04 +00:00
2024-01-31 10:32:12 +00:00
related_cluster = cluster_dict[dest_uuid]
public_relations_count += 1
2024-02-01 15:17:56 +00:00
related_clusters.append((self, related_cluster, level))
2024-01-31 12:52:04 +00:00
2024-02-06 12:34:33 +00:00
if (depth > 1 or depth == -1) and (cluster["dest-uuid"] not in visited or visited[cluster["dest-uuid"]] > level + 1):
2024-01-31 10:32:12 +00:00
new_depth = depth - 1 if depth > 1 else -1
2024-02-06 12:34:33 +00:00
if cluster["dest-uuid"] in cluster_dict:
related_clusters += cluster_dict[cluster["dest-uuid"]].get_related_clusters(cluster_dict, new_depth, visited, level+1)
2024-01-31 10:32:12 +00:00
if empty_uuids > 0:
empty_uuids_dict[self.value] = empty_uuids
2024-01-31 12:52:04 +00:00
2024-02-06 12:34:33 +00:00
# Remove duplicates
to_remove = set()
cluster_dict = {}
2024-01-31 10:32:12 +00:00
for cluster in related_clusters:
2024-02-06 12:34:33 +00:00
key1 = (cluster[0], cluster[1])
key2 = (cluster[1], cluster[0])
if key1 in cluster_dict:
if cluster_dict[key1][2] > cluster[2]:
to_remove.add(cluster_dict[key1])
cluster_dict[key1] = cluster
else:
to_remove.add(cluster)
elif key2 in cluster_dict:
if cluster_dict[key2][2] > cluster[2]:
to_remove.add(cluster_dict[key2])
cluster_dict[key2] = cluster
else:
to_remove.add(cluster)
else:
cluster_dict[key1] = cluster
related_clusters = [cluster for cluster in related_clusters if cluster not in to_remove]
2024-01-30 15:53:47 +00:00
2024-02-06 12:34:33 +00:00
return related_clusters
def _create_related_entry(self):
self.entry += f'\n'
self.entry += f'??? info "Related clusters"\n'
self.entry += f'\n'
# self.entry += f'To see the related clusters, click [here](./{self.galaxie}/{self.uuid}.md).\n'
2024-02-01 15:17:56 +00:00
self.entry += f' To see the related clusters, click [here](./relations/{self.uuid}.md).\n'
2024-02-01 10:05:45 +00:00
def _get_related_entry(self, relations):
output = ""
output += f'## Related clusters for {self.value}\n'
output += f'\n'
2024-02-02 13:10:57 +00:00
output += f'| Cluster A | Cluster B | Level {{ .graph }} |\n'
2024-02-01 15:17:56 +00:00
output += f'|-----------|-----------|-------|\n'
for relation in relations:
2024-02-06 12:34:33 +00:00
cluster_a_section = relation[0].value.lower().replace(" ", "-").replace("/", "").replace(":", "")
cluster_b_section = relation[1].value.lower().replace(" ", "-").replace("/", "").replace(":", "")
if cluster_b_section != "private+cluster":
output += f'| [{relation[0].value} ({relation[0].uuid})](../../{relation[0].galaxie_file_name}/index.md#{cluster_a_section}) | [{relation[1].value} ({relation[1].uuid})](../../{relation[1].galaxie_file_name}/index.md#{cluster_b_section}) | {relation[2]} |\n'
else:
output += f'| [{relation[0].value} ({relation[0].uuid})](../../{relation[0].galaxie_file_name}/index.md#{cluster_a_section}) | {relation[1].value} ({relation[1].uuid}) | {relation[2]} |\n'
2024-02-01 10:05:45 +00:00
return output
def create_entry(self, cluster_dict):
self._create_title_entry()
self._create_description_entry()
self._create_synonyms_entry()
self._create_uuid_entry()
self._create_refs_entry()
self._create_associated_metadata_entry()
2024-02-01 10:05:45 +00:00
if self.related_list:
self._create_related_entry()
2024-02-01 10:05:45 +00:00
self._write_relations(cluster_dict, SITE_PATH)
return self.entry
2024-02-01 10:05:45 +00:00
def _write_relations(self, cluster_dict, path):
related_clusters = self.get_related_clusters(cluster_dict)
global relation_count_dict
relation_count_dict[self.value] = len(related_clusters)
2024-02-02 13:10:57 +00:00
galaxy_path = os.path.join(path, self.galaxie_file_name)
2024-02-01 10:05:45 +00:00
if not os.path.exists(galaxy_path):
os.mkdir(galaxy_path)
relation_path = os.path.join(galaxy_path, 'relations')
if not os.path.exists(relation_path):
os.mkdir(relation_path)
with open(os.path.join(relation_path, ".pages"), "w") as index:
index.write(f'hide: true\n')
with open(os.path.join(relation_path, f'{self.uuid}.md'), "w") as index:
2024-02-01 10:05:45 +00:00
index.write(self._get_related_entry(related_clusters))
def create_index(galaxies):
index_output = INTRO
index_output += STATISTICS
for galaxie in galaxies:
index_output += f'- [{galaxie.name}](./{galaxie.json_file_name}/index.md)\n'
2024-02-01 10:05:45 +00:00
index_output += CONTRIBUTING
return index_output
2024-02-01 10:05:45 +00:00
def create_galaxies(galaxies, cluster_dict):
galaxy_output = {}
for galaxie in galaxies:
2024-02-01 10:05:45 +00:00
galaxy_output[galaxie.json_file_name] = galaxie.create_entry(cluster_dict)
return galaxy_output
2024-02-01 10:05:45 +00:00
def create_xy_chart(title, width, height, x_axis, y_axis, bar):
output = ""
output += f'```mermaid\n'
output += f'---\n'
output += f'config:\n'
output += f' xyChart:\n'
output += f' width: {width}\n'
output += f' height: {height}\n'
output += f'---\n'
output += f'xychart-beta\n'
output += f' title "{title}"\n'
output += f' x-axis [{x_axis}]\n'
output += f' y-axis "{y_axis}"\n'
output += f' bar {bar}\n'
output += f'```\n'
output += f'\n'
return output
def create_pie_chart(title, cakepieces):
output = ""
output += f'```mermaid\n'
output += f'pie showData\n'
output += f' title {title}\n'
for cakepiece in cakepieces:
output += f' "{cakepiece[0]}" : {cakepiece[1]}\n'
output += f'```\n'
output += f'\n'
return output
def get_top_x(dict, x, big_to_small=True):
sorted_dict = sorted(dict.items(), key=operator.itemgetter(1), reverse=big_to_small)[:x]
top_x = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_dict]
top_x = ", ".join(top_x)
top_x_values = sorted(dict.values(), reverse=big_to_small)[:x]
return top_x, top_x_values
2024-02-01 10:05:45 +00:00
def create_statistics():
statistic_output = ""
statistic_output += f'# MISP Galaxy statistics\n'
statistic_output +='The MISP galaxy statistics are automatically generated based on the MISP galaxy JSON files. Therefore the statistics only include detailed infomration about public clusters and relations.\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'# Cluster statistics\n'
statistic_output += f'## Number of clusters\n'
statistic_output += f'Here you can find the total number of clusters including public and private clusters. The number of public clusters has been calculated based on the number of unique Clusters in the MISP galaxy JSON files. The number of private clusters could only be approximated based on the number of relations to non-existing clusters. Therefore the number of private clusters is not accurate and only an approximation.\n'
statistic_output += f'\n'
statistic_output += f'| No. | Type | Count {{ .pie-chart }}|\n'
statistic_output += f'|----|------|-------|\n'
statistic_output += f'| 1 | Public clusters | {len(public_clusters_dict)} |\n'
statistic_output += f'| 2 | Private clusters | {len(private_clusters)} |\n'
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'## Galaxies with the most clusters\n'
galaxy_counts = {}
for galaxy in public_clusters_dict.values():
galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1
top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 20)
statistic_output += f' | No. | Galaxy | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
2024-02-02 13:10:57 +00:00
for i, galaxy in enumerate(top_galaxies.split(", "), 1):
statistic_output += f' | {i} | [{galaxy}](./{galaxy}/index.md) | {top_galaxies_values[i-1]} |\n'
2024-02-02 13:10:57 +00:00
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'## Galaxies with the least clusters\n'
flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 20, False)
statistic_output += f' | No. | Galaxy | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
2024-02-02 13:10:57 +00:00
for i, galaxy in enumerate(flop_galaxies.split(", "), 1):
statistic_output += f' | {i} | [{galaxy}](./{galaxy}/index.md) | {flop_galaxies_values[i-1]} |\n'
2024-02-02 13:10:57 +00:00
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'# Relation statistics\n'
statistic_output += f'## Number of relations\n'
statistic_output += f'| No. | Type | Count {{ .pie-chart }}|\n'
statistic_output += f'|----|------|-------|\n'
statistic_output += f'| 1 | Public relations | {public_relations_count} |\n'
statistic_output += f'| 2 | Private relations | {private_relations_count} |\n'
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'**Average number of relations per cluster**: {int(sum(relation_count_dict.values()) / len(relation_count_dict))}\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'## Cluster with the most relations\n'
top_25_relation, top_25_relation_values = get_top_x(relation_count_dict, 20)
statistic_output += f' | No. | Cluster | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, cluster in enumerate(top_25_relation.split(", "), 1):
statistic_output += f' | {i} | [{cluster}](./{cluster}/index.md) | {top_25_relation_values[i-1]} |\n'
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'# Synonyms statistics\n'
statistic_output += f'## Cluster with the most synonyms\n'
top_synonyms, top_synonyms_values = get_top_x(synonyms_count_dict, 20)
statistic_output += f' | No. | Cluster | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, cluster in enumerate(top_synonyms.split(", "), 1):
statistic_output += f' | {i} | [{cluster}](./{cluster}/index.md) | {top_synonyms_values[i-1]} |\n'
statistic_output += f'\n'
2024-02-01 10:05:45 +00:00
statistic_output += f'# Empty UUIDs statistics\n'
statistic_output += f'**Number of empty UUIDs**: {sum(empty_uuids_dict.values())}\n'
statistic_output += f'\n'
statistic_output += f'**Empty UUIDs per cluster**: {empty_uuids_dict}\n'
2024-01-31 10:32:12 +00:00
2024-02-01 10:05:45 +00:00
return statistic_output
def main():
galaxies_fnames = []
for f in os.listdir(CLUSTER_PATH):
if '.json' in f and f not in FILES_TO_IGNORE:
galaxies_fnames.append(f)
galaxies_fnames.sort()
galaxies = []
for galaxy in galaxies_fnames:
with open(os.path.join(CLUSTER_PATH, galaxy)) as fr:
galaxie_json = json.load(fr)
galaxies.append(Galaxy(galaxie_json['values'], galaxie_json['authors'], galaxie_json['description'], galaxie_json['name'], galaxy.split('.')[0]))
cluster_dict = {}
for galaxy in galaxies:
for cluster in galaxy.clusters:
cluster_dict[cluster.uuid] = cluster
# Write files
if not os.path.exists(SITE_PATH):
os.mkdir(SITE_PATH)
2024-02-01 15:17:56 +00:00
for galaxy in galaxies:
galaxy.write_entry(SITE_PATH, cluster_dict)
2024-02-06 12:34:33 +00:00
# count = 7
# for galaxy in galaxies:
# galaxy.write_entry(SITE_PATH, cluster_dict)
# count -= 1
# if count == 0:
# break
2024-02-01 10:05:45 +00:00
index_output = create_index(galaxies)
statistic_output = create_statistics()
with open(os.path.join(SITE_PATH, 'index.md'), "w") as index:
index.write(index_output)
with open(os.path.join(SITE_PATH, 'statistics.md'), "w") as index:
index.write(statistic_output)
if __name__ == "__main__":
main()
2024-01-31 12:52:04 +00:00
# test = cluster_dict['f0ec2df5-2e38-4df3-970d-525352006f2e']
# test = cluster_dict['d7247cf9-13b6-4781-b789-a5f33521633b']
# clusters = test.get_related_clusters()
# print(clusters)
# print(len(clusters))
# print("```mermaid")
# print(f"graph TD")
# for cluster in clusters:
# print(f"{cluster[0].uuid}[{cluster[0].value}] --- {cluster[1].uuid}[{cluster[1].value}]")
# print("```")