Improve [statistics] graphs for statistics

This commit is contained in:
niclas 2024-02-05 15:59:09 +01:00
parent 710837770f
commit 71d90c2c77
7 changed files with 243 additions and 53 deletions

View file

@ -1,4 +0,0 @@
validators
mkdocs-git-committers-plugin
mkdocs-rss-plugin

View file

@ -266,7 +266,10 @@ class Cluster():
for relation in relations:
cluster_a_section = relation[0].value.lower().replace(" ", "+").replace("/", "").replace(":", "")
cluster_b_section = relation[1].value.lower().replace(" ", "+").replace("/", "").replace(":", "")
output += f'| [{relation[0].value} ({relation[0].uuid})](../../{relation[0].galaxie_file_name}/index.md#{cluster_a_section}) | [{relation[1].value} ({relation[1].uuid})](../../{relation[1].galaxie_file_name}/index.md#{cluster_b_section}) | {relation[2]} |\n'
if cluster_b_section != "private+cluster":
output += f'| [{relation[0].value} ({relation[0].uuid})](../../{relation[0].galaxie_file_name}/index.md#{cluster_a_section}) | [{relation[1].value} ({relation[1].uuid})](../../{relation[1].galaxie_file_name}/index.md#{cluster_b_section}) | {relation[2]} |\n'
else:
output += f'| [{relation[0].value} ({relation[0].uuid})](../../{relation[0].galaxie_file_name}/index.md#{cluster_a_section}) | {relation[1].value} ({relation[1].uuid}) | {relation[2]} |\n'
return output
def create_entry(self, cluster_dict):
@ -348,81 +351,78 @@ def get_top_x(dict, x, big_to_small=True):
def create_statistics():
statistic_output = ""
statistic_output += f'# MISP Galaxy statistics\n'
statistic_output +='The MISP galaxy statistics are automatically generated based on the MISP galaxy JSON files. Therefore the statistics only include detailed infomration about public clusters and relations.\n'
statistic_output += f'# Cluster statistics\n'
statistic_output += f'## Number of clusters\n'
statistic_output += create_pie_chart("Number of clusters", [("Public clusters", len(public_clusters_dict)), ("Private clusters", len(private_clusters))])
statistic_output += f'Here you can find the total number of clusters including public and private clusters. The number of public clusters has been calculated based on the number of unique Clusters in the MISP galaxy JSON files. The number of private clusters could only be approximated based on the number of relations to non-existing clusters. Therefore the number of private clusters is not accurate and only an approximation.\n'
statistic_output += f'\n'
statistic_output += f'| No. | Type | Count {{ .pie-chart }}|\n'
statistic_output += f'|----|------|-------|\n'
statistic_output += f'| 1 | Public clusters | {len(public_clusters_dict)} |\n'
statistic_output += f'| 2 | Private clusters | {len(private_clusters)} |\n'
statistic_output += f'\n'
statistic_output += f'## Galaxies with the most clusters\n'
galaxy_counts = {}
for galaxy in public_clusters_dict.values():
galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1
top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 25)
statistic_output += create_xy_chart("Galaxies with the most clusters", 3000, 1000, top_galaxies, "Number of clusters", top_galaxies_values)
top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 20)
# statistic_output += create_xy_chart("Galaxies with the most clusters", 3000, 1000, top_galaxies, "Number of clusters", top_galaxies_values)
statistic_output += f' | No. | Galaxy | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, galaxy in enumerate(top_galaxies.split(", "), 1):
statistic_output += f'{i}. [{galaxy}](./{galaxy}/index.md)\n'
# statistic_output += f'{i}. [{galaxy}](./{galaxy}/index.md)\n'
statistic_output += f' | {i} | [{galaxy}](./{galaxy}/index.md) | {top_galaxies_values[i-1]} |\n'
statistic_output += f'\n'
statistic_output += f'## Galaxies with the least clusters\n'
flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 25, False)
statistic_output += create_xy_chart("Galaxies with the least clusters", 3000, 1000, flop_galaxies, "Number of clusters", flop_galaxies_values)
flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 20, False)
# statistic_output += create_xy_chart("Galaxies with the least clusters", 3000, 1000, flop_galaxies, "Number of clusters", flop_galaxies_values)
statistic_output += f' | No. | Galaxy | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, galaxy in enumerate(flop_galaxies.split(", "), 1):
statistic_output += f'{i}. [{galaxy}](./{galaxy}/index.md)\n'
# statistic_output += f'{i}. [{galaxy}](./{galaxy}/index.md)\n'
statistic_output += f' | {i} | [{galaxy}](./{galaxy}/index.md) | {flop_galaxies_values[i-1]} |\n'
statistic_output += f'\n'
# galaxy_number = 0
# global galaxies
# for galaxy in galaxies:
# galaxy_number += 1
# statistic_output += f'**Average number of clusters per galaxy**: {len(public_clusters_dict) / galaxy_number}\n'
statistic_output += f'# Relation statistics\n'
statistic_output += f'## Number of relations\n'
statistic_output += create_pie_chart("Number of relations", [("Public relations", public_relations_count), ("Private relations", private_relations_count)])
statistic_output += f'| No. | Type | Count {{ .pie-chart }}|\n'
statistic_output += f'|----|------|-------|\n'
statistic_output += f'| 1 | Public relations | {public_relations_count} |\n'
statistic_output += f'| 2 | Private relations | {private_relations_count} |\n'
statistic_output += f'\n'
statistic_output += f'**Average number of relations per cluster**: {sum(relation_count_dict.values()) / len(relation_count_dict)}\n'
statistic_output += f'**Average number of relations per cluster**: {int(sum(relation_count_dict.values()) / len(relation_count_dict))}\n'
statistic_output += f'## Cluster with the most relations\n'
top_25_relation, top_25_relation_values = get_top_x(relation_count_dict, 25)
statistic_output += create_xy_chart("Cluster with the most relations", 3000, 1000, top_25_relation, "Number of relations", top_25_relation_values)
statistic_output += f'## Cluster with the least relations\n'
top_25_relation, top_25_relation_values = get_top_x(relation_count_dict, 25, False)
statistic_output += create_xy_chart("Cluster with the least relations", 3000, 1000, top_25_relation, "Number of relations", top_25_relation_values)
top_25_relation, top_25_relation_values = get_top_x(relation_count_dict, 20)
# statistic_output += create_xy_chart("Cluster with the most relations", 3000, 1000, top_25_relation, "Number of relations", top_25_relation_values)
statistic_output += f' | No. | Cluster | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, cluster in enumerate(top_25_relation.split(", "), 1):
# statistic_output += f'{i}. [{cluster}](./{cluster}/index.md)\n'
statistic_output += f' | {i} | [{cluster}](./{cluster}/index.md) | {top_25_relation_values[i-1]} |\n'
statistic_output += f'\n'
statistic_output += f'# Synonyms statistics\n'
statistic_output += f'## Cluster with the most synonyms\n'
top_25_synonyms, top_25_synonyms_values = get_top_x(synonyms_count_dict, 25)
statistic_output += create_xy_chart("Cluster with the most synonyms", 3000, 1000, top_25_synonyms, "Number of synonyms", top_25_synonyms_values)
statistic_output += f'## Cluster with the least synonyms\n'
top_25_synonyms, top_25_synonyms_values = get_top_x(synonyms_count_dict, 25, False)
statistic_output += create_xy_chart("Cluster with the least synonyms", 3000, 1000, top_25_synonyms, "Number of synonyms", top_25_synonyms_values)
top_synonyms, top_synonyms_values = get_top_x(synonyms_count_dict, 20)
# statistic_output += create_xy_chart("Cluster with the most synonyms", 3000, 1000, top_synonyms, "Number of synonyms", top_synonyms_values)
statistic_output += f' | No. | Cluster | Count {{ .bar-chart }}|\n'
statistic_output += f' |----|--------|-------|\n'
for i, cluster in enumerate(top_synonyms.split(", "), 1):
# statistic_output += f'{i}. [{cluster}](./{cluster}/index.md)\n'
statistic_output += f' | {i} | [{cluster}](./{cluster}/index.md) | {top_synonyms_values[i-1]} |\n'
statistic_output += f'\n'
statistic_output += f'# Empty UUIDs statistics\n'
statistic_output += f'**Number of empty UUIDs**: {sum(empty_uuids_dict.values())}\n'
statistic_output += f'\n'
statistic_output += f'**Empty UUIDs per cluster**: {empty_uuids_dict}\n'
print(f"Public relations: {public_relations_count}")
print(f"Private relations: {private_relations_count}")
print(f"Total relations: {public_relations_count + private_relations_count}")
print(f"Percetage of private relations: {private_relations_count / (public_relations_count + private_relations_count) * 100}%")
print(f"Private clusters: {len(private_clusters)}")
print(f"Public clusters: {len(public_clusters_dict)}")
print(f"Total clusters: {len(private_clusters) + len(public_clusters_dict)}")
print(f"Percentage of private clusters: {len(private_clusters) / (len(private_clusters) + len(public_clusters_dict)) * 100}%")
print(f"Average number of relations per cluster: {sum(relation_count_dict.values()) / len(relation_count_dict)}")
print(f"Max number of relations per cluster: {max(relation_count_dict.values())} from {max(relation_count_dict, key=relation_count_dict.get)}")
print(f"Min number of relations per cluster: {min(relation_count_dict.values())} from {min(relation_count_dict, key=relation_count_dict.get)}")
print(f"Average number of synonyms per cluster: {sum(synonyms_count_dict.values()) / len(synonyms_count_dict)}")
print(f"Max number of synonyms per cluster: {max(synonyms_count_dict.values())} from {max(synonyms_count_dict, key=synonyms_count_dict.get)}")
print(f"Min number of synonyms per cluster: {min(synonyms_count_dict.values())} from {min(synonyms_count_dict, key=synonyms_count_dict.get)}")
print(f"Number of empty UUIDs: {sum(empty_uuids_dict.values())}")
print(f"Empty UUIDs per cluster: {empty_uuids_dict}")
print(sorted(relation_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:30])
return statistic_output
def main():

View file

@ -0,0 +1,48 @@
Babel==2.14.0
bracex==2.4
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
cryptography==42.0.1
Deprecated==1.2.14
ghp-import==2.1.0
gitdb==4.0.11
GitPython==3.1.41
graphviz==0.20.1
idna==3.6
Jinja2==3.1.3
Markdown==3.5.2
MarkupSafe==2.1.4
mergedeep==1.3.4
mkdocs==1.5.3
mkdocs-awesome-pages-plugin==2.9.2
mkdocs-git-committers-plugin==0.2.3
mkdocs-material==9.5.6
mkdocs-material-extensions==1.3.1
mkdocs-rss-plugin==1.12.0
natsort==8.4.0
packaging==23.2
paginate==0.5.6
pathspec==0.12.1
platformdirs==4.1.0
pycparser==2.21
PyGithub==2.2.0
Pygments==2.17.2
PyJWT==2.8.0
pymdown-extensions==10.7
PyNaCl==1.5.0
python-dateutil==2.8.2
PyYAML==6.0.1
pyyaml_env_tag==0.1
regex==2023.12.25
requests==2.31.0
six==1.16.0
smmap==5.0.1
typing_extensions==4.9.0
urllib3==2.1.0
validators==0.22.0
watchdog==3.0.0
wcmatch==8.5
wrapt==1.16.0

View file

@ -0,0 +1,145 @@
document$.subscribe(function () {
function parseTable(table) {
var data = [];
table.querySelectorAll("tr").forEach((row, i) => {
if (i > 0) {
var cells = row.querySelectorAll("td");
data.push({ name: cells[1].textContent, value: Number(cells[2].textContent) });
}
});
return data;
}
function createPieChart(data, elementId) {
// Set up the dimensions of the graph
var width = 500, height = 500;
// Append SVG for the graph
var svg = d3.select(elementId).append("svg")
.attr("width", width)
.attr("height", height);
// Set up the dimensions of the graph
var radius = Math.min(width, height) / 2 - 20;
// Append a group to the SVG
var g = svg.append("g")
.attr("transform", "translate(" + width / 2 + "," + height / 2 + ")");
// Set up the color scale
var color = d3.scaleOrdinal()
.domain(data.map(d => d.name))
.range(d3.quantize(t => d3.interpolateSpectral(t * 0.8 + 0.1), data.length).reverse());
// Compute the position of each group on the pie
var pie = d3.pie()
.value(d => d.value);
var data_ready = pie(data);
// Build the pie chart
g.selectAll('whatever')
.data(data_ready)
.enter()
.append('path')
.attr('d', d3.arc()
.innerRadius(0)
.outerRadius(radius)
)
.attr('fill', d => color(d.data.name))
.attr("stroke", "black")
.style("stroke-width", "2px")
.style("opacity", 0.7);
// Add labels
g.selectAll('whatever')
.data(data_ready)
.enter()
.append('text')
.text(d => d.data.name)
.attr("transform", d => "translate(" + d3.arc().innerRadius(0).outerRadius(radius).centroid(d) + ")")
.style("text-anchor", "middle")
.style("font-size", 17);
}
function createBarChart(data, elementId) {
// Set up the dimensions of the graph
var svgWidth = 1000, svgHeight = 1500;
var margin = { top: 20, right: 200, bottom: 400, left: 60 }, // Increase bottom margin for x-axis labels
width = svgWidth - margin.left - margin.right,
height = svgHeight - margin.top - margin.bottom;
// Append SVG for the graph
var svg = d3.select(elementId).append("svg")
.attr("width", svgWidth)
.attr("height", svgHeight)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
// Set up the scales
var x = d3.scaleBand()
.range([0, width])
.padding(0.1)
.domain(data.map(d => d.name));
var maxYValue = d3.max(data, d => d.value);
var y = d3.scaleLinear()
.range([height, 0])
.domain([0, maxYValue + maxYValue * 0.1]); // Add padding to the max value
// Set up the color scale
var color = d3.scaleOrdinal()
.range(d3.schemeCategory10);
// Set up the axes
var xAxis = d3.axisBottom(x)
.tickSize(0)
.tickPadding(6);
var yAxis = d3.axisLeft(y);
// Add the bars
svg.selectAll(".bar")
.data(data)
.enter().append("rect")
.attr("class", "bar")
.attr("x", d => x(d.name))
.attr("y", d => y(d.value))
.attr("width", x.bandwidth())
.attr("height", d => height - y(d.value))
.attr("fill", d => color(d.name));
// Add and rotate x-axis labels
svg.append("g")
.attr("transform", "translate(0," + height + ")")
.call(xAxis)
.selectAll("text")
.style("text-anchor", "end")
.attr("dx", "-.8em")
.attr("dy", ".15em")
.attr("transform", "rotate(-65)"); // Rotate the labels
// Add the y-axis
svg.append("g")
.call(yAxis);
}
document.querySelectorAll("table").forEach((table, index) => {
var pieChart = table.querySelector("th.pie-chart");
var barChart = table.querySelector("th.bar-chart");
graphId = "graph" + index;
var div = document.createElement("div");
div.id = graphId;
table.parentNode.insertBefore(div, table);
if (pieChart) {
var data = parseTable(table);
createPieChart(data, "#" + graphId);
}
if (barChart) {
var data = parseTable(table);
createBarChart(data, "#" + graphId);
}
})
});

View file

@ -1,5 +1,5 @@
{
"name": "mkdocs",
"name": "docs",
"lockfileVersion": 2,
"requires": true,
"packages": {

View file

@ -69,7 +69,8 @@ extra_javascript:
# - javascripts/tablefilter.js
# - "https://unpkg.com/tablefilter@0.7.3/dist/tablefilter/tablefilter.js"
# - "https://d3js.org/d3.v6.min.js"
- 01_attachements/javascripts/d3.js
- 01_attachements/javascripts/graph.js
- 01_attachements/javascripts/statistics.js
- node_modules/tablefilter/dist/tablefilter/tablefilter.js
- node_modules/d3/dist/d3.min.js