From 57871ee05dea693b95d0aea1507861ae6ae98f23 Mon Sep 17 00:00:00 2001
From: Christian Studer <christian.studer@circl.lu>
Date: Tue, 24 Jan 2023 22:49:14 +0100
Subject: [PATCH] add: [region] Added script to update the `region` cluster
 based on the UNSD M49 csv file

---
 tools/UN M49/generate_region_cluster.py | 110 ++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 tools/UN M49/generate_region_cluster.py

diff --git a/tools/UN M49/generate_region_cluster.py b/tools/UN M49/generate_region_cluster.py
new file mode 100644
index 0000000..31d1e2a
--- /dev/null
+++ b/tools/UN M49/generate_region_cluster.py	
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import csv
+import json
+from collections import defaultdict
+from pathlib import Path
+from uuid import uuid4
+
+current_path = Path(__file__).resolve().parent
+
+
+def _generate_cluster_values(cluster_values: dict):
+    values = []
+    for _, cluster_value in sorted(cluster_values.items()):
+        values.append(cluster_value)
+    return values
+
+
+def _parse_csv_input(input_file: Path) -> dict:
+    # We create a mapping to associate the regions and their sub-regions
+    regions_mapping = defaultdict(set)
+    with open(input_file, 'rt', encoding='utf-8') as csvfile:
+        csv_reader = csv.reader(csvfile, delimiter=',')
+        next(csv_reader)
+        for row in csv_reader:
+            '''
+            gc: Global Code
+            gn: Global Name
+            rc: Region Code
+            rn: Region Name
+            src: Sub-region Code
+            srn: Sub-region Name
+            irc: Intermediate Region Code
+            irn: Intermediate Region Name
+            mc: M49 Code
+            coa: Country of Area
+            '''
+            gc, gn, rc, rn, src, srn, irc, irn, mc, coa, *_ = row
+
+            global_region = f'{gc} - {gn}'
+            if rc and rn:
+                # Almost all the areas have a region information
+                region = f'{rc} - {rn}'
+                regions_mapping[global_region].add(region)
+                # Deal with the region information
+                sub_region = f'{src} - {srn}'
+                regions_mapping[region].add(sub_region)
+                country = f'{mc} - {coa}'
+                if irc and irn:
+                    # If the country is located in an intermediate region
+                    inter_region = f'{irc} - {irn}'
+                    # Deal with the sub-region information
+                    regions_mapping[sub_region].add(inter_region)
+                    # Deal with the intermediate region information
+                    regions_mapping[inter_region].add(country)
+                else:
+                    # The country is located in a sub-region
+                    regions_mapping[sub_region].add(country)
+            else:
+                # Should be only Antarctica which has only global region and
+                # country information
+                country = f'{mc} - {coa}'
+                regions_mapping[global_region].add(country)
+                regions_mapping[country] = None
+    return regions_mapping
+
+
+def update_cluster(input_file, filename):
+    with open(filename, 'rt', encoding='utf-8') as f:
+        region_cluster = json.load(f)
+    cluster = {value['value']: value for value in region_cluster['values']}
+    regions_mapping = _parse_csv_input(input_file)
+    is_changed = False
+    for region, subregions in regions_mapping.items():
+        if region not in cluster:
+            cluster_value = {
+                'value': region,
+                'uuid': uuid4().__str__(),
+            }
+            if subregions is not None:
+                cluster_value['meta'] = {
+                    'subregions': sorted(subregions)
+                }
+            cluster[region] = cluster_value
+            is_changed = True
+        else:
+            if subregions != cluster[region].get('meta', {}).get('subregions'):
+                if subregions is not None:
+                    cluster[region]['meta'] = {
+                        'subregions': sorted(subregions)
+                    }
+                is_changed = True
+    if is_changed:
+        region_cluster['values'] = _generate_cluster_values(cluster)
+        with open(filename, 'wt', encoding='utf-8') as f:
+            f.write(json.dumps(region_cluster, indent=2))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generates/updates the region galaxy cluster')
+    parser.add_argument(
+        '-i', '--input', type=Path, default=current_path / 'UNSD.csv',
+        help="CSV input file"
+    )
+    args = parser.parse_args()
+
+    filename = current_path.parents[1] / 'clusters' / 'region.json'
+    update_cluster(args.input, filename)