new: [NER] Extract Named Entity Recognition source (NER) from all MISP galaxy

This can be used for NLP training and especially to build NER
This commit is contained in:
Alexandre Dulaunoy 2024-04-16 14:35:01 +02:00
parent 59e9f48e19
commit ea04301290
Signed by: adulau
GPG key ID: 09E2CD4944E6CBCD

41
tools/NER/extract.py Normal file
View file

@ -0,0 +1,41 @@
import os
import json
import argparse
thisDir = os.path.dirname(__file__)
clusters = []
pathClusters = os.path.join(thisDir, '../../clusters')
pathGalaxies = os.path.join(thisDir, '../../galaxies')
skip_list = ["cancer.json", "handicap.json", "ammunitions.json", "firearms.json"]
for f in os.listdir(pathGalaxies):
if '.json' in f:
with open(os.path.join(pathGalaxies, f), 'r') as f_in:
galaxy_data = json.load(f_in)
if galaxy_data.get('namespace') != 'deprecated':
if f not in skip_list:
clusters.append(f)
clusters.sort()
for cluster in clusters:
fullPathClusters = os.path.join(pathClusters, cluster)
with open(fullPathClusters) as fp:
c = json.load(fp)
cluster_name = cluster.split(".")[0].upper()
l = f'{cluster_name}'
for v in c['values']:
if 'uuid' not in v:
continue
l += f",{v['value']}"
if 'meta' not in v:
continue
if 'synonyms' not in v['meta']:
continue
for synonym in v['meta']['synonyms']:
l += f',{synonym}'
print(l)