mirror of
https://github.com/MISP/misp-galaxy.git
synced 2024-11-25 16:27:19 +00:00
new: [NER] Extract Named Entity Recognition source (NER) from all MISP galaxy
This can be used for NLP training and especially to build NER
This commit is contained in:
parent
59e9f48e19
commit
ea04301290
1 changed files with 41 additions and 0 deletions
41
tools/NER/extract.py
Normal file
41
tools/NER/extract.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
thisDir = os.path.dirname(__file__)
|
||||
|
||||
clusters = []
|
||||
|
||||
pathClusters = os.path.join(thisDir, '../../clusters')
|
||||
pathGalaxies = os.path.join(thisDir, '../../galaxies')
|
||||
|
||||
skip_list = ["cancer.json", "handicap.json", "ammunitions.json", "firearms.json"]
|
||||
|
||||
for f in os.listdir(pathGalaxies):
|
||||
if '.json' in f:
|
||||
with open(os.path.join(pathGalaxies, f), 'r') as f_in:
|
||||
galaxy_data = json.load(f_in)
|
||||
if galaxy_data.get('namespace') != 'deprecated':
|
||||
if f not in skip_list:
|
||||
clusters.append(f)
|
||||
|
||||
clusters.sort()
|
||||
|
||||
for cluster in clusters:
|
||||
fullPathClusters = os.path.join(pathClusters, cluster)
|
||||
with open(fullPathClusters) as fp:
|
||||
c = json.load(fp)
|
||||
cluster_name = cluster.split(".")[0].upper()
|
||||
l = f'{cluster_name}'
|
||||
for v in c['values']:
|
||||
if 'uuid' not in v:
|
||||
continue
|
||||
l += f",{v['value']}"
|
||||
if 'meta' not in v:
|
||||
continue
|
||||
if 'synonyms' not in v['meta']:
|
||||
continue
|
||||
for synonym in v['meta']['synonyms']:
|
||||
l += f',{synonym}'
|
||||
print(l)
|
||||
|
Loading…
Reference in a new issue