misp-galaxy/tools/chk_dup.py
2019-08-30 10:08:16 +02:00

66 lines
1.9 KiB
Python
Executable file

#!/usr/bin/env python3
# coding=utf-8
"""
Tools to find duplicate in galaxies
"""
import json
import os
import collections
def loadjsons(path, return_paths=False):
"""
Find all Jsons and load them in a dict
Parameters:
path: string
return_names: boolean, if the name of the file should be returned,
default: False
Returns:
List of parsed file contents.
If return_paths is True, then every list item is a tuple of the
file name and the file content
"""
files = []
data = []
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)) and name.endswith('.json'):
files.append(name)
for jfile in files:
filepath = os.path.join(path, jfile)
if return_paths:
data.append((filepath, json.load(open(filepath))))
else:
data.append(json.load(json.load(open(filepath))))
return data
if __name__ == '__main__':
"""
Iterate all name + synonyms
tell what is duplicated.
"""
jsons = loadjsons("../clusters")
counter = collections.Counter()
namespace = []
for djson in jsons:
items = djson.get('values')
for entry in items:
name = entry.get('value').strip().lower()
counter[name] += 1
namespace.append([name, djson.get('name')])
try:
for synonym in entry.get('meta').get('synonyms'):
name = synonym.strip().lower()
counter[name] += 1
namespace.append([name, djson.get('name')])
except (AttributeError, TypeError):
pass
counter = dict(counter)
for key, val in counter.items():
if val > 1:
print("Warning duplicate %s" % key)
for item in namespace:
if item[0] == key:
print(item)