mirror of
https://github.com/MISP/misp-galaxy.git
synced 2024-11-23 15:27:18 +00:00
108 lines
3.6 KiB
Python
108 lines
3.6 KiB
Python
from modules.api import WikipediaAPI
|
|
from modules.intel import IntelAgency, Meta, Galaxy, Cluster
|
|
import os
|
|
import uuid
|
|
import json
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
CLUSTER_PATH = '../../clusters'
|
|
GALAXY_PATH = '../../galaxies'
|
|
GALAXY_NAME = 'intelligence-agencies'
|
|
UUID = "3ef969e7-96cd-4048-aa83-191ac457d0db"
|
|
WIKIPEDIA_URL = "https://en.wikipedia.org"
|
|
|
|
def get_UUIDs():
|
|
if f"{GALAXY_NAME}.json" in os.listdir(CLUSTER_PATH):
|
|
uuids = {}
|
|
with open(os.path.join(CLUSTER_PATH, f"{GALAXY_NAME}.json")) as fr:
|
|
galaxy_json = json.load(fr)
|
|
for cluster in galaxy_json["values"]:
|
|
uuids[cluster["value"]] = cluster["uuid"]
|
|
return uuids
|
|
return None
|
|
|
|
def get_notes_on_lower_level(content):
|
|
notes = []
|
|
for li in content.find_all('li', recursive=False):
|
|
if li.find('ul'):
|
|
notes.extend(get_notes_on_lower_level(li.find('ul')))
|
|
else:
|
|
a_tag = li.find('a')
|
|
|
|
title = li.text
|
|
link_href = None
|
|
description = li.text
|
|
|
|
if a_tag:
|
|
title = a_tag.get('title', description)
|
|
if a_tag.has_attr('href'):
|
|
link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
|
|
|
|
notes.append((title, link_href, description, None))
|
|
return notes
|
|
|
|
def get_agencies_from_country(heading, current_country, uuids):
|
|
agencies = []
|
|
content = heading.find_next('ul')
|
|
agency_names = get_notes_on_lower_level(content)
|
|
for name, links, description, synonyms in agency_names:
|
|
if uuids and name in uuids:
|
|
agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country, refs=[links]), description=description))
|
|
else:
|
|
agencies.append(IntelAgency(value=name, meta=Meta(country=current_country, refs=[links]), uuid=str(uuid.uuid4()), description=description))
|
|
return agencies
|
|
|
|
def extract_info(content, uuids):
|
|
IGNORE = ["See also", "References", "External links", "Further reading"]
|
|
soup = BeautifulSoup(content, 'html.parser')
|
|
agencies = []
|
|
current_country = None
|
|
for h2 in soup.find_all('h2'):
|
|
span = h2.find('span', {'class': 'mw-headline'})
|
|
if span and span.text not in IGNORE:
|
|
current_country = span.text.strip()
|
|
agencies.extend(get_agencies_from_country(h2, current_country, uuids))
|
|
else:
|
|
continue
|
|
return agencies
|
|
|
|
if __name__ == '__main__':
|
|
wiki = WikipediaAPI()
|
|
page_title = 'List of intelligence agencies'
|
|
content = wiki.get_page_html(page_title)
|
|
uuids = get_UUIDs()
|
|
if content and uuids:
|
|
agencies = extract_info(content, uuids)
|
|
elif not uuids:
|
|
print(f'No UUIDs found for {GALAXY_NAME}')
|
|
agencies = extract_info(content, None)
|
|
else:
|
|
print(f'Error: {content}')
|
|
|
|
# Write to files
|
|
galaxy = Galaxy(
|
|
description="List of intelligence agencies",
|
|
icon="ninja",
|
|
name="intelligence-agencies",
|
|
namespace="intelligence-agency",
|
|
type="intelligence-agency",
|
|
uuid=UUID,
|
|
version=1,
|
|
)
|
|
galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json'))
|
|
|
|
cluster = Cluster(
|
|
authors="Wikipedia",
|
|
category="Intelligence Agencies",
|
|
description="List of intelligence agencies",
|
|
name="intelligence-agencies",
|
|
source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
|
|
type="intelligence-agency",
|
|
uuid=UUID,
|
|
version=1,
|
|
)
|
|
for agency in agencies:
|
|
cluster.add_value(agency)
|
|
|
|
cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json'))
|