misp-galaxy/tools/IntelAgencies/main.py

175 lines
6.2 KiB
Python
Raw Normal View History

2024-03-11 15:29:36 +00:00
from modules.api import WikipediaAPI
from modules.intel import IntelAgency, Meta, Galaxy, Cluster
import os
import uuid
import json
from bs4 import BeautifulSoup
2024-03-12 12:55:00 +00:00
import pycountry
2024-03-11 15:29:36 +00:00
CLUSTER_PATH = '../../clusters'
GALAXY_PATH = '../../galaxies'
GALAXY_NAME = 'intelligence-agencies'
2024-03-12 10:22:30 +00:00
UUID = "3ef969e7-96cd-4048-aa83-191ac457d0db"
WIKIPEDIA_URL = "https://en.wikipedia.org"
2024-03-11 15:29:36 +00:00
2024-03-12 12:55:00 +00:00
COUNTRY_CODES = {
"Brunei": "BN",
"People's Republic of China": "CN",
"Democratic Republic of the Congo": "CD", # Note: This is for the Democratic Republic of the Congo, not to be confused with the Republic of the Congo (CG)
"Czech Republic": "CZ",
"Iran": "IR",
"Moldova": "MD", # Officially known as the Republic of Moldova
"North Korea": "KP", # Officially the Democratic People's Republic of Korea (DPRK)
"Palestine": "PS",
"Russia": "RU", # Officially the Russian Federation
"South Korea": "KR", # Officially the Republic of Korea (ROK)
"Syria": "SY", # Officially the Syrian Arab Republic
"Taiwan": "TW", # ISO code is assigned as "Taiwan, Province of China"
"Tanzania": "TZ", # Officially the United Republic of Tanzania
"Trinidad & Tobago": "TT",
"Turkey": "TR",
"Venezuela": "VE", # Officially the Bolivarian Republic of Venezuela
"Vietnam": "VN", # Officially the Socialist Republic of Vietnam
"European Union": None, # Not a country, no ISO code
"Shanghai Cooperation Organisation": None # Not a country, no ISO code
}
2024-03-13 08:35:00 +00:00
def compute_uuid(value, namespace=UUID):
return str(uuid.uuid5(uuid.UUID(namespace), value))
2024-03-11 15:29:36 +00:00
def get_notes_on_lower_level(content):
notes = []
for li in content.find_all('li', recursive=False):
if li.find('ul'):
notes.extend(get_notes_on_lower_level(li.find('ul')))
else:
if li.text in ["Islamic Republic of Iran Army:", "Islamic Revolutionary Guard Corps:", "FARAJA", "Judicial system of the Islamic Republic of Iran", "Intelligence [12]", "Intelligence org"]: # These are not intelligence agencies but Iran's entry is broken
continue
2024-03-12 10:22:30 +00:00
a_tag = li.find('a')
title = li.text
link_href = None
description = li.text
i_tag = li.find_all('i')
synonyms = [i.text for i in i_tag]
2024-03-12 10:22:30 +00:00
if a_tag:
title = a_tag.get('title', description)
if a_tag.has_attr('href'):
link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
if len(synonyms) == 0 or synonyms[0] == title:
synonyms = None
notes.append((title, link_href, description, synonyms))
2024-03-11 15:29:36 +00:00
return notes
2024-03-13 08:35:00 +00:00
def get_agencies_from_country(heading, current_country):
2024-03-11 15:29:36 +00:00
agencies = []
contents = []
if current_country != "Gambia": # Gambia has a mistake on the wikipedia page
contents.append(heading.find_next('ul'))
else:
soup = BeautifulSoup(str(heading), 'html.parser')
ul_tag = soup.new_tag('ul')
li_tag = soup.new_tag('li')
a_tag = heading.find_next('p').find('a')
li_tag.append(a_tag)
ul_tag.append(li_tag)
contents.append(ul_tag)
current_content = contents[0]
while True:
next_sibling = current_content.find_next_sibling()
if next_sibling is None or next_sibling.name == 'h2':
break
if current_country == "Bahamas" and next_sibling.name == 'h2': # Bahamas has a mistake on the wikipedia page
current_country = None
continue
if next_sibling.name == 'ul':
contents.append(next_sibling)
current_content = next_sibling
for content in contents:
agency_names = get_notes_on_lower_level(content)
for name, links, description, synonyms in agency_names:
2024-03-12 12:55:00 +00:00
country_code = pycountry.countries.get(name=current_country)
# Set country
country_name = current_country
if country_code:
country_code = country_code.alpha_2
else:
country_code = COUNTRY_CODES.get(current_country)
if current_country in ["European Union", "Shanghai Cooperation Organisation"]: # Not a country
country_name = None
2024-03-12 13:23:11 +00:00
# Set names for duplicates
if name in ['Special Branch', 'Financial Intelligence Unit']:
name = f'{name} ({current_country})'
2024-03-12 12:55:00 +00:00
2024-03-13 08:35:00 +00:00
agencies.append(IntelAgency(value=name, uuid=compute_uuid(name), meta=Meta(country=country_code, country_name=country_name, refs=[links], synonyms=synonyms), description=description))
2024-03-11 15:29:36 +00:00
return agencies
2024-03-13 08:35:00 +00:00
def extract_info(content):
2024-03-11 15:29:36 +00:00
IGNORE = ["See also", "References", "External links", "Further reading"]
soup = BeautifulSoup(content, 'html.parser')
agencies = []
current_country = None
for h2 in soup.find_all('h2'):
span = h2.find('span', {'class': 'mw-headline'})
if span and span.text not in IGNORE:
current_country = span.text.strip()
2024-03-13 08:35:00 +00:00
agencies.extend(get_agencies_from_country(h2, current_country))
2024-03-11 15:29:36 +00:00
else:
continue
return agencies
if __name__ == '__main__':
wiki = WikipediaAPI()
page_title = 'List of intelligence agencies'
content = wiki.get_page_html(page_title)
2024-03-13 08:35:00 +00:00
if content:
agencies = extract_info(content)
2024-03-11 15:29:36 +00:00
else:
2024-03-13 08:35:00 +00:00
raise ValueError("Error: No content found: ", content)
2024-03-11 15:29:36 +00:00
2024-03-12 13:06:14 +00:00
authors = [x['name'] for x in wiki.get_authors(page_title)]
2024-03-11 15:29:36 +00:00
# Write to files
galaxy = Galaxy(
description="List of intelligence agencies",
icon="ninja",
name="Intelligence Agencies",
2024-03-11 15:29:36 +00:00
namespace="intelligence-agency",
type="intelligence-agency",
uuid=UUID,
version=1,
)
galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json'))
cluster = Cluster(
2024-03-12 13:06:14 +00:00
authors=authors,
2024-03-11 15:29:36 +00:00
category="Intelligence Agencies",
description="List of intelligence agencies",
name="Intelligence Agencies",
2024-03-11 15:29:36 +00:00
source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
type="intelligence-agency",
uuid=UUID,
version=1,
)
for agency in agencies:
cluster.add_value(agency)
2024-03-12 10:22:30 +00:00
2024-03-11 15:29:36 +00:00
cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json'))