misp-galaxy/tools/IntelAgencies/main.py

from modules.api import WikipediaAPI
from modules.intel import IntelAgency, Meta, Galaxy, Cluster
import os
import uuid
import json

from bs4 import BeautifulSoup
import pycountry

CLUSTER_PATH = '../../clusters'
GALAXY_PATH = '../../galaxies'
GALAXY_NAME = 'intelligence-agencies'
UUID = "3ef969e7-96cd-4048-aa83-191ac457d0db"
WIKIPEDIA_URL = "https://en.wikipedia.org"

COUNTRY_CODES = {
    "Brunei": "BN",
    "People's Republic of China": "CN",
    "Democratic Republic of the Congo": "CD",  # Note: This is for the Democratic Republic of the Congo, not to be confused with the Republic of the Congo (CG)
    "Czech Republic": "CZ",
    "Iran": "IR",
    "Moldova": "MD",  # Officially known as the Republic of Moldova
    "North Korea": "KP",  # Officially the Democratic People's Republic of Korea (DPRK)
    "Palestine": "PS",
    "Russia": "RU",  # Officially the Russian Federation
    "South Korea": "KR",  # Officially the Republic of Korea (ROK)
    "Syria": "SY",  # Officially the Syrian Arab Republic
    "Taiwan": "TW",  # ISO code is assigned as "Taiwan, Province of China"
    "Tanzania": "TZ",  # Officially the United Republic of Tanzania
    "Trinidad & Tobago": "TT",
    "Turkey": "TR",
    "Venezuela": "VE",  # Officially the Bolivarian Republic of Venezuela
    "Vietnam": "VN",  # Officially the Socialist Republic of Vietnam
    "European Union": None,  # Not a country, no ISO code
    "Shanghai Cooperation Organisation": None  # Not a country, no ISO code
}

def compute_uuid(value, namespace=UUID):
    return str(uuid.uuid5(uuid.UUID(namespace), value))

def get_notes_on_lower_level(content):
    notes = []
    for li in content.find_all('li', recursive=False):
        if li.find('ul'):
            notes.extend(get_notes_on_lower_level(li.find('ul')))
        else:

            if li.text in ["Islamic Republic of Iran Army:", "Islamic Revolutionary Guard Corps:", "FARAJA", "Judicial system of the Islamic Republic of Iran", "Intelligence [12]", "Intelligence org"]: # These are not intelligence agencies but Iran's entry is broken
                continue

            a_tag = li.find('a')

            title = li.text
            link_href = None
            description = li.text

            i_tag = li.find_all('i')
            synonyms = [i.text for i in i_tag]
            
            if a_tag:
                title = a_tag.get('title', description)
                if a_tag.has_attr('href'):
                    link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'

            if len(synonyms) == 0 or synonyms[0] == title:
                synonyms = None

            notes.append((title, link_href, description, synonyms))
    return notes

def get_agencies_from_country(heading, current_country):
    agencies = []
    contents = []
    if current_country != "Gambia": # Gambia has a mistake on the wikipedia page
        contents.append(heading.find_next('ul'))
    else:
        soup = BeautifulSoup(str(heading), 'html.parser')
        ul_tag = soup.new_tag('ul')
        li_tag = soup.new_tag('li')
        a_tag = heading.find_next('p').find('a')
        li_tag.append(a_tag)
        ul_tag.append(li_tag)
        contents.append(ul_tag)
     
    current_content = contents[0]
    while True:
        next_sibling = current_content.find_next_sibling()

        if next_sibling is None or next_sibling.name == 'h2':
            break

        if current_country == "Bahamas" and next_sibling.name == 'h2': # Bahamas has a mistake on the wikipedia page
            current_country = None 
            continue

        if next_sibling.name == 'ul':
            contents.append(next_sibling)

        current_content = next_sibling
    
    for content in contents:
        agency_names = get_notes_on_lower_level(content)
        for name, links, description, synonyms in agency_names:
            country_code = pycountry.countries.get(name=current_country)

            # Set country
            country_name = current_country

            if country_code:
                country_code = country_code.alpha_2
            else:
                country_code = COUNTRY_CODES.get(current_country)

            if current_country in ["European Union", "Shanghai Cooperation Organisation"]: # Not a country
                country_name = None

            # Set names for duplicates
            if name in ['Special Branch', 'Financial Intelligence Unit']:
                name = f'{name} ({current_country})'
            
            agencies.append(IntelAgency(value=name, uuid=compute_uuid(name), meta=Meta(country=country_code, country_name=country_name, refs=[links], synonyms=synonyms), description=description))
                
    return agencies
    
def extract_info(content):
    IGNORE = ["See also", "References", "External links", "Further reading"]
    soup = BeautifulSoup(content, 'html.parser')
    agencies = []
    current_country = None
    for h2 in soup.find_all('h2'):
        span = h2.find('span', {'class': 'mw-headline'})
        if span and span.text not in IGNORE:
            current_country = span.text.strip()
            agencies.extend(get_agencies_from_country(h2, current_country))
        else:
            continue
    return agencies
    
if __name__ == '__main__':
    wiki = WikipediaAPI()
    page_title = 'List of intelligence agencies'
    content = wiki.get_page_html(page_title)
    if content:
        agencies = extract_info(content)
    else:
        raise ValueError("Error: No content found: ", content)

    authors = [x['name'] for x in wiki.get_authors(page_title)]
    # Write to files
    galaxy = Galaxy(
        description="List of intelligence agencies",
        icon="ninja",
        name="Intelligence Agencies",
        namespace="intelligence-agency",
        type="intelligence-agency",
        uuid=UUID,
        version=1,
    )
    galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json'))

    cluster = Cluster(
        authors=authors,
        category="Intelligence Agencies",
        description="List of intelligence agencies",
        name="Intelligence Agencies",
        source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
        type="intelligence-agency",
        uuid=UUID,
        version=1,
    )
    for agency in agencies:
        cluster.add_value(agency)

    cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json'))
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`from modules.api import WikipediaAPI`
			`from modules.intel import IntelAgency, Meta, Galaxy, Cluster`
			`import os`
			`import uuid`
			`import json`

			`from bs4 import BeautifulSoup`
Add [cluster] country code 2024-03-12 12:55:00 +00:00			`import pycountry`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00
			`CLUSTER_PATH = '../../clusters'`
			`GALAXY_PATH = '../../galaxies'`
			`GALAXY_NAME = 'intelligence-agencies'`
Add [agencies] refs 2024-03-12 10:22:30 +00:00			`UUID = "3ef969e7-96cd-4048-aa83-191ac457d0db"`
			`WIKIPEDIA_URL = "https://en.wikipedia.org"`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00
Add [cluster] country code 2024-03-12 12:55:00 +00:00			`COUNTRY_CODES = {`
			`"Brunei": "BN",`
			`"People's Republic of China": "CN",`
			`"Democratic Republic of the Congo": "CD", # Note: This is for the Democratic Republic of the Congo, not to be confused with the Republic of the Congo (CG)`
			`"Czech Republic": "CZ",`
			`"Iran": "IR",`
			`"Moldova": "MD", # Officially known as the Republic of Moldova`
			`"North Korea": "KP", # Officially the Democratic People's Republic of Korea (DPRK)`
			`"Palestine": "PS",`
			`"Russia": "RU", # Officially the Russian Federation`
			`"South Korea": "KR", # Officially the Republic of Korea (ROK)`
			`"Syria": "SY", # Officially the Syrian Arab Republic`
			`"Taiwan": "TW", # ISO code is assigned as "Taiwan, Province of China"`
			`"Tanzania": "TZ", # Officially the United Republic of Tanzania`
			`"Trinidad & Tobago": "TT",`
			`"Turkey": "TR",`
			`"Venezuela": "VE", # Officially the Bolivarian Republic of Venezuela`
			`"Vietnam": "VN", # Officially the Socialist Republic of Vietnam`
			`"European Union": None, # Not a country, no ISO code`
			`"Shanghai Cooperation Organisation": None # Not a country, no ISO code`
			`}`

chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`def compute_uuid(value, namespace=UUID):`
			`return str(uuid.uuid5(uuid.UUID(namespace), value))`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00
			`def get_notes_on_lower_level(content):`
			`notes = []`
			`for li in content.find_all('li', recursive=False):`
			`if li.find('ul'):`
			`notes.extend(get_notes_on_lower_level(li.find('ul')))`
			`else:`
Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00
			`if li.text in ["Islamic Republic of Iran Army:", "Islamic Revolutionary Guard Corps:", "FARAJA", "Judicial system of the Islamic Republic of Iran", "Intelligence [12]", "Intelligence org"]: # These are not intelligence agencies but Iran's entry is broken`
			`continue`

Add [agencies] refs 2024-03-12 10:22:30 +00:00			`a_tag = li.find('a')`

			`title = li.text`
			`link_href = None`
			`description = li.text`

Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00			`i_tag = li.find_all('i')`
			`synonyms = [i.text for i in i_tag]`

Add [agencies] refs 2024-03-12 10:22:30 +00:00			`if a_tag:`
			`title = a_tag.get('title', description)`
			`if a_tag.has_attr('href'):`
			`link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'`

Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00			`if len(synonyms) == 0 or synonyms[0] == title:`
			`synonyms = None`

			`notes.append((title, link_href, description, synonyms))`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`return notes`

chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`def get_agencies_from_country(heading, current_country):`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`agencies = []`
Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00			`contents = []`
			`if current_country != "Gambia": # Gambia has a mistake on the wikipedia page`
			`contents.append(heading.find_next('ul'))`
			`else:`
			`soup = BeautifulSoup(str(heading), 'html.parser')`
			`ul_tag = soup.new_tag('ul')`
			`li_tag = soup.new_tag('li')`
			`a_tag = heading.find_next('p').find('a')`
			`li_tag.append(a_tag)`
			`ul_tag.append(li_tag)`
			`contents.append(ul_tag)`

			`current_content = contents[0]`
			`while True:`
			`next_sibling = current_content.find_next_sibling()`

			`if next_sibling is None or next_sibling.name == 'h2':`
			`break`

			`if current_country == "Bahamas" and next_sibling.name == 'h2': # Bahamas has a mistake on the wikipedia page`
			`current_country = None`
			`continue`

			`if next_sibling.name == 'ul':`
			`contents.append(next_sibling)`

			`current_content = next_sibling`

			`for content in contents:`
			`agency_names = get_notes_on_lower_level(content)`
			`for name, links, description, synonyms in agency_names:`
Add [cluster] country code 2024-03-12 12:55:00 +00:00			`country_code = pycountry.countries.get(name=current_country)`

			`# Set country`
			`country_name = current_country`

			`if country_code:`
			`country_code = country_code.alpha_2`
			`else:`
			`country_code = COUNTRY_CODES.get(current_country)`

			`if current_country in ["European Union", "Shanghai Cooperation Organisation"]: # Not a country`
			`country_name = None`
Fix [cluster] duplicates 2024-03-12 13:23:11 +00:00
			`# Set names for duplicates`
			`if name in ['Special Branch', 'Financial Intelligence Unit']:`
			`name = f'{name} ({current_country})'`
Add [cluster] country code 2024-03-12 12:55:00 +00:00
chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`agencies.append(IntelAgency(value=name, uuid=compute_uuid(name), meta=Meta(country=country_code, country_name=country_name, refs=[links], synonyms=synonyms), description=description))`

Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`return agencies`

chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`def extract_info(content):`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`IGNORE = ["See also", "References", "External links", "Further reading"]`
			`soup = BeautifulSoup(content, 'html.parser')`
			`agencies = []`
			`current_country = None`
			`for h2 in soup.find_all('h2'):`
			`span = h2.find('span', {'class': 'mw-headline'})`
			`if span and span.text not in IGNORE:`
			`current_country = span.text.strip()`
chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`agencies.extend(get_agencies_from_country(h2, current_country))`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`else:`
			`continue`
			`return agencies`

			`if __name__ == '__main__':`
			`wiki = WikipediaAPI()`
			`page_title = 'List of intelligence agencies'`
			`content = wiki.get_page_html(page_title)`
chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`if content:`
			`agencies = extract_info(content)`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`else:`
chg: [intel] use UUIDv5 for clusters 2024-03-13 08:35:00 +00:00			`raise ValueError("Error: No content found: ", content)`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00
Add [cluster] authors 2024-03-12 13:06:14 +00:00			`authors = [x['name'] for x in wiki.get_authors(page_title)]`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`# Write to files`
			`galaxy = Galaxy(`
			`description="List of intelligence agencies",`
			`icon="ninja",`
Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00			`name="Intelligence Agencies",`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`namespace="intelligence-agency",`
			`type="intelligence-agency",`
			`uuid=UUID,`
			`version=1,`
			`)`
			`galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json'))`

			`cluster = Cluster(`
Add [cluster] authors 2024-03-12 13:06:14 +00:00			`authors=authors,`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`category="Intelligence Agencies",`
			`description="List of intelligence agencies",`
Add [synonyms] and fixed indivdual mistakes 2024-03-12 12:00:57 +00:00			`name="Intelligence Agencies",`
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",`
			`type="intelligence-agency",`
			`uuid=UUID,`
			`version=1,`
			`)`
			`for agency in agencies:`
			`cluster.add_value(agency)`
Add [agencies] refs 2024-03-12 10:22:30 +00:00
Add [intel-agencies] build script 2024-03-11 15:29:36 +00:00			`cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json'))`