Merge pull request #946 from NMD03/intel

Inteligence Agencies
2024-11-22 23:07:19 +00:00 · 2024-03-13 16:31:16 +01:00 · 2024-03-13 16:31:16 +01:00 · 5f1b2305cf
commit 5f1b2305cf
parent 3f3b7984a8 7885a8fd00
6 changed files with 6050 additions and 0 deletions
--- a/clusters/intelligence-agencies.json
+++ b/clusters/intelligence-agencies.json
--- a/galaxies/intelligence-agencies.json
+++ b/galaxies/intelligence-agencies.json
@ -0,0 +1,9 @@
 {
  "description": "List of intelligence agencies",
  "icon": "ninja",
  "name": "Intelligence Agencies",
  "namespace": "intelligence-agency",
  "type": "intelligence-agency",
  "uuid": "3ef969e7-96cd-4048-aa83-191ac457d0db",
  "version": 1
 }
--- a/tools/IntelAgencies/main.py
+++ b/tools/IntelAgencies/main.py
@ -0,0 +1,157 @@
 from modules.api import WikipediaAPI
 from modules.intel import IntelAgency, Meta, Galaxy, Cluster
 import os
 import uuid
 import json
 from bs4 import BeautifulSoup
 import pycountry
 CLUSTER_PATH = '../../clusters'
 GALAXY_PATH = '../../galaxies'
 GALAXY_NAME = 'intelligence-agencies'
 UUID = "3ef969e7-96cd-4048-aa83-191ac457d0db"
 WIKIPEDIA_URL = "https://en.wikipedia.org"
 COUNTRY_CODES = {
    "Brunei": "BN",
    "People's Republic of China": "CN",
    "Democratic Republic of the Congo": "CD",  # Note: This is for the Democratic Republic of the Congo, not to be confused with the Republic of the Congo (CG)
    "Czech Republic": "CZ",
    "Iran": "IR",
    "Moldova": "MD",  # Officially known as the Republic of Moldova
    "North Korea": "KP",  # Officially the Democratic People's Republic of Korea (DPRK)
    "Palestine": "PS",
    "Russia": "RU",  # Officially the Russian Federation
    "South Korea": "KR",  # Officially the Republic of Korea (ROK)
    "Syria": "SY",  # Officially the Syrian Arab Republic
    "Taiwan": "TW",  # ISO code is assigned as "Taiwan, Province of China"
    "Tanzania": "TZ",  # Officially the United Republic of Tanzania
    "Trinidad & Tobago": "TT",
    "Turkey": "TR",
    "Venezuela": "VE",  # Officially the Bolivarian Republic of Venezuela
    "Vietnam": "VN",  # Officially the Socialist Republic of Vietnam
    "European Union": None,  # Not a country, no ISO code
    "Shanghai Cooperation Organisation": None  # Not a country, no ISO code
 }
 def compute_uuid(value, namespace=UUID):
    return str(uuid.uuid5(uuid.UUID(namespace), value))
 def get_notes_on_lower_level(content):
    notes = []
    for li in content.find_all('li', recursive=False):
        if li.find('ul'):
            notes.extend(get_notes_on_lower_level(li.find('ul')))
        else:
            a_tag = li.find('a')
            title = li.text
            link_href = None
            description = li.text
            i_tag = li.find_all('i')
            synonyms = [i.text for i in i_tag]
            if a_tag:
                title = a_tag.get('title', description)
                if a_tag.has_attr('href'):
                    link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
            if len(synonyms) == 0 or synonyms[0] == title:
                synonyms = None
            notes.append((title, link_href, description, synonyms))
    return notes
 def get_agencies_from_country(heading, current_country):
    agencies = []
    contents = []
    contents.append(heading.find_next('ul'))
    current_content = contents[0]
    while True:
        next_sibling = current_content.find_next_sibling()
        if next_sibling is None or next_sibling.name == 'h2':
            break
        if next_sibling.name == 'ul':
            contents.append(next_sibling)
        current_content = next_sibling
    for content in contents:
        agency_names = get_notes_on_lower_level(content)
        for name, links, description, synonyms in agency_names:
            country_code = pycountry.countries.get(name=current_country)
            # Set country
            country_name = current_country
            if country_code:
                country_code = country_code.alpha_2
            else:
                country_code = COUNTRY_CODES.get(current_country)
            if current_country in ["European Union", "Shanghai Cooperation Organisation"]: # Not a country
                country_name = None
            # Set names for duplicates
            if name in ['Special Branch', 'Financial Intelligence Unit']:
                name = f'{name} ({current_country})'
            agencies.append(IntelAgency(value=name, uuid=compute_uuid(name), meta=Meta(country=country_code, country_name=country_name, refs=[links], synonyms=synonyms), description=description))
    return agencies
 def extract_info(content):
    IGNORE = ["See also", "References", "External links", "Further reading"]
    soup = BeautifulSoup(content, 'html.parser')
    agencies = []
    current_country = None
    for h2 in soup.find_all('h2'):
        span = h2.find('span', {'class': 'mw-headline'})
        if span and span.text not in IGNORE:
            current_country = span.text.strip()
            agencies.extend(get_agencies_from_country(h2, current_country))
        else:
            continue
    return agencies
 if __name__ == '__main__':
    wiki = WikipediaAPI()
    page_title = 'List of intelligence agencies'
    content = wiki.get_page_html(page_title)
    if content:
        agencies = extract_info(content)
    else:
        raise ValueError("Error: No content found: ", content)
    authors = [x['name'] for x in wiki.get_authors(page_title)]
    # Write to files
    galaxy = Galaxy(
        description="List of intelligence agencies",
        icon="ninja",
        name="Intelligence Agencies",
        namespace="intelligence-agency",
        type="intelligence-agency",
        uuid=UUID,
        version=1,
    )
    galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json'))
    cluster = Cluster(
        authors=authors,
        category="Intelligence Agencies",
        description="List of intelligence agencies",
        name="Intelligence Agencies",
        source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
        type="intelligence-agency",
        uuid=UUID,
        version=1,
    )
    for agency in agencies:
        cluster.add_value(agency)
    cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json'))
--- a/tools/IntelAgencies/modules/init.py
+++ b/tools/IntelAgencies/modules/init.py
--- a/tools/IntelAgencies/modules/api.py
+++ b/tools/IntelAgencies/modules/api.py
@ -0,0 +1,72 @@
 import requests
 class WikipediaAPI():
    def __init__(self):
        self.base_url = 'https://en.wikipedia.org/w/api.php'
    def get_page_summary(self, page_title):
        params = {
            'action': 'query',
            'format': 'json',
            'titles': page_title,
            'prop': 'extracts',
            'explaintext': True,
        }
        try:
            response = requests.get(self.base_url, params=params)
            data = response.json()
            page_id = next(iter(data['query']['pages']))
            return data['query']['pages'][page_id]['extract']
        except Exception as e:
            print(f'Error: {e}')
            return None
    def get_page_content(self, page_title):
        params = {
            'action': 'query',
            'format': 'json',
            'titles': page_title,
            'prop': 'revisions',
            'rvprop': 'content',
        }
        try:
            response = requests.get(self.base_url, params=params)
            data = response.json()
            page_id = next(iter(data['query']['pages']))
            return data['query']['pages'][page_id]['revisions'][0]['*']
        except Exception as e:
            print(f'Error: {e}')
            return None
    def get_page_html(self, page_title):
        params = {
            'action': 'parse',
            'format': 'json',
            'page': page_title,
            'prop': 'text',
            'disableeditsection': True, 
        }
        try:
            response = requests.get(self.base_url, params=params)
            data = response.json()
            return data['parse']['text']['*']
        except Exception as e:
            print(f'Error: {e}')
            return None
    def get_authors(self, page_title):
        params = {
            'action': 'query',
            'format': 'json',
            'titles': page_title,
            'prop': 'contributors',
        }
        try:
            response = requests.get(self.base_url, params=params)
            data = response.json()
            page_id = next(iter(data['query']['pages']))
            return data['query']['pages'][page_id]['contributors']
        except Exception as e:
            print(f'Error: {e}')
            return None
--- a/tools/IntelAgencies/modules/intel.py
+++ b/tools/IntelAgencies/modules/intel.py
@ -0,0 +1,76 @@
 from dataclasses import dataclass, field, asdict, is_dataclass
 import json
@dataclass
 class Meta:
    country: str = None
    country_name: str = None
    refs: list = field(default_factory=list)
    synonyms: list = field(default_factory=list)
 def custom_asdict(obj):
    if is_dataclass(obj):
        result = {}
        for field_name, field_def in obj.__dataclass_fields__.items():
            value = getattr(obj, field_name)
            if field_name == 'meta': 
                meta_value = custom_asdict(value)
                meta_value = {k: v for k, v in meta_value.items() if v is not None and not (k in ['refs', 'synonyms'] and (not v or all(e is None for e in v)))}
                value = meta_value
            elif isinstance(value, (list, tuple)) and all(is_dataclass(i) for i in value):
                value = [custom_asdict(i) for i in value]
            elif isinstance(value, list) and all(e is None for e in value):
                continue
            if value is None and field_name in ['country', 'country_name']:
                continue
            result[field_name] = value
        return result
    else:
        return obj
@dataclass
 class IntelAgency:
    description: str = ""
    meta: Meta = field(default_factory=Meta)
    related: list = field(default_factory=list)
    uuid: str = None
    value: str = None
    def __post_init__(self):
            if not self.value:
                raise ValueError("IntelAgency 'value' cannot be empty.")
            if not self.uuid:
                 raise ValueError("IntelAgency 'uuid' cannot be empty.")
@dataclass
 class Galaxy:
    description: str
    icon: str 
    name: str
    namespace: str
    type: str 
    uuid: str 
    version: int 
    def save_to_file(self, path: str):
        with open(path, "w") as file:
            file.write(json.dumps(asdict(self), indent=4))
@dataclass
 class Cluster:
    authors: str
    category: str
    description: str
    name: str
    source: str
    type: str
    uuid: str
    version: int
    values: list = field(default_factory=list)
    def add_value(self, value: IntelAgency):
        self.values.append(value)
    def save_to_file(self, path: str):
        with open(path, "w") as file:
            file.write(json.dumps(custom_asdict(self), indent=4, ensure_ascii=False))