Add [synonyms] and fixed indivdual mistakes

This commit is contained in:
niclas 2024-03-12 13:00:57 +01:00
parent bb28408b14
commit c88253baea
7 changed files with 760 additions and 40 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
{ {
"description": "List of intelligence agencies", "description": "List of intelligence agencies",
"icon": "ninja", "icon": "ninja",
"name": "intelligence-agencies", "name": "Intelligence Agencies",
"namespace": "intelligence-agency", "namespace": "intelligence-agency",
"type": "intelligence-agency", "type": "intelligence-agency",
"uuid": "3ef969e7-96cd-4048-aa83-191ac457d0db", "uuid": "3ef969e7-96cd-4048-aa83-191ac457d0db",

View file

@ -28,29 +28,68 @@ def get_notes_on_lower_level(content):
if li.find('ul'): if li.find('ul'):
notes.extend(get_notes_on_lower_level(li.find('ul'))) notes.extend(get_notes_on_lower_level(li.find('ul')))
else: else:
if li.text in ["Islamic Republic of Iran Army:", "Islamic Revolutionary Guard Corps:", "FARAJA", "Judicial system of the Islamic Republic of Iran", "Intelligence [12]", "Intelligence org"]: # These are not intelligence agencies but Iran's entry is broken
continue
a_tag = li.find('a') a_tag = li.find('a')
title = li.text title = li.text
link_href = None link_href = None
description = li.text description = li.text
i_tag = li.find_all('i')
synonyms = [i.text for i in i_tag]
if a_tag: if a_tag:
title = a_tag.get('title', description) title = a_tag.get('title', description)
if a_tag.has_attr('href'): if a_tag.has_attr('href'):
link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}' link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
notes.append((title, link_href, description, None)) if len(synonyms) == 0 or synonyms[0] == title:
synonyms = None
notes.append((title, link_href, description, synonyms))
return notes return notes
def get_agencies_from_country(heading, current_country, uuids): def get_agencies_from_country(heading, current_country, uuids):
agencies = [] agencies = []
content = heading.find_next('ul') contents = []
agency_names = get_notes_on_lower_level(content) if current_country != "Gambia": # Gambia has a mistake on the wikipedia page
for name, links, description, synonyms in agency_names: contents.append(heading.find_next('ul'))
if uuids and name in uuids: else:
agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country, refs=[links]), description=description)) soup = BeautifulSoup(str(heading), 'html.parser')
else: ul_tag = soup.new_tag('ul')
agencies.append(IntelAgency(value=name, meta=Meta(country=current_country, refs=[links]), uuid=str(uuid.uuid4()), description=description)) li_tag = soup.new_tag('li')
a_tag = heading.find_next('p').find('a')
li_tag.append(a_tag)
ul_tag.append(li_tag)
contents.append(ul_tag)
current_content = contents[0]
while True:
next_sibling = current_content.find_next_sibling()
if next_sibling is None or next_sibling.name == 'h2':
break
if current_country == "Bahamas" and next_sibling.name == 'h2': # Bahamas has a mistake on the wikipedia page
current_country = None
continue
if next_sibling.name == 'ul':
contents.append(next_sibling)
current_content = next_sibling
for content in contents:
agency_names = get_notes_on_lower_level(content)
for name, links, description, synonyms in agency_names:
if uuids and name in uuids:
agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country, refs=[links], synonyms=[synonyms]), description=description))
else:
agencies.append(IntelAgency(value=name, meta=Meta(country=current_country, refs=[links], synonyms=[synonyms]), uuid=str(uuid.uuid4()), description=description))
return agencies return agencies
def extract_info(content, uuids): def extract_info(content, uuids):
@ -71,6 +110,7 @@ if __name__ == '__main__':
wiki = WikipediaAPI() wiki = WikipediaAPI()
page_title = 'List of intelligence agencies' page_title = 'List of intelligence agencies'
content = wiki.get_page_html(page_title) content = wiki.get_page_html(page_title)
# print(content)
uuids = get_UUIDs() uuids = get_UUIDs()
if content and uuids: if content and uuids:
agencies = extract_info(content, uuids) agencies = extract_info(content, uuids)
@ -84,7 +124,7 @@ if __name__ == '__main__':
galaxy = Galaxy( galaxy = Galaxy(
description="List of intelligence agencies", description="List of intelligence agencies",
icon="ninja", icon="ninja",
name="intelligence-agencies", name="Intelligence Agencies",
namespace="intelligence-agency", namespace="intelligence-agency",
type="intelligence-agency", type="intelligence-agency",
uuid=UUID, uuid=UUID,
@ -96,7 +136,7 @@ if __name__ == '__main__':
authors="Wikipedia", authors="Wikipedia",
category="Intelligence Agencies", category="Intelligence Agencies",
description="List of intelligence agencies", description="List of intelligence agencies",
name="intelligence-agencies", name="Intelligence Agencies",
source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies", source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
type="intelligence-agency", type="intelligence-agency",
uuid=UUID, uuid=UUID,