mirror of
https://github.com/MISP/misp-galaxy.git
synced 2024-11-22 23:07:19 +00:00
Add [synonyms] and fixed indivdual mistakes
This commit is contained in:
parent
bb28408b14
commit
c88253baea
7 changed files with 760 additions and 40 deletions
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"description": "List of intelligence agencies",
|
"description": "List of intelligence agencies",
|
||||||
"icon": "ninja",
|
"icon": "ninja",
|
||||||
"name": "intelligence-agencies",
|
"name": "Intelligence Agencies",
|
||||||
"namespace": "intelligence-agency",
|
"namespace": "intelligence-agency",
|
||||||
"type": "intelligence-agency",
|
"type": "intelligence-agency",
|
||||||
"uuid": "3ef969e7-96cd-4048-aa83-191ac457d0db",
|
"uuid": "3ef969e7-96cd-4048-aa83-191ac457d0db",
|
||||||
|
|
|
@ -28,29 +28,68 @@ def get_notes_on_lower_level(content):
|
||||||
if li.find('ul'):
|
if li.find('ul'):
|
||||||
notes.extend(get_notes_on_lower_level(li.find('ul')))
|
notes.extend(get_notes_on_lower_level(li.find('ul')))
|
||||||
else:
|
else:
|
||||||
|
|
||||||
|
if li.text in ["Islamic Republic of Iran Army:", "Islamic Revolutionary Guard Corps:", "FARAJA", "Judicial system of the Islamic Republic of Iran", "Intelligence [12]", "Intelligence org"]: # These are not intelligence agencies but Iran's entry is broken
|
||||||
|
continue
|
||||||
|
|
||||||
a_tag = li.find('a')
|
a_tag = li.find('a')
|
||||||
|
|
||||||
title = li.text
|
title = li.text
|
||||||
link_href = None
|
link_href = None
|
||||||
description = li.text
|
description = li.text
|
||||||
|
|
||||||
|
i_tag = li.find_all('i')
|
||||||
|
synonyms = [i.text for i in i_tag]
|
||||||
|
|
||||||
if a_tag:
|
if a_tag:
|
||||||
title = a_tag.get('title', description)
|
title = a_tag.get('title', description)
|
||||||
if a_tag.has_attr('href'):
|
if a_tag.has_attr('href'):
|
||||||
link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
|
link_href = f'{WIKIPEDIA_URL}{a_tag["href"]}'
|
||||||
|
|
||||||
notes.append((title, link_href, description, None))
|
if len(synonyms) == 0 or synonyms[0] == title:
|
||||||
|
synonyms = None
|
||||||
|
|
||||||
|
notes.append((title, link_href, description, synonyms))
|
||||||
return notes
|
return notes
|
||||||
|
|
||||||
def get_agencies_from_country(heading, current_country, uuids):
|
def get_agencies_from_country(heading, current_country, uuids):
|
||||||
agencies = []
|
agencies = []
|
||||||
content = heading.find_next('ul')
|
contents = []
|
||||||
|
if current_country != "Gambia": # Gambia has a mistake on the wikipedia page
|
||||||
|
contents.append(heading.find_next('ul'))
|
||||||
|
else:
|
||||||
|
soup = BeautifulSoup(str(heading), 'html.parser')
|
||||||
|
ul_tag = soup.new_tag('ul')
|
||||||
|
li_tag = soup.new_tag('li')
|
||||||
|
a_tag = heading.find_next('p').find('a')
|
||||||
|
li_tag.append(a_tag)
|
||||||
|
ul_tag.append(li_tag)
|
||||||
|
contents.append(ul_tag)
|
||||||
|
|
||||||
|
current_content = contents[0]
|
||||||
|
while True:
|
||||||
|
next_sibling = current_content.find_next_sibling()
|
||||||
|
|
||||||
|
if next_sibling is None or next_sibling.name == 'h2':
|
||||||
|
break
|
||||||
|
|
||||||
|
if current_country == "Bahamas" and next_sibling.name == 'h2': # Bahamas has a mistake on the wikipedia page
|
||||||
|
current_country = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
if next_sibling.name == 'ul':
|
||||||
|
contents.append(next_sibling)
|
||||||
|
|
||||||
|
current_content = next_sibling
|
||||||
|
|
||||||
|
for content in contents:
|
||||||
agency_names = get_notes_on_lower_level(content)
|
agency_names = get_notes_on_lower_level(content)
|
||||||
for name, links, description, synonyms in agency_names:
|
for name, links, description, synonyms in agency_names:
|
||||||
if uuids and name in uuids:
|
if uuids and name in uuids:
|
||||||
agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country, refs=[links]), description=description))
|
agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country, refs=[links], synonyms=[synonyms]), description=description))
|
||||||
else:
|
else:
|
||||||
agencies.append(IntelAgency(value=name, meta=Meta(country=current_country, refs=[links]), uuid=str(uuid.uuid4()), description=description))
|
agencies.append(IntelAgency(value=name, meta=Meta(country=current_country, refs=[links], synonyms=[synonyms]), uuid=str(uuid.uuid4()), description=description))
|
||||||
|
|
||||||
return agencies
|
return agencies
|
||||||
|
|
||||||
def extract_info(content, uuids):
|
def extract_info(content, uuids):
|
||||||
|
@ -71,6 +110,7 @@ if __name__ == '__main__':
|
||||||
wiki = WikipediaAPI()
|
wiki = WikipediaAPI()
|
||||||
page_title = 'List of intelligence agencies'
|
page_title = 'List of intelligence agencies'
|
||||||
content = wiki.get_page_html(page_title)
|
content = wiki.get_page_html(page_title)
|
||||||
|
# print(content)
|
||||||
uuids = get_UUIDs()
|
uuids = get_UUIDs()
|
||||||
if content and uuids:
|
if content and uuids:
|
||||||
agencies = extract_info(content, uuids)
|
agencies = extract_info(content, uuids)
|
||||||
|
@ -84,7 +124,7 @@ if __name__ == '__main__':
|
||||||
galaxy = Galaxy(
|
galaxy = Galaxy(
|
||||||
description="List of intelligence agencies",
|
description="List of intelligence agencies",
|
||||||
icon="ninja",
|
icon="ninja",
|
||||||
name="intelligence-agencies",
|
name="Intelligence Agencies",
|
||||||
namespace="intelligence-agency",
|
namespace="intelligence-agency",
|
||||||
type="intelligence-agency",
|
type="intelligence-agency",
|
||||||
uuid=UUID,
|
uuid=UUID,
|
||||||
|
@ -96,7 +136,7 @@ if __name__ == '__main__':
|
||||||
authors="Wikipedia",
|
authors="Wikipedia",
|
||||||
category="Intelligence Agencies",
|
category="Intelligence Agencies",
|
||||||
description="List of intelligence agencies",
|
description="List of intelligence agencies",
|
||||||
name="intelligence-agencies",
|
name="Intelligence Agencies",
|
||||||
source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
|
source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies",
|
||||||
type="intelligence-agency",
|
type="intelligence-agency",
|
||||||
uuid=UUID,
|
uuid=UUID,
|
Loading…
Reference in a new issue