misp-galaxy/tools/gen_defence_university.py
VVX7 b4c3ffc8eb new: [dev] add ASPI's China Defence University Tracker.
Thanks to Cormac Doherty for writing the web scraper! To update the galaxy run the included gen_defence_university.py script.

"The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.

It includes entries on nearly 100 civilian universities, 50 People’s Liberation Army institutions, China’s nuclear weapons program, three Ministry of State Security institutions, four Ministry of Public Security universities, and 12 state-owned defence industry conglomerates.

The Tracker is a tool to inform universities, governments and scholars as they engage with the entities from the People’s Republic of China. It aims to build understanding of the expansion of military-civil fusion—the Chinese government’s policy of integrating military and civilian efforts—into the education sector.

The Tracker should be used to inform due diligence of Chinese institutions. However, the fact that an institution is not included here does not indicate that it should not raise risks or is not involved in defence research. Similarly, entries in the database may not reflect the full range and nature of an institution’s defence and security links." - ASPI (https://unitracker.aspi.org.au/about/)
2020-08-21 11:24:22 -04:00

284 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import requests
import json
from bs4 import BeautifulSoup
import bs4
import uuid
# This tool is part of the MISP core project and released under the GNU Affero
# General Public License v3.0
#
# Copyright (C) 2020 Cormac Doherty
# Copyright (C) 2020 Roger Johnston
#
#
# version 0.1 - initial
# version 0.2 - fixed typo ( _curRef NOT curRef)
def _buildArticleSection(nxtSibling):
_sectionParagraphs = []
_nxtsib = nxtSibling
# Headings and their content are at the same hierarchical
# level in the html - just a sequence. This loop is bounded on
# the next element being a <p>
while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
# Almost every sentence, if not clause, in parapgraph
# text is referenced/cited/footnoted.
#
# The following iterates through the sequence of 'tokens'
# in the current <p>, building 'statements' composed of a
# statement and a reference.
#
# so-called "clauses" and "references" are accumulated over
# loop iterations i.e. a clause is appended to previous clauses
# if a reference has yet to be accumulated. (implicitly -
# references come after statements.)
#
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
#
_curClause = None
_curRef = None
for token in _nxtsib.contents:
# References (links) are interleved within text blocks as <spans>.
# The following control structure parses 'the next token' as
# - <spans> containing a link
# - disposable 'junk' if its <em>phasised and contains "Last update"
# - as relevant paragraph text to be accumulated.
if (token.name == 'span'):
_anchors = token.find_all('a', recursive=True)
_anch = None
if (len(_anchors) != 0):
_anch = _anchors[0]
if (_anch is not None):
_curRef = _anch['href']
else:
_curRef = None
elif ((token.name != 'em') or (not ("Last updated" in token.text))): # ignore the "last updated footer
if (_curClause is not None):
if (isinstance(token, bs4.element.NavigableString)):
_curClause = _curClause + token
else:
_curClause = _curClause + token.text
else:
# anomalous html handling
# - <strong> and
# - (useless) <a> tags
# appear in a few places
if ((token.name != 'strong') and
(token.name != 'em') and
(token.name != 'br') and
(token.name != 'sup') and
(token.name != 'a')):
_curClause = token # this quashes them
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
if ((_curRef is not None) and (_curClause is not None)):
statement = {}
statement["clause"] = _curClause
statement["ref"] = _curRef
_sectionParagraphs.append(statement)
_curClause = None
_curRef = None
# If a sequence of 'clauses' have been accumulated without finding a reference
# create a reference-LESS statement.
if ((_curClause is not None) and (not "Last updated" in _curClause)):
statement = {}
statement["clause"] = _curClause
_sectionParagraphs.append(statement)
_nxtsib = _nxtsib.find_next_sibling()
return _sectionParagraphs
def _buildListSection(listContent):
laboratories = []
for lab in listContent.find_all('li', recursive="False"):
_lab = {}
_lab['name'] = lab.contents[0].replace(u'\xa0', '')
ref = lab.find('a')
if (ref is not None):
_lab['ref'] = ref['href']
else:
_lab['ref'] = None
laboratories.append(_lab)
return laboratories
def _fetchArticle(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
_article = soup.body.find_all('article')[0]
article = {}
article['url'] = url
article['name'] = _article.h1.text.replace('\n', '').strip()
article['_name'] = _article.h2.contents[0]
_artbody = _article.find('div', {"class": "article__copy"})
# Risk Statement
article['risk statement'] = _artbody.find('p').text
article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
# Article body
sections = []
for _heading in _artbody.findChildren('h2'):
_nxtSibling = _heading.find_next_sibling()
section = {}
section['title'] = _heading.text
if (_nxtSibling.name == 'ul'):
section['body'] = _buildListSection(_nxtSibling)
else:
section['body'] = _buildArticleSection(_nxtSibling)
sections.append(section)
article['sections'] = sections
# # Logo
# logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
_panel = _article.find("div", {"class": "aside__groups cf"})
_paneldivs = _panel.find_all('div')
for _paneldiv in _panel.find_all('div'):
_title = _paneldiv.find('h3').text
_items = []
for _item in _paneldiv.find_all('li'):
_anch = _item.find('a')
if (_anch is not None):
if ("Location" in _title): # locations
_loc = {}
_loc['name'] = _anch.contents[0].replace('\n', '').strip()
_loc['ref'] = _anch['href']
_latlong = _anch['href'].split("=")[1]
_loc['lat'] = _latlong.split(",")[0]
_loc['long'] = _latlong.split(",")[1]
_items.append(_loc)
else:
_items.append(_anch.text)
else:
_items.append(_item.text.replace('\n', '').strip())
article[_title.lower()] = _items
return article
def _gen_galaxy(scrape):
base = {
"authors": [
"Australian Strategic Policy Institute"
],
"category": "academic-institution",
"description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPIs International Cyber Policy Centre.",
"name": "China Defence Universities Tracker",
"source": "ASPI International Cyber Policy Centre",
"type": "china-defence-universities",
"uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
"values": [
],
"version": 1
}
for uni in scrape:
new_template = template = {
# university description
"description": "",
"meta": {
"supervising agency": [],
"subsidiary": [],
"category": [],
"topics": [],
# Defence labs
"Major Defence Laboratories": [],
# Defence labs
"alias": [],
"risk": "",
"address": "",
"lat": "",
"long": "",
# External link to ASPI
"refs": []
},
"uuid": "",
"value": ""
}
new_template["uuid"] = str(uuid.uuid4())
new_template["meta"]["refs"].append(uni["url"])
for intro in uni["intro"]:
new_template["description"] += intro["clause"]
new_template["value"] = uni["name"] + f" ({uni['_name']})"
new_template["meta"]["risk"] = uni["risk statement"]
for alias in uni["aliases"]:
new_template["meta"]["alias"].append(alias)
for agency in uni["supervising agencies"]:
new_template["meta"]["supervising agency"].append(agency)
if uni.get("subsidiaries"):
for subsidiary in uni["subsidiaries"]:
new_template["meta"]["subsidiary"].append(subsidiary)
if uni.get("topics"):
for topic in uni["topics"]:
new_template["meta"]["topics"].append(topic)
for category in uni["categories"]:
new_template["meta"]["category"].append(category)
if uni.get("location"):
new_template["meta"]["address"] = uni["location"][0]["name"]
new_template["meta"]["lat"] = uni["location"][0]["lat"]
new_template["meta"]["long"] = uni["location"][0]["long"]
base["values"].append(new_template)
return base
def main():
url = "https://unitracker.aspi.org.au"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
table = soup.find_all('table')[0] # Grab the first table
head = None
articles = []
for row in table.find_all('tr'):
if head is not None:
colOne = row.find_all('td')[0].find_all('a')[0]['href']
article = _fetchArticle(url + colOne)
print("Processing: {}".format(url + colOne))
articles.append(article)
else:
head = "bloop"
galaxy = _gen_galaxy(articles)
with open("china-defence-universities.json", "w") as g:
g.write(json.dumps(galaxy, indent=4, sort_keys=True))
if __name__ == "__main__":
main()