#!/usr/bin/python3
import requests
import json
from bs4 import BeautifulSoup
import bs4
import uuid
# This tool is part of the MISP core project and released under the GNU Affero
# General Public License v3.0
#
# Copyright (C) 2020 Cormac Doherty
# Copyright (C) 2020 Roger Johnston
#
#
# version 0.1 - initial
# version 0.2 - fixed typo ( _curRef NOT curRef)
def _buildArticleSection(nxtSibling):
_sectionParagraphs = []
_nxtsib = nxtSibling
# Headings and their content are at the same hierarchical
# level in the html - just a sequence. This loop is bounded on
# the next element being a
while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
# Almost every sentence, if not clause, in parapgraph
# text is referenced/cited/footnoted.
#
# The following iterates through the sequence of 'tokens'
# in the current
, building 'statements' composed of a
# statement and a reference.
#
# so-called "clauses" and "references" are accumulated over
# loop iterations i.e. a clause is appended to previous clauses
# if a reference has yet to be accumulated. (implicitly -
# references come after statements.)
#
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
#
_curClause = None
_curRef = None
for token in _nxtsib.contents:
# References (links) are interleved within text blocks as .
# The following control structure parses 'the next token' as
# - containing a link
# - disposable 'junk' if its phasised and contains "Last update"
# - as relevant paragraph text to be accumulated.
if (token.name == 'span'):
_anchors = token.find_all('a', recursive=True)
_anch = None
if (len(_anchors) != 0):
_anch = _anchors[0]
if (_anch is not None):
_curRef = _anch['href']
else:
_curRef = None
elif ((token.name != 'em') or (not ("Last updated" in token.text))): # ignore the "last updated footer
if (_curClause is not None):
if (isinstance(token, bs4.element.NavigableString)):
_curClause = _curClause + token
else:
_curClause = _curClause + token.text
else:
# anomalous html handling
# - and
# - (useless) tags
# appear in a few places
if ((token.name != 'strong') and
(token.name != 'em') and
(token.name != 'br') and
(token.name != 'sup') and
(token.name != 'a')):
_curClause = token # this quashes them
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
if ((_curRef is not None) and (_curClause is not None)):
statement = {}
statement["clause"] = _curClause
statement["ref"] = _curRef
_sectionParagraphs.append(statement)
_curClause = None
_curRef = None
# If a sequence of 'clauses' have been accumulated without finding a reference
# create a reference-LESS statement.
if ((_curClause is not None) and (not "Last updated" in _curClause)):
statement = {}
statement["clause"] = _curClause
_sectionParagraphs.append(statement)
_nxtsib = _nxtsib.find_next_sibling()
return _sectionParagraphs
def _buildListSection(listContent):
laboratories = []
for lab in listContent.find_all('li', recursive="False"):
_lab = {}
_lab['name'] = lab.contents[0].replace(u'\xa0', '')
ref = lab.find('a')
if (ref is not None):
_lab['ref'] = ref['href']
else:
_lab['ref'] = None
laboratories.append(_lab)
return laboratories
def _fetchArticle(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
_article = soup.body.find_all('article')[0]
article = {}
article['url'] = url
article['name'] = _article.h1.text.replace('\n', '').strip()
article['_name'] = _article.h2.contents[0]
_artbody = _article.find('div', {"class": "article__copy"})
# Risk Statement
article['risk statement'] = _artbody.find('p').text
article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
# Article body
sections = []
for _heading in _artbody.findChildren('h2'):
_nxtSibling = _heading.find_next_sibling()
section = {}
section['title'] = _heading.text
if (_nxtSibling.name == 'ul'):
section['body'] = _buildListSection(_nxtSibling)
else:
section['body'] = _buildArticleSection(_nxtSibling)
sections.append(section)
article['sections'] = sections
# # Logo
# logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
_panel = _article.find("div", {"class": "aside__groups cf"})
_paneldivs = _panel.find_all('div')
for _paneldiv in _panel.find_all('div'):
_title = _paneldiv.find('h3').text
_items = []
for _item in _paneldiv.find_all('li'):
_anch = _item.find('a')
if (_anch is not None):
if ("Location" in _title): # locations
_loc = {}
_loc['name'] = _anch.contents[0].replace('\n', '').strip()
_loc['ref'] = _anch['href']
_latlong = _anch['href'].split("=")[1]
_loc['lat'] = _latlong.split(",")[0]
_loc['long'] = _latlong.split(",")[1]
_items.append(_loc)
else:
_items.append(_anch.text)
else:
_items.append(_item.text.replace('\n', '').strip())
article[_title.lower()] = _items
return article
def _gen_galaxy(scrape):
base = {
"authors": [
"Australian Strategic Policy Institute"
],
"category": "academic-institution",
"description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.",
"name": "China Defence Universities Tracker",
"source": "ASPI International Cyber Policy Centre",
"type": "china-defence-universities",
"uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
"values": [
],
"version": 1
}
for uni in scrape:
new_template = template = {
# university description
"description": "",
"meta": {
"supervising agency": [],
"subsidiary": [],
"category": [],
"topics": [],
# Defence labs
"Major Defence Laboratories": [],
# Defence labs
"alias": [],
"risk": "",
"address": "",
"lat": "",
"long": "",
# External link to ASPI
"refs": []
},
"uuid": "",
"value": ""
}
new_template["uuid"] = str(uuid.uuid4())
new_template["meta"]["refs"].append(uni["url"])
for intro in uni["intro"]:
new_template["description"] += intro["clause"]
new_template["value"] = uni["name"] + f" ({uni['_name']})"
new_template["meta"]["risk"] = uni["risk statement"]
for alias in uni["aliases"]:
new_template["meta"]["alias"].append(alias)
for agency in uni["supervising agencies"]:
new_template["meta"]["supervising agency"].append(agency)
if uni.get("subsidiaries"):
for subsidiary in uni["subsidiaries"]:
new_template["meta"]["subsidiary"].append(subsidiary)
if uni.get("topics"):
for topic in uni["topics"]:
new_template["meta"]["topics"].append(topic)
for category in uni["categories"]:
new_template["meta"]["category"].append(category)
if uni.get("location"):
new_template["meta"]["address"] = uni["location"][0]["name"]
new_template["meta"]["lat"] = uni["location"][0]["lat"]
new_template["meta"]["long"] = uni["location"][0]["long"]
base["values"].append(new_template)
return base
def main():
url = "https://unitracker.aspi.org.au"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
table = soup.find_all('table')[0] # Grab the first table
head = None
articles = []
for row in table.find_all('tr'):
if head is not None:
colOne = row.find_all('td')[0].find_all('a')[0]['href']
article = _fetchArticle(url + colOne)
print("Processing: {}".format(url + colOne))
articles.append(article)
else:
head = "bloop"
galaxy = _gen_galaxy(articles)
with open("china-defence-universities.json", "w") as g:
g.write(json.dumps(galaxy, indent=4, sort_keys=True))
if __name__ == "__main__":
main()