misp-galaxy/tools/gen_defence_university.py

295 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import requests
import json
from bs4 import BeautifulSoup
import bs4
import uuid
# This tool is part of the MISP core project and released under the GNU Affero
# General Public License v3.0
#
# Copyright (C) 2020 Cormac Doherty
# Copyright (C) 2020 Roger Johnston
#
#
# version 0.1 - initial
# version 0.2 - fixed typo ( _curRef NOT curRef)
def _buildArticleSection(nxtSibling):
_sectionParagraphs = []
_nxtsib = nxtSibling
# Headings and their content are at the same hierarchical
# level in the html - just a sequence. This loop is bounded on
# the next element being a <p>
while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
# Almost every sentence, if not clause, in parapgraph
# text is referenced/cited/footnoted.
#
# The following iterates through the sequence of 'tokens'
# in the current <p>, building 'statements' composed of a
# statement and a reference.
#
# so-called "clauses" and "references" are accumulated over
# loop iterations i.e. a clause is appended to previous clauses
# if a reference has yet to be accumulated. (implicitly -
# references come after statements.)
#
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
#
_curClause = None
_curRef = None
for token in _nxtsib.contents:
# References (links) are interleved within text blocks as <spans>.
# The following control structure parses 'the next token' as
# - <spans> containing a link
# - disposable 'junk' if its <em>phasised and contains "Last update"
# - as relevant paragraph text to be accumulated.
if (token.name == 'span'):
_anchors = token.find_all('a', recursive=True)
_anch = None
if (len(_anchors) != 0):
_anch = _anchors[0]
if (_anch is not None):
_curRef = _anch['href']
else:
_curRef = None
elif ((token.name != 'em') or (not ("Last updated" in token.text))): # ignore the "last updated footer
if (_curClause is not None):
if (isinstance(token, bs4.element.NavigableString)):
_curClause = _curClause + token
else:
_curClause = _curClause + token.text
else:
# anomalous html handling
# - <strong> and
# - (useless) <a> tags
# appear in a few places
if ((token.name != 'strong') and
(token.name != 'em') and
(token.name != 'br') and
(token.name != 'sup') and
(token.name != 'a')):
_curClause = token # this quashes them
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
# 'statement' is appended to the section's list of paragraphs and
# are reset.
if ((_curRef is not None) and (_curClause is not None)):
statement = {}
statement["clause"] = _curClause
statement["ref"] = _curRef
_sectionParagraphs.append(statement)
_curClause = None
_curRef = None
# If a sequence of 'clauses' have been accumulated without finding a reference
# create a reference-LESS statement.
if ((_curClause is not None) and (not "Last updated" in _curClause)):
statement = {}
statement["clause"] = _curClause
_sectionParagraphs.append(statement)
_nxtsib = _nxtsib.find_next_sibling()
return _sectionParagraphs
def _buildListSection(listContent):
laboratories = []
for lab in listContent.find_all('li', recursive="False"):
_lab = {}
_lab['name'] = lab.contents[0].replace(u'\xa0', '')
ref = lab.find('a')
if (ref is not None):
_lab['ref'] = ref['href']
else:
_lab['ref'] = None
laboratories.append(_lab)
return laboratories
def _fetchArticle(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
_article = soup.body.find_all('article')[0]
article = {}
article['url'] = url
article['name'] = _article.h1.text.replace('\n', '').strip()
article['_name'] = _article.h2.contents[0]
_artbody = _article.find('div', {"class": "article__copy"})
# Risk Statement
article['risk statement'] = _artbody.find('p').text
article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
# Article body
sections = []
for _heading in _artbody.findChildren('h2'):
_nxtSibling = _heading.find_next_sibling()
section = {}
section['title'] = _heading.text
if (_nxtSibling.name == 'ul'):
section['body'] = _buildListSection(_nxtSibling)
else:
section['body'] = _buildArticleSection(_nxtSibling)
sections.append(section)
article['sections'] = sections
# # Logo
# logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
_panel = _article.find("div", {"class": "aside__groups cf"})
_paneldivs = _panel.find_all('div')
for _paneldiv in _panel.find_all('div'):
_title = _paneldiv.find('h3').text
_items = []
for _item in _paneldiv.find_all('li'):
_anch = _item.find('a')
if (_anch is not None):
if ("Location" in _title): # locations
_loc = {}
_loc['name'] = _anch.contents[0].replace('\n', '').strip()
_loc['ref'] = _anch['href']
_latlong = _anch['href'].split("=")[1]
_loc['lat'] = _latlong.split(",")[0]
_loc['long'] = _latlong.split(",")[1]
_items.append(_loc)
else:
_items.append(_anch.text)
else:
_items.append(_item.text.replace('\n', '').strip())
article[_title.lower()] = _items
return article
def _gen_galaxy(scrape):
base = {
"authors": [
"Australian Strategic Policy Institute"
],
"category": "academic-institution",
"description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPIs International Cyber Policy Centre.",
"name": "China Defence Universities Tracker",
"source": "ASPI International Cyber Policy Centre",
"type": "china-defence-universities",
"uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
"values": [
],
"version": 1
}
for uni in scrape:
new_template = template = {
"description": "",
"meta": {
"refs": []
},
"uuid": "",
"value": ""
}
new_template["uuid"] = str(uuid.uuid4())
new_template["meta"]["refs"].append(uni["url"])
new_template["value"] = uni["name"] + f" ({uni['_name']})"
def _append_meta(key, meta):
if uni.get(meta):
values = []
for value in uni[meta]:
if value != "":
values.append(value)
if values:
new_template["meta"][key] = values
if uni.get("intro"):
for intro in uni["intro"]:
new_template["description"] += intro["clause"]
if new_template["description"] == "":
new_template["description"] += uni["name"] + f" ({uni['_name']})"
else:
new_template["description"] += uni["name"] + f" ({uni['_name']})"
if uni.get("risk"):
if uni.get("risk") != "":
new_template["meta"]["risk"] = uni["risk statement"]
_append_meta("aliases", "aliases")
_append_meta("supervising agencies", "supervising agencies")
_append_meta("subsidiaries", "subsidiaries")
_append_meta("topics", "topics")
_append_meta("categories", "categories")
if uni.get("sections"):
labs = []
for section in uni["sections"]:
if section["title"] == "Major defence laboratories":
for lab in section["body"]:
if lab.get("name"):
if lab["name"] != "":
labs.append(lab["name"])
if labs:
new_template["meta"]["major defence laboratories"] = labs
if uni.get("location"):
if uni.get(uni["location"][0]["name"]) != "":
new_template["meta"]["address"] = uni["location"][0]["name"]
if uni.get(uni["location"][0]["lat"]) != "":
new_template["meta"]["lat"] = uni["location"][0]["lat"]
if uni.get(uni["location"][0]["long"]) != "":
new_template["meta"]["long"] = uni["location"][0]["long"]
base["values"].append(new_template)
return base
def main():
url = "https://unitracker.aspi.org.au"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html5lib')
table = soup.find_all('table')[0] # Grab the first table
head = None
articles = []
for row in table.find_all('tr'):
if head is not None:
colOne = row.find_all('td')[0].find_all('a')[0]['href']
article = _fetchArticle(url + colOne)
print("Processing: {}".format(url + colOne))
articles.append(article)
else:
head = "bloop"
galaxy = _gen_galaxy(articles)
print(galaxy)
with open("china-defence-universities.json", "w") as g:
g.write(json.dumps(galaxy, indent=4, sort_keys=True))
if __name__ == "__main__":
main()