mirror of
https://github.com/MISP/misp-galaxy.git
synced 2024-11-26 08:47:18 +00:00
Merge pull request #574 from VVX7/main
new: [dev] add ASPI's China Defence University Tracker.
This commit is contained in:
commit
a0fd1a2bc5
3 changed files with 5814 additions and 0 deletions
5510
clusters/china-defence-universities.json
Normal file
5510
clusters/china-defence-universities.json
Normal file
File diff suppressed because it is too large
Load diff
9
galaxies/china-defence-universities.json
Normal file
9
galaxies/china-defence-universities.json
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
{
|
||||||
|
"description": "China Defence Universities",
|
||||||
|
"icon": "globe",
|
||||||
|
"name": "China Defence Universities",
|
||||||
|
"namespace": "misp",
|
||||||
|
"type": "china-defence-universities",
|
||||||
|
"uuid": "c51c59e9-f213-4ad4-9913-09a43d78dff5",
|
||||||
|
"version": 1
|
||||||
|
}
|
295
tools/gen_defence_university.py
Normal file
295
tools/gen_defence_university.py
Normal file
|
@ -0,0 +1,295 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import bs4
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
# This tool is part of the MISP core project and released under the GNU Affero
|
||||||
|
# General Public License v3.0
|
||||||
|
#
|
||||||
|
# Copyright (C) 2020 Cormac Doherty
|
||||||
|
# Copyright (C) 2020 Roger Johnston
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# version 0.1 - initial
|
||||||
|
# version 0.2 - fixed typo ( _curRef NOT curRef)
|
||||||
|
|
||||||
|
def _buildArticleSection(nxtSibling):
|
||||||
|
_sectionParagraphs = []
|
||||||
|
_nxtsib = nxtSibling
|
||||||
|
|
||||||
|
# Headings and their content are at the same hierarchical
|
||||||
|
# level in the html - just a sequence. This loop is bounded on
|
||||||
|
# the next element being a <p>
|
||||||
|
while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
|
||||||
|
# Almost every sentence, if not clause, in parapgraph
|
||||||
|
# text is referenced/cited/footnoted.
|
||||||
|
#
|
||||||
|
# The following iterates through the sequence of 'tokens'
|
||||||
|
# in the current <p>, building 'statements' composed of a
|
||||||
|
# statement and a reference.
|
||||||
|
#
|
||||||
|
# so-called "clauses" and "references" are accumulated over
|
||||||
|
# loop iterations i.e. a clause is appended to previous clauses
|
||||||
|
# if a reference has yet to be accumulated. (implicitly -
|
||||||
|
# references come after statements.)
|
||||||
|
#
|
||||||
|
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
|
||||||
|
# 'statement' is appended to the section's list of paragraphs and
|
||||||
|
# are reset.
|
||||||
|
#
|
||||||
|
_curClause = None
|
||||||
|
_curRef = None
|
||||||
|
|
||||||
|
for token in _nxtsib.contents:
|
||||||
|
# References (links) are interleved within text blocks as <spans>.
|
||||||
|
# The following control structure parses 'the next token' as
|
||||||
|
# - <spans> containing a link
|
||||||
|
# - disposable 'junk' if its <em>phasised and contains "Last update"
|
||||||
|
# - as relevant paragraph text to be accumulated.
|
||||||
|
if (token.name == 'span'):
|
||||||
|
_anchors = token.find_all('a', recursive=True)
|
||||||
|
_anch = None
|
||||||
|
if (len(_anchors) != 0):
|
||||||
|
_anch = _anchors[0]
|
||||||
|
|
||||||
|
if (_anch is not None):
|
||||||
|
_curRef = _anch['href']
|
||||||
|
else:
|
||||||
|
_curRef = None
|
||||||
|
elif ((token.name != 'em') or (not ("Last updated" in token.text))): # ignore the "last updated footer
|
||||||
|
if (_curClause is not None):
|
||||||
|
if (isinstance(token, bs4.element.NavigableString)):
|
||||||
|
_curClause = _curClause + token
|
||||||
|
else:
|
||||||
|
_curClause = _curClause + token.text
|
||||||
|
else:
|
||||||
|
# anomalous html handling
|
||||||
|
# - <strong> and
|
||||||
|
# - (useless) <a> tags
|
||||||
|
# appear in a few places
|
||||||
|
if ((token.name != 'strong') and
|
||||||
|
(token.name != 'em') and
|
||||||
|
(token.name != 'br') and
|
||||||
|
(token.name != 'sup') and
|
||||||
|
(token.name != 'a')):
|
||||||
|
_curClause = token # this quashes them
|
||||||
|
|
||||||
|
# Once a 'clause' AND a 'statement' are accumulated, an encapsulating
|
||||||
|
# 'statement' is appended to the section's list of paragraphs and
|
||||||
|
# are reset.
|
||||||
|
if ((_curRef is not None) and (_curClause is not None)):
|
||||||
|
statement = {}
|
||||||
|
statement["clause"] = _curClause
|
||||||
|
statement["ref"] = _curRef
|
||||||
|
_sectionParagraphs.append(statement)
|
||||||
|
_curClause = None
|
||||||
|
_curRef = None
|
||||||
|
|
||||||
|
# If a sequence of 'clauses' have been accumulated without finding a reference
|
||||||
|
# create a reference-LESS statement.
|
||||||
|
if ((_curClause is not None) and (not "Last updated" in _curClause)):
|
||||||
|
statement = {}
|
||||||
|
statement["clause"] = _curClause
|
||||||
|
_sectionParagraphs.append(statement)
|
||||||
|
|
||||||
|
_nxtsib = _nxtsib.find_next_sibling()
|
||||||
|
|
||||||
|
return _sectionParagraphs
|
||||||
|
|
||||||
|
|
||||||
|
def _buildListSection(listContent):
|
||||||
|
laboratories = []
|
||||||
|
for lab in listContent.find_all('li', recursive="False"):
|
||||||
|
_lab = {}
|
||||||
|
_lab['name'] = lab.contents[0].replace(u'\xa0', '')
|
||||||
|
|
||||||
|
ref = lab.find('a')
|
||||||
|
if (ref is not None):
|
||||||
|
_lab['ref'] = ref['href']
|
||||||
|
else:
|
||||||
|
_lab['ref'] = None
|
||||||
|
|
||||||
|
laboratories.append(_lab)
|
||||||
|
|
||||||
|
return laboratories
|
||||||
|
|
||||||
|
|
||||||
|
def _fetchArticle(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.content, 'html5lib')
|
||||||
|
_article = soup.body.find_all('article')[0]
|
||||||
|
|
||||||
|
article = {}
|
||||||
|
article['url'] = url
|
||||||
|
article['name'] = _article.h1.text.replace('\n', '').strip()
|
||||||
|
article['_name'] = _article.h2.contents[0]
|
||||||
|
|
||||||
|
_artbody = _article.find('div', {"class": "article__copy"})
|
||||||
|
|
||||||
|
# Risk Statement
|
||||||
|
article['risk statement'] = _artbody.find('p').text
|
||||||
|
|
||||||
|
article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
|
||||||
|
|
||||||
|
# Article body
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
for _heading in _artbody.findChildren('h2'):
|
||||||
|
_nxtSibling = _heading.find_next_sibling()
|
||||||
|
|
||||||
|
section = {}
|
||||||
|
section['title'] = _heading.text
|
||||||
|
if (_nxtSibling.name == 'ul'):
|
||||||
|
section['body'] = _buildListSection(_nxtSibling)
|
||||||
|
else:
|
||||||
|
section['body'] = _buildArticleSection(_nxtSibling)
|
||||||
|
sections.append(section)
|
||||||
|
|
||||||
|
article['sections'] = sections
|
||||||
|
|
||||||
|
# # Logo
|
||||||
|
# logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
|
||||||
|
|
||||||
|
_panel = _article.find("div", {"class": "aside__groups cf"})
|
||||||
|
_paneldivs = _panel.find_all('div')
|
||||||
|
|
||||||
|
for _paneldiv in _panel.find_all('div'):
|
||||||
|
_title = _paneldiv.find('h3').text
|
||||||
|
_items = []
|
||||||
|
for _item in _paneldiv.find_all('li'):
|
||||||
|
_anch = _item.find('a')
|
||||||
|
if (_anch is not None):
|
||||||
|
if ("Location" in _title): # locations
|
||||||
|
_loc = {}
|
||||||
|
_loc['name'] = _anch.contents[0].replace('\n', '').strip()
|
||||||
|
_loc['ref'] = _anch['href']
|
||||||
|
_latlong = _anch['href'].split("=")[1]
|
||||||
|
_loc['lat'] = _latlong.split(",")[0]
|
||||||
|
_loc['long'] = _latlong.split(",")[1]
|
||||||
|
_items.append(_loc)
|
||||||
|
else:
|
||||||
|
_items.append(_anch.text)
|
||||||
|
else:
|
||||||
|
_items.append(_item.text.replace('\n', '').strip())
|
||||||
|
article[_title.lower()] = _items
|
||||||
|
|
||||||
|
return article
|
||||||
|
|
||||||
|
|
||||||
|
def _gen_galaxy(scrape):
|
||||||
|
base = {
|
||||||
|
"authors": [
|
||||||
|
"Australian Strategic Policy Institute"
|
||||||
|
],
|
||||||
|
"category": "academic-institution",
|
||||||
|
"description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.",
|
||||||
|
"name": "China Defence Universities Tracker",
|
||||||
|
"source": "ASPI International Cyber Policy Centre",
|
||||||
|
"type": "china-defence-universities",
|
||||||
|
"uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
|
||||||
|
"values": [
|
||||||
|
],
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
|
|
||||||
|
for uni in scrape:
|
||||||
|
new_template = template = {
|
||||||
|
"description": "",
|
||||||
|
"meta": {
|
||||||
|
"refs": []
|
||||||
|
},
|
||||||
|
"uuid": "",
|
||||||
|
"value": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
new_template["uuid"] = str(uuid.uuid4())
|
||||||
|
|
||||||
|
new_template["meta"]["refs"].append(uni["url"])
|
||||||
|
|
||||||
|
new_template["value"] = uni["name"] + f" ({uni['_name']})"
|
||||||
|
|
||||||
|
def _append_meta(key, meta):
|
||||||
|
if uni.get(meta):
|
||||||
|
values = []
|
||||||
|
for value in uni[meta]:
|
||||||
|
if value != "":
|
||||||
|
values.append(value)
|
||||||
|
if values:
|
||||||
|
new_template["meta"][key] = values
|
||||||
|
|
||||||
|
if uni.get("intro"):
|
||||||
|
for intro in uni["intro"]:
|
||||||
|
new_template["description"] += intro["clause"]
|
||||||
|
if new_template["description"] == "":
|
||||||
|
new_template["description"] += uni["name"] + f" ({uni['_name']})"
|
||||||
|
else:
|
||||||
|
new_template["description"] += uni["name"] + f" ({uni['_name']})"
|
||||||
|
|
||||||
|
if uni.get("risk"):
|
||||||
|
if uni.get("risk") != "":
|
||||||
|
new_template["meta"]["risk"] = uni["risk statement"]
|
||||||
|
|
||||||
|
_append_meta("aliases", "aliases")
|
||||||
|
|
||||||
|
_append_meta("supervising agencies", "supervising agencies")
|
||||||
|
|
||||||
|
_append_meta("subsidiaries", "subsidiaries")
|
||||||
|
|
||||||
|
_append_meta("topics", "topics")
|
||||||
|
|
||||||
|
_append_meta("categories", "categories")
|
||||||
|
|
||||||
|
if uni.get("sections"):
|
||||||
|
labs = []
|
||||||
|
for section in uni["sections"]:
|
||||||
|
if section["title"] == "Major defence laboratories":
|
||||||
|
for lab in section["body"]:
|
||||||
|
if lab.get("name"):
|
||||||
|
if lab["name"] != "":
|
||||||
|
labs.append(lab["name"])
|
||||||
|
if labs:
|
||||||
|
new_template["meta"]["major defence laboratories"] = labs
|
||||||
|
|
||||||
|
if uni.get("location"):
|
||||||
|
if uni.get(uni["location"][0]["name"]) != "":
|
||||||
|
new_template["meta"]["address"] = uni["location"][0]["name"]
|
||||||
|
if uni.get(uni["location"][0]["lat"]) != "":
|
||||||
|
new_template["meta"]["lat"] = uni["location"][0]["lat"]
|
||||||
|
if uni.get(uni["location"][0]["long"]) != "":
|
||||||
|
new_template["meta"]["long"] = uni["location"][0]["long"]
|
||||||
|
|
||||||
|
base["values"].append(new_template)
|
||||||
|
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
url = "https://unitracker.aspi.org.au"
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.content, 'html5lib')
|
||||||
|
|
||||||
|
table = soup.find_all('table')[0] # Grab the first table
|
||||||
|
head = None
|
||||||
|
articles = []
|
||||||
|
for row in table.find_all('tr'):
|
||||||
|
if head is not None:
|
||||||
|
colOne = row.find_all('td')[0].find_all('a')[0]['href']
|
||||||
|
article = _fetchArticle(url + colOne)
|
||||||
|
print("Processing: {}".format(url + colOne))
|
||||||
|
articles.append(article)
|
||||||
|
else:
|
||||||
|
head = "bloop"
|
||||||
|
|
||||||
|
galaxy = _gen_galaxy(articles)
|
||||||
|
|
||||||
|
print(galaxy)
|
||||||
|
|
||||||
|
with open("china-defence-universities.json", "w") as g:
|
||||||
|
g.write(json.dumps(galaxy, indent=4, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue