new: [dev] add ASPI's China Defence University Tracker.

Thanks to Cormac Doherty for writing the web scraper! To update the galaxy run the included gen_defence_university.py script. "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre. It includes entries on nearly 100 civilian universities, 50 People’s Liberation Army institutions, China’s nuclear weapons program, three Ministry of State Security institutions, four Ministry of Public Security universities, and 12 state-owned defence industry conglomerates. The Tracker is a tool to inform universities, governments and scholars as they engage with the entities from the People’s Republic of China. It aims to build understanding of the expansion of military-civil fusion—the Chinese government’s policy of integrating military and civilian efforts—into the education sector. The Tracker should be used to inform due diligence of Chinese institutions. However, the fact that an institution is not included here does not indicate that it should not raise risks or is not involved in defence research. Similarly, entries in the database may not reflect the full range and nature of an institution’s defence and security links." - ASPI (https://unitracker.aspi.org.au/about/)
2024-11-26 16:57:18 +00:00 · 2020-08-21 11:24:22 -04:00 · 2020-08-21 11:24:22 -04:00 · b4c3ffc8eb
commit b4c3ffc8eb
parent 6d546eb025
3 changed files with 5924 additions and 0 deletions
--- a/clusters/china-defence-universities.json
+++ b/clusters/china-defence-universities.json
--- a/galaxies/china-defence-universities.json
+++ b/galaxies/china-defence-universities.json
@ -0,0 +1,9 @@
+{
+  "description": "China Defence Universities",
+  "icon": "globe",
+  "name": "China Defence Universities",
+  "namespace": "misp",
+  "type": "china-defence-universities",
+  "uuid": "c51c59e9-f213-4ad4-9913-09a43d78dff5",
+  "version": 1
+}
--- a/tools/gen_defence_university.py
+++ b/tools/gen_defence_university.py
@ -0,0 +1,284 @@
+#!/usr/bin/python3
+import requests
+import json
+from bs4 import BeautifulSoup
+import bs4
+import uuid
+
+# This tool is part of the MISP core project and released under the GNU Affero
+# General Public License v3.0
+#
+# Copyright (C) 2020 Cormac Doherty
+# Copyright (C) 2020 Roger Johnston
+#
+#
+# version 0.1 - initial
+# version 0.2 - fixed typo ( _curRef NOT curRef)
+
+def _buildArticleSection(nxtSibling):
+    _sectionParagraphs = []
+    _nxtsib = nxtSibling
+
+    # Headings and their content are at the same hierarchical
+    # level in the html - just a sequence. This loop is bounded on
+    # the next element being a <p>
+    while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
+        # Almost every sentence, if not clause, in parapgraph
+        # text is referenced/cited/footnoted.
+        #
+        # The following iterates through the sequence of 'tokens'
+        # in the current <p>, building 'statements' composed of a
+        # statement and a reference.
+        #
+        # so-called "clauses" and "references" are accumulated over
+        # loop iterations i.e. a clause is appended to previous clauses
+        # if a reference has yet to be accumulated. (implicitly -
+        # references come after statements.)
+        #
+        # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
+        # 'statement' is appended to the section's list of paragraphs and
+        # are reset.
+        #
+        _curClause = None
+        _curRef = None
+
+        for token in _nxtsib.contents:
+            # References (links) are interleved within text blocks as <spans>.
+            # The following control structure parses 'the next token' as
+            #    - <spans> containing a link
+            #    - disposable 'junk' if its <em>phasised and contains "Last update"
+            #    - as relevant paragraph text to be accumulated.
+            if (token.name == 'span'):
+                _anchors = token.find_all('a', recursive=True)
+                _anch = None
+                if (len(_anchors) != 0):
+                    _anch = _anchors[0]
+
+                if (_anch is not None):
+                    _curRef = _anch['href']
+                else:
+                    _curRef = None
+            elif ((token.name != 'em') or (not ("Last updated" in token.text))):  # ignore the "last updated footer
+                if (_curClause is not None):
+                    if (isinstance(token, bs4.element.NavigableString)):
+                        _curClause = _curClause + token
+                    else:
+                        _curClause = _curClause + token.text
+                else:
+                    # anomalous html handling
+                    #  - <strong> and
+                    #  - (useless) <a> tags
+                    # appear in a few places
+                    if ((token.name != 'strong') and
+                            (token.name != 'em') and
+                            (token.name != 'br') and
+                            (token.name != 'sup') and
+                            (token.name != 'a')):
+                        _curClause = token  # this quashes them
+
+            # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
+            # 'statement' is appended to the section's list of paragraphs and
+            # are reset.
+            if ((_curRef is not None) and (_curClause is not None)):
+                statement = {}
+                statement["clause"] = _curClause
+                statement["ref"] = _curRef
+                _sectionParagraphs.append(statement)
+                _curClause = None
+                _curRef = None
+
+        # If a sequence of 'clauses' have been accumulated without finding a reference
+        # create a reference-LESS statement.
+        if ((_curClause is not None) and (not "Last updated" in _curClause)):
+            statement = {}
+            statement["clause"] = _curClause
+            _sectionParagraphs.append(statement)
+
+        _nxtsib = _nxtsib.find_next_sibling()
+
+    return _sectionParagraphs
+
+
+def _buildListSection(listContent):
+    laboratories = []
+    for lab in listContent.find_all('li', recursive="False"):
+        _lab = {}
+        _lab['name'] = lab.contents[0].replace(u'\xa0', '')
+
+        ref = lab.find('a')
+        if (ref is not None):
+            _lab['ref'] = ref['href']
+        else:
+            _lab['ref'] = None
+
+        laboratories.append(_lab)
+
+    return laboratories
+
+
+def _fetchArticle(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html5lib')
+    _article = soup.body.find_all('article')[0]
+
+    article = {}
+    article['url'] = url
+    article['name'] = _article.h1.text.replace('\n', '').strip()
+    article['_name'] = _article.h2.contents[0]
+
+    _artbody = _article.find('div', {"class": "article__copy"})
+
+    # Risk Statement
+    article['risk statement'] = _artbody.find('p').text
+
+    article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
+
+    # Article body
+    sections = []
+
+    for _heading in _artbody.findChildren('h2'):
+        _nxtSibling = _heading.find_next_sibling()
+
+        section = {}
+        section['title'] = _heading.text
+        if (_nxtSibling.name == 'ul'):
+            section['body'] = _buildListSection(_nxtSibling)
+        else:
+            section['body'] = _buildArticleSection(_nxtSibling)
+        sections.append(section)
+
+    article['sections'] = sections
+
+    #    # Logo
+    #    logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
+
+    _panel = _article.find("div", {"class": "aside__groups cf"})
+    _paneldivs = _panel.find_all('div')
+
+    for _paneldiv in _panel.find_all('div'):
+        _title = _paneldiv.find('h3').text
+        _items = []
+        for _item in _paneldiv.find_all('li'):
+            _anch = _item.find('a')
+            if (_anch is not None):
+                if ("Location" in _title):  # locations
+                    _loc = {}
+                    _loc['name'] = _anch.contents[0].replace('\n', '').strip()
+                    _loc['ref'] = _anch['href']
+                    _latlong = _anch['href'].split("=")[1]
+                    _loc['lat'] = _latlong.split(",")[0]
+                    _loc['long'] = _latlong.split(",")[1]
+                    _items.append(_loc)
+                else:
+                    _items.append(_anch.text)
+            else:
+                _items.append(_item.text.replace('\n', '').strip())
+        article[_title.lower()] = _items
+
+    return article
+
+
+def _gen_galaxy(scrape):
+    base = {
+        "authors": [
+            "Australian Strategic Policy Institute"
+        ],
+        "category": "academic-institution",
+        "description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.",
+        "name": "China Defence Universities Tracker",
+        "source": "ASPI International Cyber Policy Centre",
+        "type": "china-defence-universities",
+        "uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
+        "values": [
+        ],
+        "version": 1
+    }
+
+    for uni in scrape:
+        new_template = template = {
+            # university description
+            "description": "",
+            "meta": {
+                "supervising agency": [],
+                "subsidiary": [],
+                "category": [],
+                "topics": [],
+                # Defence labs
+                "Major Defence Laboratories": [],
+                # Defence labs
+                "alias": [],
+                "risk": "",
+                "address": "",
+                "lat": "",
+                "long": "",
+                # External link to ASPI
+                "refs": []
+            },
+            "uuid": "",
+            "value": ""
+        }
+
+        new_template["uuid"] = str(uuid.uuid4())
+
+        new_template["meta"]["refs"].append(uni["url"])
+
+        for intro in uni["intro"]:
+            new_template["description"] += intro["clause"]
+
+        new_template["value"] = uni["name"] + f" ({uni['_name']})"
+
+        new_template["meta"]["risk"] = uni["risk statement"]
+
+        for alias in uni["aliases"]:
+            new_template["meta"]["alias"].append(alias)
+
+        for agency in uni["supervising agencies"]:
+            new_template["meta"]["supervising agency"].append(agency)
+
+        if uni.get("subsidiaries"):
+            for subsidiary in uni["subsidiaries"]:
+                new_template["meta"]["subsidiary"].append(subsidiary)
+
+        if uni.get("topics"):
+            for topic in uni["topics"]:
+                new_template["meta"]["topics"].append(topic)
+
+        for category in uni["categories"]:
+            new_template["meta"]["category"].append(category)
+
+        if uni.get("location"):
+            new_template["meta"]["address"] = uni["location"][0]["name"]
+            new_template["meta"]["lat"] = uni["location"][0]["lat"]
+            new_template["meta"]["long"] = uni["location"][0]["long"]
+
+        base["values"].append(new_template)
+
+    return base
+
+
+def main():
+    url = "https://unitracker.aspi.org.au"
+    response = requests.get(url)
+
+    soup = BeautifulSoup(response.content, 'html5lib')
+
+    table = soup.find_all('table')[0]  # Grab the first table
+    head = None
+    articles = []
+    for row in table.find_all('tr'):
+        if head is not None:
+            colOne = row.find_all('td')[0].find_all('a')[0]['href']
+            article = _fetchArticle(url + colOne)
+            print("Processing: {}".format(url + colOne))
+            articles.append(article)
+        else:
+            head = "bloop"
+
+    galaxy = _gen_galaxy(articles)
+
+    with open("china-defence-universities.json", "w") as g:
+        g.write(json.dumps(galaxy, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()