new: [dev] add ASPI's China Defence University Tracker.

Thanks to Cormac Doherty for writing the web scraper! To update the galaxy run the included gen_defence_university.py script. "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre. It includes entries on nearly 100 civilian universities, 50 People’s Liberation Army institutions, China’s nuclear weapons program, three Ministry of State Security institutions, four Ministry of Public Security universities, and 12 state-owned defence industry conglomerates. The Tracker is a tool to inform universities, governments and scholars as they engage with the entities from the People’s Republic of China. It aims to build understanding of the expansion of military-civil fusion—the Chinese government’s policy of integrating military and civilian efforts—into the education sector. The Tracker should be used to inform due diligence of Chinese institutions. However, the fact that an institution is not included here does not indicate that it should not raise risks or is not involved in defence research. Similarly, entries in the database may not reflect the full range and nature of an institution’s defence and security links." - ASPI (https://unitracker.aspi.org.au/about/)
2024-11-22 23:07:19 +00:00 · 2020-08-21 11:24:22 -04:00 · 2020-08-21 11:24:22 -04:00 · b4c3ffc8eb
commit b4c3ffc8eb
parent 6d546eb025
3 changed files with 5924 additions and 0 deletions
--- a/clusters/china-defence-universities.json
+++ b/clusters/china-defence-universities.json
--- a/galaxies/china-defence-universities.json
+++ b/galaxies/china-defence-universities.json
@ -0,0 +1,9 @@
 {
  "description": "China Defence Universities",
  "icon": "globe",
  "name": "China Defence Universities",
  "namespace": "misp",
  "type": "china-defence-universities",
  "uuid": "c51c59e9-f213-4ad4-9913-09a43d78dff5",
  "version": 1
 }
--- a/tools/gen_defence_university.py
+++ b/tools/gen_defence_university.py
@ -0,0 +1,284 @@
 #!/usr/bin/python3
 import requests
 import json
 from bs4 import BeautifulSoup
 import bs4
 import uuid
 # This tool is part of the MISP core project and released under the GNU Affero
 # General Public License v3.0
 #
 # Copyright (C) 2020 Cormac Doherty
 # Copyright (C) 2020 Roger Johnston
 #
 #
 # version 0.1 - initial
 # version 0.2 - fixed typo ( _curRef NOT curRef)
 def _buildArticleSection(nxtSibling):
    _sectionParagraphs = []
    _nxtsib = nxtSibling
    # Headings and their content are at the same hierarchical
    # level in the html - just a sequence. This loop is bounded on
    # the next element being a <p>
    while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
        # Almost every sentence, if not clause, in parapgraph
        # text is referenced/cited/footnoted.
        #
        # The following iterates through the sequence of 'tokens'
        # in the current <p>, building 'statements' composed of a
        # statement and a reference.
        #
        # so-called "clauses" and "references" are accumulated over
        # loop iterations i.e. a clause is appended to previous clauses
        # if a reference has yet to be accumulated. (implicitly -
        # references come after statements.)
        #
        # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
        # 'statement' is appended to the section's list of paragraphs and
        # are reset.
        #
        _curClause = None
        _curRef = None
        for token in _nxtsib.contents:
            # References (links) are interleved within text blocks as <spans>.
            # The following control structure parses 'the next token' as
            #    - <spans> containing a link
            #    - disposable 'junk' if its <em>phasised and contains "Last update"
            #    - as relevant paragraph text to be accumulated.
            if (token.name == 'span'):
                _anchors = token.find_all('a', recursive=True)
                _anch = None
                if (len(_anchors) != 0):
                    _anch = _anchors[0]
                if (_anch is not None):
                    _curRef = _anch['href']
                else:
                    _curRef = None
            elif ((token.name != 'em') or (not ("Last updated" in token.text))):  # ignore the "last updated footer
                if (_curClause is not None):
                    if (isinstance(token, bs4.element.NavigableString)):
                        _curClause = _curClause + token
                    else:
                        _curClause = _curClause + token.text
                else:
                    # anomalous html handling
                    #  - <strong> and
                    #  - (useless) <a> tags
                    # appear in a few places
                    if ((token.name != 'strong') and
                            (token.name != 'em') and
                            (token.name != 'br') and
                            (token.name != 'sup') and
                            (token.name != 'a')):
                        _curClause = token  # this quashes them
            # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
            # 'statement' is appended to the section's list of paragraphs and
            # are reset.
            if ((_curRef is not None) and (_curClause is not None)):
                statement = {}
                statement["clause"] = _curClause
                statement["ref"] = _curRef
                _sectionParagraphs.append(statement)
                _curClause = None
                _curRef = None
        # If a sequence of 'clauses' have been accumulated without finding a reference
        # create a reference-LESS statement.
        if ((_curClause is not None) and (not "Last updated" in _curClause)):
            statement = {}
            statement["clause"] = _curClause
            _sectionParagraphs.append(statement)
        _nxtsib = _nxtsib.find_next_sibling()
    return _sectionParagraphs
 def _buildListSection(listContent):
    laboratories = []
    for lab in listContent.find_all('li', recursive="False"):
        _lab = {}
        _lab['name'] = lab.contents[0].replace(u'\xa0', '')
        ref = lab.find('a')
        if (ref is not None):
            _lab['ref'] = ref['href']
        else:
            _lab['ref'] = None
        laboratories.append(_lab)
    return laboratories
 def _fetchArticle(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    _article = soup.body.find_all('article')[0]
    article = {}
    article['url'] = url
    article['name'] = _article.h1.text.replace('\n', '').strip()
    article['_name'] = _article.h2.contents[0]
    _artbody = _article.find('div', {"class": "article__copy"})
    # Risk Statement
    article['risk statement'] = _artbody.find('p').text
    article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
    # Article body
    sections = []
    for _heading in _artbody.findChildren('h2'):
        _nxtSibling = _heading.find_next_sibling()
        section = {}
        section['title'] = _heading.text
        if (_nxtSibling.name == 'ul'):
            section['body'] = _buildListSection(_nxtSibling)
        else:
            section['body'] = _buildArticleSection(_nxtSibling)
        sections.append(section)
    article['sections'] = sections
    #    # Logo
    #    logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
    _panel = _article.find("div", {"class": "aside__groups cf"})
    _paneldivs = _panel.find_all('div')
    for _paneldiv in _panel.find_all('div'):
        _title = _paneldiv.find('h3').text
        _items = []
        for _item in _paneldiv.find_all('li'):
            _anch = _item.find('a')
            if (_anch is not None):
                if ("Location" in _title):  # locations
                    _loc = {}
                    _loc['name'] = _anch.contents[0].replace('\n', '').strip()
                    _loc['ref'] = _anch['href']
                    _latlong = _anch['href'].split("=")[1]
                    _loc['lat'] = _latlong.split(",")[0]
                    _loc['long'] = _latlong.split(",")[1]
                    _items.append(_loc)
                else:
                    _items.append(_anch.text)
            else:
                _items.append(_item.text.replace('\n', '').strip())
        article[_title.lower()] = _items
    return article
 def _gen_galaxy(scrape):
    base = {
        "authors": [
            "Australian Strategic Policy Institute"
        ],
        "category": "academic-institution",
        "description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.",
        "name": "China Defence Universities Tracker",
        "source": "ASPI International Cyber Policy Centre",
        "type": "china-defence-universities",
        "uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
        "values": [
        ],
        "version": 1
    }
    for uni in scrape:
        new_template = template = {
            # university description
            "description": "",
            "meta": {
                "supervising agency": [],
                "subsidiary": [],
                "category": [],
                "topics": [],
                # Defence labs
                "Major Defence Laboratories": [],
                # Defence labs
                "alias": [],
                "risk": "",
                "address": "",
                "lat": "",
                "long": "",
                # External link to ASPI
                "refs": []
            },
            "uuid": "",
            "value": ""
        }
        new_template["uuid"] = str(uuid.uuid4())
        new_template["meta"]["refs"].append(uni["url"])
        for intro in uni["intro"]:
            new_template["description"] += intro["clause"]
        new_template["value"] = uni["name"] + f" ({uni['_name']})"
        new_template["meta"]["risk"] = uni["risk statement"]
        for alias in uni["aliases"]:
            new_template["meta"]["alias"].append(alias)
        for agency in uni["supervising agencies"]:
            new_template["meta"]["supervising agency"].append(agency)
        if uni.get("subsidiaries"):
            for subsidiary in uni["subsidiaries"]:
                new_template["meta"]["subsidiary"].append(subsidiary)
        if uni.get("topics"):
            for topic in uni["topics"]:
                new_template["meta"]["topics"].append(topic)
        for category in uni["categories"]:
            new_template["meta"]["category"].append(category)
        if uni.get("location"):
            new_template["meta"]["address"] = uni["location"][0]["name"]
            new_template["meta"]["lat"] = uni["location"][0]["lat"]
            new_template["meta"]["long"] = uni["location"][0]["long"]
        base["values"].append(new_template)
    return base
 def main():
    url = "https://unitracker.aspi.org.au"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    table = soup.find_all('table')[0]  # Grab the first table
    head = None
    articles = []
    for row in table.find_all('tr'):
        if head is not None:
            colOne = row.find_all('td')[0].find_all('a')[0]['href']
            article = _fetchArticle(url + colOne)
            print("Processing: {}".format(url + colOne))
            articles.append(article)
        else:
            head = "bloop"
    galaxy = _gen_galaxy(articles)
    with open("china-defence-universities.json", "w") as g:
        g.write(json.dumps(galaxy, indent=4, sort_keys=True))
 if __name__ == "__main__":
    main()