From a933c6f172b3678bcb2b3c5d4e5e7c33887e1774 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Tue, 7 Sep 2021 07:29:38 +0200 Subject: [PATCH] new: [import] first version of the import - Parse the CPE 2.3 XML file - Extract vendor and product - Canonize words from the vendor and product - Insert in redis the reverse index and create a sorted set with the score per cpe vendor:product frequency --- README.md | 39 +++++++++++++++++++++++++++ bin/import.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ data/dump.sh | 4 +++ 3 files changed, 118 insertions(+) create mode 100644 README.md create mode 100644 bin/import.py create mode 100644 data/dump.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..27153dd --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# CPE guesser + +CPE guesser is a web service to guess the CPE name based on one or more keyword(s). Then the result can +be used against [cve-search](https://github.com/cve-search/cve-search) to do actual searches by CPE names. + +## Requirements + +- Redis +- Python + +## How does this work? + +A CPE entry is composed of a human readable name with some references and the structured CPE name. + +~~~ + + 10web Form Maker 1.7.17 for WordPress + + Change Log + + + +~~~ + +The CPE name is structured with a vendor name, a product name and some additional information. +CPE name can be easily changed due to vendor name or product name changes, some vendor/product are +sharing common names or name is composed of multiple words. + + +### Data + +Split vendor name and product name (such as `_`) into single word(s) and then canonize the word. Building an inverse index using +the cpe vendor:product format as value and the canonized word as key. Then cpe guesser creates a ranked set with the most common +cpe (vendor:product) per version to give a probability of the CPE appearance. + +### Redis structure + +- `w:` set +- `s:` sorted set with a score depending of the number of appearance diff --git a/bin/import.py b/bin/import.py new file mode 100644 index 0000000..4f15da0 --- /dev/null +++ b/bin/import.py @@ -0,0 +1,75 @@ + +import xml.sax +import redis +rdb = redis.Redis(host='127.0.0.1', port=6379, db=8) + +class CPEHandler( xml.sax.ContentHandler ): + def __init__(self): + self.cpe = "" + self.title = "" + self.title_seen = False + self.cpe = "" + self.record = {} + self.refs = [] + + def startElement(self, tag, attributes): + self.CurrentData = tag + if tag == 'cpe-23:cpe23-item': + self.record['cpe-23'] = attributes['name'] + if tag == 'title': + self.title_seen = True + if tag == 'reference': + self.refs.append(attributes['href']) + + def characters(self, data): + if self.title_seen: + self.title = self.title + data + + def endElement(self, tag): + if tag == 'title': + self.record['title'] = self.title + self.title = "" + self.title_seen = False + if tag == 'references': + self.record['refs'] = self.refs + self.refs = [] + if tag == 'cpe-item': + to_insert = CPEExtractor(cpe=self.record['cpe-23']) + for word in canonize(to_insert['vendor']): + insert( word=word, cpe=to_insert['cpeline'] ) + for word in canonize(to_insert['product']): + insert( word=word, cpe=to_insert['cpeline'] ) + self.record = {} + + +def CPEExtractor( cpe=None ): + if cpe is None: + return False + record = {} + cpefield = cpe.split(":") + record['vendor'] = cpefield[3] + record['product'] = cpefield[4] + cpeline = "" + for cpeentry in cpefield[:5]: + cpeline = cpeline + ":" + cpeentry + record['cpeline'] = cpeline[1:] + return record + +def canonize( value=None ): + value = value.lower() + words = value.split('_') + return words + +def insert( word=None, cpe=None): + if cpe is None or word is None: + return False + rdb.sadd('w:{}'.format(word), cpe) + rdb.zadd('s:{}'.format(word), {cpe: 1}, incr=True) + +cpe_path = '../data/official-cpe-dictionary_v2.3.xml' + +parser = xml.sax.make_parser() + +Handler = CPEHandler() +parser.setContentHandler( Handler ) +parser.parse(cpe_path) diff --git a/data/dump.sh b/data/dump.sh new file mode 100644 index 0000000..fd46480 --- /dev/null +++ b/data/dump.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +wget https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz +gzip -d official-cpe-dictionary_v2.3.xml.gz