From a933c6f172b3678bcb2b3c5d4e5e7c33887e1774 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Tue, 7 Sep 2021 07:29:38 +0200
Subject: [PATCH] new: [import] first version of the import

- Parse the CPE 2.3 XML file
- Extract vendor and product
- Canonize words from the vendor and product
- Insert in redis the reverse index and create a sorted set with the
score per cpe vendor:product frequency
---
 README.md     | 39 +++++++++++++++++++++++++++
 bin/import.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++
 data/dump.sh  |  4 +++
 3 files changed, 118 insertions(+)
 create mode 100644 README.md
 create mode 100644 bin/import.py
 create mode 100644 data/dump.sh
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..27153dd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,39 @@
+# CPE guesser
+
+CPE guesser is a web service to guess the CPE name based on one or more keyword(s).  Then the result can
+be used against [cve-search](https://github.com/cve-search/cve-search) to do actual searches by CPE names.
+
+## Requirements
+
+- Redis
+- Python
+
+## How does this work?
+
+A CPE entry is composed of a human readable name with some references and the structured CPE name.
+
+~~~
+  <cpe-item name="cpe:/a:10web:form_maker:1.7.17::~~~wordpress~~">
+    <title xml:lang="en-US">10web Form Maker 1.7.17 for WordPress</title>
+    <references>
+      <reference href="https://wordpress.org/plugins/form-maker/#developers">Change Log</reference>
+    </references>
+    <cpe-23:cpe23-item name="cpe:2.3:a:10web:form_maker:1.7.17:*:*:*:*:wordpress:*:*"/>
+  </cpe-item>
+~~~
+
+The CPE name is structured with a vendor name, a product name and some additional information.
+CPE name can be easily changed due to vendor name or product name changes, some vendor/product are
+sharing common names or name is composed of multiple words.
+
+
+### Data
+
+Split vendor name and product name (such as `_`) into single word(s) and then canonize the word. Building an inverse index using
+the cpe vendor:product format as value and the canonized word as key.  Then cpe guesser creates a ranked set with the most common 
+cpe (vendor:product)  per version to give a probability of the CPE appearance.
+
+### Redis structure
+
+- `w:<word>` set
+- `s:<word>` sorted set with a score depending of the number of appearance
diff --git a/bin/import.py b/bin/import.py
new file mode 100644
index 0000000..4f15da0
--- /dev/null
+++ b/bin/import.py
@@ -0,0 +1,75 @@
+
+import xml.sax
+import redis
+rdb = redis.Redis(host='127.0.0.1', port=6379, db=8)
+
+class CPEHandler( xml.sax.ContentHandler ):
+    def __init__(self):
+        self.cpe = ""
+        self.title = ""
+        self.title_seen = False
+        self.cpe = ""
+        self.record  = {}
+        self.refs = []
+
+    def startElement(self, tag, attributes):
+        self.CurrentData = tag
+        if tag == 'cpe-23:cpe23-item':
+            self.record['cpe-23'] = attributes['name']
+        if tag == 'title':
+            self.title_seen = True
+        if tag == 'reference':
+            self.refs.append(attributes['href'])
+
+    def characters(self, data):
+        if self.title_seen:
+            self.title = self.title + data
+      
+    def endElement(self, tag):
+        if tag == 'title':
+            self.record['title'] = self.title
+            self.title = ""
+            self.title_seen = False
+        if tag == 'references':
+            self.record['refs'] = self.refs
+            self.refs = []
+        if tag == 'cpe-item':
+            to_insert = CPEExtractor(cpe=self.record['cpe-23'])
+            for word in canonize(to_insert['vendor']):
+                insert( word=word, cpe=to_insert['cpeline'] )
+            for word in canonize(to_insert['product']):
+                insert( word=word, cpe=to_insert['cpeline'] )
+            self.record = {}
+
+
+def CPEExtractor( cpe=None ):
+    if cpe is None:
+        return False
+    record = {}
+    cpefield = cpe.split(":")
+    record['vendor'] = cpefield[3]
+    record['product'] = cpefield[4]
+    cpeline = ""
+    for cpeentry in cpefield[:5]:
+        cpeline = cpeline + ":" + cpeentry
+    record['cpeline'] = cpeline[1:] 
+    return record 
+
+def canonize( value=None ):
+    value = value.lower()
+    words = value.split('_')
+    return words
+
+def insert( word=None, cpe=None):
+    if cpe is None or word is None:
+        return False
+    rdb.sadd('w:{}'.format(word), cpe)
+    rdb.zadd('s:{}'.format(word), {cpe: 1}, incr=True)
+
+cpe_path = '../data/official-cpe-dictionary_v2.3.xml'
+
+parser = xml.sax.make_parser()
+
+Handler = CPEHandler()
+parser.setContentHandler( Handler )
+parser.parse(cpe_path)
diff --git a/data/dump.sh b/data/dump.sh
new file mode 100644
index 0000000..fd46480
--- /dev/null
+++ b/data/dump.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+wget https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz
+gzip -d official-cpe-dictionary_v2.3.xml.gz