mirror of
https://github.com/cve-search/cpe-guesser.git
synced 2024-11-24 15:57:22 +00:00
new: [import] first version of the import
- Parse the CPE 2.3 XML file - Extract vendor and product - Canonize words from the vendor and product - Insert in redis the reverse index and create a sorted set with the score per cpe vendor:product frequency
This commit is contained in:
commit
a933c6f172
3 changed files with 118 additions and 0 deletions
39
README.md
Normal file
39
README.md
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
# CPE guesser
|
||||||
|
|
||||||
|
CPE guesser is a web service to guess the CPE name based on one or more keyword(s). Then the result can
|
||||||
|
be used against [cve-search](https://github.com/cve-search/cve-search) to do actual searches by CPE names.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Redis
|
||||||
|
- Python
|
||||||
|
|
||||||
|
## How does this work?
|
||||||
|
|
||||||
|
A CPE entry is composed of a human readable name with some references and the structured CPE name.
|
||||||
|
|
||||||
|
~~~
|
||||||
|
<cpe-item name="cpe:/a:10web:form_maker:1.7.17::~~~wordpress~~">
|
||||||
|
<title xml:lang="en-US">10web Form Maker 1.7.17 for WordPress</title>
|
||||||
|
<references>
|
||||||
|
<reference href="https://wordpress.org/plugins/form-maker/#developers">Change Log</reference>
|
||||||
|
</references>
|
||||||
|
<cpe-23:cpe23-item name="cpe:2.3:a:10web:form_maker:1.7.17:*:*:*:*:wordpress:*:*"/>
|
||||||
|
</cpe-item>
|
||||||
|
~~~
|
||||||
|
|
||||||
|
The CPE name is structured with a vendor name, a product name and some additional information.
|
||||||
|
CPE name can be easily changed due to vendor name or product name changes, some vendor/product are
|
||||||
|
sharing common names or name is composed of multiple words.
|
||||||
|
|
||||||
|
|
||||||
|
### Data
|
||||||
|
|
||||||
|
Split vendor name and product name (such as `_`) into single word(s) and then canonize the word. Building an inverse index using
|
||||||
|
the cpe vendor:product format as value and the canonized word as key. Then cpe guesser creates a ranked set with the most common
|
||||||
|
cpe (vendor:product) per version to give a probability of the CPE appearance.
|
||||||
|
|
||||||
|
### Redis structure
|
||||||
|
|
||||||
|
- `w:<word>` set
|
||||||
|
- `s:<word>` sorted set with a score depending of the number of appearance
|
75
bin/import.py
Normal file
75
bin/import.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
|
||||||
|
import xml.sax
|
||||||
|
import redis
|
||||||
|
rdb = redis.Redis(host='127.0.0.1', port=6379, db=8)
|
||||||
|
|
||||||
|
class CPEHandler( xml.sax.ContentHandler ):
|
||||||
|
def __init__(self):
|
||||||
|
self.cpe = ""
|
||||||
|
self.title = ""
|
||||||
|
self.title_seen = False
|
||||||
|
self.cpe = ""
|
||||||
|
self.record = {}
|
||||||
|
self.refs = []
|
||||||
|
|
||||||
|
def startElement(self, tag, attributes):
|
||||||
|
self.CurrentData = tag
|
||||||
|
if tag == 'cpe-23:cpe23-item':
|
||||||
|
self.record['cpe-23'] = attributes['name']
|
||||||
|
if tag == 'title':
|
||||||
|
self.title_seen = True
|
||||||
|
if tag == 'reference':
|
||||||
|
self.refs.append(attributes['href'])
|
||||||
|
|
||||||
|
def characters(self, data):
|
||||||
|
if self.title_seen:
|
||||||
|
self.title = self.title + data
|
||||||
|
|
||||||
|
def endElement(self, tag):
|
||||||
|
if tag == 'title':
|
||||||
|
self.record['title'] = self.title
|
||||||
|
self.title = ""
|
||||||
|
self.title_seen = False
|
||||||
|
if tag == 'references':
|
||||||
|
self.record['refs'] = self.refs
|
||||||
|
self.refs = []
|
||||||
|
if tag == 'cpe-item':
|
||||||
|
to_insert = CPEExtractor(cpe=self.record['cpe-23'])
|
||||||
|
for word in canonize(to_insert['vendor']):
|
||||||
|
insert( word=word, cpe=to_insert['cpeline'] )
|
||||||
|
for word in canonize(to_insert['product']):
|
||||||
|
insert( word=word, cpe=to_insert['cpeline'] )
|
||||||
|
self.record = {}
|
||||||
|
|
||||||
|
|
||||||
|
def CPEExtractor( cpe=None ):
|
||||||
|
if cpe is None:
|
||||||
|
return False
|
||||||
|
record = {}
|
||||||
|
cpefield = cpe.split(":")
|
||||||
|
record['vendor'] = cpefield[3]
|
||||||
|
record['product'] = cpefield[4]
|
||||||
|
cpeline = ""
|
||||||
|
for cpeentry in cpefield[:5]:
|
||||||
|
cpeline = cpeline + ":" + cpeentry
|
||||||
|
record['cpeline'] = cpeline[1:]
|
||||||
|
return record
|
||||||
|
|
||||||
|
def canonize( value=None ):
|
||||||
|
value = value.lower()
|
||||||
|
words = value.split('_')
|
||||||
|
return words
|
||||||
|
|
||||||
|
def insert( word=None, cpe=None):
|
||||||
|
if cpe is None or word is None:
|
||||||
|
return False
|
||||||
|
rdb.sadd('w:{}'.format(word), cpe)
|
||||||
|
rdb.zadd('s:{}'.format(word), {cpe: 1}, incr=True)
|
||||||
|
|
||||||
|
cpe_path = '../data/official-cpe-dictionary_v2.3.xml'
|
||||||
|
|
||||||
|
parser = xml.sax.make_parser()
|
||||||
|
|
||||||
|
Handler = CPEHandler()
|
||||||
|
parser.setContentHandler( Handler )
|
||||||
|
parser.parse(cpe_path)
|
4
data/dump.sh
Normal file
4
data/dump.sh
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
wget https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz
|
||||||
|
gzip -d official-cpe-dictionary_v2.3.xml.gz
|
Loading…
Reference in a new issue