From 2ea30b6761a7d3595b06e8dc214eb6ef71d542f2 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Tue, 21 Sep 2021 20:17:42 +0300 Subject: [PATCH 1/4] new: [dev] .gitignore added --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2fe3a52 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +bin/__pycache__/ +lib/__pycache__/ +data/*.xml +data/*.gz From fa6abfbb5482d8aaaa8ff2a4cbaa84c018262568 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Tue, 21 Sep 2021 20:23:48 +0300 Subject: [PATCH 2/4] chg: [import] implement dl & fail-safe mechanism --- bin/import.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/bin/import.py b/bin/import.py index 766518f..39c55e6 100644 --- a/bin/import.py +++ b/bin/import.py @@ -1,6 +1,18 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import sys +import os +import urllib.request +import gzip +import shutil import xml.sax import redis + +# Configuration +cpe_path = '../data/official-cpe-dictionary_v2.3.xml' +cpe_source = 'https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz' rdb = redis.Redis(host='127.0.0.1', port=6379, db=8) class CPEHandler( xml.sax.ContentHandler ): @@ -11,6 +23,8 @@ class CPEHandler( xml.sax.ContentHandler ): self.cpe = "" self.record = {} self.refs = [] + self.itemcount = 0 + self.wordcount = 0 def startElement(self, tag, attributes): self.CurrentData = tag @@ -37,9 +51,14 @@ class CPEHandler( xml.sax.ContentHandler ): to_insert = CPEExtractor(cpe=self.record['cpe-23']) for word in canonize(to_insert['vendor']): insert( word=word, cpe=to_insert['cpeline'] ) + self.wordcount += 1 for word in canonize(to_insert['product']): insert( word=word, cpe=to_insert['cpeline'] ) + self.wordcount += 1 self.record = {} + self.itemcount += 1 + if self.itemcount % 5000 == 0: + print ("... " + str(self.itemcount) + " items processed (" + str(self.wordcount) + " words)") def CPEExtractor( cpe=None ): @@ -67,10 +86,46 @@ def insert( word=None, cpe=None): rdb.zadd('s:{}'.format(word), {cpe: 1}, incr=True) rdb.zadd('rank:cpe', {cpe: 1}, incr=True) -cpe_path = '../data/official-cpe-dictionary_v2.3.xml' -parser = xml.sax.make_parser() +if __name__ == '__main__': + argparser = argparse.ArgumentParser(description='Initializes the Redis database with CPE dictionary.') + argparser.add_argument('--download', '-d', action='count', default=0, help='Download the CPE dictionary even if it already exists.') + argparser.add_argument('--replace', '-r', action='count', default=0, help='Flush and repopulated the CPE database.') + args = argparser.parse_args() -Handler = CPEHandler() -parser.setContentHandler( Handler ) -parser.parse(cpe_path) + if args.replace == 0 and rdb.dbsize() > 0: + print("Warning! The Redis database already has " + str(rdb.dbsize()) + " keys.") + print("Use --replace if you want to flush the database and repopulate it.") + sys.exit(1) + + if args.download > 0 or not os.path.isfile(cpe_path): + print("Downloading CPE data from " + cpe_source + " ...") + try: + urllib.request.urlretrieve(cpe_source, cpe_path + ".gz") + except (urllib.error.HTTPError, urllib.error.URLError, FileNotFoundError, PermissionError) as e: + print(e) + sys.exit(1) + + print("Uncompressing " + cpe_path + ".gz ...") + try: + with gzip.open(cpe_path + ".gz", 'rb') as cpe_gz: + with open(cpe_path, 'wb') as cpe_xml: + shutil.copyfileobj(cpe_gz, cpe_xml) + os.remove(cpe_path + ".gz") + except (FileNotFoundError, PermissionError) as e: + print(e) + sys.exit(1) + + elif os.path.isfile(cpe_path): + print("Using existing file " + cpe_path + " ...") + + if rdb.dbsize() > 0: + print("Flushing " + str(rdb.dbsize()) + " keys from the database...") + rdb.flushdb() + + print("Populating the database (please be patient)...") + parser = xml.sax.make_parser() + Handler = CPEHandler() + parser.setContentHandler( Handler ) + parser.parse(cpe_path) + print("Done! " + str(rdb.dbsize()) + " keys inserted.") From 6af4a9d6e27eeaf4b14442cfdffd79f40a8e1f09 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Tue, 21 Sep 2021 20:24:44 +0300 Subject: [PATCH 3/4] chg: [doc] update import instructions --- README.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0e4d8a7..00c77ca 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,9 @@ the software with `lookup.py` to find the most probable CPE matching the keyword ### Installation - `git clone https://github.com/cve-search/cpe-guesser.git` -- `cd cpe-guesser/data` -- `chmod +x dump.sh` -- `cd ../bin` -- `python3 import.py` -- Take a cup of black or green tea -- `python3 server.py` to run the local HTTP server +- Download the CPE dictionary & populate the database with `python3 cpe-guesser/bin/import.py`. +- Take a cup of black or green tea. +- `python3 cpe-guesser/bin/server.py` to run the local HTTP server. If you don't want to install it locally, there is a public online version. Check below. From 9a1ee5a85efac8ef0f0b9a6ff4081eaabe299e14 Mon Sep 17 00:00:00 2001 From: Esa Jokinen Date: Tue, 21 Sep 2021 20:25:24 +0300 Subject: [PATCH 4/4] chg: [import] remove obsolete import mechanism --- data/.gitkeep | 0 data/dump.sh | 4 ---- 2 files changed, 4 deletions(-) create mode 100644 data/.gitkeep delete mode 100644 data/dump.sh diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/dump.sh b/data/dump.sh deleted file mode 100644 index fd46480..0000000 --- a/data/dump.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh - -wget https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz -gzip -d official-cpe-dictionary_v2.3.xml.gz