2021-09-21 17:23:48 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
2021-09-07 05:29:38 +00:00
|
|
|
|
2021-09-21 17:23:48 +00:00
|
|
|
import argparse
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import urllib.request
|
|
|
|
import gzip
|
|
|
|
import shutil
|
2021-09-07 05:29:38 +00:00
|
|
|
import xml.sax
|
|
|
|
import redis
|
2021-09-22 05:37:42 +00:00
|
|
|
import time
|
2023-07-08 18:50:49 +00:00
|
|
|
from dynaconf import Dynaconf
|
2021-09-21 17:23:48 +00:00
|
|
|
|
|
|
|
# Configuration
|
2023-08-09 13:21:05 +00:00
|
|
|
settings = Dynaconf(settings_files=['../config/settings.yaml'])
|
2023-07-08 18:50:49 +00:00
|
|
|
cpe_path = settings.cpe.path
|
2023-08-09 13:21:05 +00:00
|
|
|
cpe_source = settings.cpe.source
|
2023-07-08 18:50:49 +00:00
|
|
|
rdb = redis.Redis(host=settings.redis.host, port=settings.redis.port, db=8)
|
2021-10-19 16:30:29 +00:00
|
|
|
|
2023-08-09 13:21:05 +00:00
|
|
|
|
2021-10-19 16:30:29 +00:00
|
|
|
class CPEHandler(xml.sax.ContentHandler):
|
2021-09-07 05:29:38 +00:00
|
|
|
def __init__(self):
|
|
|
|
self.cpe = ""
|
|
|
|
self.title = ""
|
|
|
|
self.title_seen = False
|
|
|
|
self.cpe = ""
|
2021-10-19 16:30:29 +00:00
|
|
|
self.record = {}
|
2021-09-07 05:29:38 +00:00
|
|
|
self.refs = []
|
2021-09-21 17:23:48 +00:00
|
|
|
self.itemcount = 0
|
|
|
|
self.wordcount = 0
|
2021-09-22 05:37:42 +00:00
|
|
|
self.start_time = time.time()
|
2021-09-07 05:29:38 +00:00
|
|
|
|
|
|
|
def startElement(self, tag, attributes):
|
|
|
|
self.CurrentData = tag
|
|
|
|
if tag == 'cpe-23:cpe23-item':
|
|
|
|
self.record['cpe-23'] = attributes['name']
|
|
|
|
if tag == 'title':
|
|
|
|
self.title_seen = True
|
|
|
|
if tag == 'reference':
|
|
|
|
self.refs.append(attributes['href'])
|
|
|
|
|
|
|
|
def characters(self, data):
|
|
|
|
if self.title_seen:
|
|
|
|
self.title = self.title + data
|
2021-09-22 05:37:42 +00:00
|
|
|
|
2021-09-07 05:29:38 +00:00
|
|
|
def endElement(self, tag):
|
|
|
|
if tag == 'title':
|
|
|
|
self.record['title'] = self.title
|
|
|
|
self.title = ""
|
|
|
|
self.title_seen = False
|
|
|
|
if tag == 'references':
|
|
|
|
self.record['refs'] = self.refs
|
|
|
|
self.refs = []
|
|
|
|
if tag == 'cpe-item':
|
|
|
|
to_insert = CPEExtractor(cpe=self.record['cpe-23'])
|
|
|
|
for word in canonize(to_insert['vendor']):
|
2021-10-19 16:30:29 +00:00
|
|
|
insert(word=word, cpe=to_insert['cpeline'])
|
2021-09-21 17:23:48 +00:00
|
|
|
self.wordcount += 1
|
2021-09-07 05:29:38 +00:00
|
|
|
for word in canonize(to_insert['product']):
|
2021-10-19 16:30:29 +00:00
|
|
|
insert(word=word, cpe=to_insert['cpeline'])
|
2021-09-21 17:23:48 +00:00
|
|
|
self.wordcount += 1
|
2021-09-07 05:29:38 +00:00
|
|
|
self.record = {}
|
2021-09-21 17:23:48 +00:00
|
|
|
self.itemcount += 1
|
|
|
|
if self.itemcount % 5000 == 0:
|
2021-10-19 16:30:29 +00:00
|
|
|
time_elapsed = round(time.time() - self.start_time)
|
|
|
|
print(
|
|
|
|
f"... {self.itemcount} items processed ({self.wordcount} words) in {time_elapsed} seconds"
|
|
|
|
)
|
2021-09-07 05:29:38 +00:00
|
|
|
|
|
|
|
|
2021-10-19 16:30:29 +00:00
|
|
|
def CPEExtractor(cpe=None):
|
2021-09-07 05:29:38 +00:00
|
|
|
if cpe is None:
|
|
|
|
return False
|
|
|
|
record = {}
|
|
|
|
cpefield = cpe.split(":")
|
|
|
|
record['vendor'] = cpefield[3]
|
|
|
|
record['product'] = cpefield[4]
|
|
|
|
cpeline = ""
|
|
|
|
for cpeentry in cpefield[:5]:
|
2021-09-22 05:37:42 +00:00
|
|
|
cpeline = f"{cpeline}:{cpeentry}"
|
|
|
|
record['cpeline'] = cpeline[1:]
|
|
|
|
return record
|
2021-09-07 05:29:38 +00:00
|
|
|
|
2021-10-19 16:30:29 +00:00
|
|
|
|
|
|
|
def canonize(value=None):
|
2021-09-07 05:29:38 +00:00
|
|
|
value = value.lower()
|
|
|
|
words = value.split('_')
|
|
|
|
return words
|
|
|
|
|
2021-10-19 16:30:29 +00:00
|
|
|
|
|
|
|
def insert(word=None, cpe=None):
|
2021-09-07 05:29:38 +00:00
|
|
|
if cpe is None or word is None:
|
|
|
|
return False
|
2021-09-22 05:37:42 +00:00
|
|
|
rdb.sadd(f"w:{word}", cpe)
|
|
|
|
rdb.zadd(f"s:{word}", {cpe: 1}, incr=True)
|
|
|
|
rdb.zadd("rank:cpe", {cpe: 1}, incr=True)
|
2021-09-07 05:29:38 +00:00
|
|
|
|
|
|
|
|
2021-09-21 17:23:48 +00:00
|
|
|
if __name__ == '__main__':
|
2021-10-19 16:30:29 +00:00
|
|
|
argparser = argparse.ArgumentParser(
|
|
|
|
description='Initializes the Redis database with CPE dictionary.'
|
|
|
|
)
|
|
|
|
argparser.add_argument(
|
|
|
|
'--download',
|
|
|
|
'-d',
|
|
|
|
action='count',
|
|
|
|
default=0,
|
|
|
|
help='Download the CPE dictionary even if it already exists.',
|
|
|
|
)
|
|
|
|
argparser.add_argument(
|
|
|
|
'--replace',
|
|
|
|
'-r',
|
|
|
|
action='count',
|
|
|
|
default=0,
|
|
|
|
help='Flush and repopulated the CPE database.',
|
|
|
|
)
|
|
|
|
argparser.add_argument(
|
|
|
|
'--update',
|
|
|
|
'-u',
|
|
|
|
action='store_true',
|
|
|
|
default=False,
|
|
|
|
help='Update the CPE database without flushing',
|
|
|
|
)
|
2021-09-21 17:23:48 +00:00
|
|
|
args = argparser.parse_args()
|
|
|
|
|
2021-10-19 16:20:44 +00:00
|
|
|
if args.replace == 0 and rdb.dbsize() > 0 and not args.update:
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Warning! The Redis database already has {rdb.dbsize()} keys.")
|
2021-09-21 17:23:48 +00:00
|
|
|
print("Use --replace if you want to flush the database and repopulate it.")
|
2023-07-11 07:10:39 +00:00
|
|
|
sys.exit(0)
|
2021-09-21 17:23:48 +00:00
|
|
|
|
|
|
|
if args.download > 0 or not os.path.isfile(cpe_path):
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Downloading CPE data from {cpe_source} ...")
|
2021-09-21 17:23:48 +00:00
|
|
|
try:
|
2021-09-22 05:37:42 +00:00
|
|
|
urllib.request.urlretrieve(cpe_source, f"{cpe_path}.gz")
|
2021-10-19 16:30:29 +00:00
|
|
|
except (
|
|
|
|
urllib.error.HTTPError,
|
|
|
|
urllib.error.URLError,
|
|
|
|
FileNotFoundError,
|
|
|
|
PermissionError,
|
|
|
|
) as e:
|
2021-09-21 17:23:48 +00:00
|
|
|
print(e)
|
|
|
|
sys.exit(1)
|
|
|
|
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Uncompressing {cpe_path}.gz ...")
|
2021-09-21 17:23:48 +00:00
|
|
|
try:
|
2021-09-22 05:37:42 +00:00
|
|
|
with gzip.open(f"{cpe_path}.gz", 'rb') as cpe_gz:
|
2021-09-21 17:23:48 +00:00
|
|
|
with open(cpe_path, 'wb') as cpe_xml:
|
|
|
|
shutil.copyfileobj(cpe_gz, cpe_xml)
|
2021-09-22 05:37:42 +00:00
|
|
|
os.remove(f"{cpe_path}.gz")
|
2021-09-21 17:23:48 +00:00
|
|
|
except (FileNotFoundError, PermissionError) as e:
|
|
|
|
print(e)
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
elif os.path.isfile(cpe_path):
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Using existing file {cpe_path} ...")
|
2021-09-21 17:23:48 +00:00
|
|
|
|
2021-10-19 16:18:20 +00:00
|
|
|
if rdb.dbsize() > 0 and not args.update:
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Flushing {rdb.dbsize()} keys from the database...")
|
2021-09-21 17:23:48 +00:00
|
|
|
rdb.flushdb()
|
2021-09-07 05:29:38 +00:00
|
|
|
|
2021-09-21 17:23:48 +00:00
|
|
|
print("Populating the database (please be patient)...")
|
|
|
|
parser = xml.sax.make_parser()
|
|
|
|
Handler = CPEHandler()
|
2021-10-19 16:30:29 +00:00
|
|
|
parser.setContentHandler(Handler)
|
2021-09-21 17:23:48 +00:00
|
|
|
parser.parse(cpe_path)
|
2021-09-22 05:37:42 +00:00
|
|
|
print(f"Done! {rdb.dbsize()} keys inserted.")
|