From 60259c99d9ea952e8c9d838edc60984484007530 Mon Sep 17 00:00:00 2001
From: Paul Duncan <pabs@pablotron.org>
Date: Fri, 18 Mar 2022 03:25:55 -0400
Subject: add bin/mirror.rb

---
 bin/mirror.rb | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100755 bin/mirror.rb
diff --git a/bin/mirror.rb b/bin/mirror.rb
new file mode 100755
index 0000000..7a66180
--- /dev/null
+++ b/bin/mirror.rb
@@ -0,0 +1,111 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+#
+# build-mirror.rb: Mirror current data from all upstream sources for
+# testing.
+#
+# Usage:
+#   # create mirror in ./20220318
+#   bin/mirror.rb ./20220318
+#
+# Environment Variables:
+#
+# * MIRROR_DRY_RUN: Set to do a dry run (default: false)
+# * MIRROR_LOG_LEVEL: log level (default: "info")
+# * MIRROR_THREADED: Set to download in threads (default: false)
+#
+
+require 'fileutils'
+require 'json'
+require 'logger'
+require 'net/http'
+require 'open-uri'
+require 'uri'
+
+# check command-line args
+raise "Usage #$0 dir" unless ARGV.size == 1
+
+CVE_YEARS = (2002...Time.now.year).to_a
+
+# format strings
+F = {
+  cve: 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-%<id>s.%<ext>s',
+  cpematch: 'https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.%<ext>s',
+}
+
+# data sources
+SOURCES = [{
+  dst: 'nvd/cve',
+  urls: %w{modified recent}.concat(CVE_YEARS).each.with_object([]) do |s, r|
+    %w{meta json.gz}.each do |ext|
+      r << F[:cve] % { id: s, ext: ext }
+    end
+  end,
+}, {
+  dst: 'nvd/cpedict',
+  urls: %w{https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz},
+}, {
+  dst: 'nvd/cpematch',
+  urls: %w{meta json.gz}.map do |ext|
+    F[:cpematch] % { ext: ext }
+  end,
+}, {
+  dst: 'cisa',
+  urls: %w{https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json},
+}, {
+  dst: 'cwe',
+  urls: %w{https://cwe.mitre.org/data/xml/cwec_latest.xml.zip},
+}]
+
+def get(log, dst, uri)
+  sum, len = 0, nil
+
+  uri.open({
+    content_length_proc: proc do |val|
+      len = val
+      log.debug(uri) { JSON({ type: 'size', len: len }) }
+    end,
+
+    progress_proc: proc do |val| 
+      sum += val
+      log.debug(uri) { JSON({ type: 'read', val: val, sum: sum, len: len }) }
+    end,
+  }) { |src| IO.copy_stream(src, dst) }
+
+  log.info(uri) { JSON({ type: 'done', sum: sum })
+end
+
+log = Logger.new(STDOUT)
+log.level = ENV.fetch('MIRROR_LOG_LEVEL', 'info')
+
+# get base directory
+DST = ARGV.shift
+log.debug('dst') { DST }
+
+# build list of directories
+DIRS = SOURCES.map { |s| File.join(DST, s[:dst]) }.tap do |dirs|
+  log.debug('dirs') { JSON(dirs) }
+end
+
+# build list of urls
+URIS = SOURCES.each.with_object([]) do |src, r|
+  r.concat(src[:urls].map { |url| URI.parse(url) }.map do |uri|
+    { uri: uri, dst: File.join(DST, src[:dst], File.basename(uri.path)) }
+  end)
+end.tap { |uris| log.debug('uris') { JSON(uris) } }
+
+unless ENV.key?('MIRROR_DRY_RUN') 
+  # create dirs
+  FileUtils.mkdir_p(DIRS)
+
+  if ENV.key?('MIRROR_THREADED')
+    # fetch URLs in threads, join threads
+    URIS.map do |row| 
+      Thread.new(log, row) { |log, row| get(log, row[:dst], row[:uri]) }
+    end.each { |t| t.join }
+  else
+    # fetch URLs in sequence
+    URIS.each { |row| get(log, row[:dst], row[:uri]) }
+  end
+end
-- 
cgit v1.2.3