aboutsummaryrefslogtreecommitdiff
path: root/bin/mirror.rb
blob: 7a6618047ecbf4cd83a9d1493bd4cc57e7dd1202 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env ruby
# frozen_string_literal: true

#
# build-mirror.rb: Mirror current data from all upstream sources for
# testing.
#
# Usage:
#   # create mirror in ./20220318
#   bin/mirror.rb ./20220318
#
# Environment Variables:
#
# * MIRROR_DRY_RUN: Set to do a dry run (default: false)
# * MIRROR_LOG_LEVEL: log level (default: "info")
# * MIRROR_THREADED: Set to download in threads (default: false)
#

require 'fileutils'
require 'json'
require 'logger'
require 'net/http'
require 'open-uri'
require 'uri'

# check command-line args
raise "Usage #$0 dir" unless ARGV.size == 1

CVE_YEARS = (2002...Time.now.year).to_a

# format strings
F = {
  cve: 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-%<id>s.%<ext>s',
  cpematch: 'https://nvd.nist.gov/feeds/json/cpematch/1.0/nvdcpematch-1.0.%<ext>s',
}

# data sources
SOURCES = [{
  dst: 'nvd/cve',
  urls: %w{modified recent}.concat(CVE_YEARS).each.with_object([]) do |s, r|
    %w{meta json.gz}.each do |ext|
      r << F[:cve] % { id: s, ext: ext }
    end
  end,
}, {
  dst: 'nvd/cpedict',
  urls: %w{https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.gz},
}, {
  dst: 'nvd/cpematch',
  urls: %w{meta json.gz}.map do |ext|
    F[:cpematch] % { ext: ext }
  end,
}, {
  dst: 'cisa',
  urls: %w{https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json},
}, {
  dst: 'cwe',
  urls: %w{https://cwe.mitre.org/data/xml/cwec_latest.xml.zip},
}]

def get(log, dst, uri)
  sum, len = 0, nil

  uri.open({
    content_length_proc: proc do |val|
      len = val
      log.debug(uri) { JSON({ type: 'size', len: len }) }
    end,

    progress_proc: proc do |val| 
      sum += val
      log.debug(uri) { JSON({ type: 'read', val: val, sum: sum, len: len }) }
    end,
  }) { |src| IO.copy_stream(src, dst) }

  log.info(uri) { JSON({ type: 'done', sum: sum })
end

log = Logger.new(STDOUT)
log.level = ENV.fetch('MIRROR_LOG_LEVEL', 'info')

# get base directory
DST = ARGV.shift
log.debug('dst') { DST }

# build list of directories
DIRS = SOURCES.map { |s| File.join(DST, s[:dst]) }.tap do |dirs|
  log.debug('dirs') { JSON(dirs) }
end

# build list of urls
URIS = SOURCES.each.with_object([]) do |src, r|
  r.concat(src[:urls].map { |url| URI.parse(url) }.map do |uri|
    { uri: uri, dst: File.join(DST, src[:dst], File.basename(uri.path)) }
  end)
end.tap { |uris| log.debug('uris') { JSON(uris) } }

unless ENV.key?('MIRROR_DRY_RUN') 
  # create dirs
  FileUtils.mkdir_p(DIRS)

  if ENV.key?('MIRROR_THREADED')
    # fetch URLs in threads, join threads
    URIS.map do |row| 
      Thread.new(log, row) { |log, row| get(log, row[:dst], row[:uri]) }
    end.each { |t| t.join }
  else
    # fetch URLs in sequence
    URIS.each { |row| get(log, row[:dst], row[:uri]) }
  end
end