diff --git a/exe/pubid-nist b/exe/pubid-nist index 09bc181..2e61e82 100755 --- a/exe/pubid-nist +++ b/exe/pubid-nist @@ -7,15 +7,19 @@ require "csv" require "lightly" def render_report_doc(doc) - [doc[:finalPubId] != doc[:id], + [doc[:mr] != doc[:doi], doc[:finalPubId], - doc[:id], - doc[:mr] != doc[:doi], doc[:mr], doc[:doi], doc[:title]] end +def get_documents(options) + documents = Pubid::Nist::NistTechPubs.status + + options[:updated] ? documents.reject { |doc| doc[:doi] == doc[:mr] } : documents +end + class Pubid::Nist::CLI < Thor desc "report", "Create report for NIST Tech Pubs database (fetches from GitHub)" option :csv, aliases: "-c", type: :boolean, desc: "Export to CSV format" @@ -25,12 +29,11 @@ class Pubid::Nist::CLI < Thor option :pull, aliases: "-p", type: :boolean, desc: "Update cache from NIST Tech Pubs database", default: false + def report heading = %w( - ID\ changed? - New\ PubID - Document\ ID DOI\ changed? + New\ PubID New\ PubID-MR DOI Title @@ -39,18 +42,14 @@ class Pubid::Nist::CLI < Thor Lightly.clear "documents" if options[:pull] # Pubid::Nist::NistTechPubs.fetch if Lightly.cached? "documents" - warn "Using nist-tech-pubs.xml file from local cache" + warn "Using allrecords-MODS.xml file from local cache" else - warn "Cached nist-tech-pubs.xml not present, downloading from GitHub..." + warn "Cached allrecords-MODS.xml not present, downloading from GitHub..." end puts options[:csv] && heading.to_csv || heading.join(" | ") - documents = Pubid::Nist::NistTechPubs.status - - documents = documents.reject { |doc| doc[:finalPubId] == doc[:id] } if options[:updated] - - documents.each do |doc| + get_documents(options).each do |doc| if options[:csv] puts render_report_doc(doc).to_csv else diff --git a/lib/pubid/nist.rb b/lib/pubid/nist.rb index 595fb69..384cfc7 100644 --- a/lib/pubid/nist.rb +++ b/lib/pubid/nist.rb @@ -3,6 +3,7 @@ require "yaml" require "parslet" require "pubid-core" +require "loc_mods" module Pubid module Nist diff --git a/lib/pubid/nist/nist_tech_pubs.rb b/lib/pubid/nist/nist_tech_pubs.rb index 6801c67..5f8144c 100644 --- a/lib/pubid/nist/nist_tech_pubs.rb +++ b/lib/pubid/nist/nist_tech_pubs.rb @@ -6,7 +6,7 @@ module Pubid::Nist class NistTechPubs - URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml".freeze + URL = "https://github.com/usnistgov/NIST-Tech-Pubs/releases/download/Oct2024/allrecords-MODS.xml" @converted_id = @converted_doi = {} @@ -14,12 +14,27 @@ class << self attr_accessor :documents, :converted_id, :converted_doi - def fetch + def create_title(title, non_sort = nil) + content = title.gsub("\n", " ").squeeze(" ").strip + content = "#{non_sort.content}#{content}".squeeze(" ") if non_sort + content + end + + def fetch Lightly.prune @documents ||= Lightly.get "documents" do - Nokogiri::XML(URI.open(URL)) - .xpath("/body/query/doi_record/report-paper/report-paper_metadata") - .map { |doc| parse_docid doc } + LocMods::Collection.from_xml(OpenURI.open_uri(URL)).mods.map do |doc| + url = doc.location.reduce(nil) { |m, l| m || l.url.detect { |u| u.usage == "primary display" } } + + title = doc.title_info.reduce([]) do |a, ti| + next a if ti.type == "alternative" + + a += ti.title.map { |t| create_title(t, ti.non_sort[0]) } + a + ti.sub_title.map { |t| create_title(t) } + end.join(" - ") + + { doi: url.content.gsub("https://doi.org/10.6028/", ""), title: title } + end end rescue StandardError => e warn e.message @@ -27,18 +42,6 @@ def fetch end def convert(doc) - id = @converted_id[doc[:id]] ||= Pubid::Nist::Identifier.parse(doc[:id]) - return id unless doc.key?(:doi) - - begin - doi = @converted_doi[doc[:doi]] ||= - Pubid::Nist::Identifier.parse(doc[:doi]) - rescue Pubid::Core::Errors::ParseError - return id - end - # return more complete pubid - id.merge(doi) - rescue Pubid::Core::Errors::ParseError @converted_doi[doc[:doi]] ||= Pubid::Nist::Identifier.parse(doc[:doi]) end @@ -94,7 +97,6 @@ def status fetch.lazy.map do |doc| final_doc = convert(doc) { - id: doc[:id], doi: doc[:doi], title: doc[:title], finalPubId: final_doc.to_s, @@ -102,7 +104,6 @@ def status } rescue Pubid::Core::Errors::ParseError { - id: doc[:id], doi: doc[:doi], title: doc[:title], finalPubId: "parse error", diff --git a/pubid-nist.gemspec b/pubid-nist.gemspec index f7bf44d..359db4f 100644 --- a/pubid-nist.gemspec +++ b/pubid-nist.gemspec @@ -34,5 +34,6 @@ Gem::Specification.new do |spec| spec.add_dependency "lightly" spec.add_dependency "parslet" spec.add_dependency "pubid-core", "~> 1.12.2" + spec.add_dependency "loc_mods", "~> 0.2.0" spec.add_dependency "rubyzip" end diff --git a/spec/nist_pubid/nist_tech_pubs_spec.rb b/spec/nist_pubid/nist_tech_pubs_spec.rb index cffcb5f..ab746fd 100644 --- a/spec/nist_pubid/nist_tech_pubs_spec.rb +++ b/spec/nist_pubid/nist_tech_pubs_spec.rb @@ -1,14 +1,5 @@ RSpec.describe Pubid::Nist::NistTechPubs, vcr: true do describe "#fetch" do - it "fetch doc identifiers from nist_tech_pubs" do - expect(described_class.fetch.map { |d| d[:id] }) - .to include("NBS BH 1", - "NIST SP 1800-15", - "NIST SP 1265", - "NBS FIPS 83", - "NIST IR 8379") - end - it "fetches doi identifiers" do expect(described_class.fetch.map { |d| d[:doi] }) .to include("NBS.BH.1", @@ -21,41 +12,15 @@ describe "#convert" do it "converts old pubid to new NIST PubID" do - expect(described_class.convert({ id: "NISTIR 8379" }).to_s) + expect(described_class.convert({ doi: "NISTIR.8379" }).to_s) .to eq("NIST IR 8379") end it "keeps correct NIST PubID the same" do - expect(described_class.convert({ id: "NIST SP 800-133r2" }).to_s) - .to eq("NIST SP 800-133r2") - expect(described_class.convert({ id: "NIST SP 800-160v1" }).to_s) - .to eq("NIST SP 800-160v1") - end - - it "uses doi when cannot parse document id" do - expect(described_class.convert( - { id: "NBS CIRC re3", doi: "NBS.CIRC.5e3" }, - ).to_s).to eq("NBS CIRC 5e3") - end - - it "uses doi when doi more complete then id" do - expect(described_class.convert( - { id: "NIST SP 260-162", doi: "NIST SP 260-162 2006ed." }, - ).to_s).to eq("NIST SP 260-162e2006") - end - - it "combines data from id and doi" do - expect(described_class.convert( - { id: "NIST SP 260-162r1", doi: "NIST SP 260-162 2006ed." }, - ).to_s).to eq("NIST SP 260-162e2006r1") - end - - context "when doi code is wrong" do - it "skips merging with doi" do - expect(described_class.convert( - { id: "NIST TN 1648", doi: "NISTPUB.0413171251" }, - ).to_s).to eq("NIST TN 1648") - end + expect(described_class.convert({ doi: "NIST.SP.800-133r2" }).to_s(format = :mr)) + .to eq("NIST.SP.800-133r2") + expect(described_class.convert({ doi: "NIST.SP.800-160v1" }).to_s(format = :mr)) + .to eq("NIST.SP.800-160v1") end end @@ -116,8 +81,7 @@ before do described_class.documents = [ - { id: id, - doi: doi, + { doi: doi, title: title }, ] end @@ -127,8 +91,7 @@ it do expect(subject.to_a) .to eq([ - { id: id, doi: doi, title: title, mr: mr, - finalPubId: finalPubId }, + { doi: doi, title: title, mr: mr, finalPubId: finalPubId }, ]) end @@ -139,7 +102,7 @@ it do expect(subject.to_a) .to eq([ - { id: id, doi: doi, title: title, finalPubId: "parse error", + { doi: doi, title: title, finalPubId: "parse error", mr: "parse_error" }, ]) end