Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch and parse NIST documents using relaton-nist fetcher and parser #237

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions exe/pubid-nist
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@ require "csv"
require "lightly"

def render_report_doc(doc)
[doc[:finalPubId] != doc[:id],
[doc[:mr] != doc[:doi],
doc[:finalPubId],
doc[:id],
doc[:mr] != doc[:doi],
doc[:mr],
doc[:doi],
doc[:title]]
end

def get_documents(options)
documents = Pubid::Nist::NistTechPubs.status

options[:updated] ? documents.reject { |doc| doc[:doi] == doc[:mr] } : documents
end

class Pubid::Nist::CLI < Thor
desc "report", "Create report for NIST Tech Pubs database (fetches from GitHub)"
option :csv, aliases: "-c", type: :boolean, desc: "Export to CSV format"
Expand All @@ -25,12 +29,11 @@ class Pubid::Nist::CLI < Thor
option :pull, aliases: "-p", type: :boolean,
desc: "Update cache from NIST Tech Pubs database",
default: false

def report
heading = %w(
ID\ changed?
New\ PubID
Document\ ID
DOI\ changed?
New\ PubID
New\ PubID-MR
DOI
Title
Expand All @@ -39,18 +42,14 @@ class Pubid::Nist::CLI < Thor
Lightly.clear "documents" if options[:pull]
# Pubid::Nist::NistTechPubs.fetch
if Lightly.cached? "documents"
warn "Using nist-tech-pubs.xml file from local cache"
warn "Using allrecords-MODS.xml file from local cache"
else
warn "Cached nist-tech-pubs.xml not present, downloading from GitHub..."
warn "Cached allrecords-MODS.xml not present, downloading from GitHub..."
end

puts options[:csv] && heading.to_csv || heading.join(" | ")

documents = Pubid::Nist::NistTechPubs.status

documents = documents.reject { |doc| doc[:finalPubId] == doc[:id] } if options[:updated]

documents.each do |doc|
get_documents(options).each do |doc|
if options[:csv]
puts render_report_doc(doc).to_csv
else
Expand Down
1 change: 1 addition & 0 deletions lib/pubid/nist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require "yaml"
require "parslet"
require "pubid-core"
require "loc_mods"

module Pubid
module Nist
Expand Down
39 changes: 20 additions & 19 deletions lib/pubid/nist/nist_tech_pubs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,42 @@

module Pubid::Nist
class NistTechPubs
URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml".freeze
URL = "https://github.com/usnistgov/NIST-Tech-Pubs/releases/download/Oct2024/allrecords-MODS.xml"

@converted_id = @converted_doi = {}

class << self

attr_accessor :documents, :converted_id, :converted_doi

def fetch
def create_title(title, non_sort = nil)
content = title.gsub("\n", " ").squeeze(" ").strip
content = "#{non_sort.content}#{content}".squeeze(" ") if non_sort
content
end

def fetch
Lightly.prune
@documents ||= Lightly.get "documents" do
Nokogiri::XML(URI.open(URL))
.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
.map { |doc| parse_docid doc }
LocMods::Collection.from_xml(OpenURI.open_uri(URL)).mods.map do |doc|
url = doc.location.reduce(nil) { |m, l| m || l.url.detect { |u| u.usage == "primary display" } }

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have class methods that parses title and DOI outside the loop? So these methods could be used in relaton-nist

Pubid::List::NistTechPubs.parse_doi(doc) => "NIST.IR.841"
Pubid::List::NistTechPubs.parse_title(doc) => "Status report ..."


title = doc.title_info.reduce([]) do |a, ti|
next a if ti.type == "alternative"

a += ti.title.map { |t| create_title(t, ti.non_sort[0]) }
a + ti.sub_title.map { |t| create_title(t) }
end.join(" - ")

{ doi: url.content.gsub("https://doi.org/10.6028/", ""), title: title }
end
end
rescue StandardError => e
warn e.message
[]
end

def convert(doc)
id = @converted_id[doc[:id]] ||= Pubid::Nist::Identifier.parse(doc[:id])
return id unless doc.key?(:doi)

begin
doi = @converted_doi[doc[:doi]] ||=
Pubid::Nist::Identifier.parse(doc[:doi])
rescue Pubid::Core::Errors::ParseError
return id
end
# return more complete pubid
id.merge(doi)
rescue Pubid::Core::Errors::ParseError
@converted_doi[doc[:doi]] ||= Pubid::Nist::Identifier.parse(doc[:doi])
end

Expand Down Expand Up @@ -94,15 +97,13 @@ def status
fetch.lazy.map do |doc|
final_doc = convert(doc)
{
id: doc[:id],
doi: doc[:doi],
title: doc[:title],
finalPubId: final_doc.to_s,
mr: final_doc.to_s(:mr),
}
rescue Pubid::Core::Errors::ParseError
{
id: doc[:id],
doi: doc[:doi],
title: doc[:title],
finalPubId: "parse error",
Expand Down
1 change: 1 addition & 0 deletions pubid-nist.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,6 @@ Gem::Specification.new do |spec|
spec.add_dependency "lightly"
spec.add_dependency "parslet"
spec.add_dependency "pubid-core", "~> 1.12.2"
spec.add_dependency "loc_mods", "~> 0.2.0"
spec.add_dependency "rubyzip"
end
53 changes: 8 additions & 45 deletions spec/nist_pubid/nist_tech_pubs_spec.rb
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
RSpec.describe Pubid::Nist::NistTechPubs, vcr: true do
describe "#fetch" do
it "fetch doc identifiers from nist_tech_pubs" do
expect(described_class.fetch.map { |d| d[:id] })
.to include("NBS BH 1",
"NIST SP 1800-15",
"NIST SP 1265",
"NBS FIPS 83",
"NIST IR 8379")
end

it "fetches doi identifiers" do
expect(described_class.fetch.map { |d| d[:doi] })
.to include("NBS.BH.1",
Expand All @@ -21,41 +12,15 @@

describe "#convert" do
it "converts old pubid to new NIST PubID" do
expect(described_class.convert({ id: "NISTIR 8379" }).to_s)
expect(described_class.convert({ doi: "NISTIR.8379" }).to_s)
.to eq("NIST IR 8379")
end

it "keeps correct NIST PubID the same" do
expect(described_class.convert({ id: "NIST SP 800-133r2" }).to_s)
.to eq("NIST SP 800-133r2")
expect(described_class.convert({ id: "NIST SP 800-160v1" }).to_s)
.to eq("NIST SP 800-160v1")
end

it "uses doi when cannot parse document id" do
expect(described_class.convert(
{ id: "NBS CIRC re3", doi: "NBS.CIRC.5e3" },
).to_s).to eq("NBS CIRC 5e3")
end

it "uses doi when doi more complete then id" do
expect(described_class.convert(
{ id: "NIST SP 260-162", doi: "NIST SP 260-162 2006ed." },
).to_s).to eq("NIST SP 260-162e2006")
end

it "combines data from id and doi" do
expect(described_class.convert(
{ id: "NIST SP 260-162r1", doi: "NIST SP 260-162 2006ed." },
).to_s).to eq("NIST SP 260-162e2006r1")
end

context "when doi code is wrong" do
it "skips merging with doi" do
expect(described_class.convert(
{ id: "NIST TN 1648", doi: "NISTPUB.0413171251" },
).to_s).to eq("NIST TN 1648")
end
expect(described_class.convert({ doi: "NIST.SP.800-133r2" }).to_s(format = :mr))
.to eq("NIST.SP.800-133r2")
expect(described_class.convert({ doi: "NIST.SP.800-160v1" }).to_s(format = :mr))
.to eq("NIST.SP.800-160v1")
end
end

Expand Down Expand Up @@ -116,8 +81,7 @@

before do
described_class.documents = [
{ id: id,
doi: doi,
{ doi: doi,
title: title },
]
end
Expand All @@ -127,8 +91,7 @@
it do
expect(subject.to_a)
.to eq([
{ id: id, doi: doi, title: title, mr: mr,
finalPubId: finalPubId },
{ doi: doi, title: title, mr: mr, finalPubId: finalPubId },
])
end

Expand All @@ -139,7 +102,7 @@
it do
expect(subject.to_a)
.to eq([
{ id: id, doi: doi, title: title, finalPubId: "parse error",
{ doi: doi, title: title, finalPubId: "parse error",
mr: "parse_error" },
])
end
Expand Down
Loading