From 0261787f731710de9244f9877956d77a87ce92d9 Mon Sep 17 00:00:00 2001 From: Jane Sandberg Date: Mon, 7 Oct 2024 13:47:13 -0700 Subject: [PATCH] [#2508] Index primary sources as a facet --- marc_to_solr/lib/genre.rb | 64 ++++++++++++++++++++- spec/marc_to_solr/lib/genre_spec.rb | 89 +++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 2 deletions(-) diff --git a/marc_to_solr/lib/genre.rb b/marc_to_solr/lib/genre.rb index e811e380..cd62fe4b 100644 --- a/marc_to_solr/lib/genre.rb +++ b/marc_to_solr/lib/genre.rb @@ -12,8 +12,8 @@ def initialize(record) # 655 $a, $v, $x filtered def to_a @as_array ||= ( - genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v - ).uniq + genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v + genre_from_primary_source_mapping + genres_from_autobiography + ).compact.uniq end private @@ -60,6 +60,43 @@ def genres_from_subfield_v end end + def genre_from_primary_source_mapping + potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|vx:651|*0|vx:655|*0|a:655|*0|vx').collect_matching_lines(record) do |field, spec, extractor| + extractor.collect_subfields(field, spec) + end + if potential_genres.any? { |genre| genre_term_indicates_primary_source? genre } + ['Primary source'] + else + [] + end + end + + def genres_from_autobiography + if biography? && author_matches_subject? + ['Primary source'] + else + [] + end + end + + def genre_term_indicates_primary_source?(genre) + normalized_genre = genre.downcase.strip.delete_suffix('.') + primary_source_genres.include? normalized_genre + end + + def biography? + potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|avx:651|*0|vx:655|*0|avx').collect_matching_lines(record) do |field, spec, extractor| + extractor.collect_subfields(field, spec) + end + potential_genres.include?('Biography') + end + + def author_matches_subject? + authors = Traject::MarcExtractor.cached('100abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip } + name_subjects = Traject::MarcExtractor.cached('600abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip } + authors.any? { |author| name_subjects.include? author } + end + def likely_genre_term term genre_terms.include?(term) || genre_starting_terms.any? { |potential| term.start_with? potential } end @@ -109,4 +146,27 @@ def genre_starting_terms 'Translations into ' ] end + + def primary_source_genres + [ + 'archival resources', + 'archives', + 'charters', + 'correspondence', + 'diaries', + 'documents', + 'early works', + 'interview', + 'interviews', + 'letters', + 'manuscripts', + 'notebooks, sketchbooks, etc', + 'oral history', + 'oratory', + 'pamphlets', + 'personal narratives', + 'sources', + 'speeches' + ] + end end diff --git a/spec/marc_to_solr/lib/genre_spec.rb b/spec/marc_to_solr/lib/genre_spec.rb index 16ca6f74..e4d372f9 100644 --- a/spec/marc_to_solr/lib/genre_spec.rb +++ b/spec/marc_to_solr/lib/genre_spec.rb @@ -43,5 +43,94 @@ expect(genres).not_to include("Join") expect(genres).not_to include("Dramatic renditon") end + + it 'defaults to not including Primary source' do + expect(genres).not_to include("Primary source") + end + + context 'when the 650 subfield x has Correspondence' do + let(:genres) do + g650 = { "650" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Authors" }, { "x" => "Correspondence" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 600 subfield x has Notebooks, sketchbooks, etc.' do + let(:genres) do + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g600]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when there is an extra space after the period "etc. "' do + let(:genres) do + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc. " }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g600]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 650 subfield v has Archival resources' do + let(:genres) do + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Tales" }, { "z" => "Argentina" }, { "v" => "Archival resources." }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary Source in the list of genres' do + expect(genres).to include('Primary source') + end + end + context 'when the 650 subfield a is Biography' do + let(:genres) do + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g650]) + described_class.new(sample_marc).to_a + end + it 'does not include Primary source' do + expect(genres).not_to include('Primary source') + end + end + context 'when the 650 subfield a is Biography and the 600 and 100 match' do + let(:genres) do + g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil," }, { "e" => "author" }] } } + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil." }] } } + g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g650]) + described_class.new(sample_marc).to_a + end + it 'includes Primary source' do + expect(genres).to include 'Primary source' + end + end + context 'when the 651 subfield v is Biography' do + let(:genres) do + g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g651]) + described_class.new(sample_marc).to_a + end + it 'does not include Primary source' do + expect(genres).not_to include('Primary source') + end + end + context 'when the 651 subfield v is Biography, and the 600 and 100 match' do + let(:genres) do + g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }, { "0" => "http://id.loc.gov/authorities/names/n83057391" }] } } + g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }] } } + g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } } + sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g651]) + described_class.new(sample_marc).to_a + end + it 'includes Primary source' do + expect(genres).to contain_exactly 'Primary source', 'Biography' + end + end end end