Skip to content

Commit

Permalink
[#2508] Index primary sources as a facet
Browse files Browse the repository at this point in the history
  • Loading branch information
sandbergja committed Oct 9, 2024
1 parent 2adad52 commit 0261787
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 2 deletions.
64 changes: 62 additions & 2 deletions marc_to_solr/lib/genre.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def initialize(record)
# 655 $a, $v, $x filtered
def to_a
@as_array ||= (
genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v
).uniq
genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v + genre_from_primary_source_mapping + genres_from_autobiography
).compact.uniq
end

private
Expand Down Expand Up @@ -60,6 +60,43 @@ def genres_from_subfield_v
end
end

def genre_from_primary_source_mapping
potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|vx:651|*0|vx:655|*0|a:655|*0|vx').collect_matching_lines(record) do |field, spec, extractor|
extractor.collect_subfields(field, spec)
end
if potential_genres.any? { |genre| genre_term_indicates_primary_source? genre }
['Primary source']
else
[]
end
end

def genres_from_autobiography
if biography? && author_matches_subject?
['Primary source']
else
[]
end
end

def genre_term_indicates_primary_source?(genre)
normalized_genre = genre.downcase.strip.delete_suffix('.')
primary_source_genres.include? normalized_genre
end

def biography?
potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|avx:651|*0|vx:655|*0|avx').collect_matching_lines(record) do |field, spec, extractor|
extractor.collect_subfields(field, spec)
end
potential_genres.include?('Biography')
end

def author_matches_subject?
authors = Traject::MarcExtractor.cached('100abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip }
name_subjects = Traject::MarcExtractor.cached('600abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip }
authors.any? { |author| name_subjects.include? author }
end

def likely_genre_term term
genre_terms.include?(term) || genre_starting_terms.any? { |potential| term.start_with? potential }
end
Expand Down Expand Up @@ -109,4 +146,27 @@ def genre_starting_terms
'Translations into '
]
end

def primary_source_genres
[
'archival resources',
'archives',
'charters',
'correspondence',
'diaries',
'documents',
'early works',
'interview',
'interviews',
'letters',
'manuscripts',
'notebooks, sketchbooks, etc',
'oral history',
'oratory',
'pamphlets',
'personal narratives',
'sources',
'speeches'
]
end
end
89 changes: 89 additions & 0 deletions spec/marc_to_solr/lib/genre_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,94 @@
expect(genres).not_to include("Join")
expect(genres).not_to include("Dramatic renditon")
end

it 'defaults to not including Primary source' do
expect(genres).not_to include("Primary source")
end

context 'when the 650 subfield x has Correspondence' do
let(:genres) do
g650 = { "650" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Authors" }, { "x" => "Correspondence" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g650])
described_class.new(sample_marc).to_a
end
it 'includes Primary Source in the list of genres' do
expect(genres).to include('Primary source')
end
end
context 'when the 600 subfield x has Notebooks, sketchbooks, etc.' do
let(:genres) do
g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc." }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g600])
described_class.new(sample_marc).to_a
end
it 'includes Primary Source in the list of genres' do
expect(genres).to include('Primary source')
end
end
context 'when there is an extra space after the period "etc. "' do
let(:genres) do
g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc. " }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g600])
described_class.new(sample_marc).to_a
end
it 'includes Primary Source in the list of genres' do
expect(genres).to include('Primary source')
end
end
context 'when the 650 subfield v has Archival resources' do
let(:genres) do
g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Tales" }, { "z" => "Argentina" }, { "v" => "Archival resources." }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g650])
described_class.new(sample_marc).to_a
end
it 'includes Primary Source in the list of genres' do
expect(genres).to include('Primary source')
end
end
context 'when the 650 subfield a is Biography' do
let(:genres) do
g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g650])
described_class.new(sample_marc).to_a
end
it 'does not include Primary source' do
expect(genres).not_to include('Primary source')
end
end
context 'when the 650 subfield a is Biography and the 600 and 100 match' do
let(:genres) do
g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil," }, { "e" => "author" }] } }
g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil." }] } }
g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g650])
described_class.new(sample_marc).to_a
end
it 'includes Primary source' do
expect(genres).to include 'Primary source'
end
end
context 'when the 651 subfield v is Biography' do
let(:genres) do
g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g651])
described_class.new(sample_marc).to_a
end
it 'does not include Primary source' do
expect(genres).not_to include('Primary source')
end
end
context 'when the 651 subfield v is Biography, and the 600 and 100 match' do
let(:genres) do
g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }, { "0" => "http://id.loc.gov/authorities/names/n83057391" }] } }
g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }] } }
g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g651])
described_class.new(sample_marc).to_a
end
it 'includes Primary source' do
expect(genres).to contain_exactly 'Primary source', 'Biography'
end
end
end
end

0 comments on commit 0261787

Please sign in to comment.