[#2508] Index primary sources as a facet

pulibrary · Oct 9, 2024 · 0261787 · 0261787
1 parent 2adad52
commit 0261787
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 2 deletions.
diff --git a/marc_to_solr/lib/genre.rb b/marc_to_solr/lib/genre.rb
@@ -12,8 +12,8 @@ def initialize(record)
   # 655 $a, $v, $x filtered
   def to_a
     @as_array ||= (
-      genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v
-    ).uniq
+      genres_from_subfield_x + genres_from_subject_vocabularies + genres_from_subfield_v + genre_from_primary_source_mapping + genres_from_autobiography
+    ).compact.uniq
   end
 
   private
@@ -60,6 +60,43 @@ def genres_from_subfield_v
       end
     end
 
+    def genre_from_primary_source_mapping
+      potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|vx:651|*0|vx:655|*0|a:655|*0|vx').collect_matching_lines(record) do |field, spec, extractor|
+        extractor.collect_subfields(field, spec)
+      end
+      if potential_genres.any? { |genre| genre_term_indicates_primary_source? genre }
+        ['Primary source']
+      else
+        []
+      end
+    end
+
+    def genres_from_autobiography
+      if biography? && author_matches_subject?
+        ['Primary source']
+      else
+        []
+      end
+    end
+
+    def genre_term_indicates_primary_source?(genre)
+      normalized_genre = genre.downcase.strip.delete_suffix('.')
+      primary_source_genres.include? normalized_genre
+    end
+
+    def biography?
+      potential_genres = Traject::MarcExtractor.cached('600|*0|vx:610|*0|vx:611|*0|vx:630|*0|vx:650|*0|avx:651|*0|vx:655|*0|avx').collect_matching_lines(record) do |field, spec, extractor|
+        extractor.collect_subfields(field, spec)
+      end
+      potential_genres.include?('Biography')
+    end
+
+    def author_matches_subject?
+      authors = Traject::MarcExtractor.cached('100abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip }
+      name_subjects = Traject::MarcExtractor.cached('600abcdjq').extract(record).uniq.map { |name| Traject::Macros::Marc21.trim_punctuation name.downcase.strip }
+      authors.any? { |author| name_subjects.include? author }
+    end
+
     def likely_genre_term term
       genre_terms.include?(term) || genre_starting_terms.any? { |potential| term.start_with? potential }
     end
@@ -109,4 +146,27 @@ def genre_starting_terms
         'Translations into '
       ]
     end
+
+    def primary_source_genres
+      [
+        'archival resources',
+        'archives',
+        'charters',
+        'correspondence',
+        'diaries',
+        'documents',
+        'early works',
+        'interview',
+        'interviews',
+        'letters',
+        'manuscripts',
+        'notebooks, sketchbooks, etc',
+        'oral history',
+        'oratory',
+        'pamphlets',
+        'personal narratives',
+        'sources',
+        'speeches'
+      ]
+    end
 end
diff --git a/spec/marc_to_solr/lib/genre_spec.rb b/spec/marc_to_solr/lib/genre_spec.rb
@@ -43,5 +43,94 @@
       expect(genres).not_to include("Join")
       expect(genres).not_to include("Dramatic renditon")
     end
+
+    it 'defaults to not including Primary source' do
+      expect(genres).not_to include("Primary source")
+    end
+
+    context 'when the 650 subfield x has Correspondence' do
+      let(:genres) do
+        g650 = { "650" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Authors" }, { "x" => "Correspondence" }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g650])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary Source in the list of genres' do
+        expect(genres).to include('Primary source')
+      end
+    end
+    context 'when the 600 subfield x has Notebooks, sketchbooks, etc.' do
+      let(:genres) do
+        g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc." }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g600])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary Source in the list of genres' do
+        expect(genres).to include('Primary source')
+      end
+    end
+    context 'when there is an extra space after the period "etc. "' do
+      let(:genres) do
+        g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Magallanes, Alejandro" }, { "x" => "Notebooks, sketchbooks, etc. " }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g600])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary Source in the list of genres' do
+        expect(genres).to include('Primary source')
+      end
+    end
+    context 'when the 650 subfield v has Archival resources' do
+      let(:genres) do
+        g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Tales" }, { "z" => "Argentina" }, { "v" => "Archival resources." }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g650])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary Source in the list of genres' do
+        expect(genres).to include('Primary source')
+      end
+    end
+    context 'when the 650 subfield a is Biography' do
+      let(:genres) do
+        g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g650])
+        described_class.new(sample_marc).to_a
+      end
+      it 'does not include Primary source' do
+        expect(genres).not_to include('Primary source')
+      end
+    end
+    context 'when the 650 subfield a is Biography and the 600 and 100 match' do
+      let(:genres) do
+        g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil," }, { "e" => "author" }] } }
+        g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Wheaton, Wil." }] } }
+        g650 = { "650" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "Biography" }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g650])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary source' do
+        expect(genres).to include 'Primary source'
+      end
+    end
+    context 'when the 651 subfield v is Biography' do
+      let(:genres) do
+        g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g651])
+        described_class.new(sample_marc).to_a
+      end
+      it 'does not include Primary source' do
+        expect(genres).not_to include('Primary source')
+      end
+    end
+    context 'when the 651 subfield v is Biography, and the 600 and 100 match' do
+      let(:genres) do
+        g100 = { "100" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }, { "0" => "http://id.loc.gov/authorities/names/n83057391" }] } }
+        g600 = { "600" => { "ind1" => "1", "ind2" => "0", "subfields" => [{ "a" => "Gornick, Vivian." }] } }
+        g651 = { "651" => { "ind1" => " ", "ind2" => "0", "subfields" => [{ "a" => "New York (N.Y.)" }, { "v" => "Biography" }] } }
+        sample_marc = MARC::Record.new_from_hash('fields' => [g100, g600, g651])
+        described_class.new(sample_marc).to_a
+      end
+      it 'includes Primary source' do
+        expect(genres).to contain_exactly 'Primary source', 'Biography'
+      end
+    end
   end
 end