Refactor genre extraction into its own class (#2509)

Helps with #2508
pulibrary · Oct 7, 2024 · 9f0a933 · 9f0a933
1 parent 2ed8452
commit 9f0a933
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 132 deletions.
diff --git a/marc_to_solr/lib/genre.rb b/marc_to_solr/lib/genre.rb
@@ -0,0 +1,97 @@
+# This class is responsible for listing the
+# genres present in a given MARC record
+class Genre
+  GENRES = [
+    'Bibliography',
+    'Biography',
+    'Catalogs',
+    'Catalogues raisonnes',
+    'Commentaries',
+    'Congresses',
+    'Diaries',
+    'Dictionaries',
+    'Drama',
+    'Encyclopedias',
+    'Exhibitions',
+    'Fiction',
+    'Guidebooks',
+    'In art',
+    'Indexes',
+    'Librettos',
+    'Manuscripts',
+    'Newspapers',
+    'Periodicals',
+    'Pictorial works',
+    'Poetry',
+    'Portraits',
+    'Scores',
+    'Songs and music',
+    'Sources',
+    'Statistics',
+    'Texts',
+    'Translations'
+  ].freeze
+
+  GENRE_STARTS_WITH = [
+    'Census',
+    'Maps',
+    'Methods',
+    'Parts',
+    'Personal narratives',
+    'Scores and parts',
+    'Study and teaching',
+    'Translations into '
+  ].freeze
+
+  SUBJECT_GENRE_VOCABULARIES = ['sk', 'aat', 'lcgft', 'rbbin', 'rbgenr', 'rbmscv',
+                                'rbpap', 'rbpri', 'rbprov', 'rbpub', 'rbtyp', 'homoit'].freeze
+
+  def initialize(record)
+    @record = record
+  end
+
+  # 600/610/650/651 $v, $x filtered
+  # 655 $a, $v, $x filtered
+  def to_a
+    genres = []
+    Traject::MarcExtractor.cached('600|*0|x:610|*0|x:611|*0|x:630|*0|x:650|*0|x:651|*0|x:655|*0|x').collect_matching_lines(record) do |field, spec, extractor|
+      genre = extractor.collect_subfields(field, spec).first
+      unless genre.nil?
+        genre = Traject::Macros::Marc21.trim_punctuation(genre)
+        genres << genre if GENRES.include?(genre) || GENRE_STARTS_WITH.any? { |g| genre[g] }
+      end
+    end
+    Traject::MarcExtractor.cached('650|*7|v:655|*7|a:655|*7|v').collect_matching_lines(record) do |field, spec, extractor|
+      should_include = false
+      field.subfields.each do |s_field|
+        # only include heading if it is part of the vocabulary
+        should_include = SUBJECT_GENRE_VOCABULARIES.include?(s_field.value) if s_field.code == '2'
+      end
+      genre = extractor.collect_subfields(field, spec).first
+      unless genre.nil?
+        genre = Traject::Macros::Marc21.trim_punctuation(genre)
+        if genre.match?(/^\s+$/)
+          logger.error "#{record['001']} - Blank genre field"
+        elsif should_include
+          genres << genre
+        end
+      end
+    end
+    Traject::MarcExtractor.cached('600|*0|v:610|*0|v:611|*0|v:630|*0|v:650|*0|v:651|*0|v:655|*0|a:655|*0|v').collect_matching_lines(record) do |field, spec, extractor|
+      genre = extractor.collect_subfields(field, spec).first
+      unless genre.nil?
+        genre = Traject::Macros::Marc21.trim_punctuation(genre)
+        if genre.match?(/^\s+$/)
+          logger.error "#{record['001']} - Blank genre field"
+        else
+          genres << genre
+        end
+      end
+    end
+    genres.uniq
+  end
+
+  private
+
+    attr_reader :record
+end
diff --git a/marc_to_solr/lib/princeton_marc.rb b/marc_to_solr/lib/princeton_marc.rb
@@ -432,92 +432,6 @@ def remove_parens_035 standard_no
   standard_no.gsub(/^\(.*?\)/, '')
 end
 
-GENRES = [
-  'Bibliography',
-  'Biography',
-  'Catalogs',
-  'Catalogues raisonnes',
-  'Commentaries',
-  'Congresses',
-  'Diaries',
-  'Dictionaries',
-  'Drama',
-  'Encyclopedias',
-  'Exhibitions',
-  'Fiction',
-  'Guidebooks',
-  'In art',
-  'Indexes',
-  'Librettos',
-  'Manuscripts',
-  'Newspapers',
-  'Periodicals',
-  'Pictorial works',
-  'Poetry',
-  'Portraits',
-  'Scores',
-  'Songs and music',
-  'Sources',
-  'Statistics',
-  'Texts',
-  'Translations'
-]
-
-GENRE_STARTS_WITH = [
-  'Census',
-  'Maps',
-  'Methods',
-  'Parts',
-  'Personal narratives',
-  'Scores and parts',
-  'Study and teaching',
-  'Translations into '
-]
-
-SUBJECT_GENRE_VOCABULARIES = ['sk', 'aat', 'lcgft', 'rbbin', 'rbgenr', 'rbmscv',
-                              'rbpap', 'rbpri', 'rbprov', 'rbpub', 'rbtyp', 'homoit']
-
-# 600/610/650/651 $v, $x filtered
-# 655 $a, $v, $x filtered
-def process_genre_facet record
-  genres = []
-  Traject::MarcExtractor.cached('600|*0|x:610|*0|x:611|*0|x:630|*0|x:650|*0|x:651|*0|x:655|*0|x').collect_matching_lines(record) do |field, spec, extractor|
-    genre = extractor.collect_subfields(field, spec).first
-    unless genre.nil?
-      genre = Traject::Macros::Marc21.trim_punctuation(genre)
-      genres << genre if GENRES.include?(genre) || GENRE_STARTS_WITH.any? { |g| genre[g] }
-    end
-  end
-  Traject::MarcExtractor.cached('650|*7|v:655|*7|a:655|*7|v').collect_matching_lines(record) do |field, spec, extractor|
-    should_include = false
-    field.subfields.each do |s_field|
-      # only include heading if it is part of the vocabulary
-      should_include = SUBJECT_GENRE_VOCABULARIES.include?(s_field.value) if s_field.code == '2'
-    end
-    genre = extractor.collect_subfields(field, spec).first
-    unless genre.nil?
-      genre = Traject::Macros::Marc21.trim_punctuation(genre)
-      if genre.match?(/^\s+$/)
-        logger.error "#{record['001']} - Blank genre field"
-      elsif should_include
-        genres << genre
-      end
-    end
-  end
-  Traject::MarcExtractor.cached('600|*0|v:610|*0|v:611|*0|v:630|*0|v:650|*0|v:651|*0|v:655|*0|a:655|*0|v').collect_matching_lines(record) do |field, spec, extractor|
-    genre = extractor.collect_subfields(field, spec).first
-    unless genre.nil?
-      genre = Traject::Macros::Marc21.trim_punctuation(genre)
-      if genre.match?(/^\s+$/)
-        logger.error "#{record['001']} - Blank genre field"
-      else
-        genres << genre
-      end
-    end
-  end
-  genres.uniq
-end
-
 def everything_after_t record, fields
   values = []
   Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, _spec, _extractor|

diff --git a/marc_to_solr/lib/traject_config.rb b/marc_to_solr/lib/traject_config.rb
@@ -7,6 +7,7 @@
 require 'bundler/setup'
 require 'change_the_subject'
 require_relative './format'
+require_relative './genre'
 require_relative './princeton_marc'
 require_relative './geo'
 require_relative './electronic_portfolio_builder'
@@ -1023,8 +1024,7 @@
 # 600/610/650/651 $v, $x filtered
 # 655 $a, $v, $x filtered
 to_field 'genre_facet' do |record, accumulator|
-  genres = process_genre_facet(record)
-  accumulator.replace(genres)
+  accumulator.replace(Genre.new(record).to_a)
 end
 
 # Related name(s):

diff --git a/spec/marc_to_solr/lib/genre_spec.rb b/spec/marc_to_solr/lib/genre_spec.rb
@@ -0,0 +1,47 @@
+require 'rails_helper'
+
+RSpec.describe Genre do
+  describe '#to_a' do
+    let(:genres) do
+      g600 = { "600" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Exclude" }, { "v" => "John" }, { "x" => "Join" }] } }
+      g630 = { "630" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "x" => "Fiction." }] } }
+      g655 = { "655" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Culture." }, { "x" => "Dramatic rendition" }, { "v" => "Awesome" }] } }
+      g655_2 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Poetry" }, { "x" => "Translations into French" }, { "v" => "Maps" }] } }
+      g655_3 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Manuscript" }, { "x" => "Translations into French" }, { "v" => "Genre" }, { "2" => "rbgenr" }] } }
+      sample_marc = MARC::Record.new_from_hash('fields' => [g600, g630, g655, g655_2, g655_3])
+      described_class.new(sample_marc).to_a
+    end
+
+    it 'trims punctuation' do
+      expect(genres).to include("Culture")
+    end
+
+    it 'excludes $a when not 655' do
+      expect(genres).not_to include("Exclude")
+    end
+
+    it 'excludes 2nd indicator of 7 if vocab type is not in approved list' do
+      expect(genres).not_to include("Maps")
+      expect(genres).not_to include("Poetry")
+    end
+
+    it 'includes 2nd indicator of 7 if vocab type is in approved list' do
+      expect(genres).to include("Manuscript")
+      expect(genres).to include("Genre")
+    end
+
+    it 'includes 6xx $v and 655 $a' do
+      expect(genres).to include("John")
+      expect(genres).to include("Awesome")
+    end
+
+    it 'includes 6xx $x from filtered in terms' do
+      expect(genres).to include("Fiction")
+    end
+
+    it 'excludes $x terms that do not match filter list' do
+      expect(genres).not_to include("Join")
+      expect(genres).not_to include("Dramatic renditon")
+    end
+  end
+end
diff --git a/spec/marc_to_solr/lib/princeton_marc_spec.rb b/spec/marc_to_solr/lib/princeton_marc_spec.rb
@@ -504,50 +504,6 @@ def fixture_record(fixture_name)
     end
   end
 
-  describe 'process_genre_facet function' do
-    before(:all) do
-      @g600 = { "600" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Exclude" }, { "v" => "John" }, { "x" => "Join" }] } }
-      @g630 = { "630" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "x" => "Fiction." }] } }
-      @g655 = { "655" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Culture." }, { "x" => "Dramatic rendition" }, { "v" => "Awesome" }] } }
-      @g655_2 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Poetry" }, { "x" => "Translations into French" }, { "v" => "Maps" }] } }
-      @g655_3 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Manuscript" }, { "x" => "Translations into French" }, { "v" => "Genre" }, { "2" => "rbgenr" }] } }
-      @sample_marc = MARC::Record.new_from_hash('fields' => [@g600, @g630, @g655, @g655_2, @g655_3])
-      @genres = process_genre_facet(@sample_marc)
-    end
-
-    it 'trims punctuation' do
-      expect(@genres).to include("Culture")
-    end
-
-    it 'excludes $a when not 655' do
-      expect(@genres).not_to include("Exclude")
-    end
-
-    it 'excludes 2nd indicator of 7 if vocab type is not in approved list' do
-      expect(@genres).not_to include("Maps")
-      expect(@genres).not_to include("Poetry")
-    end
-
-    it 'includes 2nd indicator of 7 if vocab type is in approved list' do
-      expect(@genres).to include("Manuscript")
-      expect(@genres).to include("Genre")
-    end
-
-    it 'includes 6xx $v and 655 $a' do
-      expect(@genres).to include("John")
-      expect(@genres).to include("Awesome")
-    end
-
-    it 'includes 6xx $x from filtered in terms' do
-      expect(@genres).to include("Fiction")
-    end
-
-    it 'excludes $x terms that do not match filter list' do
-      expect(@genres).not_to include("Join")
-      expect(@genres).not_to include("Dramatic renditon")
-    end
-  end
-
   describe 'process_hierarchy function' do
     before(:all) do
       @s610_ind2_5 = { "600" => { "ind1" => "", "ind2" => "5", "subfields" => [{ "a" => "Exclude" }] } }