Skip to content

Commit

Permalink
Refactor genre extraction into its own class (#2509)
Browse files Browse the repository at this point in the history
Helps with #2508
  • Loading branch information
sandbergja authored Oct 7, 2024
1 parent 2ed8452 commit 9f0a933
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 132 deletions.
97 changes: 97 additions & 0 deletions marc_to_solr/lib/genre.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# This class is responsible for listing the
# genres present in a given MARC record
class Genre
GENRES = [
'Bibliography',
'Biography',
'Catalogs',
'Catalogues raisonnes',
'Commentaries',
'Congresses',
'Diaries',
'Dictionaries',
'Drama',
'Encyclopedias',
'Exhibitions',
'Fiction',
'Guidebooks',
'In art',
'Indexes',
'Librettos',
'Manuscripts',
'Newspapers',
'Periodicals',
'Pictorial works',
'Poetry',
'Portraits',
'Scores',
'Songs and music',
'Sources',
'Statistics',
'Texts',
'Translations'
].freeze

GENRE_STARTS_WITH = [
'Census',
'Maps',
'Methods',
'Parts',
'Personal narratives',
'Scores and parts',
'Study and teaching',
'Translations into '
].freeze

SUBJECT_GENRE_VOCABULARIES = ['sk', 'aat', 'lcgft', 'rbbin', 'rbgenr', 'rbmscv',
'rbpap', 'rbpri', 'rbprov', 'rbpub', 'rbtyp', 'homoit'].freeze

def initialize(record)
@record = record
end

# 600/610/650/651 $v, $x filtered
# 655 $a, $v, $x filtered
def to_a
genres = []
Traject::MarcExtractor.cached('600|*0|x:610|*0|x:611|*0|x:630|*0|x:650|*0|x:651|*0|x:655|*0|x').collect_matching_lines(record) do |field, spec, extractor|
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
genres << genre if GENRES.include?(genre) || GENRE_STARTS_WITH.any? { |g| genre[g] }
end
end
Traject::MarcExtractor.cached('650|*7|v:655|*7|a:655|*7|v').collect_matching_lines(record) do |field, spec, extractor|
should_include = false
field.subfields.each do |s_field|
# only include heading if it is part of the vocabulary
should_include = SUBJECT_GENRE_VOCABULARIES.include?(s_field.value) if s_field.code == '2'
end
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
if genre.match?(/^\s+$/)
logger.error "#{record['001']} - Blank genre field"
elsif should_include
genres << genre
end
end
end
Traject::MarcExtractor.cached('600|*0|v:610|*0|v:611|*0|v:630|*0|v:650|*0|v:651|*0|v:655|*0|a:655|*0|v').collect_matching_lines(record) do |field, spec, extractor|
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
if genre.match?(/^\s+$/)
logger.error "#{record['001']} - Blank genre field"
else
genres << genre
end
end
end
genres.uniq
end

private

attr_reader :record
end
86 changes: 0 additions & 86 deletions marc_to_solr/lib/princeton_marc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -432,92 +432,6 @@ def remove_parens_035 standard_no
standard_no.gsub(/^\(.*?\)/, '')
end

GENRES = [
'Bibliography',
'Biography',
'Catalogs',
'Catalogues raisonnes',
'Commentaries',
'Congresses',
'Diaries',
'Dictionaries',
'Drama',
'Encyclopedias',
'Exhibitions',
'Fiction',
'Guidebooks',
'In art',
'Indexes',
'Librettos',
'Manuscripts',
'Newspapers',
'Periodicals',
'Pictorial works',
'Poetry',
'Portraits',
'Scores',
'Songs and music',
'Sources',
'Statistics',
'Texts',
'Translations'
]

GENRE_STARTS_WITH = [
'Census',
'Maps',
'Methods',
'Parts',
'Personal narratives',
'Scores and parts',
'Study and teaching',
'Translations into '
]

SUBJECT_GENRE_VOCABULARIES = ['sk', 'aat', 'lcgft', 'rbbin', 'rbgenr', 'rbmscv',
'rbpap', 'rbpri', 'rbprov', 'rbpub', 'rbtyp', 'homoit']

# 600/610/650/651 $v, $x filtered
# 655 $a, $v, $x filtered
def process_genre_facet record
genres = []
Traject::MarcExtractor.cached('600|*0|x:610|*0|x:611|*0|x:630|*0|x:650|*0|x:651|*0|x:655|*0|x').collect_matching_lines(record) do |field, spec, extractor|
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
genres << genre if GENRES.include?(genre) || GENRE_STARTS_WITH.any? { |g| genre[g] }
end
end
Traject::MarcExtractor.cached('650|*7|v:655|*7|a:655|*7|v').collect_matching_lines(record) do |field, spec, extractor|
should_include = false
field.subfields.each do |s_field|
# only include heading if it is part of the vocabulary
should_include = SUBJECT_GENRE_VOCABULARIES.include?(s_field.value) if s_field.code == '2'
end
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
if genre.match?(/^\s+$/)
logger.error "#{record['001']} - Blank genre field"
elsif should_include
genres << genre
end
end
end
Traject::MarcExtractor.cached('600|*0|v:610|*0|v:611|*0|v:630|*0|v:650|*0|v:651|*0|v:655|*0|a:655|*0|v').collect_matching_lines(record) do |field, spec, extractor|
genre = extractor.collect_subfields(field, spec).first
unless genre.nil?
genre = Traject::Macros::Marc21.trim_punctuation(genre)
if genre.match?(/^\s+$/)
logger.error "#{record['001']} - Blank genre field"
else
genres << genre
end
end
end
genres.uniq
end

def everything_after_t record, fields
values = []
Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, _spec, _extractor|
Expand Down
4 changes: 2 additions & 2 deletions marc_to_solr/lib/traject_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
require 'bundler/setup'
require 'change_the_subject'
require_relative './format'
require_relative './genre'
require_relative './princeton_marc'
require_relative './geo'
require_relative './electronic_portfolio_builder'
Expand Down Expand Up @@ -1023,8 +1024,7 @@
# 600/610/650/651 $v, $x filtered
# 655 $a, $v, $x filtered
to_field 'genre_facet' do |record, accumulator|
genres = process_genre_facet(record)
accumulator.replace(genres)
accumulator.replace(Genre.new(record).to_a)
end

# Related name(s):
Expand Down
47 changes: 47 additions & 0 deletions spec/marc_to_solr/lib/genre_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require 'rails_helper'

RSpec.describe Genre do
describe '#to_a' do
let(:genres) do
g600 = { "600" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Exclude" }, { "v" => "John" }, { "x" => "Join" }] } }
g630 = { "630" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "x" => "Fiction." }] } }
g655 = { "655" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Culture." }, { "x" => "Dramatic rendition" }, { "v" => "Awesome" }] } }
g655_2 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Poetry" }, { "x" => "Translations into French" }, { "v" => "Maps" }] } }
g655_3 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Manuscript" }, { "x" => "Translations into French" }, { "v" => "Genre" }, { "2" => "rbgenr" }] } }
sample_marc = MARC::Record.new_from_hash('fields' => [g600, g630, g655, g655_2, g655_3])
described_class.new(sample_marc).to_a
end

it 'trims punctuation' do
expect(genres).to include("Culture")
end

it 'excludes $a when not 655' do
expect(genres).not_to include("Exclude")
end

it 'excludes 2nd indicator of 7 if vocab type is not in approved list' do
expect(genres).not_to include("Maps")
expect(genres).not_to include("Poetry")
end

it 'includes 2nd indicator of 7 if vocab type is in approved list' do
expect(genres).to include("Manuscript")
expect(genres).to include("Genre")
end

it 'includes 6xx $v and 655 $a' do
expect(genres).to include("John")
expect(genres).to include("Awesome")
end

it 'includes 6xx $x from filtered in terms' do
expect(genres).to include("Fiction")
end

it 'excludes $x terms that do not match filter list' do
expect(genres).not_to include("Join")
expect(genres).not_to include("Dramatic renditon")
end
end
end
44 changes: 0 additions & 44 deletions spec/marc_to_solr/lib/princeton_marc_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -504,50 +504,6 @@ def fixture_record(fixture_name)
end
end

describe 'process_genre_facet function' do
before(:all) do
@g600 = { "600" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Exclude" }, { "v" => "John" }, { "x" => "Join" }] } }
@g630 = { "630" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "x" => "Fiction." }] } }
@g655 = { "655" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Culture." }, { "x" => "Dramatic rendition" }, { "v" => "Awesome" }] } }
@g655_2 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Poetry" }, { "x" => "Translations into French" }, { "v" => "Maps" }] } }
@g655_3 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Manuscript" }, { "x" => "Translations into French" }, { "v" => "Genre" }, { "2" => "rbgenr" }] } }
@sample_marc = MARC::Record.new_from_hash('fields' => [@g600, @g630, @g655, @g655_2, @g655_3])
@genres = process_genre_facet(@sample_marc)
end

it 'trims punctuation' do
expect(@genres).to include("Culture")
end

it 'excludes $a when not 655' do
expect(@genres).not_to include("Exclude")
end

it 'excludes 2nd indicator of 7 if vocab type is not in approved list' do
expect(@genres).not_to include("Maps")
expect(@genres).not_to include("Poetry")
end

it 'includes 2nd indicator of 7 if vocab type is in approved list' do
expect(@genres).to include("Manuscript")
expect(@genres).to include("Genre")
end

it 'includes 6xx $v and 655 $a' do
expect(@genres).to include("John")
expect(@genres).to include("Awesome")
end

it 'includes 6xx $x from filtered in terms' do
expect(@genres).to include("Fiction")
end

it 'excludes $x terms that do not match filter list' do
expect(@genres).not_to include("Join")
expect(@genres).not_to include("Dramatic renditon")
end
end

describe 'process_hierarchy function' do
before(:all) do
@s610_ind2_5 = { "600" => { "ind1" => "", "ind2" => "5", "subfields" => [{ "a" => "Exclude" }] } }
Expand Down

0 comments on commit 9f0a933

Please sign in to comment.