-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor genre extraction into its own class (#2509)
Helps with #2508
- Loading branch information
1 parent
2ed8452
commit 9f0a933
Showing
5 changed files
with
146 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# This class is responsible for listing the | ||
# genres present in a given MARC record | ||
class Genre | ||
GENRES = [ | ||
'Bibliography', | ||
'Biography', | ||
'Catalogs', | ||
'Catalogues raisonnes', | ||
'Commentaries', | ||
'Congresses', | ||
'Diaries', | ||
'Dictionaries', | ||
'Drama', | ||
'Encyclopedias', | ||
'Exhibitions', | ||
'Fiction', | ||
'Guidebooks', | ||
'In art', | ||
'Indexes', | ||
'Librettos', | ||
'Manuscripts', | ||
'Newspapers', | ||
'Periodicals', | ||
'Pictorial works', | ||
'Poetry', | ||
'Portraits', | ||
'Scores', | ||
'Songs and music', | ||
'Sources', | ||
'Statistics', | ||
'Texts', | ||
'Translations' | ||
].freeze | ||
|
||
GENRE_STARTS_WITH = [ | ||
'Census', | ||
'Maps', | ||
'Methods', | ||
'Parts', | ||
'Personal narratives', | ||
'Scores and parts', | ||
'Study and teaching', | ||
'Translations into ' | ||
].freeze | ||
|
||
SUBJECT_GENRE_VOCABULARIES = ['sk', 'aat', 'lcgft', 'rbbin', 'rbgenr', 'rbmscv', | ||
'rbpap', 'rbpri', 'rbprov', 'rbpub', 'rbtyp', 'homoit'].freeze | ||
|
||
def initialize(record) | ||
@record = record | ||
end | ||
|
||
# 600/610/650/651 $v, $x filtered | ||
# 655 $a, $v, $x filtered | ||
def to_a | ||
genres = [] | ||
Traject::MarcExtractor.cached('600|*0|x:610|*0|x:611|*0|x:630|*0|x:650|*0|x:651|*0|x:655|*0|x').collect_matching_lines(record) do |field, spec, extractor| | ||
genre = extractor.collect_subfields(field, spec).first | ||
unless genre.nil? | ||
genre = Traject::Macros::Marc21.trim_punctuation(genre) | ||
genres << genre if GENRES.include?(genre) || GENRE_STARTS_WITH.any? { |g| genre[g] } | ||
end | ||
end | ||
Traject::MarcExtractor.cached('650|*7|v:655|*7|a:655|*7|v').collect_matching_lines(record) do |field, spec, extractor| | ||
should_include = false | ||
field.subfields.each do |s_field| | ||
# only include heading if it is part of the vocabulary | ||
should_include = SUBJECT_GENRE_VOCABULARIES.include?(s_field.value) if s_field.code == '2' | ||
end | ||
genre = extractor.collect_subfields(field, spec).first | ||
unless genre.nil? | ||
genre = Traject::Macros::Marc21.trim_punctuation(genre) | ||
if genre.match?(/^\s+$/) | ||
logger.error "#{record['001']} - Blank genre field" | ||
elsif should_include | ||
genres << genre | ||
end | ||
end | ||
end | ||
Traject::MarcExtractor.cached('600|*0|v:610|*0|v:611|*0|v:630|*0|v:650|*0|v:651|*0|v:655|*0|a:655|*0|v').collect_matching_lines(record) do |field, spec, extractor| | ||
genre = extractor.collect_subfields(field, spec).first | ||
unless genre.nil? | ||
genre = Traject::Macros::Marc21.trim_punctuation(genre) | ||
if genre.match?(/^\s+$/) | ||
logger.error "#{record['001']} - Blank genre field" | ||
else | ||
genres << genre | ||
end | ||
end | ||
end | ||
genres.uniq | ||
end | ||
|
||
private | ||
|
||
attr_reader :record | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
require 'rails_helper' | ||
|
||
RSpec.describe Genre do | ||
describe '#to_a' do | ||
let(:genres) do | ||
g600 = { "600" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Exclude" }, { "v" => "John" }, { "x" => "Join" }] } } | ||
g630 = { "630" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "x" => "Fiction." }] } } | ||
g655 = { "655" => { "ind1" => "", "ind2" => "0", "subfields" => [{ "a" => "Culture." }, { "x" => "Dramatic rendition" }, { "v" => "Awesome" }] } } | ||
g655_2 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Poetry" }, { "x" => "Translations into French" }, { "v" => "Maps" }] } } | ||
g655_3 = { "655" => { "ind1" => "", "ind2" => "7", "subfields" => [{ "a" => "Manuscript" }, { "x" => "Translations into French" }, { "v" => "Genre" }, { "2" => "rbgenr" }] } } | ||
sample_marc = MARC::Record.new_from_hash('fields' => [g600, g630, g655, g655_2, g655_3]) | ||
described_class.new(sample_marc).to_a | ||
end | ||
|
||
it 'trims punctuation' do | ||
expect(genres).to include("Culture") | ||
end | ||
|
||
it 'excludes $a when not 655' do | ||
expect(genres).not_to include("Exclude") | ||
end | ||
|
||
it 'excludes 2nd indicator of 7 if vocab type is not in approved list' do | ||
expect(genres).not_to include("Maps") | ||
expect(genres).not_to include("Poetry") | ||
end | ||
|
||
it 'includes 2nd indicator of 7 if vocab type is in approved list' do | ||
expect(genres).to include("Manuscript") | ||
expect(genres).to include("Genre") | ||
end | ||
|
||
it 'includes 6xx $v and 655 $a' do | ||
expect(genres).to include("John") | ||
expect(genres).to include("Awesome") | ||
end | ||
|
||
it 'includes 6xx $x from filtered in terms' do | ||
expect(genres).to include("Fiction") | ||
end | ||
|
||
it 'excludes $x terms that do not match filter list' do | ||
expect(genres).not_to include("Join") | ||
expect(genres).not_to include("Dramatic renditon") | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters