Skip to content

Commit

Permalink
automatically only use first initial when not ambiguous
Browse files Browse the repository at this point in the history
  • Loading branch information
peetucket committed May 4, 2021
1 parent 0a9feed commit 4497509
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 36 deletions.
11 changes: 11 additions & 0 deletions app/models/author.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ def institution
Settings.HARVESTER.INSTITUTION.name
end

# indicates if the LastName, FirstInitial form for this user is unique within our author database (including any alternate identities that include Stanford as an institution)
# also checks to see if there are alternate identities with institutions other than Stanford, which is problematic, and should be considered ambiguous
def unique_first_initial?
return false unless first_name && last_name # this method only works if you have a complete first and last name
first_initial_not_unique = self.class.where('preferred_first_name like ? and preferred_last_name = ?', "#{first_name[0]}%", last_name).size > 1
author_identities_not_unique = author_identities.map do |author_identity|
(!author_identity.institution.blank? && !author_identity.institution.include?('Stanford')) || self.class.where('preferred_first_name like ? and preferred_last_name = ? and id != ?', "#{author_identity.first_name[0]}%", author_identity.last_name, author_identity.author_id).size > 1
end
!(first_initial_not_unique || author_identities_not_unique.include?(true))
end

# @return [Array<Integer>] ScienceWireIds for approved publications
def approved_sciencewire_ids
publications.where("contributions.status = 'approved'")
Expand Down
1 change: 0 additions & 1 deletion config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ DOI:
HARVESTER:
LOG: log/all_sources_harvester.log
USE_MIDDLE_NAME: true
USE_FIRST_INITIAL: false
USE_AUTHOR_IDENTITIES: false
INSTITUTION:
name: Stanford University
Expand Down
15 changes: 8 additions & 7 deletions lib/agent/author_name.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ def text_search_query
text_search_terms.map { |x| "\"#{x}\"" }.join(' or ')
end

def text_search_terms
def text_search_terms(options = {})
use_first_initial = options[:use_first_initial] || true
@text_search_terms ||=
[first_name_query, middle_name_query].flatten.reject(&:empty?).uniq
[first_name_query(use_first_initial), middle_name_query(use_first_initial)].flatten.reject(&:empty?).uniq
end

def ==(other)
Expand All @@ -68,10 +69,10 @@ def ==(other)
# 'Lastname,Firstname' or
# 'Lastname,FirstInitial'
# @return [Array<String>|String] names
def first_name_query
def first_name_query(use_first_initial)
return '' if last.empty? && first.empty?
query = ["#{last_name},#{first_name}"]
query += ["#{last_name},#{first_initial}"] if Settings.HARVESTER.USE_FIRST_INITIAL
query = ["#{last_name},#{first_name}"]
query += ["#{last_name},#{first_initial}"] if use_first_initial
query
end

Expand All @@ -80,10 +81,10 @@ def first_name_query
# 'Lastname,Firstname,MiddleInitial' or
# 'Lastname,FirstInitial,MiddleInitial'
# @return [Array<String>|String] names
def middle_name_query
def middle_name_query(use_first_initial)
return '' unless middle =~ /^[[:alpha:]]/
query = ["#{last_name},#{first_name},#{middle_name}", "#{last_name},#{first_name},#{middle_initial}"]
query += ["#{last_name},#{first_initial}#{middle_initial}", "#{last_name},#{first_initial},#{middle_initial}"] if Settings.HARVESTER.USE_FIRST_INITIAL
query += ["#{last_name},#{first_initial}#{middle_initial}", "#{last_name},#{first_initial},#{middle_initial}"] if use_first_initial
query
end

Expand Down
2 changes: 1 addition & 1 deletion lib/web_of_science/query_author.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def names
ident.first_name,
Settings.HARVESTER.USE_MIDDLE_NAME ? ident.middle_name : ''
)
end&.text_search_terms
end&.text_search_terms(use_first_initial: author.unique_first_initial?)
end.flatten.compact.uniq
end

Expand Down
33 changes: 33 additions & 0 deletions spec/factories/author.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,26 @@
end
end

factory :author_duped_last_name, parent: :author do
sunetid { FactoryBot.generate(:random_id) }
cap_profile_id { FactoryBot.generate(:random_id) }
university_id { FactoryBot.generate(:random_id) }
california_physician_license { FactoryBot.generate(:random_string) }
active_in_cap { true }
email { '[email protected]' }
official_first_name { 'Albert' }
official_last_name { 'Edler' }
official_middle_name { '' }
preferred_first_name { 'Albert' }
preferred_last_name { 'Edler' }
preferred_middle_name { '' }
emails_for_harvest { '[email protected]' }
end

factory :inactive_author, parent: :author do
active_in_cap { false }
end

factory :author_with_alternate_identities, parent: :author do
transient do
alt_count { 1 } # default number of alternate identities to create
Expand All @@ -65,6 +85,19 @@
end
end

factory :odd_name, parent: :author do
active_in_cap { true }
cap_import_enabled { true }
official_first_name { 'Somebody' }
official_last_name { 'WithReallyUnusualName' }
official_middle_name { '' }
preferred_first_name { 'Somebody' }
preferred_last_name { 'WithReallyUnusualName' }
preferred_middle_name { '' }
email { '[email protected]' }
emails_for_harvest { '[email protected]' }
end

# Public data from
# - https://stanfordwho.stanford.edu
# - https://med.stanford.edu/profiles/russ-altman
Expand Down
55 changes: 30 additions & 25 deletions spec/lib/agent/author_name_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -117,21 +117,18 @@

describe '#text_search_terms' do
it 'includes first_name_query and middle_name_query elements' do
fnames = all_names.send(:first_name_query)
mnames = all_names.send(:middle_name_query)
fnames = all_names.send(:first_name_query, true)
mnames = all_names.send(:middle_name_query, true)
expect(all_names.text_search_terms).to include(*fnames, *mnames)
end
end

describe '#first_name_query' do
it 'when no names are present returns an empty String' do
expect(no_names.send(:first_name_query)).to eq ''
expect(no_names.send(:first_name_query, true)).to eq ''
end
context 'when all names are present' do
let(:fn_query) { all_names.send(:first_name_query) }
before do
allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(false)
end
context 'when all names are present with middle initial' do
let(:fn_query) { all_names.send(:first_name_query, true) }
it 'is Array<String> with non-empty unique values' do
expect(fn_query).to be_an Array
expect(fn_query).to all(be_a(String))
Expand All @@ -141,8 +138,8 @@
it 'includes name with first_name' do
expect(fn_query).to include "#{all_names.last_name},#{all_names.first_name}"
end
it 'excludes name with first_initial when settings do not allow for it' do
expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_initial}"
it 'includes name with first_initial when settings allow for it' do
expect(fn_query).to include "#{all_names.last_name},#{all_names.first_initial}"
end
it 'does not include name with middle_name' do
expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_name}"
Expand All @@ -153,26 +150,37 @@
expect(fn_query).to all(exclude(",#{all_names.middle_initial}"))
end
end
context 'when all names are present and settings allow for first initial' do
before do
allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(true)
context 'when all names are present without middle initial' do
let(:fn_query) { all_names.send(:first_name_query, false) }
it 'is Array<String> with non-empty unique values' do
expect(fn_query).to be_an Array
expect(fn_query).to all(be_a(String))
expect(fn_query).not_to include(be_empty)
expect(fn_query.size).to eq(fn_query.uniq.size)
end
let(:fn_query) { all_names.send(:first_name_query) }
it 'includes name with first_initial when settings allow for it' do
expect(fn_query).to include "#{all_names.last_name},#{all_names.first_initial}"
it 'includes name with first_name' do
expect(fn_query).to include "#{all_names.last_name},#{all_names.first_name}"
end
it 'does not include name with first_initial' do
expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_initial}"
end
it 'does not include name with middle_name' do
expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_name}"
expect(fn_query).to all(exclude(",#{all_names.middle_name}"))
end
it 'does not include name with middle_initial' do
expect(fn_query).not_to include "#{all_names.last_name},#{all_names.first_name},#{all_names.middle_initial}"
expect(fn_query).to all(exclude(",#{all_names.middle_initial}"))
end
end
end

describe '#middle_name_query' do
it 'when no names are present returns an empty String' do
expect(no_names.send(:middle_name_query)).to eq ''
expect(no_names.send(:middle_name_query, false)).to eq ''
end
context 'when all names are present' do
let(:mn_query) { all_names.send(:middle_name_query) }
before do
allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(false)
end
let(:mn_query) { all_names.send(:middle_name_query, false) }
it 'is Array<String> with non-empty unique values' do
expect(mn_query).to be_an Array
expect(mn_query).to all(be_a(String))
Expand All @@ -196,10 +204,7 @@
end
end
context 'when all names are present and settings allow for first initial' do
let(:mn_query) { all_names.send(:middle_name_query) }
before do
allow(Settings.HARVESTER).to receive(:USE_FIRST_INITIAL).and_return(true)
end
let(:mn_query) { all_names.send(:middle_name_query, true) }
it 'includes name with middle_initial appended to first initial when settings allow for it' do
expect(mn_query).to include "#{all_names.last_name},#{all_names.first_initial}#{all_names.middle_initial}"
end
Expand Down
4 changes: 2 additions & 2 deletions spec/lib/web_of_science/query_author_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@

it 'ignores the bad alternate identity data' do
expect(author_one_identity.author_identities.first.first_name).to eq '.' # bad first name
# we get three name variants out (we would have more if we allowed the bad name variant)
expect(described_class.new(author_one_identity).send(:names)).to eq %w[Edler,Alice Edler,Alice,Jim Edler,Alice,J]
# we do not get the name variant with the period for a first name (we would have more if we allowed the bad name variant)
expect(described_class.new(author_one_identity).send(:names)).to eq %w[Edler,Alice Edler,A Edler,Alice,Jim Edler,Alice,J Edler,AJ Edler,A,J]
end
end
end
Expand Down
25 changes: 25 additions & 0 deletions spec/models/author_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,31 @@
end
end

describe '#unique_first_initial?' do
it 'confirms unique first initial within stanford with no alternate identities' do
odd_name = create :odd_name
expect(odd_name.author_identities.size).to eq(0) # has no alternate identities
expect(odd_name.unique_first_initial?).to eq(true) # and no other odd names likes this at stanford, so ok to search with first initial
end
it 'confirms unique first initial within stanford with stanford only alternate identities' do
subject.update_from_cap_authorship_profile_hash(auth_hash)
expect(subject.author_identities.size).to eq(2) # has alternate identities
expect(subject.unique_first_initial?).to eq(true) # ok, because all of the alternate identities are stanford or no institution, and no other first name ambiguity
end
it 'confirms ambiguous first initial within stanford with no alternate identities' do
create :author_duped_last_name
expect(subject.author_identities.size).to eq(0) # no alternate identities
expect(subject.unique_first_initial?).to eq(false) # not unique, because we now have another stanford author with the same last name and same first initial
end
it 'confirms ambiguous first initial even when non ambiguous within Stanford due to a non-Stanford alternate identity existing' do
author_with_alternate_identities = create :author_with_alternate_identities
expect(author_with_alternate_identities.author_identities.size).to eq(1) # alternate identities for primary author
expect(author_with_alternate_identities.author_identities.first.institution).not_to be blank? # alternate institution is not empty
expect(author_with_alternate_identities.author_identities.first.institution.include?('Stanford')).to be false # alternate institution is not Stanford
expect(author_with_alternate_identities.unique_first_initial?).to eq(false) # not unique, because even though there are no other stanford authors with similar names, they have a non-Stanford alternate identity
end
end

describe '#first_name' do
it 'is the preferred_first_name' do
subject.update_from_cap_authorship_profile_hash(auth_hash)
Expand Down

0 comments on commit 4497509

Please sign in to comment.