diff --git a/es/contributor/mapping.json b/es/contributor/mapping.json index 605c8f923..0405a4494 100644 --- a/es/contributor/mapping.json +++ b/es/contributor/mapping.json @@ -11,6 +11,16 @@ "index": "not_analyzed", "type": "string" }, + "name": { + "ignore_above": 2048, + "index": "not_analyzed", + "type": "string" + }, + "email": { + "ignore_above": 2048, + "index": "not_analyzed", + "type": "string" + }, "release_author": { "ignore_above": 2048, "index": "not_analyzed", diff --git a/lib/MetaCPAN/Document/Contributor.pm b/lib/MetaCPAN/Document/Contributor.pm index 826bebcf9..dec572834 100644 --- a/lib/MetaCPAN/Document/Contributor.pm +++ b/lib/MetaCPAN/Document/Contributor.pm @@ -3,7 +3,7 @@ package MetaCPAN::Document::Contributor; use MetaCPAN::Moose; use ElasticSearchX::Model::Document; -use MetaCPAN::Types::TypeTiny qw( Str ); +use MetaCPAN::Types::TypeTiny qw( ArrayRef Str ); has distribution => ( is => 'ro', @@ -24,9 +24,18 @@ has release_name => ( ); has pauseid => ( - is => 'ro', - isa => Str, - required => 1, + is => 'ro', + isa => Str, +); + +has name => ( + is => 'ro', + isa => Str, +); + +has email => ( + is => 'ro', + isa => ArrayRef [Str], ); __PACKAGE__->meta->make_immutable; diff --git a/lib/MetaCPAN/Query/Contributor.pm b/lib/MetaCPAN/Query/Contributor.pm index 4aea27617..d0b1859b9 100644 --- a/lib/MetaCPAN/Query/Contributor.pm +++ b/lib/MetaCPAN/Query/Contributor.pm @@ -13,8 +13,9 @@ sub find_release_contributors { my $query = +{ bool => { must => [ - { term => { release_author => $author } }, - { term => { release_name => $name } }, + { term => { release_author => $author } }, + { term => { release_name => $name } }, + { exists => { field => 'pauseid' } }, ] } }; @@ -22,8 +23,14 @@ sub find_release_contributors { my $res = $self->es->search( es_doc_path('contributor'), body => { - query => $query, - size => 999, + query => $query, + size => 999, + _source => [ qw( + distribution + pauseid + release_author + release_name + ) ], } ); hit_total($res) or return {}; @@ -40,11 +47,17 @@ sub find_author_contributions { my $res = $self->es->search( es_doc_path('contributor'), body => { - query => $query, - size => 999, + query => $query, + size => 999, + _source => [ qw( + distribution + pauseid + release_author + release_name + ) ], } ); - $res->{hits}{total} or return {}; + hit_total($res) or return {}; return +{ contributors => [ map { $_->{_source} } @{ $res->{hits}{hits} } ] }; diff --git a/lib/MetaCPAN/Script/Contributor.pm b/lib/MetaCPAN/Script/Contributor.pm index 3f7b8f047..fbeb6cf46 100644 --- a/lib/MetaCPAN/Script/Contributor.pm +++ b/lib/MetaCPAN/Script/Contributor.pm @@ -80,32 +80,7 @@ sub run { ? { range => { date => { gte => sprintf( 'now-%dd', $self->age ) } } } : return; - my $timeout = $self->all ? '720m' : '5m'; - - my $scroll = $self->es->scroll_helper( - scroll => $timeout, - es_doc_path('release'), - body => { - query => $query, - size => 500, - _source => [qw( author distribution name )], - }, - ); - - my @data; - - while ( my $r = $scroll->next ) { - my $contrib_data = $self->get_cpan_author_contributors( - $r->{_source}{author}, - $r->{_source}{name}, - $r->{_source}{distribution}, - ); - next unless is_arrayref($contrib_data); - log_debug { 'adding release ' . $r->{_source}{name} }; - push @data => @{$contrib_data}; - } - - $self->update_release_contirbutors( \@data, $timeout ); + $self->update_contributors($query); } __PACKAGE__->meta->make_immutable; diff --git a/lib/MetaCPAN/Script/Release.pm b/lib/MetaCPAN/Script/Release.pm index ced176545..ff7ca617f 100644 --- a/lib/MetaCPAN/Script/Release.pm +++ b/lib/MetaCPAN/Script/Release.pm @@ -383,9 +383,14 @@ sub import_archive { MetaCPAN::Script::Runner->run; } - my $contrib_data = $self->get_cpan_author_contributors( $document->author, - $document->name, $document->distribution ); - $self->update_release_contirbutors($contrib_data); + $self->update_contributors( { + bool => { + must => [ + { term => { author => $document->author } }, + { term => { name => $document->name } }, + ], + }, + } ); } sub detect_status { diff --git a/lib/MetaCPAN/Script/Role/Contributor.pm b/lib/MetaCPAN/Script/Role/Contributor.pm index 5f96a6919..819412ba2 100644 --- a/lib/MetaCPAN/Script/Role/Contributor.pm +++ b/lib/MetaCPAN/Script/Role/Contributor.pm @@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor; use Moose::Role; +use Log::Contextual qw( :log ); use MetaCPAN::ESConfig qw( es_doc_path ); -use MetaCPAN::Util qw( digest true false ); +use MetaCPAN::Util qw( true false ); use Ref::Util qw( is_arrayref ); -sub get_cpan_author_contributors { - my ( $self, $author, $release, $distribution ) = @_; - my @ret; - my $es = $self->es; - - my $type = $self->model->doc('release'); - my $data; - eval { - $data = $type->get_contributors( $author, $release ); - 1; - } or return []; - - for my $d ( @{ $data->{contributors} } ) { - next unless exists $d->{pauseid}; - - # skip existing records - my $id = digest( $d->{pauseid}, $release ); - my $exists = $es->exists( es_doc_path('contributor'), id => $id, ); - next if $exists; - - $d->{release_author} = $author; - $d->{release_name} = $release; - $d->{distribution} = $distribution; - push @ret, $d; +sub update_contributors { + my ( $self, $query ) = @_; + + my $scroll = $self->es->scroll_helper( + es_doc_path('release'), + body => { + query => $query, + sort => ['_doc'], + _source => [ qw< + name + author + distribution + metadata.author + metadata.x_contributors + > ], + }, + ); + + my $bulk = $self->es->bulk_helper( es_doc_path('contributor') ); + + while ( my $release = $scroll->next ) { + log_debug { 'updating contributors for ' . $release->{_source}{name} }; + my $actions = $self->release_contributor_update_actions( + $release->{_source} ); + for my $action (@$actions) { + $bulk->add_action(%$action); + } } - return \@ret; + $bulk->flush; } -sub update_release_contirbutors { - my ( $self, $data, $timeout ) = @_; - return unless $data and is_arrayref($data); - - my $bulk = $self->es->bulk_helper( es_doc_path('contributor'), - timeout => $timeout || '5m', ); - - for my $d ( @{$data} ) { - my $id = digest( $d->{pauseid}, $d->{release_name} ); - $bulk->update( { - id => $id, - doc => { - pauseid => $d->{pauseid}, - release_name => $d->{release_name}, - release_author => $d->{release_author}, - distribution => $d->{distribution}, +sub release_contributor_update_actions { + my ( $self, $release ) = @_; + my @actions; + + my $res = $self->es->search( + es_doc_path('contributor'), + body => { + query => { + bool => { + must => [ + { term => { release_name => $release->{name} } }, + { term => { release_author => $release->{author} } }, + ], + } }, - doc_as_upsert => true, - } ); + sort => ['_doc'], + size => 500, + _source => false, + }, + ); + my @ids = map $_->{_id}, @{ $res->{hits}{hits} }; + push @actions, map +{ delete => { id => $_ } }, @ids; + + my $contribs = $self->get_contributors($release); + my @docs = map { + ; + my $contrib = $_; + { + release_name => $release->{name}, + release_author => $release->{author}, + distribution => $release->{distribution}, + map +( defined $contrib->{$_} ? ( $_ => $contrib->{$_} ) : () ), + qw(pauseid name email) + }; + } @$contribs; + push @actions, map +{ create => { _source => $_ } }, @docs; + return \@actions; +} + +sub get_contributors { + my ( $self, $release ) = @_; + + my $author_name = $release->{author}; + my $contribs = $release->{metadata}{x_contributors} || []; + my $authors = $release->{metadata}{author} || []; + + for ( \( $contribs, $authors ) ) { + + # If a sole contributor is a string upgrade it to an array... + $$_ = [$$_] + if !ref $$_; + + # but if it's any other kind of value don't die trying to parse it. + $$_ = [] + unless Ref::Util::is_arrayref($$_); } + $authors = [ grep { $_ ne 'unknown' } @$authors ]; - $bulk->flush; + my $author = eval { + $self->es->get_source( es_doc_path('author'), id => $author_name ); + } + or return []; + + my $author_email = $author->{email}; + + my $author_info = { + email => [ + lc "$author_name\@cpan.org", + ( + Ref::Util::is_arrayref($author_email) + ? @{$author_email} + : $author_email + ), + ], + name => $author_name, + }; + my %seen = map { $_ => $author_info } + ( @{ $author_info->{email} }, $author_info->{name}, ); + + my @contribs = map { + my $name = $_; + my $email; + if ( $name =~ s/\s*<([^<>]+@[^<>]+)>// ) { + $email = $1; + } + my $info; + my $dupe; + if ( $email and $info = $seen{$email} ) { + $dupe = 1; + } + elsif ( $info = $seen{$name} ) { + $dupe = 1; + } + else { + $info = { + name => $name, + email => [], + }; + } + $seen{$name} ||= $info; + if ($email) { + push @{ $info->{email} }, $email + unless grep { $_ eq $email } @{ $info->{email} }; + $seen{$email} ||= $info; + } + $dupe ? () : $info; + } ( @$authors, @$contribs ); + + my %want_email; + for my $contrib (@contribs) { + + # heuristic to autofill pause accounts + if ( !$contrib->{pauseid} ) { + my ($pauseid) + = map { /^(.*)\@cpan\.org$/ ? $1 : () } + @{ $contrib->{email} }; + $contrib->{pauseid} = uc $pauseid + if $pauseid; + + } + + push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} }; + } + + if (%want_email) { + my $check_author = $self->es->search( + es_doc_path('author'), + body => { + query => { terms => { email => [ sort keys %want_email ] } }, + _source => [ 'email', 'pauseid' ], + size => 100, + }, + ); + + for my $author ( @{ $check_author->{hits}{hits} } ) { + my $emails = $author->{_source}{email}; + $emails = [$emails] + if !ref $emails; + my $pauseid = uc $author->{_source}{pauseid}; + for my $email (@$emails) { + for my $contrib ( @{ $want_email{$email} } ) { + $contrib->{pauseid} = $pauseid; + } + } + } + } + + return \@contribs; } no Moose::Role;