Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

include name and email in contributor index #1322

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions es/contributor/mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
"index": "not_analyzed",
"type": "string"
},
"name": {
"ignore_above": 2048,
"index": "not_analyzed",
"type": "string"
},
"email": {
"ignore_above": 2048,
"index": "not_analyzed",
"type": "string"
},
"release_author": {
"ignore_above": 2048,
"index": "not_analyzed",
Expand Down
17 changes: 13 additions & 4 deletions lib/MetaCPAN/Document/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package MetaCPAN::Document::Contributor;
use MetaCPAN::Moose;

use ElasticSearchX::Model::Document;
use MetaCPAN::Types::TypeTiny qw( Str );
use MetaCPAN::Types::TypeTiny qw( ArrayRef Str );

has distribution => (
is => 'ro',
Expand All @@ -24,9 +24,18 @@ has release_name => (
);

has pauseid => (
is => 'ro',
isa => Str,
required => 1,
is => 'ro',
isa => Str,
);

has name => (
is => 'ro',
isa => Str,
);

has email => (
is => 'ro',
isa => ArrayRef [Str],
);

__PACKAGE__->meta->make_immutable;
Expand Down
27 changes: 20 additions & 7 deletions lib/MetaCPAN/Query/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,24 @@ sub find_release_contributors {
my $query = +{
bool => {
must => [
{ term => { release_author => $author } },
{ term => { release_name => $name } },
{ term => { release_author => $author } },
{ term => { release_name => $name } },
{ exists => { field => 'pauseid' } },
]
}
};

my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => $query,
size => 999,
query => $query,
size => 999,
_source => [ qw(
distribution
pauseid
release_author
release_name
) ],
}
);
hit_total($res) or return {};
Expand All @@ -40,11 +47,17 @@ sub find_author_contributions {
my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => $query,
size => 999,
query => $query,
size => 999,
_source => [ qw(
distribution
pauseid
release_author
release_name
) ],
}
);
$res->{hits}{total} or return {};
hit_total($res) or return {};

return +{
contributors => [ map { $_->{_source} } @{ $res->{hits}{hits} } ] };
Expand Down
27 changes: 1 addition & 26 deletions lib/MetaCPAN/Script/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -80,32 +80,7 @@ sub run {
? { range => { date => { gte => sprintf( 'now-%dd', $self->age ) } } }
: return;

my $timeout = $self->all ? '720m' : '5m';

my $scroll = $self->es->scroll_helper(
scroll => $timeout,
es_doc_path('release'),
body => {
query => $query,
size => 500,
_source => [qw( author distribution name )],
},
);

my @data;

while ( my $r = $scroll->next ) {
my $contrib_data = $self->get_cpan_author_contributors(
$r->{_source}{author},
$r->{_source}{name},
$r->{_source}{distribution},
);
next unless is_arrayref($contrib_data);
log_debug { 'adding release ' . $r->{_source}{name} };
push @data => @{$contrib_data};
}

$self->update_release_contirbutors( \@data, $timeout );
$self->update_contributors($query);
}

__PACKAGE__->meta->make_immutable;
Expand Down
11 changes: 8 additions & 3 deletions lib/MetaCPAN/Script/Release.pm
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,14 @@ sub import_archive {
MetaCPAN::Script::Runner->run;
}

my $contrib_data = $self->get_cpan_author_contributors( $document->author,
$document->name, $document->distribution );
$self->update_release_contirbutors($contrib_data);
$self->update_contributors( {
bool => {
must => [
{ term => { author => $document->author } },
{ term => { name => $document->name } },
],
},
} );
}

sub detect_status {
Expand Down
220 changes: 175 additions & 45 deletions lib/MetaCPAN/Script/Role/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;

use Moose::Role;

use Log::Contextual qw( :log );
use MetaCPAN::ESConfig qw( es_doc_path );
use MetaCPAN::Util qw( digest true false );
use MetaCPAN::Util qw( true false );
use Ref::Util qw( is_arrayref );

sub get_cpan_author_contributors {
my ( $self, $author, $release, $distribution ) = @_;
my @ret;
my $es = $self->es;

my $type = $self->model->doc('release');
my $data;
eval {
$data = $type->get_contributors( $author, $release );
1;
} or return [];

for my $d ( @{ $data->{contributors} } ) {
next unless exists $d->{pauseid};

# skip existing records
my $id = digest( $d->{pauseid}, $release );
my $exists = $es->exists( es_doc_path('contributor'), id => $id, );
next if $exists;

$d->{release_author} = $author;
$d->{release_name} = $release;
$d->{distribution} = $distribution;
push @ret, $d;
sub update_contributors {
my ( $self, $query ) = @_;

my $scroll = $self->es->scroll_helper(
es_doc_path('release'),
body => {
query => $query,
sort => ['_doc'],
_source => [ qw<
name
author
distribution
metadata.author
metadata.x_contributors
> ],
},
);

my $bulk = $self->es->bulk_helper( es_doc_path('contributor') );

while ( my $release = $scroll->next ) {
log_debug { 'updating contributors for ' . $release->{_source}{name} };
my $actions = $self->release_contributor_update_actions(
$release->{_source} );
for my $action (@$actions) {
$bulk->add_action(%$action);
}
}

return \@ret;
$bulk->flush;
}

sub update_release_contirbutors {
my ( $self, $data, $timeout ) = @_;
return unless $data and is_arrayref($data);

my $bulk = $self->es->bulk_helper( es_doc_path('contributor'),
timeout => $timeout || '5m', );

for my $d ( @{$data} ) {
my $id = digest( $d->{pauseid}, $d->{release_name} );
$bulk->update( {
id => $id,
doc => {
pauseid => $d->{pauseid},
release_name => $d->{release_name},
release_author => $d->{release_author},
distribution => $d->{distribution},
sub release_contributor_update_actions {
my ( $self, $release ) = @_;
my @actions;

my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => {
bool => {
must => [
{ term => { release_name => $release->{name} } },
{ term => { release_author => $release->{author} } },
],
}
},
doc_as_upsert => true,
} );
sort => ['_doc'],
size => 500,
_source => false,
},
);
my @ids = map $_->{_id}, @{ $res->{hits}{hits} };
push @actions, map +{ delete => { id => $_ } }, @ids;

my $contribs = $self->get_contributors($release);
my @docs = map {
;
my $contrib = $_;
{
release_name => $release->{name},
release_author => $release->{author},
distribution => $release->{distribution},
map +( defined $contrib->{$_} ? ( $_ => $contrib->{$_} ) : () ),
qw(pauseid name email)
};
} @$contribs;
push @actions, map +{ create => { _source => $_ } }, @docs;
return \@actions;
}

sub get_contributors {
my ( $self, $release ) = @_;

my $author_name = $release->{author};
my $contribs = $release->{metadata}{x_contributors} || [];
my $authors = $release->{metadata}{author} || [];

for ( \( $contribs, $authors ) ) {

# If a sole contributor is a string upgrade it to an array...
$$_ = [$$_]
if !ref $$_;

# but if it's any other kind of value don't die trying to parse it.
$$_ = []
unless Ref::Util::is_arrayref($$_);
}
$authors = [ grep { $_ ne 'unknown' } @$authors ];

$bulk->flush;
my $author = eval {
$self->es->get_source( es_doc_path('author'), id => $author_name );
}
or return [];

my $author_email = $author->{email};

my $author_info = {
email => [
lc "$author_name\@cpan.org",
(
Ref::Util::is_arrayref($author_email)
? @{$author_email}
: $author_email
),
],
name => $author_name,
};
my %seen = map { $_ => $author_info }
( @{ $author_info->{email} }, $author_info->{name}, );

my @contribs = map {
my $name = $_;
my $email;
if ( $name =~ s/\s*<([^<>]+@[^<>]+)>// ) {
$email = $1;
}
my $info;
my $dupe;
if ( $email and $info = $seen{$email} ) {
$dupe = 1;
}
elsif ( $info = $seen{$name} ) {
$dupe = 1;
}
else {
$info = {
name => $name,
email => [],
};
}
$seen{$name} ||= $info;
if ($email) {
push @{ $info->{email} }, $email
unless grep { $_ eq $email } @{ $info->{email} };
$seen{$email} ||= $info;
}
$dupe ? () : $info;
} ( @$authors, @$contribs );

my %want_email;
for my $contrib (@contribs) {

# heuristic to autofill pause accounts
if ( !$contrib->{pauseid} ) {
my ($pauseid)
= map { /^(.*)\@cpan\.org$/ ? $1 : () }
@{ $contrib->{email} };
$contrib->{pauseid} = uc $pauseid
if $pauseid;

}

push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} };
}

if (%want_email) {
my $check_author = $self->es->search(
es_doc_path('author'),
body => {
query => { terms => { email => [ sort keys %want_email ] } },
_source => [ 'email', 'pauseid' ],
size => 100,
},
);

for my $author ( @{ $check_author->{hits}{hits} } ) {
my $emails = $author->{_source}{email};
$emails = [$emails]
if !ref $emails;
my $pauseid = uc $author->{_source}{pauseid};
for my $email (@$emails) {
for my $contrib ( @{ $want_email{$email} } ) {
$contrib->{pauseid} = $pauseid;
}
}
}
}

return \@contribs;
}

no Moose::Role;
Expand Down