forked from Ensembl/VEP_plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RefSeqHGVS.pm
208 lines (147 loc) · 5.7 KB
/
RefSeqHGVS.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
=head1 NAME
RefSeqHGVS -- provide RefSeq-based HGVS tags for VEP output
=head1 SYNOPSIS
cp RefSeqHGVS.pm ~/.vep/Plugins/ (or elsewhere in PERL5LIB)
perl variant_effect_predictor.pl -i variations.vcf --plugin RefSeqHGVS
Output:
Variant lines with the following addtiional tags:
HGVSc-RefSeq=NM_198156.2:c.403delA;HGVSp-RefSeq=NP_937799.1:p.Arg135GlyfsX26
=head1 DESCRIPTION
RefSeqHGVS is a plugin for Ensembl's Variant Effect Predictor that
provides variant annotatoins in HGVS format [1] using RefSeq accessions
(typically NM and NP). It provides the analog to VEP's HGVSc and HGVSp
annotations, which use Ensembl ENST and ENSP accessions. This module
relies RefSeq data in the OtherFeatures database.
Converting ENST HGVS tags to RefSeq tags is confounded subtle differences
between exon structures for the same CDS. The plugin is intended to be
conservative by requiring exact matches of both CDS and exon structure
when converting variants.
[1] http://www.hgvs.org/mutnomen/
=head1 BUGS AND LIMITATIONS
=over
=item * RefSeq transcripts limited to those in the OtherFeatures database.
Archived RefSeq transcripts are not available.
=item * Discrepancies between RefSeq and GRCh37
Approximately 20% of RefSeqs differ from the GRCh37 due to substitution
(16.2%), insertion/deletion (3.5%), or both (1.2%) differences. [1]
Variants are annotated by difference with respect to the genome, not the
RefSeq transcript.
[1] MU2A--reconciling the genome and transcriptome to determine the
effects of base substitutions.
Garla, V., Kong, Y., Szpakowski, S., & Krauthammer, M. (2011).
Bioinformatics, 27(3), 416-8. doi:10.1093/bioinformatics/btq658
http://www.ncbi.nlm.nih.gov/pubmed/21149339
=item * Conservative selection of equivalent RefSeq transcripts.
Only NM transcripts which are exactly identical to the ENST transcript in
both CDS and exon structure are used. In the future, this might be
relaxed to exclude only regions of mismatch. Currently, this implies that
HGVS tags are not constructued when the RefSeq differs from the reference
geneome.
=back
=head1 CONTACT
Reece Hart <[email protected]>
=head1 LICENSE
This code and all rights to it are hereby donated to EnsEMBL project by
Reece Hart and Locus Development.
Copyright (c) 1999-2011 The European Bioinformatics Institute and
Genome Research Limited. All rights reserved.
This software is distributed under a modified Apache license.
For license details, please see
http://www.ensembl.org/info/about/code_licence.html
=cut
package RefSeqHGVS;
use strict;
use warnings;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin);
use Data::Dumper;
my %mt_cache; # cache of ENST -> NM mappings
sub version {
return '2.3';
}
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
my $reg = 'Bio::EnsEMBL::Registry';
$self->{ofsa} = $reg->get_adaptor('Human', 'otherfeatures', 'Slice')
or die "Failed to create transcript adaptor in human otherfeatures database\n";
$self->{ofta} = $reg->get_adaptor('Human', 'otherfeatures', 'Transcript')
or die "Failed to create transcript adaptor in human otherfeatures database\n";
return $self;
}
sub feature_types {
return ['Transcript'];
}
sub variant_feature_types {
return ['VariationFeature'];
}
sub get_header_info {
return {
'HGVSc-RefSeq' => "HGVSc using RefSeq transcripts accessions",
'HGVSp-RefSeq' => "HGVSp using RefSeq protein accessions",
};
}
sub run {
my ($self, $tva) = @_;
my $t = $tva->transcript;
my %rv;
my $ofsa = $self->{ofsa};
my $ofta = $self->{ofta};
# mappabale_transcripts have identical CDS and exon structure
my @mappable_transcripts = $self->get_mappable_transcripts($t);
my @transcript_acs = map { $_->display_id() } @mappable_transcripts;
my @protein_acs = grep {defined $_} map { $_->translation()->display_id() } @mappable_transcripts;
# substitute accessions for those in mappable transcripts/proteins
my @hgvsc_rs;
if (defined (my $hgvsc = $tva->hgvs_coding())) {
@hgvsc_rs = map {__subst_hgvs_ac($hgvsc ,$_)} @transcript_acs;
}
my @hgvsp_rs;
if (defined (my $hgvsp = $tva->hgvs_protein())) {
@hgvsp_rs = map {__subst_hgvs_ac($hgvsp,$_)} @protein_acs;
}
$rv{'HGVSc-RefSeq'} = join(';',@hgvsc_rs) if @hgvsc_rs;
$rv{'HGVSp-RefSeq'} = join(';',@hgvsp_rs) if @hgvsp_rs;
return \%rv;
}
sub get_mappable_transcripts {
my ($self,$t) = @_;
my $key = $t->display_id();
if (not exists $mt_cache{$key}) {
@{$mt_cache{$key}} = $self->_get_mappable_transcripts($t);
}
return @{$mt_cache{$key}};
}
############################################################################
## internal methods
sub _get_mappable_transcripts {
my ($self,$t) = @_;
my $cds_seq = $t->translateable_seq();
my $exon_structure = __tx_exon_str($t);
# cds_seq is empty for pseudogenes; no mapping possible
return () if ($cds_seq eq '');
# get overlapping transcripts from other features
my @tx = @{ $self->{ofta}->fetch_all_by_Slice($t->feature_Slice()) };
# limit to NMs with standard-format
@tx = grep { $_->display_id() =~ m/^NM_\d+\.\d+$/ } @tx;
# limit to transcripts with same CDS
@tx = grep { $_->translateable_seq() eq $cds_seq } @tx;
# limit to transcripts with identical chromosomal exon structure
@tx = grep { __tx_exon_str($_->transform('chromosome')) eq $exon_structure } @tx;
return @tx;
}
############################################################################
## function (not methods)
sub __tx_exon_str {
my $t = shift;
return join(',', map {sprintf('[%s,%s]', $_->start(), $_->end())}
@{ $t->get_all_translateable_Exons() } );
}
sub __subst_hgvs_ac {
my ($hgvs,$ac) = @_;
$hgvs =~ s/^[^:]+:/$ac:/;
return $hgvs;
}
sub __uniq {
return keys %{{ map {$_=>1} @_ }};
}
1;