forked from vikas0633/perl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblat2gff.pl
executable file
·195 lines (152 loc) · 7.95 KB
/
blat2gff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/perl
# blat2gff.pl
use strict;
use Getopt::Long;
my ($swap,$addmatch);
my $matchtype=undef;
my $matchpart="match_part"; # no other SO choice ?
my $source="BLAT";
my %matchids;
my $prefixmatch="_mid";
my $gffver=3;
my $percentscore= 1; # default yes?
my $optok= GetOptions(
"swapquerytarget!",\$swap,
"source=s", \$source,
"version:i", \$gffver,
"matchtype:s", \$matchtype,
"prefixmatch=s", \$prefixmatch,
"percentscore!", \$percentscore,
);
die "
usage: blat2gff < blat.psl > blat.gff
options: -source $source -matchtype EST_match -swapQueryTarget -nopercentscore -version [3|2]
" unless($optok);
if(defined $matchtype) { $addmatch=1; $matchtype="match" unless($matchtype); }
print "##gff-version $gffver\n";
my $ispsl=0;
while(<>) {
$ispsl=1 ; ## if m/^psLayout/; # should do but want some slack here?
next unless(/^\d/);
chomp;
my @v= split "\t";
if(@v==22) { shift(@v);} # ucsc psl starts with a 'bin' field?
unless($ispsl and @v==21){ die "# error: doesnt look like psl format I know" ; }
my( $matchscore, $mismatches, $rep_matches, $orient,
$qid, $qsize, $qstart, $qend,
$tid, $tsize, $tstart, $tend,
$blocksizes, $qstarts, $tstarts,
)= @v[0..2, 8..16, 18..20];
$qstart++; $tstart++; # move to 1-origin
if($swap) {
($tid, $tsize, $tstart, $tend, $qid, $qsize, $qstart, $qend,)
= ($qid, $qsize, $qstart, $qend, $tid, $tsize, $tstart, $tend,);
($qstarts,$tstarts)= ($tstarts,$qstarts);
}
my $matchid = $qid .$prefixmatch. ++$matchids{$qid};
my @blocksizes = split( /,/ , $blocksizes );
my @qstarts = split( /,/ , $qstarts );
my @tstarts = split( /,/ , $tstarts );
my $npart = @qstarts;
$matchscore = sprintf "%.2f", (100 * ( $matchscore ) / $qsize) if($percentscore);
## BioPerl Tools/Blat.pm does this, which seems wrong: $matchscore + $mismatches + $rep_matches
if($addmatch) {
## with - orient need to back-calculate Query location (gff target)
if($orient eq '-') { ($qstart,$qend)= ($qend,$qstart); } # print reversed for Target
gffput( 0, $tid, $source, $matchtype, $tstart, $tend, $matchscore, $orient, ".",
$matchid, "$qid $qstart $qend");
$matchscore="."; # dont duplicate in match_parts; score is for entire feature
}
for(my $p=0; $p<$npart; $p++) {
my($qstart,$tstart,$len)= ($qstarts[$p], $tstarts[$p], $blocksizes[$p]);
my $qend= $qstart+$len;
my $tend= $tstart+$len;
$qstart++; $tstart++; # move to 1-origin
if($orient eq '-') { # reverse all exons for Target
$len = $blocksizes[$npart-1-$p];
$qend = $qstarts[$npart-1-$p]; # lower
$qstart= $qend + $len; # higher
$qend++;
}
gffput( 1, $tid, $source, $matchpart, $tstart, $tend, $matchscore, $orient, ".",
$matchid, "$qid $qstart $qend");
}
}
sub gffput {
my($part,$r,$s,$t,$b,$e,$p,$o,$f,$id,$tg)=@_;
if($gffver<3){
my $IDtag="gene" ;
print join("\t", $r, $s, $t, $b, $e, $p, $o, $f, "$IDtag \"$id\"; Target \"$tg\""),"\n";
}else{
my $IDtag=($part) ? "Parent" : "ID";
print join("\t", $r, $s, $t, $b, $e, $p, $o, $f, "$IDtag=$id;Target=$tg"),"\n";
}
}
=head1 ABOUT blat2gff
Convert blat psl to gff v3, preserving exon/match_part info, and
separate distinct matches for same query id onto target. This is
the usual case for EST/mRNA matches onto genome backbone, with multiple
duplicate but distinct genes, each with exon structure. Blat provides
this info and it is important for genome analyses.
Note that bioperl SearchIO/psl.pm doesn't do this, it just uses the psl
start,end fields, and it lumps any same query-id into one common match group,
even for distinct locations (separate rows from psl output).
I missed the other BioPerl Bio/Tools/Blat.pm which does the qstarts,tstarts exon
calculations. It however gives a common ID and superfeature to all same-query-id
matches, which is a problem (same as below combining multiple gene hits into one).
Someone should rewrite BioPerl SearchIO/psl.pm to use the Tools/Blat.pm package.
=head1 USAGE
$td/blat2gff.pl < dgri-gnoest.blat > dgri-gnoest.blat2gff
=head1 AUTHOR
don gilbert, 2007, [email protected]
=item Blat/PSL header
0..8
match mis- rep. N's Q gap Q gap T gap T gap strand
match match count bases count bases
9..16
Q Q Q Q T T T T
name size start end name size start end
17..20
block blockSizes qStarts tStarts
count
Parse qStarts, tStarts for match_part structure
=item examples
blat -fastMap $em/dgri1/dgri_caf060210.fa dgri-est.fa dgri-gnoest.blat
cat dgri-gnoest.blat | sort -k14,14 -k10,10 -k16,16n | more
grep EB637050.1 dgri-gnoest.blat
563 0 0 0 0 0 3 1912 -
gi|93002893|gb|EB637050.1 830 72 635 scaffold_15110 24565398 10715419 10717894
4 162,231,97,73, 195,357,588,685, 10715419,10716978,10717307,10717821,
grep EB637050.1 dgri-gnoest.blat | perl -pe's/gi.\d+.(gb.\w+).\d+.//;' | $td/blat2gff.pl -match EST_match
scaffold_15110 BLAT EST_match 10715419 10717894 563 - . ID=EB637050_mid1;Target=EB637050 72 635
scaffold_15110 BLAT match_part 10715419 10715581 563 - . Parent=EB637050_mid1;Target=EB637050 195 357
scaffold_15110 BLAT match_part 10716978 10717209 563 - . Parent=EB637050_mid1;Target=EB637050 357 588
scaffold_15110 BLAT match_part 10717307 10717404 563 - . Parent=EB637050_mid1;Target=EB637050 588 685
scaffold_15110 BLAT match_part 10717821 10717894 563 - . Parent=EB637050_mid1;Target=EB637050 685 758
# example tandem match pair
grep EB634440.1 dgri-gnoest.blat
670 .. + gi|93000283|gb|EB634440.1|EB634440 753 0 683 scaffold_14830 6267026 2489511 2490194 2 398,272, 0,411, 2489511,2489922,
683 .. - gi|93000283|gb|EB634440.1|EB634440 753 0 683 scaffold_14830 6267026 2484805 2485488 1 683, 70, 2484805,
646 .. - gi|93000283|gb|EB634440.1|EB634440 753 0 683 scaffold_14830 6267026 2480511 2481194 3 272,71,305, 70,355,448, 2480511,2480796,2480889,
grep EB634440.1 dgri-gnoest.blat | perl -pe's/gi.\d+.(gb.\w+).\d+.//;' | \
$td/blat2gff.pl -match EST_match
##gff-version 3
scaffold_14830 BLAT EST_match 2489512 2490194 670 + . ID=EB634440_mid1;Target=EB634440 1 683
scaffold_14830 BLAT match_part 2489512 2489909 670 + . Parent=EB634440_mid1;Target=EB634440 1 398
scaffold_14830 BLAT match_part 2489923 2490194 670 + . Parent=EB634440_mid1;Target=EB634440 412 683
scaffold_14830 BLAT EST_match 2484806 2485488 683 - . ID=EB634440_mid2;Target=EB634440 1 683
scaffold_14830 BLAT match_part 2484806 2485488 683 - . Parent=EB634440_mid2;Target=EB634440 71 753
scaffold_14830 BLAT EST_match 2480512 2481194 646 - . ID=EB634440_mid3;Target=EB634440 1 683
scaffold_14830 BLAT match_part 2480512 2480783 646 - . Parent=EB634440_mid3;Target=EB634440 71 342
scaffold_14830 BLAT match_part 2480797 2480867 646 - . Parent=EB634440_mid3;Target=EB634440 356 426
scaffold_14830 BLAT match_part 2480890 2481194 646 - . Parent=EB634440_mid3;Target=EB634440 449 753
=item equivalent Bioperl bp_search2gff3.pl
grep EB634440.1 dgri-gnoest.blat | perl -pe's/gi.\d+.(gb.\w+).\d+.//;' | \
$in/lib/Bio/script/bp_search2gff3.pl -f psl -m -ver 3 -t hit -i -
##gff-version 3
scaffold_14830 BLAT match_part 2489512 2490194 . + 0 Parent=EB634440;Target=Sequence:EB634440 1 683
scaffold_14830 BLAT match_part 2484806 2485488 . - 0 Parent=EB634440;Target=Sequence:EB634440 1 683
scaffold_14830 BLAT match_part 2480512 2481194 . - 0 Parent=EB634440;Target=Sequence:EB634440 1 683
scaffold_14830 BLAT match 2480512 2490194 . - . ID=EB634440
cat dgri-gnoest.blat | sort -k14,14 -k10,10 -k16,16n | more
=cut