-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpslScore.pl
93 lines (86 loc) · 3.18 KB
/
pslScore.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env perl
#
# pslScore.pl - a reproduction of the C library calculations from
# src/lib/psl.c
#
use strict;
use warnings;
my $argc = scalar(@ARGV);
if ($argc < 1) {
printf STDERR "pslScore - calculate web blat score from psl files\n";
printf STDERR "usage:\n";
printf STDERR " pslScore.pl <file.psl> [moreFiles.psl]\n";
printf STDERR "options:\n";
printf STDERR " none at this time\n";
exit 255;
}
# is psl a protein psl (are it's blockSizes and scores in protein space)
# 1 == not protein, return 3 == protein
sub pslIsProtein($$$$$$$) {
my ($blockCount, $strand, $tStart, $tEnd, $tSize, $tStarts, $blockSizes) = @_;
my @starts = split(',', $tStarts);
my @sizes = split(',', $blockSizes);
my $lastBlock = $blockCount - 1;
my $answer = 1;
if ($strand =~ m/^\+/) {
my $test = $starts[$lastBlock] + (3 * $sizes[$lastBlock]);
$answer = 3 if ($tEnd == $test);
} elsif ($strand =~ m/^-/) {
my $test = $tSize - ($starts[$lastBlock] + (3 * $sizes[$lastBlock]));
$answer = 3 if ($tStart == $test);
}
return $answer;
} # pslIsProtein()
sub pslCalcMilliBad($$$$$$$$$$$) {
my ($sizeMul, $qEnd, $qStart, $tEnd, $tStart, $qNumInsert, $tNumInsert, $matches, $repMatches, $misMatches, $isMrna) = @_;
my $milliBad = 0;
my $qAliSize = $sizeMul * ($qEnd - $qStart);
my $tAliSize = $tEnd - $tStart;
my $aliSize = $qAliSize;
$aliSize = $tAliSize if ($tAliSize < $qAliSize);
if ($aliSize <= 0) {
return $milliBad;
}
my $sizeDif = $qAliSize - $tAliSize;
if ($sizeDif < 0) {
if ($isMrna) {
$sizeDif = 0;
} else {
$sizeDif = -$sizeDif;
}
}
my $insertFactor = $qNumInsert;
if (0 == $isMrna) {
$insertFactor += $tNumInsert;
}
my $total = ($sizeMul * ($matches + $repMatches + $misMatches));
if ($total != 0) {
my $roundAwayFromZero = 3*log(1+$sizeDif);
if ($roundAwayFromZero < 0) {
$roundAwayFromZero = int($roundAwayFromZero - 0.5);
} else {
$roundAwayFromZero = int($roundAwayFromZero + 0.5);
}
$milliBad = (1000 * ($misMatches*$sizeMul + $insertFactor + $roundAwayFromZero)) / $total;
}
return $milliBad;
} # sub pslCalcMilliBad()
while (my $file = shift) {
if ($file =~ m/.gz$/) {
open (FH, "zcat $file|") or die "can not read $file";
} else {
open (FH, "<$file") or die "can not read $file";
}
while (my $line = <FH>) {
next if ($line =~ m/^#/ or $line !~ /^\d/);
chomp $line;
my ($matches, $misMatches, $repMatches, $nCount, $qNumInsert, $qBaseInsert, $tNumInsert, $tBaseInsert, $strand, $qName, $qSize, $qStart, $qEnd, $tName, $tSize, $tStart, $tEnd, $blockCount, $blockSizes, $qStarts, $tStarts) = split('\t', $line);
my $sizeMul = pslIsProtein($blockCount, $strand, $tStart, $tEnd, $tSize, $tStarts, $blockSizes);
my $pslScore = $sizeMul * ($matches + ( $repMatches >> 1) ) -
$sizeMul * $misMatches - $qNumInsert - $tNumInsert;
my $milliBad = int(pslCalcMilliBad($sizeMul, $qEnd, $qStart, $tEnd, $tStart, $qNumInsert, $tNumInsert, $matches, $repMatches, $misMatches, 1));
my $percentIdentity = 100.0 - $milliBad * 0.1;
printf "%s\t%d\t%d\t%s\t%d\t%d\t%.2f\t%d\t%.2f\n", $tName, $tStart, $tEnd, $qName, $qStart, $qEnd, ($qEnd-$qStart)/$qSize * 100, $pslScore, $percentIdentity;
}
close (FH);
}