-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtaxonomic_specificity.sh
92 lines (79 loc) · 3.32 KB
/
taxonomic_specificity.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/bin/sh
#Count the number of bases aligned to the correct genus, species, and strain, as well as the incorrect genus
count_bases(){
fp=$1
term=$2
type=$3
[[ "$type" -eq "blasr" ]] && readcol=1
[[ "$type" -eq "last" ]] && readcol=7
for read in $(egrep $term $fp | tr ' ' '\t' | cut -f $readcol | sort -u); do
echo $read
#[[ "$type" -eq "blasr" ]] && egrep $term $fp | grep $read | tr ' ' '\t' | cut -f 6,7 | awk '{print $2-$1}'|sort -u | sort -nrk1 | head -1
#[[ "$type" -eq "last" ]] && egrep $term $fp | grep $read | tr ' ' '\t' | cut -f 12 | sort -nrk1 | head -1
done #| tr -d '-' | awk '{sum+=$1}END{print sum}'
}
count_bases_absent(){
fp=$1
term=$2
type=$3
[[ "$type" -eq "blasr" ]] && readcol=1
[[ "$type" -eq "last" ]] && readcol=7
for read in $(egrep -v $term $fp | grep -v readname | tr ' ' '\t' | cut -f $readcol | sort -u); do
[[ "$type" -eq "blasr" ]] && egrep -v $term $fp | grep $read | tr ' ' '\t' | cut -f 6,7 | awk '{print $3-$2}' | sort -nrk1 | head -1
[[ "$type" -eq "last" ]] && egrep -v $term $fp | grep $read | tr ' ' '\t' | cut -f 12 | sort -nrk1 | head -1
done | tr -d '-' | awk '{sum+=$1}END{print sum}'
}
count_reads(){
fp=$1
term=$2
type=$3
[[ "$type" -eq "blasr" ]] && egrep $term $fp | tr ' ' '\t' | cut -f 1 | sort -u | wc -l
[[ "$type" -eq "last" ]] && egrep $term $fp | tr ' ' '\t' | cut -f 7 | sort -u | wc -l
}
count_reads_absent(){
fp=$1
term=$2
type=$3
[[ "$type" -eq "blasr" ]] && readcol=1
[[ "$type" -eq "last" ]] && readcol=7
egrep -v $term $fp | fgrep -v readname | tr ' ' '\t' | cut -f $readcol | sort -u | wc -l
}
longest_alignment(){
fp=$1
term=$2
type=$3
egrep $term $fp |tr ' ' '\n' | tr '\t' '\n' | grep channel | sort -u | while read readname; do
[[ $type == "blasr" ]] && egrep $term $fp | grep $readname | tr ' ' '\t' | cut -f 6,7 | awk '{print $2-$1}' | sort -nrk1 | head -1 | tr -d '-'
[[ $type == "last" ]] && egrep $term $fp | grep $readname | cut -f 12 | sort -nrk1 | head -1
done
}
longest_alignment_absent(){
fp=$1
term=$2
type=$3
egrep -v $term $fp |tr ' ' '\n' | tr '\t' '\n' | grep channel | sort -u | while read readname; do
[[ $type == "blasr" ]] && egrep -v $term $fp | grep $readname | tr ' ' '\t' | cut -f 6,7 | awk '{print $2-$1}' | sort -nrk1 | head -1 | tr -d '-'
[[ $type == "last" ]] && egrep -v $term $fp | grep $readname | cut -f 12 | sort -nrk1 | head -1
done
}
read_num_and_align_len(){
echo "$#/$(for var in "$@"; do echo $var; done | awk '{sum+=$1}END{print sum}')"
}
taxon(){
fp=$1
type=$4
genus_dat=$(longest_alignment $fp $2 $type)
genus=$(read_num_and_align_len $genus_dat)
species_dat=$(longest_alignment $fp $3 $type)
species=$(read_num_and_align_len $species_dat)
nongenus_dat=$(longest_alignment_absent $fp $2 $type)
nongenus=$(read_num_and_align_len $nongenus_dat)
echo "$fp $genus $species $nongenus"
}
for p in last blasr; do
echo "Dataset GenusReads/GenusBases SpeciesReads/SpeciesBases NonGenusReads/NonGenusBases" > taxonomic_specificity.$p.tsv
taxon ecoli_reads.fa.$p.read.stats 'Escherichia|Shigella' coli $p >> taxonomic_specificity.$p.tsv
taxon MVA_reads.fa.$p.read.stats 'Vaccinia|Cowpox' 'MVA|Cowpox' $p >> taxonomic_specificity.$p.tsv
taxon lister_reads.fa.$p.read.stats Vaccinia Lister $p >> taxonomic_specificity.$p.tsv
taxon cowpox_reads.fa.$p.read.stats Cowpox Cowpox $p >> taxonomic_specificity.$p.tsv
done