forked from caaswxb/SRY
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SRY_k
134 lines (109 loc) · 5.26 KB
/
SRY_k
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/bash
script_abs=$(readlink -f "$0")
script_dir=$(dirname $script_abs)
export PATH=$PATH:$script_dir/script
export LC_ALL=C
usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>* Specific-kmer file\n-l <string>* Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>* Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int> K-mer length (default: 21)\n-p <int> CPU number (default: 1)\n-h Help and exit."
printHelpAndExit() {
echo -e "$usageHelp"
exit;
}
while getopts "i:l:g:k:p:h" opt; do
case $opt in
i) KMER=$OPTARG ;;
l) LREAD=$OPTARG ;;
g) GSIZE=$OPTARG ;;
k) KLEN=$OPTARG ;;
p) CPU=$OPTARG ;;
h) printHelpAndExit 0;;
esac
done
if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
printHelpAndExit
exit
fi
if [ -z "$KLEN" ];then
KLEN=21
fi
if [ -z "$CPU" ];then
CPU=1
fi
if [ ! -d "output" ];then
mkdir output
fi
#judge if the softwares needed are installed.
if ! type jellyfish >/dev/null 2>&1; then
echo 'jellyfish is not installed!'
exit
fi
if ! type samtools >/dev/null 2>&1; then
echo 'samtools is not installed!'
exit
fi
if ! type seqtk >/dev/null 2>&1; then
echo 'seqtk is not installed!'
exit
fi
if ! type parallel >/dev/null 2>&1; then
echo 'parallel is not installed!'
exit
fi
#obtain specific-Y kmers
echo "***********************************Preparation*************************************"
#calculate specific k-mer density
NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
echo "The number of specific k-mers is $NUM."
CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing
DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'`
echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads."
echo -e "\n***********************************Process*****************************************"
#pre-process the input file
create_reverse_complement.pl $KMER > output/kmer_rc.txt
LREAD=${LREAD//,/ }
for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
samtools faidx output/changeid.fa
#split kmer
PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
split -l 200000 output/kmer_rc.txt output/x
echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."
#sort kmer
for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
parallel -j $CPU <temp.sh
echo "Sort kmer has been done -> output/x*.st"
#search long reads contained specific kmers
for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
parallel -j $CPU <temp.sh
echo "Kmer searching on long reads has been done -> output/x*.kmercount"
#obtain the position of specific kmers on each read
for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmer position on long reads has been done -> output/x*.loci"
#get the extended sequences including specific kmers
for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"
#acquire the kmers in sequences
for f in `ls output/x* | grep -v '\.'`;do echo "jellyfish count -C -m $KLEN -s 10000000 -o $f.loci.jf $f.loci.fa && jellyfish dump -ct $f.loci.jf | sort -k1,1 > $f.loci.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"
#filter kmers of last step
for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"
#the relationship between kmer and read id
for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"
#get the id
for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
rm temp.sh
#get reads
echo -e "\n***************************************Final****************************************"
cat output/x*.seqid >output/ALL.seqid
count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
echo "DONE!"
echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"