SRY_k

#!/bin/bash

script_abs=$(readlink -f "$0")
script_dir=$(dirname $script_abs)
export PATH=$PATH:$script_dir/script
export LC_ALL=C

usageHelp="\n#################### Sorting pacbio or nanopore reads based on kmer file #############################\n\nUsage: ./${0##*/} (parameters with * must be supplied!)\n\n-i <string>*       Specific-kmer file\n-l <string>*       Long-read files of targeted genomes with comma separated (support for both fa and fq)\n-g <number>*       Approximate genome size of targeted Y/W chromosome. such as 52 for human Y(The unit is Megabase)\n-k <int>           K-mer length (default: 21)\n-p <int>           CPU number (default: 1)\n-h                 Help and exit."
printHelpAndExit() {
	echo -e "$usageHelp"
	exit;
}

while getopts "i:l:g:k:p:h" opt; do
	case $opt in
		i) KMER=$OPTARG ;;
		l) LREAD=$OPTARG ;;
		g) GSIZE=$OPTARG ;;
		k) KLEN=$OPTARG ;;
		p) CPU=$OPTARG ;;
		h) printHelpAndExit 0;;
	esac
done

if [ -z "$KMER" ] || [ -z "$LREAD" ] || [ -z "$GSIZE" ];then
	printHelpAndExit
	exit
fi

if [ -z "$KLEN" ];then
	KLEN=21
fi

if [ -z "$CPU" ];then
        CPU=1
fi

if [ ! -d "output" ];then
	mkdir output
fi

#judge if the softwares needed are installed.
if ! type jellyfish >/dev/null 2>&1; then
        echo 'jellyfish is not installed!'
        exit
fi

if ! type samtools >/dev/null 2>&1; then
        echo 'samtools is not installed!'
        exit
fi

if ! type seqtk >/dev/null 2>&1; then
        echo 'seqtk is not installed!'
        exit
fi

if ! type parallel >/dev/null 2>&1; then
        echo 'parallel is not installed!'
        exit
fi

#obtain specific-Y kmers

echo "***********************************Preparation*************************************"
#calculate specific k-mer density
NUM=`wc -l $KMER |awk 'BEGIN{ORS=""}{print $1}'`
echo "The number of specific k-mers is $NUM."
CORRACT_RATE=0.85 #the correct rate of long reads from third sequencing
DENSITY=`perl -e '$c=sprintf ("%.1f",'${CORRACT_RATE}'**'$KLEN'*'$NUM'*1000/('$GSIZE'*1000000)); print $c;'`
echo "The kmer length is $KLEN;Average specific k-mer density is $DENSITY per kb of long reads."

echo -e "\n***********************************Process*****************************************"

#pre-process the input file
create_reverse_complement.pl $KMER > output/kmer_rc.txt
LREAD=${LREAD//,/ }
for f in $( ls $LREAD ); do seqtk seq -CA $f; done | tee >(grep '>' - |awk '{gsub(/^>/,"");print $0"\t"NR}'> output/origin2newid.txt) | seqtk rename - > output/changeid.fa
samtools faidx output/changeid.fa

#split kmer
PART=`perl -e '$p=(int('$NUM'/100000)+1);print $p;'`
split -l 200000 output/kmer_rc.txt output/x
echo "Dividing the kmer file into $PART parts to decrease the memory. xaa,xab,xac..."

#sort kmer
for k in `ls output/x* | grep -v '\.'`;do echo "sort $k >$k.st";done > temp.sh
parallel -j $CPU <temp.sh
echo "Sort kmer has been done -> output/x*.st"

#search long reads contained specific kmers
for s in `ls output/x* | grep -v '\.'`;do echo "grep -F -b -n -o -f $s output/changeid.fa > $s.kmercount";done >temp.sh
parallel -j $CPU <temp.sh
echo "Kmer searching on long reads has been done -> output/x*.kmercount"

#obtain the position of specific kmers on each read
for c in `ls output/x* | grep -v '\.'`;do echo "get_loci.pl output/changeid.fa.fai $c.kmercount $KLEN > $c.loci";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmer position on long reads has been done -> output/x*.loci"

#get the extended sequences including specific kmers
for l in `ls output/x* | grep -v '\.'`;do echo "seqtk subseq output/changeid.fa $l.loci >$l.loci.fa";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining extended sequences of kmer on long reads has been done -> output/x*.loci.fa"

#acquire the kmers in sequences
for f in `ls output/x* | grep -v '\.'`;do echo "jellyfish count -C -m $KLEN -s 10000000 -o $f.loci.jf $f.loci.fa && jellyfish dump -ct $f.loci.jf | sort -k1,1 > $f.loci.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining kmers of extended sequences has been done -> output/x*.loci.kmer"

#filter kmers of last step
for lk in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $lk.loci.kmer $lk.st |cut -f 1 > $lk.need.kmer";done >temp.sh
parallel -j $CPU <temp.sh
echo "Searching specific kmers from last step has been done -> output/x*.need.kmer"

#the relationship between kmer and read id
for lf in `ls output/x* | grep -v '\.'`;do echo "kmer2id.pl $lf.loci.fa $KLEN | sort -k1,1 -k2,2 -u > $lf.loci.kmer2id";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the relationship bewteen kmer and read has been done -> output/x*.loci.kmer2id"

#get the id
for li in `ls output/x* | grep -v '\.'`;do echo "filterx -k s $li.loci.kmer2id:dup=Y $li.need.kmer | cut -f 2 > $li.seqid";done >temp.sh
parallel -j $CPU <temp.sh
echo "Obtaining the IDs of reads including specific kmer -> output/x*.seqid"
rm temp.sh

#get reads
echo -e "\n***************************************Final****************************************"
cat output/x*.seqid >output/ALL.seqid
count_density.pl output/ALL.seqid output/changeid.fa.fai > output/ALL.density
select_reads.pl output/ALL.density $DENSITY output/origin2newid.txt output/changeid.fa
echo "DONE!"
echo "The final filtered reads file of Y/W chromosome -> output/candidate_target.fa"
echo "The final filtered reads file of autosome and X/Z chromosome -> output/candidate_other.fa"