compute_codonRatio.sh

#!/bin/bash
# 
# script of the https://github.com/i2bc/n_terminal_lysine deposit
# compute codon ratios of interest of cds sequences 
#
# rely on cusp from the emboss tools package 
# use ratio_for_2codons.awk, ratio_for_Rcodons.awk
#
# fix: 
# ${1} integer : size in nt of the sequences to analyse
# ${2} integer : minimal threshold for denominator to compute a ratio, "NA" otherwise
# ${list_sp} chain : list of species identifier giving acces to the input nucleotidic sequence file Tmp/${SP}_${1}nt_cfg.fna
#
# input : 
# - *.list: list (one ID by line) of identifiers of genome sequence (eg. GCA_000005845.2 from ncbi genbank)
# - Tmp/${SP}_${1}nt_cfg.fna: multiple sequence fna files, format: one header line followed by ONE sequence line ; Tmp/${SP}_${1}nt_cfg.fna can be generated by the compute.sh, see https://github.com/i2bc/n_terminal_lysine deposit
#
# temporary outputs : 
# - Tmp/cut_${SP}_${i}.fna: codon extraction by nucleotidic position range
# - Tmp/${SP}_${i}.cusp: codon usage table from the codons of Tmp/cut_${SP}_${i}.fna
# - Tmp/${SP}_${c}.txt: column "Number" of Tmp/${SP}_${i}.cusp for codon ${c} chosen in AAA AAG TAT TAC CAT CAC CAA CAG AAT AAC GAT GAC GAA GAG AGA AGG CGA CGC CGG CGT
# - Tmp/${SP}_ratio_s${d}.txt: incremental table of codon number and ratio
# - Tmp/ratio_${SP}_s${d}_: parts of the table of codon number and ratio depending of the size (in nt) of the sequence to analyse
# output : 
# - ${SP}_codon.txt: table of codon number and ratio (33 columns x ${1}/3 lines)
#
# command line eg.:
#   list_sp="GCA_000005845.2"
#   bash n_terminal_lysine/compute_ratios.sh 60 10
#
mkdir -p Tmp
position_range=`seq 2 ${1} | awk '{if(NR%3==0){if($1<=9){if($1+2<=9){print "0"$1"-0"$1+2}else{print "0"$1"-"$1+2}}else{print $1"-"$1+2}}}'`
for SP in `cut -f1 ${list_sp}` ; do
    # extract codon by position
    for i in ${position_range} ; do
         # cut -c ${i} Tmp/${SP}_150nt_cfg.fna | awk '{if(NR%2==0){print}else{print ">seq_"NR/2+0.5}}' > Tmp/cut_${SP}_${i}.fna ; # for analysis on 150 nt
         cut -c ${i} Tmp/${SP}_${1}nt_cfg.fna | awk '{if(NR%2==0){print}else{print ">seq_"NR/2+0.5}}' > Tmp/cut_${SP}_${i}.fna ;
         cusp -sequence Tmp/cut_${SP}_${i}.fna -outfile Tmp/${SP}_${i}.cusp ;
    done;
    # extract the Number column in file.cusp for some codons:
    for c in AAA AAG TAT TAC CAT CAC CAA CAG AAT AAC GAT GAC GAA GAG AGA AGG CGA CGC CGG CGT ; do
       echo ${c} > Tmp/${SP}_${c}.txt;
       for i in ${position_range} ; do
             grep ${c} Tmp/${SP}_${i}.cusp | awk '{print $5}' >> Tmp/${SP}_${c}.txt;
       done;
    done;
    # compute ratios:
    for d in ${2} ; do
        rm Tmp/${SP}_ratio_s${d}.txt ;
        for f in "Tmp/${SP}_AAA.txt Tmp/${SP}_AAG.txt" ; do
            paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_Kcodons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
        done ; 
        for f in "Tmp/${SP}_TAT.txt Tmp/${SP}_TAC.txt" "Tmp/${SP}_CAT.txt Tmp/${SP}_CAC.txt" "Tmp/${SP}_CAA.txt Tmp/${SP}_CAG.txt" "Tmp/${SP}_AAT.txt Tmp/${SP}_AAC.txt" "Tmp/${SP}_GAT.txt Tmp/${SP}_GAC.txt" "Tmp/${SP}_GAA.txt Tmp/${SP}_GAG.txt" ; do
            paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_2codons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
        done;
        for f in "Tmp/${SP}_AGA.txt Tmp/${SP}_AGG.txt Tmp/${SP}_CGA.txt Tmp/${SP}_CGC.txt Tmp/${SP}_CGG.txt Tmp/${SP}_CGT.txt" ; do
            paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_Rcodons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
        done ;
        positions=`echo "" | awk -v pos=${1} 'BEGIN{print int(pos/3)}'`
        split -l ${positions} Tmp/${SP}_ratio_s${d}.txt Tmp/ratio_${SP}_s${d}_;
        paste Tmp/ratio_${SP}_* > ${SP}_codon.txt;
    done ;
done