-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_codonRatio.sh
63 lines (63 loc) · 3.64 KB
/
compute_codonRatio.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash
#
# script of the https://github.com/i2bc/n_terminal_lysine deposit
# compute codon ratios of interest of cds sequences
#
# rely on cusp from the emboss tools package
# use ratio_for_2codons.awk, ratio_for_Rcodons.awk
#
# fix:
# ${1} integer : size in nt of the sequences to analyse
# ${2} integer : minimal threshold for denominator to compute a ratio, "NA" otherwise
# ${list_sp} chain : list of species identifier giving acces to the input nucleotidic sequence file Tmp/${SP}_${1}nt_cfg.fna
#
# input :
# - *.list: list (one ID by line) of identifiers of genome sequence (eg. GCA_000005845.2 from ncbi genbank)
# - Tmp/${SP}_${1}nt_cfg.fna: multiple sequence fna files, format: one header line followed by ONE sequence line ; Tmp/${SP}_${1}nt_cfg.fna can be generated by the compute.sh, see https://github.com/i2bc/n_terminal_lysine deposit
#
# temporary outputs :
# - Tmp/cut_${SP}_${i}.fna: codon extraction by nucleotidic position range
# - Tmp/${SP}_${i}.cusp: codon usage table from the codons of Tmp/cut_${SP}_${i}.fna
# - Tmp/${SP}_${c}.txt: column "Number" of Tmp/${SP}_${i}.cusp for codon ${c} chosen in AAA AAG TAT TAC CAT CAC CAA CAG AAT AAC GAT GAC GAA GAG AGA AGG CGA CGC CGG CGT
# - Tmp/${SP}_ratio_s${d}.txt: incremental table of codon number and ratio
# - Tmp/ratio_${SP}_s${d}_: parts of the table of codon number and ratio depending of the size (in nt) of the sequence to analyse
# output :
# - ${SP}_codon.txt: table of codon number and ratio (33 columns x ${1}/3 lines)
#
# command line eg.:
# list_sp="GCA_000005845.2"
# bash n_terminal_lysine/compute_ratios.sh 60 10
#
mkdir -p Tmp
position_range=`seq 2 ${1} | awk '{if(NR%3==0){if($1<=9){if($1+2<=9){print "0"$1"-0"$1+2}else{print "0"$1"-"$1+2}}else{print $1"-"$1+2}}}'`
for SP in `cut -f1 ${list_sp}` ; do
# extract codon by position
for i in ${position_range} ; do
# cut -c ${i} Tmp/${SP}_150nt_cfg.fna | awk '{if(NR%2==0){print}else{print ">seq_"NR/2+0.5}}' > Tmp/cut_${SP}_${i}.fna ; # for analysis on 150 nt
cut -c ${i} Tmp/${SP}_${1}nt_cfg.fna | awk '{if(NR%2==0){print}else{print ">seq_"NR/2+0.5}}' > Tmp/cut_${SP}_${i}.fna ;
cusp -sequence Tmp/cut_${SP}_${i}.fna -outfile Tmp/${SP}_${i}.cusp ;
done;
# extract the Number column in file.cusp for some codons:
for c in AAA AAG TAT TAC CAT CAC CAA CAG AAT AAC GAT GAC GAA GAG AGA AGG CGA CGC CGG CGT ; do
echo ${c} > Tmp/${SP}_${c}.txt;
for i in ${position_range} ; do
grep ${c} Tmp/${SP}_${i}.cusp | awk '{print $5}' >> Tmp/${SP}_${c}.txt;
done;
done;
# compute ratios:
for d in ${2} ; do
rm Tmp/${SP}_ratio_s${d}.txt ;
for f in "Tmp/${SP}_AAA.txt Tmp/${SP}_AAG.txt" ; do
paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_Kcodons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
done ;
for f in "Tmp/${SP}_TAT.txt Tmp/${SP}_TAC.txt" "Tmp/${SP}_CAT.txt Tmp/${SP}_CAC.txt" "Tmp/${SP}_CAA.txt Tmp/${SP}_CAG.txt" "Tmp/${SP}_AAT.txt Tmp/${SP}_AAC.txt" "Tmp/${SP}_GAT.txt Tmp/${SP}_GAC.txt" "Tmp/${SP}_GAA.txt Tmp/${SP}_GAG.txt" ; do
paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_2codons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
done;
for f in "Tmp/${SP}_AGA.txt Tmp/${SP}_AGG.txt Tmp/${SP}_CGA.txt Tmp/${SP}_CGC.txt Tmp/${SP}_CGG.txt Tmp/${SP}_CGT.txt" ; do
paste ${f} | awk -v md=${d} -f n_terminal_lysine/ratio_for_Rcodons.awk >> Tmp/${SP}_ratio_s${d}.txt ;
done ;
positions=`echo "" | awk -v pos=${1} 'BEGIN{print int(pos/3)}'`
split -l ${positions} Tmp/${SP}_ratio_s${d}.txt Tmp/ratio_${SP}_s${d}_;
paste Tmp/ratio_${SP}_* > ${SP}_codon.txt;
done ;
done