-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_x_nt.awk
63 lines (63 loc) · 1.45 KB
/
get_x_nt.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# extract the n first nucleotides of a fna sequences file
#
# change 1srt codon to ATG
# exclude sequence tagged "pseudo"
# exclude sequence with length < input size value
#
# eg. of command line:
# rm get_x_nt.log ; awk -v size=60 -f get_nt.awk *.fna
#
BEGIN{
tooShort=0; pseudo=0 ;
seq=""; pseu="" ; shor="";
if(size==0){print "abort due to no fixed size (need -v size=60)"}
}
{
if(size > 0){
if ($1~"^>"){
if(length(seq)==size){
print comment"\n"seq;
} else {
if(length(comment)>1){ # pseudo have comment fixed to null
tooshort=tooshort+1;
tmp=sprintf("%s\n%s",shor,comment);
shor=tmp;
}
}
seq="" ;
if ($0 !~ /pseudo=true/){
comment=$1;
getSeq=1;
} else {
pseudo=pseudo+1;
tmp=sprintf("%s\n%s",pseu,$1);
pseu=tmp;
getSeq=0;
comment="";
}
} else {
if(getSeq){
seql=length(seq);
if (seql+length($1)<=size){
tmp=sprintf("%s%s",seq,$1);
} else {
seqToAdd=substr($1,1,size-seql);
tmp=sprintf("%s%s",seq,seqToAdd);
getSeq=0;
}
seq=tmp;
}
}
}
}
END{
if(length(seq)==size){
print comment"\n"seq;
} else {
tooshort=tooshort+1;
tmp=sprintf("%s\n%s",shor,comment);
shor=tmp;
}
print tooshort" too_short: "shor >>"get_x_nt.log" ;
print pseudo" pseudo: "pseu >>"get_x_nt.log"
}