-
Notifications
You must be signed in to change notification settings - Fork 0
/
uf-circut
executable file
·173 lines (145 loc) · 6.58 KB
/
uf-circut
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/bin/sh
#
# uf-circut - Take a section from circular sequences in unfasta format.
# Copyright (C) 2016 Marco van Zwetselaar <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This utility is part of http://io.zwets.it/unfasta
# Function to exit this script with an error message on stderr
err_exit() {
echo "$(basename "$0"): $*" >&2
exit 1
}
# Function to show usage information and exit
usage_exit() {
echo "
Usage: $(basename $0) [OPTIONS] FROM:END [FILE ...]
$(basename $0) [OPTIONS] FROM/LEN [FILE ...]
$(basename $0) [OPTIONS] MID~DIST [FILE ...]
Cut a segment from each sequence in each unfasta FILE and write to standard
output. If no FILE is present or when FILE is '-', read standard input.
Sequences are taken to be circular and cuts wrap around. For straight cuts
use uf-cut. Sequences for which the cut fails are not written to standard
output unless option -z|--zero is present.
Options
-q, --quiet Do not emit messages about failed cuts.
-z, --zero Output a zero length sequence for a failed cut.
-m, --mark Document the cut by appending '(uf:circut:...)' to header.
There are three ways to specify the section:
1. FROM:END cuts the section starting at position FROM and ending at poition
END. If END is left of FROM, then the section returned wraps around the
end of the sequence.
2. FROM/LEN cuts a section of length LEN starting at position FROM, wrapping
around if FROM+LEN extends beyond the end of the sequence.
3. MID~DIST cuts a section starting DIST left from MID and ending DIST right
of MID, wrapping around if either end extends beyond en of sequence.
The following constraints must be met. Failing cuts lead to a diagnostic
message on standard error. The botched cut will not be output.
Constraints:
- Positions (FROM, END, MID) are 1-based; negative values count backward from
the rightmost element. FROM, END and MID must be on the sequence and hence
values must be in range 1..LENGTH or -1..-LENGTH.
- Lengths (LEN, DIST) are positive numbers; LEN must be in range 1..LENGTH;
DIST must be in range 0..(LENGTH-1)/2; these constraints prevent the cut
from being longer than the sequence and 'wrapping onto itself'.
- In the FROM:END case, when FROM is positive and END is negative, then END
must not resolve to a position left of FROM. Inversely, when FROM is
negative and END is positive then END must end up left of FROM. In short:
M:-N never wraps, -M:N always wrap (to guarantee least surprise behaviour.)
Examples, tip and tricks:
- $(basename "$0") -10:11 Select 21 bases around pos 1 of cyclic seq
- $(basename "$0") 1~10 Same result as above
- $(basename "$0") -10/21 Same result as above
- $(basename "$0") 100:99 rotates the sequence left by 99 positions
" >&2
exit ${1:-1}
}
# Parse options
QUIET=0
ZERO=0
MARK=0
while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do
case $1 in
-m|--mark) MARK=1 ;;
-z|--zero) ZERO=1 ;;
-q|--quiet) QUIET=1 ;;
-h|--help) usage_exit 0 ;;
*) usage_exit ;;
esac
shift || usage_exit
done
# Parse the cut specification - arrr walk the regex plank matey
[ $# -ge 1 -a "$(expr "$1" : '^\(-\?[1-9][0-9]*\(:-\?[1-9][0-9]*\|\(/[1-9]\|~[0-9]\)[0-9]*\)\)$')" = "$1" ] || usage_exit
CUT_SPEC=$1
shift
# Parse the bits out of the cut specification
LHS="$(expr "$CUT_SPEC" : '\(.*\)[/:~].*')"
OP="$(expr "$CUT_SPEC" : '.*\([/:~]\).*')"
RHS="$(expr "$CUT_SPEC" : '.*[/:~]\(.*\)')"
# Delegate to gawk
gawk -bO -v P="$(basename "$0")" -v FROM=$LHS -v MID=$LHS -v UPTO=$RHS -v LEN=$RHS -v DIST=$RHS -v OP="$OP" -v Z=$ZERO -v Q=$QUIET -v M=$MARK '
function bump0(p1,p2) { return p1 < 0 && p2 >= 0 ? 1 : 0 } # return 1 if from p1 to p2 crosses 0
BEGIN {
# Translate the three possible cut specs into POS0 (left) and POS1 (right)
if (OP == ":") {
POS0 = FROM; POS1 = UPTO
} else if (OP == "/") {
POS0 = FROM; POS1 = FROM+LEN-1; POS1 += bump0(FROM,POS1)
} else if (OP == "~") {
POS0 = MID-DIST; POS0 -= bump0(POS0,MID)
POS1 = MID+DIST; POS1 += bump0(MID,POS1)
}
}
NR%2==1 { # Store header, will be printed only when sequence processes alright
HDR = $0
}
function map_on_seq(n,l) { return n < 0 ? (n < -l ? n+l+l : n+l) + 1 : (n > l ? n - l : n) }
NR%2==0 { # Process sequence line
N = length($0)
if ( validate() ) {
X = map_on_seq(POS0,N); Y = map_on_seq(POS1,N)
print HDR (M ? " (uf:circut:" X ":" Y ")" : "")
print X <= Y ? substr($0,X,Y-X+1) : (substr($0,X) substr($0,1,Y))
}
else if (Z) { # invalid cut, if flag zero then output a zero length seq
print HDR (M ? " (uf:circut:" POS0 ":" POS1 ":failed)" : "")
print ""
}
}
END {
close("/dev/stderr")
}
function abs(n) { return n<0 ? -n : n }
function val_error(s) { if (!Q) print P ": line " NR ": " s > "/dev/stderr"; return 0; }
function validate() {
if (OP == ":") {
if (abs(FROM) > N) return val_error("FROM position (" FROM ") outside sequence length (" N ")")
if (abs(UPTO) > N) return val_error("END position (" UPTO ") outside sequence length (" N ")")
# Check the cases specified in the constraint for guaranteeing least surprise
if (FROM < 0 && UPTO > 0 && UPTO-FROM > N) return val_error("sequence too short (" N ") for wrap -" abs(FROM) ":" UPTO)
if (FROM > 0 && UPTO < 0 && FROM-UPTO > N+1) return val_error("sequence too short (" N ") for cut " FROM ":-" abs(UPTO))
}
else if (OP == "/") {
if (abs(FROM) > N) return val_error("FROM position (" FROM ") outside sequence length (" N ")")
if (LEN > N) return val_error("requested length (" LEN ") exceeds sequence length (" N ")")
}
else if (OP == "~") {
if (abs(MID) > N) return val_error("MID position (" MID ") outside sequence length (" N ")")
if (2*DIST+1 > N) return val_error("requested width (" 2*DIST+1 ") exceeds sequence length (" N ")")
}
return 1
}
' "$@"
# vim: sts=4:sw=4:et:si:ai