-
Notifications
You must be signed in to change notification settings - Fork 0
/
uf-valid
executable file
·173 lines (153 loc) · 5.99 KB
/
uf-valid
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/bin/sh
#
# uf-valid - Validate the sequences in an unfasta stream.
# Copyright (C) 2016 Marco van Zwetselaar <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This utility is part of http://io.zwets.it/unfasta
# ALPHABETS taken from http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml
# Nucleic Acids / Base
# A adenosine/adenine C cytidine/cytosine G guanosine/guanine
# T thymidine/thymine N A/G/C/T (any) U uracil/uridine
# K G/T (keto) S G/C (strong) Y T/C (pyrimidine)
# M A/C (amino) W A/T (weak) R G/A (purine)
# B G/T/C D G/A/T H A/C/T
# V G/C/A - gap of indeterminate length (EXCLUDED)
DNA_ALPHABET="ACGTN"
RNA_ALPHABET="ACGUN"
NUCL_ALPHABET="ACGTNUKSYMWRBDHV"
# Amino Acids
# A Ala Alanine P Pro Proline
# B Asx D or N Q Gln Glutamine
# C Cys Cystine R Arg Arginine
# D Asp Aspartate S Ser Serine
# E Glu Glutamate T Thr Threonine
# F Phe Phenylalanine U Selenocysteine (EXCLUDED)
# G Gly Glycine V Val Valine
# H His Histidine W Trp Tryptophan
# I Ile Isoleucine Y Tyr Tyrosine
# K Lys Lysine Z Glx E or Q
# L Leu Leucine X any
# M Met Methionine * translation stop (EXCLUDED)
# N Asn Asparagine - gap of indeterminate length (EXCLUDED)
AMINO_ALPHABET="ABCDEFGHIKLMNPQRSTVXWYZ"
# Function to exit this script with an error message on stderr
err_exit() {
echo "$(basename "$0"): $*" >&2
exit 1
}
# Function to show usage information and exit
usage_exit() {
echo "
Usage: $(basename $0) [OPTIONS] [FILE ...]
Validate the sequences in each FILE against the allowed alphabet. If no FILE
is present or FILE is '-', read from standard input. Only valid sequences are
copied to standard output. For invalid sequences, an error message is printed
to standard error.
OPTIONS
-a, --allow CHARS Validate sequences against the alphabet consisting of CHARS.
May be specified in addition to the synonyms listed below.
-i, --ignore-case Ignore case for allowed CHARS (default for the synonyms).
-v, --headers Validate the syntax of the headers against NCBI conditions.
-s, --stop Stop processing after the first invalid sequence.
-k, --keep Do not drop invalid sequences but copy them to standard out.
-q, --quiet Do not copy standard input to standard output.
Synonyms for the usual alphabets (mutually exclusive):
--dna Equivalent to: -i -a '$DNA_ALPHABET'
--rna Equivalent to: -i -a '$RNA_ALPHABET'
--nucl Equivalent to: -i -a '$NUCL_ALPHABET'
--amino, --prot Equivalent to: -i -a '$AMINO_ALPHABET'
CAVEAT: when using --allow with symbols other than characters: the current
implementation is a simple regex search for the complement of the alphabet,
so symbols which are regex meta-characters like ., ?, and * need escaping.
" >&2
exit ${1:-1}
}
# Set defaults
ALLOW=""
IGNORE_CASE=0
VAL_HEADERS=0
STOP_ON_ERR=0
KEEP_ERRORS=0
QUIET=0
# Parse options
while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do
case $1 in
--dna)
[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
ALLOW="$DNA_ALPHABET"
IGNORE_CASE=1
;;
--rna)
[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
ALLOW="$RNA_ALPHABET"
IGNORE_CASE=1
;;
--nucl)
[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
ALLOW="$NUCL_ALPHABET"
IGNORE_CASE=1
;;
--amino|--prot)
[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
ALLOW="$AMINO_ALPHABET"
IGNORE_CASE=1
;;
--allow=*) ALLOW="${ALLOW}${1#"--allow="}" ;;
-a|--allow) shift; ALLOW="${ALLOW}$1" ;;
-i|--ignore*) IGNORE_CASE=1 ;;
-v|--headers) VAL_HEADERS=1 ;;
-s|--stop) STOP_ON_ERR=1 ;;
-k|--keep*) KEEP_ERRORS=1 ;;
-q|--quiet) QUIET=1 ;;
--help) usage_exit 0 ;;
*) usage_exit ;;
esac
shift || usage_exit
done
# Check that at least some alphabet was selected
[ -n "$ALLOW" ] || err_exit "no valid alphabet specified"
# Do the work
gawk -bO -v P="$(basename "$0")" -v WRONG="[^$ALLOW]" -v H=$VAL_HEADERS -v Q=$QUIET -v K=$KEEP_ERRORS -v S=$STOP_ON_ERR -v IGNORECASE=$IGNORE_CASE '{
ERR = 0
HDR = $0
if ( HDR !~ /^>/ ) {
print P ": no FASTA header found at line " NR
ERR = 1
}
else if ( H && HDR !~ /^>[[:alpha:]]+\|[[:alnum:]._]+(\|[[:alnum:]._]+)*\|?(\s+.*)?$/ ) { # rudimentary syntax check of header
print P ": invalid header syntax at line " NR ": " HDR > "/dev/stderr"
ERR = 1
}
if ( getline SEQ != 1 ) {
print P ": read error or end of file, no sequence read at line " NR
ERR = 1
}
else {
POS = match (SEQ, WRONG)
if ( POS ) {
print P ": invalid character in sequence at line " NR ", pos " POS ": " substr(SEQ,POS,1) > "/dev/stderr"
ERR = 1
}
}
if ( (!ERR || K) && !Q ) {
print HDR
print SEQ
}
if ( ERR && S ) {
exit 1
}
}' "$@"
# vim: sts=4:sw=4:et:si:ai