Add uf-valid for validation

zwets · Jan 26, 2016 · 0677a22 · 0677a22
1 parent 99b6a0d
commit 0677a22
Show file tree

Hide file tree

Showing 2 changed files with 186 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ The header line must start with `>`, immediately followed by the _sequence ident
 
 ##### Sequence line syntax
 
-The sequence line can syntactically contain any character except newline (which terminates it).  However to be semantically valid it must contain only characters defined by IUPAC, and listed in the [NCBI Blast Specification](http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml).  Note that `uf` does not check the validity of the characters in the input FASTA file but copies them verbatim to stdout, removing only whitespace.  Use the `uf-validate` filter to check sequence validity.
+The sequence line can syntactically contain any character except newline (which terminates it).  However to be semantically valid it must contain only characters defined by IUPAC, and listed in the [NCBI Blast Specification](http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml).  Note that `uf` does not check the validity of the characters in the input FASTA file but copies them verbatim to stdout, removing only whitespace.  Use the `uf-valid` filter to check sequence validity.
 
 
 #### Unfasta *is* FASTA
@@ -137,7 +137,9 @@ As in FASTA, an unfasta file contains a list of zero (does FASTA support this?)
 
 #### Requirements for filters
 
-Filters must read standard input and write to standard output.  Filters must produce valid unfasta output, that is pairs of lines with the first starting with `>` and the second having sequence data.  For constraints on the sequence data, refer to `uf-validate`.  Filters must be self-documenting, that is support at least the `--help` option.
+Filters must read standard input and write to standard output.  Filters must produce valid unfasta output, that is pairs of lines with the first starting with `>` and the second having sequence data.  For constraints on the sequence data, refer to `uf-valid`.  Filters must be self-documenting, that is support at least the `--help` option.  
+
+The self-documentation requirement is the reason why several of the `uf-*` tools have been implemented as bash scripts while a simple alias would suffice.  E.g. `uf-bare` could have just been `alias uf-bare='awk "NR%2==1"'`, but then there would be no `uf-bare --help`.
 
 #### Zero-length sequences
 

diff --git a/uf-valid b/uf-valid
@@ -0,0 +1,182 @@
+#!/bin/sh
+#
+#  uf-valid - Validate the sequences in an unfasta stream.
+#  Copyright (C) 2016  Marco van Zwetselaar <[email protected]>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 3 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+#  Created on 2016-01-25
+
+# ALPHABETS taken from http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml
+
+# Nucleic Acids
+#  A  adenosine          C  cytidine             G  guanine
+#  T  thymidine          N  A/G/C/T (any)        U  uridine 
+#  K  G/T (keto)         S  G/C (strong)         Y  T/C (pyrimidine) 
+#  M  A/C (amino)        W  A/T (weak)           R  G/A (purine)        
+#  B  G/T/C              D  G/A/T                H  A/C/T      
+#  V  G/C/A              -  gap of indeterminate length (EXCLUDED)
+
+DNA_ALPHABET="ACGTN"
+RNA_ALPHABET="ACGUN"
+NUCL_ALPHABET="ACGTNUKSYMWRBDHV"
+
+# Amino Acids	
+#  A  alanine               P  proline       
+#  B  aspartate/asparagine  Q  glutamine      
+#  C  cystine               R  arginine      
+#  D  aspartate             S  serine      
+#  E  glutamate             T  threonine      
+#  F  phenylalanine         U* selenocysteine (EXCLUDED)
+#  G  glycine               V  valine        
+#  H  histidine             W  tryptophan        
+#  I  isoleucine            Y  tyrosine
+#  K  lysine                Z  glutamate/glutamine
+#  L  leucine               X  any
+#  M  methionine            *  translation stop (EXCLUDED)
+#  N  asparagine            -  gap of indeterminate length (EXCLUDED)
+
+AMINO_ALPHABET="ABCDEFGHIKLMNPQRSTVXWYZ"
+
+# Function to exit this script with an error message on stderr
+err_exit() {
+	echo "$(basename "$0"): $*" >&2
+	exit 1
+}
+
+# Function to show usage information and exit
+usage_exit() {
+	echo
+	echo "Usage: $(basename $0) [OPTIONS] [FILE] ..."
+	echo
+	echo "  Validate the sequences in each FILE against the allowed alphabet.  If no FILE"
+	echo "  is present or FILE is '-', read from standard input.  Only valid sequences are"
+	echo "  copied to standard output.  For invalid sequences, an error message is printed"
+	echo "  to standard error."
+	echo 
+	echo "  Options"
+	echo "   -a|--allow CHARS  Validate sequences against the alphabet consisting of CHARS."
+	echo "                     May be specified in addition to the synonyms listed below."
+	echo "   -i|--ignore-case  Ignore case for allowed CHARS (default for the synonyms)."
+	echo "   -v|--headers      Also validate the syntax of the headers (default no)."
+	echo "   -s|--stop         Stop processing after the first invalid sequence."
+	echo "   -k|--keep         Do not drop invalid sequences but copy them to standard out."
+	echo "   -q|--quiet        Do not copy standard input to standard output."
+	echo
+	echo "  Synonyms for the usual alphabets (mutually exclusive):"
+	echo "   --dna             Equivalent to: -i -a '$DNA_ALPHABET'"
+	echo "   --rna             Equivalent to: -i -a '$RNA_ALPHABET'"
+	echo "   --nucl            Equivalent to: -i -a '$NUCL_ALPHABET'"
+	echo "   --amino|--prot    Equivalent to: -i -a '$AMINO_ALPHABET'"
+	echo
+	echo "  CAVEAT: when using --allow with symbols other than characters: the current"
+	echo "  implementation is a simple regex search for the complement of the alphabet,"
+	echo "  so symbols which are regex meta-characters like ., ?, and * need escaping."
+	echo 
+	exit ${1:-1}
+}
+
+# Set defaults
+
+ALLOW=""
+IGNORE_CASE=0
+VAL_HEADERS=0
+STOP_ON_ERR=0
+KEEP_ERRORS=0
+QUIET=0
+
+# Parse options
+
+while [ $# -ne 0 -a "$(expr "$1" : '\(.\).*')" = "-" ]; do
+	case $1 in
+	--allow=*)
+		ALLOW="${ALLOW}${1#"--allow="}"
+		;;
+	-a|--allow)
+		shift
+		ALLOW="${ALLOW}$1"
+		;;
+	--dna)
+		[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
+		ALLOW="$DNA_ALPHABET"
+		IGNORE_CASE=1
+		;;
+	--rna)
+		[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
+		ALLOW="$RNA_ALPHABET"
+		IGNORE_CASE=1
+		;;
+	--nucl)
+		[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
+		ALLOW="$NUCL_ALPHABET"
+		IGNORE_CASE=1
+		;;
+	--amino|--prot)
+		[ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
+		ALLOW="$AMINO_ALPHABET"
+		IGNORE_CASE=1
+		;;
+	-i|--ignore*)
+		IGNORE_CASE=1
+		;;
+	-v|--headers)
+		VAL_HEADERS=1
+		;;
+	-s|--stop)
+		STOP_ON_ERR=1
+		;;
+	-k|--keep-errors)
+		KEEP_ERRORS=1
+		;;
+	-q|--quiet)
+		QUIET=1
+		;;
+	--help)
+		usage_exit 0
+		;;
+	*) usage_exit
+		;;
+	esac
+	shift
+done
+
+# Check that at least some alphabet was selected
+
+[ -n "$ALLOW" ] || err_exit "no valid alphabet specified"
+
+# Do the work
+
+awk -v P="$(basename "$0")" -v WRONG="[^$ALLOW]" -v H=$VAL_HEADERS -v Q=$QUIET -v K=$KEEP_ERRORS -v S=$STOP_ON_ERR -v IGNORECASE=$IGNORE_CASE '
+	{ 	ERR = 0
+		if ( H && $0 !~ /^>[[:alpha:]]+\|\w+(\|\w+)*(\s+.*)?$/ ) {
+			print P ": invalid header at line " NR ": " $0 > "/dev/stderr"
+			ERR = 1
+		}
+		HDR = $0
+		getline SEQ
+		#POS = SEQ ~ WRONG
+		POS = match (SEQ, WRONG)
+		if ( POS ) {
+			print P ": invalid character in sequence at line " NR ", pos " POS ": " substr(SEQ,POS,1) > "/dev/stderr"
+			ERR = 1
+		}
+		if ( (!ERR || K) && !Q ) {
+			print HDR
+			print SEQ
+		}
+		if ( ERR && S ) {
+			exit 1
+		}
+	}' "$@"
+