uf-valid

#!/bin/sh
#
#  uf-valid - Validate the sequences in an unfasta stream.
#  Copyright (C) 2016  Marco van Zwetselaar <io@zwets.it>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#  This utility is part of http://io.zwets.it/unfasta

# ALPHABETS taken from http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml

# Nucleic Acids / Base
#  A  adenosine/adenine  C  cytidine/cytosine    G  guanosine/guanine
#  T  thymidine/thymine  N  A/G/C/T (any)        U  uracil/uridine
#  K  G/T (keto)         S  G/C (strong)         Y  T/C (pyrimidine)
#  M  A/C (amino)        W  A/T (weak)           R  G/A (purine)
#  B  G/T/C              D  G/A/T                H  A/C/T
#  V  G/C/A              -  gap of indeterminate length (EXCLUDED)

DNA_ALPHABET="ACGTN"
RNA_ALPHABET="ACGUN"
NUCL_ALPHABET="ACGTNUKSYMWRBDHV"

# Amino Acids
#  A Ala Alanine               P Pro Proline
#  B Asx D or N                Q Gln Glutamine
#  C Cys Cystine               R Arg Arginine
#  D Asp Aspartate             S Ser Serine
#  E Glu Glutamate             T Thr Threonine
#  F Phe Phenylalanine         U     Selenocysteine (EXCLUDED)
#  G Gly Glycine               V Val Valine
#  H His Histidine             W Trp Tryptophan
#  I Ile Isoleucine            Y Tyr Tyrosine
#  K Lys Lysine                Z Glx E or Q
#  L Leu Leucine               X any
#  M Met Methionine            * translation stop (EXCLUDED)
#  N Asn Asparagine            - gap of indeterminate length (EXCLUDED)

AMINO_ALPHABET="ABCDEFGHIKLMNPQRSTVXWYZ"

# Function to exit this script with an error message on stderr
err_exit() {
    echo "$(basename "$0"): $*" >&2
    exit 1
}

# Function to show usage information and exit
usage_exit() {
    echo "
Usage: $(basename $0) [OPTIONS] [FILE ...]

  Validate the sequences in each FILE against the allowed alphabet.  If no FILE
  is present or FILE is '-', read from standard input.  Only valid sequences are
  copied to standard output.  For invalid sequences, an error message is printed
  to standard error.

  OPTIONS
   -a, --allow CHARS  Validate sequences against the alphabet consisting of CHARS.
                      May be specified in addition to the synonyms listed below.
   -i, --ignore-case  Ignore case for allowed CHARS (default for the synonyms).
   -v, --headers      Validate the syntax of the headers against NCBI conditions.
   -s, --stop         Stop processing after the first invalid sequence.
   -k, --keep         Do not drop invalid sequences but copy them to standard out.
   -q, --quiet        Do not copy standard input to standard output.

  Synonyms for the usual alphabets (mutually exclusive):
   --dna             Equivalent to: -i -a '$DNA_ALPHABET'
   --rna             Equivalent to: -i -a '$RNA_ALPHABET'
   --nucl            Equivalent to: -i -a '$NUCL_ALPHABET'
   --amino, --prot   Equivalent to: -i -a '$AMINO_ALPHABET'

  CAVEAT: when using --allow with symbols other than characters: the current
  implementation is a simple regex search for the complement of the alphabet,
  so symbols which are regex meta-characters like ., ?, and * need escaping.
" >&2
    exit ${1:-1}
}

# Set defaults

ALLOW=""
IGNORE_CASE=0
VAL_HEADERS=0
STOP_ON_ERR=0
KEEP_ERRORS=0
QUIET=0

# Parse options

while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do
    case $1 in
    --dna)
        [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
        ALLOW="$DNA_ALPHABET"
        IGNORE_CASE=1
        ;;
    --rna)
        [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
        ALLOW="$RNA_ALPHABET"
        IGNORE_CASE=1
        ;;
    --nucl)
        [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
        ALLOW="$NUCL_ALPHABET"
        IGNORE_CASE=1
        ;;
    --amino|--prot)
        [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined"
        ALLOW="$AMINO_ALPHABET"
        IGNORE_CASE=1
        ;;
    --allow=*)        ALLOW="${ALLOW}${1#"--allow="}" ;;
    -a|--allow)       shift; ALLOW="${ALLOW}$1" ;;
    -i|--ignore*)     IGNORE_CASE=1 ;;
    -v|--headers)     VAL_HEADERS=1 ;;
    -s|--stop)        STOP_ON_ERR=1 ;;
    -k|--keep*)       KEEP_ERRORS=1 ;;
    -q|--quiet)       QUIET=1 ;;
    --help) usage_exit 0 ;;
    *) usage_exit ;;
    esac
    shift || usage_exit
done

# Check that at least some alphabet was selected

[ -n "$ALLOW" ] || err_exit "no valid alphabet specified"

# Do the work

gawk -bO -v P="$(basename "$0")" -v WRONG="[^$ALLOW]" -v H=$VAL_HEADERS -v Q=$QUIET -v K=$KEEP_ERRORS -v S=$STOP_ON_ERR -v IGNORECASE=$IGNORE_CASE '{
    ERR = 0
    HDR = $0
    if ( HDR !~ /^>/ ) {
        print P ": no FASTA header found at line " NR
        ERR = 1
    }
    else if ( H && HDR !~ /^>[[:alpha:]]+\|[[:alnum:]._]+(\|[[:alnum:]._]+)*\|?(\s+.*)?$/ ) {  # rudimentary syntax check of header
        print P ": invalid header syntax at line " NR ": " HDR > "/dev/stderr"
        ERR = 1
    }
    if ( getline SEQ != 1 ) {
        print P ": read error or end of file, no sequence read at line " NR
        ERR = 1
    }
    else {
        POS = match (SEQ, WRONG)
        if ( POS ) {
            print P ": invalid character in sequence at line " NR ", pos " POS ": " substr(SEQ,POS,1) > "/dev/stderr"
            ERR = 1
        }
    }
    if ( (!ERR || K) && !Q ) {
        print HDR
        print SEQ
    }
    if ( ERR && S ) {
        exit 1
    }
}' "$@"

# vim: sts=4:sw=4:et:si:ai