diff --git a/uf b/uf index 4d59455..cf7e632 100755 --- a/uf +++ b/uf @@ -77,3 +77,4 @@ else END { print "\n" }' "$@" fi +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-bare b/uf-bare index e424ede..ecc639b 100755 --- a/uf-bare +++ b/uf-bare @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Output only the bare sequences from each FILE to standard output, dropping the @@ -39,7 +39,7 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] By default headers go to /dev/null. Use option -w to store them in a file. The file can be merged back in with bare sequences using 'uf-dress'. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -47,21 +47,21 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] unset HDRSFILE FORCE while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - -w) shift - [ $# -ge 1 ] || usage_exit - HDRSFILE="$1" - ;; - -f|--force) - FORCE="yes" - ;; - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + -w) shift + [ $# -ge 1 ] || usage_exit + HDRSFILE="$1" + ;; + -f|--force) + FORCE="yes" + ;; + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check options validity @@ -71,6 +71,7 @@ done # Do the work awk -b -O -v F="$HDRSFILE" ' - NR%2==1 && F { print > F } - NR%2==0' "$@" + NR%2==1 && F { print > F } + NR%2==0' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-circut b/uf-circut index 5a9e3d2..e4704c6 100755 --- a/uf-circut +++ b/uf-circut @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] FROM:END [FILE ...] $(basename $0) [OPTIONS] FROM/LEN [FILE ...] $(basename $0) [OPTIONS] MID~DIST [FILE ...] @@ -72,7 +72,7 @@ Usage: $(basename $0) [OPTIONS] FROM:END [FILE ...] - $(basename "$0") -10/21 Same result as above - $(basename "$0") 100:99 rotates the sequence left by 99 positions " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -81,24 +81,24 @@ QUIET=0 ZERO=0 MARK=0 -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - -m|--mark) - MARK=1 - ;; - -z|--zero) - ZERO=1 - ;; - -q|--quiet) - QUIET=1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + -m|--mark) + MARK=1 + ;; + -z|--zero) + ZERO=1 + ;; + -q|--quiet) + QUIET=1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Parse the cut specification - arrr walk the regex plank matey @@ -117,7 +117,7 @@ RHS="$(expr "$CUT_SPEC" : '.*[/:~]\(.*\)')" # Delegate to awk awk -b -O -v P="$(basename "$0")" -v FROM=$LHS -v MID=$LHS -v UPTO=$RHS -v LEN=$RHS -v DIST=$RHS -v OP="$OP" -v Z=$ZERO -v Q=$QUIET -v M=$MARK ' -function bump0(p1,p2) { return p1 < 0 && p2 >= 0 ? 1 : 0 } # return 1 if from p1 to p2 crosses 0 +function bump0(p1,p2) { return p1 < 0 && p2 >= 0 ? 1 : 0 } # return 1 if from p1 to p2 crosses 0 BEGIN { # Translate the three possible cut specs into POS0 (left) and POS1 (right) @@ -179,3 +179,4 @@ function validate() { } ' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-cut b/uf-cut index 0477bb3..fad5aa7 100755 --- a/uf-cut +++ b/uf-cut @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] FROM:END [FILE ...] $(basename $0) [OPTIONS] FROM/LEN [FILE ...] $(basename $0) [OPTIONS] MID~DIST [FILE ...] @@ -69,7 +69,7 @@ Usage: $(basename $0) [OPTIONS] FROM:END [FILE ...] - '$(basename "$0") x/1' Same result as previous - '$(basename "$0") x~0' Same result as previous " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -79,27 +79,27 @@ ZERO=0 CLIP=0 MARK=0 -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - -m|--mark) - MARK=1 - ;; - -c|--clip) - CLIP=1 - ;; - -q|--quiet) - QUIET=1 - ;; - -z|--zero) - ZERO=1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + -m|--mark) + MARK=1 + ;; + -c|--clip) + CLIP=1 + ;; + -q|--quiet) + QUIET=1 + ;; + -z|--zero) + ZERO=1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Parse the cut specification - arrr walk the regex plank matey @@ -200,3 +200,4 @@ function validate_and_map() { } ' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-dress b/uf-dress index 39999e3..ac850f0 100755 --- a/uf-dress +++ b/uf-dress @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Insert headers to turn bare sequences into valid unfasta. Reads each line from @@ -45,7 +45,7 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] See also: 'uf-map' which implements this idiom in a single command. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -53,18 +53,18 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] unset HDRSFILE while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - -r) shift - [ $# -ge 1 ] || usage_exit - HDRSFILE="$1" - ;; - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + -r) shift + [ $# -ge 1 ] || usage_exit + HDRSFILE="$1" + ;; + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check options validity @@ -74,11 +74,12 @@ done # Do the work awk -b -O -v F="$HDRSFILE" '{ - HDR = "" - if (F) getline HDR < F - if (!HDR) HDR = ">lcl|" NR " Dummy header " NR - print HDR - print - } - END { if (F) close (F) }' "$@" + HDR = "" + if (F) getline HDR < F + if (!HDR) HDR = ">lcl|" NR " Dummy header " NR + print HDR + print + } + END { if (F) close (F) }' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-drop b/uf-drop index 5e49d8b..15427f2 100755 --- a/uf-drop +++ b/uf-drop @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] NUM [FILE ...] For each sequence in each unfasta format FILE drop the first NUM characters @@ -38,7 +38,7 @@ Usage: $(basename $0) [OPTIONS] NUM [FILE ...] -b|--but Drop everything BUT the final NUM characters from each sequence. -m|--mark Document the edit by appending '(uf:drop:[but:]NUM)' to headers. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -47,20 +47,20 @@ BUT=0 MARK=0 while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - -m|--mark) - MARK=1 - ;; - -b|--but) - BUT=1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + -m|--mark) + MARK=1 + ;; + -b|--but) + BUT=1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check first argument is positive number @@ -73,7 +73,8 @@ shift # Delegate to awk awk -b -O -v P="$(basename "$0")" -v N=$NUM -v M=$MARK -v B=$BUT ' - NR%2==1 { print $0 (M ? " (uf:drop:" (B?"but:":"") N ")" : "") } - NR%2==0 { print B ? substr ($0,length($0)-N+1) : substr ($0,N+1) } - ' "$@" + NR%2==1 { print $0 (M ? " (uf:drop:" (B?"but:":"") N ")" : "") } + NR%2==0 { print B ? substr ($0,length($0)-N+1) : substr ($0,N+1) } + ' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-freqs b/uf-freqs index 7914f04..0039392 100755 --- a/uf-freqs +++ b/uf-freqs @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Count frequencies of elements in each sequence in each unfasta FILE and write @@ -38,7 +38,7 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] -l|--length-only Length of sequence only, no frequencies per element -t|--totals Totals across all sequences at bottom " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -47,46 +47,47 @@ BARE=0 LENGTH_ONLY=0 TOTALS=0 -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - -b|--bare) - BARE=1 - ;; - -l|--length*) - LENGTH_ONLY=1 - ;; - -t|--totals) - TOTALS=1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + -b|--bare) + BARE=1 + ;; + -l|--length*) + LENGTH_ONLY=1 + ;; + -t|--totals) + TOTALS=1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done awk -b -O -v P="$(basename "$0")" -v BARE=$BARE -v LENGTH_ONLY=$LENGTH_ONLY -v TOTALS=$TOTALS ' BEGIN { PROCINFO["sorted_in"] = "@ind_str_asc" } NR%2==1 && !BARE NR%2==0 { - for (i=1; i<=length($0); ++i) freqs[toupper(substr($0,i,1))] += 1 - printf length($0) - if (!LENGTH_ONLY) for (j in freqs) printf " " j "=" freqs[j] - printf "\n" - if (TOTALS) { - for (j in freqs) totals[j] += freqs[j] - grand_total += length($0) - } + for (i=1; i<=length($0); ++i) freqs[toupper(substr($0,i,1))] += 1 + printf length($0) + if (!LENGTH_ONLY) for (j in freqs) printf " " j "=" freqs[j] + printf "\n" + if (TOTALS) { + for (j in freqs) totals[j] += freqs[j] + grand_total += length($0) + } } END { - if (TOTALS) { - if (!BARE) print ">TOTALS" - printf grand_total; - if (!LENGTH_ONLY) for (j in totals) printf " " j "=" totals[j] - printf "\n" - } + if (TOTALS) { + if (!BARE) print ">TOTALS" + printf grand_total; + if (!LENGTH_ONLY) for (j in totals) printf " " j "=" totals[j] + printf "\n" + } } ' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-hash b/uf-hash index 6d5507d..f5ff89f 100755 --- a/uf-hash +++ b/uf-hash @@ -23,13 +23,13 @@ ALGO="md5" # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Computes a hash over the sequence content of unfasta FILEs, or stdin if @@ -42,28 +42,28 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] OPTIONS -a|--algo ALG Use algorithm md5/sha1/224/256/384/512, default: $ALGO " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options unset REORDERABLE -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - --algo=*) - ALGO=${1#--algo=} - ;; - -a|--algo*) - shift - ALGO=$1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + --algo=*) + ALGO=${1#--algo=} + ;; + -a|--algo*) + shift + ALGO=$1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check options and arguments @@ -80,3 +80,4 @@ FILE="-" awk -b -O 'NR % 2 == 0' "$FILE" | $HASH_PGM | cut -d' ' -f1 +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-headers b/uf-headers index 76935c4..2806072 100755 --- a/uf-headers +++ b/uf-headers @@ -20,33 +20,34 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Drop the sequence data and output only the headers from each FILE. If no FILE is present or FILE is '-', read from standard input. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done awk -b -O 'NR%2==1' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-random b/uf-random index b55cba6..3e0b473 100755 --- a/uf-random +++ b/uf-random @@ -24,13 +24,13 @@ SEQ_LENGTH=100 # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [CHARS] Write to standard output an unfasta file with randomly generated sequences. @@ -49,48 +49,48 @@ Usage: $(basename $0) [OPTIONS] [CHARS] NOTE: the current implementation uses the \$RANDOM builtin to obtain random numbers. This is bash specific. TODO: change to read from /dev/urandom. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options -while [ $# -ne 0 -a "$(expr "$1" : '\(.\).*')" = "-" ]; do - case $1 in - --count=*) - SEQ_COUNT=${1#--count=} - ;; - -c|--count) - shift - SEQ_COUNT=$1 - ;; - --length=*) - SEQ_LENGTH=${1#--length=} - ;; - -l|--length) - shift - SEQ_LENGTH=$1 - ;; - --range=*) - RANGE=${1#--range=} - ;; - -r|--range) - shift - RANGE=$1 - ;; - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + --count=*) + SEQ_COUNT=${1#--count=} + ;; + -c|--count) + shift + SEQ_COUNT=$1 + ;; + --length=*) + SEQ_LENGTH=${1#--length=} + ;; + -l|--length) + shift + SEQ_LENGTH=$1 + ;; + --range=*) + RANGE=${1#--range=} + ;; + -r|--range) + shift + RANGE=$1 + ;; + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Parse arguments if [ $# -eq 1 ]; then - SEQ_CHARS="$1" - shift + SEQ_CHARS="$1" + shift fi [ $# -eq 0 ] || usage_exit @@ -98,10 +98,10 @@ fi # Parse MIN and MAX if random lengths are required if [ -n "$RANGE" ]; then - MIN_LENGTH="$(expr "$RANGE" : '\([0-9]\+\),[0-9]\+$')" || err_exit "not a valid range specification: $RANGE" - MAX_LENGTH="$(expr "$RANGE" : '[0-9]\+,\([0-9]\+\)$')" || err_exit "not a valid range specification: $RANGE" - RANGE_SIZE=$((MAX_LENGTH - MIN_LENGTH + 1)) - [ $RANGE_SIZE -gt 0 ] || err_exit "invalid range: $RANGE" + MIN_LENGTH="$(expr "$RANGE" : '\([0-9]\+\),[0-9]\+$')" || err_exit "not a valid range specification: $RANGE" + MAX_LENGTH="$(expr "$RANGE" : '[0-9]\+,\([0-9]\+\)$')" || err_exit "not a valid range specification: $RANGE" + RANGE_SIZE=$((MAX_LENGTH - MIN_LENGTH + 1)) + [ $RANGE_SIZE -gt 0 ] || err_exit "invalid range: $RANGE" fi # Check that we have RANDOM, which is a bash extension - @TODO migrate to /dev/urandom @@ -111,11 +111,12 @@ expr "$RANDOM" : '[0-9]\+$' >/dev/null || err_exit "\$RANDOM not supported by sh # Generate the sequences for (( S = 1 ; S <= $SEQ_COUNT ; S += 1 )); do - [ -z "$RANGE" ] || SEQ_LENGTH=$((MIN_LENGTH + ($RANDOM % RANGE_SIZE))) - echo ">lcl|$S Random sequence $S (length $SEQ_LENGTH)" - for (( C = 1 ; C <= SEQ_LENGTH ; C += 1 )); do - echo -n ${SEQ_CHARS:$(($RANDOM % ${#SEQ_CHARS})):1} - done - echo + [ -z "$RANGE" ] || SEQ_LENGTH=$((MIN_LENGTH + ($RANDOM % RANGE_SIZE))) + echo ">lcl|$S Random sequence $S (length $SEQ_LENGTH)" + for (( C = 1 ; C <= SEQ_LENGTH ; C += 1 )); do + echo -n ${SEQ_CHARS:$(($RANDOM % ${#SEQ_CHARS})):1} + done + echo done +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-rc b/uf-rc index 20b8a9e..a2782ae 100755 --- a/uf-rc +++ b/uf-rc @@ -31,27 +31,27 @@ COMP_ALPHABET="TtGgCcAaNnMmKkWwSsRrYyVvBbHhDd" # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Write to standard output the reverse complement of every sequence in each FILE. If no FILE is present or FILE is '-', read from standard input. - Options - -r|--reverse-only Reverse only, do not complement. - -c|--complement-only Complement only, do not reverse. - -m|--mark[-header] Document operation by attaching '(uf:...)' to headers. + OPTIONS + -r, --reverse-only Reverse only, do not complement. + -c, --complement-only Complement only, do not reverse. + -m, --mark[-header] Document operation by attaching '(uf:...)' to headers. Case is preserved. Will also work for the degenerate nucleotide letters. E.g. complement of Y (pyrimidine, T/C) is R (purine, A/T). " >&2 - exit ${1:-1} + exit ${1:-1} } # Defaults @@ -63,44 +63,45 @@ MARK=0 # Parse options while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - -r|--reverse-only) - COMPLEMENT=0 - ;; - -c|--complement-only) - REVERSE=0 - ;; - -m|--mark*) - MARK=1 - ;; - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + -r|--reverse-only) + COMPLEMENT=0 + ;; + -c|--complement-only) + REVERSE=0 + ;; + -m|--mark*) + MARK=1 + ;; + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Do the work awk -b -O -v P="$(basename "$0")" -v C=$COMPLEMENT -v R=$REVERSE -v M=$MARK ' - NR%2==1 { print $0 (M ? " (uf:" (R?"reverse":"") (C?"complement":"") ")" : "") } - NR%2==0 { - if (R) for (i = length($0); i >= 1; --i) print_maybe_comp() - else for (i = 1; i <= length($0); ++i) print_maybe_comp() - printf "\n" - } - function print_maybe_comp( c,p) { # Haha, globals rule, no need to pass i - c = substr ($0,i,1) - if (C) { - p = index ("'$NUCL_ALPHABET'", c) - if (!p) { - print P ": invalid character, cannot complement: " c - exit 1 - } - c = substr ("'$COMP_ALPHABET'", p, 1) - } - printf "%c", c - }' "$@" + NR%2==1 { print $0 (M ? " (uf:" (R?"reverse":"") (C?"complement":"") ")" : "") } + NR%2==0 { + if (R) for (i = length($0); i >= 1; --i) print_maybe_comp() + else for (i = 1; i <= length($0); ++i) print_maybe_comp() + printf "\n" + } + function print_maybe_comp( c,p) { # Haha, globals rule, no need to pass i + c = substr ($0,i,1) + if (C) { + p = index ("'$NUCL_ALPHABET'", c) + if (!p) { + print P ": invalid character, cannot complement: " c + exit 1 + } + c = substr ("'$COMP_ALPHABET'", p, 1) + } + printf "%c", c + }' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-select b/uf-select index cb34617..894e069 100755 --- a/uf-select +++ b/uf-select @@ -20,71 +20,71 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS|NUMBERS|REGEX] [FILE ...] Select sequences from unfasta FILEs and write to standard output. If no FILE is present or when FILE is '-', read standard input. OPTIONS - -n|--nth N[,..] Select the nth sequence(s) for each file. - -g|--grep REGEX Select the sequence(s) whose header matches REGEX. + -n, --nth N[,..] Select the nth sequence(s) for each file. + -g, --grep REGEX Select the sequence(s) whose header matches REGEX. As a convenience, when no option is specified then if the first argument parses as a valid list of NUMBERS then --nth is assumed, else --grep is assumed. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options unset NTH REGEX -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - --nth=*) - NTH=${1#--nth=} - ;; - -n|--nth) - shift - NTH=$1 - ;; - --grep=*) - REGEX=${1#--grep=} - ;; - -g|--grep) - shift - REGEX=$1 - ;; - -h|--help) - usage_exit 0 - ;; - -) - break - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + --nth=*) + NTH=${1#--nth=} + ;; + -n|--nth) + shift + NTH=$1 + ;; + --grep=*) + REGEX=${1#--grep=} + ;; + -g|--grep) + shift + REGEX=$1 + ;; + -h|--help) + usage_exit 0 + ;; + -) + break + ;; + *) usage_exit + ;; + esac + shift done # Handle option-less case if [ -z "$NTH" ] && [ -z "$GREP" ]; then - [ -n "$1" ] || usage_exit - if expr "$1" : '\([0-9]\+\(,[0-9]\+\)*$\)' >/dev/null; then - NTH="$1" - else - REGEX="$1" - fi - shift + [ -n "$1" ] || usage_exit + if expr "$1" : '\([0-9]\+\(,[0-9]\+\)*$\)' >/dev/null; then + NTH="$1" + else + REGEX="$1" + fi + shift fi # Check options @@ -95,16 +95,17 @@ fi # Do the work if [ -n "$NTH" ]; then - #awk -b -O -v NTH=$NTH 'BEGIN { split(NTH,lines,/,/) } int((NR+1)/2) in lines' "$@" - awk -b -O -v NTH=$NTH ' - BEGIN { split(NTH,lines,/,/) } - { for (x in lines) if (int((NR+1)/2) == lines[x]) print } - ' "$@" + #awk -b -O -v NTH=$NTH 'BEGIN { split(NTH,lines,/,/) } int((NR+1)/2) in lines' "$@" + awk -b -O -v NTH=$NTH ' + BEGIN { split(NTH,lines,/,/) } + { for (x in lines) if (int((NR+1)/2) == lines[x]) print } + ' "$@" elif [ -n "$REGEX" ]; then - awk -b -O ' + awk -b -O ' NR % 2 == 1 && /'$REGEX'/ { print; getline; print } - ' "$@" + ' "$@" else - usage_exit + usage_exit fi +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-sort b/uf-sort index a8fbf4c..d8dded7 100755 --- a/uf-sort +++ b/uf-sort @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Sort the sequences in unfasta FILEs in order of decreasing length, and @@ -36,25 +36,25 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] OPTIONS -r|--reverse Reverse the order of the sort " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options unset REVERSE -while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\)..*')" = "-" ]; do - case $1 in - -r|--reverse) - REVERSE="-r" - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift +while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do + case $1 in + -r|--reverse) + REVERSE="-r" + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Do the work @@ -67,12 +67,13 @@ done # Pre-process into single records awk -b -O -v OFS='\t' ' - NR % 2 == 1 { HDR = $0; } - NR % 2 == 0 { print length(), $0, HDR } - ' "$@" | + NR % 2 == 1 { HDR = $0; } + NR % 2 == 0 { print length(), $0, HDR } + ' "$@" | # Sort in order -#LC_ALL=C sort $REVERSE --buffer-size=1G --key='1rn,2' -t ' ' - | -LC_ALL=C sort $REVERSE --key='1rn,2' -t ' ' - | +#LC_ALL=C sort $REVERSE --buffer-size=1G --key='1rn,2' -t ' ' - | +LC_ALL=C sort $REVERSE --key='1rn,2' -t ' ' - | # And unpack again - let's hope there are no tabs in headers ... awk -b -O -F '\t' '{ print $3; print $2; }' +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-take b/uf-take index 9e033f2..4762738 100755 --- a/uf-take +++ b/uf-take @@ -20,13 +20,13 @@ # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] NUM [FILE ...] For each sequence in each unfasta format FILE write the first NUM characters @@ -34,10 +34,10 @@ Usage: $(basename $0) [OPTIONS] NUM [FILE ...] input. With modifier -b|--but, take all except the final NUM characters. Options - -b|--but Copy everything BUT the final NUM characters from each sequence. - -m|--mark Document the edit by appending '(uf:take:[but:]NUM)' to headers. + -b, --but Copy everything BUT the final NUM characters from each sequence. + -m, --mark Document the edit by appending '(uf:take:[but:]NUM)' to headers. " >&2 - exit ${1:-1} + exit ${1:-1} } # Parse options @@ -46,20 +46,20 @@ BUT=0 MARK=0 while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - -m|--mark) - MARK=1 - ;; - -b|--but) - BUT=1 - ;; - -h|--help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + -m|--mark) + MARK=1 + ;; + -b|--but) + BUT=1 + ;; + -h|--help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check first argument is positive number @@ -72,7 +72,8 @@ shift # Delegate to awk awk -b -O -v P="$(basename "$0")" -v N=$NUM -v M=$MARK -v B=$BUT ' - NR%2==1 { print $0 (M ? " (uf:take:" (B?"but:":"") N ")" : "") } - NR%2==0 { print B ? substr ($0,1,length($0)-N) : substr ($0,1,N) } - ' "$@" + NR%2==1 { print $0 (M ? " (uf:take:" (B?"but:":"") N ")" : "") } + NR%2==0 { print B ? substr ($0,1,length($0)-N) : substr ($0,1,N) } + ' "$@" +# vim: sts=4:sw=4:et:si:ai diff --git a/uf-valid b/uf-valid index 7b132cb..2fa3c9b 100755 --- a/uf-valid +++ b/uf-valid @@ -32,7 +32,7 @@ DNA_ALPHABET="ACGTN" RNA_ALPHABET="ACGUN" NUCL_ALPHABET="ACGTNUKSYMWRBDHV" -# Amino Acids +# Amino Acids # A alanine P proline # B aspartate/asparagine Q glutamine # C cystine R arginine @@ -46,18 +46,18 @@ NUCL_ALPHABET="ACGTNUKSYMWRBDHV" # L leucine X any # M methionine * translation stop (EXCLUDED) # N asparagine - gap of indeterminate length (EXCLUDED) - + AMINO_ALPHABET="ABCDEFGHIKLMNPQRSTVXWYZ" # Function to exit this script with an error message on stderr err_exit() { - echo "$(basename "$0"): $*" >&2 - exit 1 + echo "$(basename "$0"): $*" >&2 + exit 1 } # Function to show usage information and exit usage_exit() { - echo " + echo " Usage: $(basename $0) [OPTIONS] [FILE ...] Validate the sequences in each FILE against the allowed alphabet. If no FILE @@ -65,26 +65,26 @@ Usage: $(basename $0) [OPTIONS] [FILE ...] copied to standard output. For invalid sequences, an error message is printed to standard error. - Options - -a|--allow CHARS Validate sequences against the alphabet consisting of CHARS. - May be specified in addition to the synonyms listed below. - -i|--ignore-case Ignore case for allowed CHARS (default for the synonyms). - -v|--headers Validate the syntax of the headers against NCBI conditions. - -s|--stop Stop processing after the first invalid sequence. - -k|--keep Do not drop invalid sequences but copy them to standard out. - -q|--quiet Do not copy standard input to standard output. + OPTIONS + -a, --allow CHARS Validate sequences against the alphabet consisting of CHARS. + May be specified in addition to the synonyms listed below. + -i, --ignore-case Ignore case for allowed CHARS (default for the synonyms). + -v, --headers Validate the syntax of the headers against NCBI conditions. + -s, --stop Stop processing after the first invalid sequence. + -k, --keep Do not drop invalid sequences but copy them to standard out. + -q, --quiet Do not copy standard input to standard output. Synonyms for the usual alphabets (mutually exclusive): --dna Equivalent to: -i -a '$DNA_ALPHABET' --rna Equivalent to: -i -a '$RNA_ALPHABET' --nucl Equivalent to: -i -a '$NUCL_ALPHABET' - --amino|--prot Equivalent to: -i -a '$AMINO_ALPHABET' + --amino, --prot Equivalent to: -i -a '$AMINO_ALPHABET' CAVEAT: when using --allow with symbols other than characters: the current implementation is a simple regex search for the complement of the alphabet, so symbols which are regex meta-characters like ., ?, and * need escaping. " >&2 - exit ${1:-1} + exit ${1:-1} } # Set defaults @@ -99,56 +99,56 @@ QUIET=0 # Parse options while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do - case $1 in - --allow=*) - ALLOW="${ALLOW}${1#"--allow="}" - ;; - -a|--allow) - shift - ALLOW="${ALLOW}$1" - ;; - --dna) - [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" - ALLOW="$DNA_ALPHABET" - IGNORE_CASE=1 - ;; - --rna) - [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" - ALLOW="$RNA_ALPHABET" - IGNORE_CASE=1 - ;; - --nucl) - [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" - ALLOW="$NUCL_ALPHABET" - IGNORE_CASE=1 - ;; - --amino|--prot) - [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" - ALLOW="$AMINO_ALPHABET" - IGNORE_CASE=1 - ;; - -i|--ignore*) - IGNORE_CASE=1 - ;; - -v|--headers) - VAL_HEADERS=1 - ;; - -s|--stop) - STOP_ON_ERR=1 - ;; - -k|--keep-errors) - KEEP_ERRORS=1 - ;; - -q|--quiet) - QUIET=1 - ;; - --help) - usage_exit 0 - ;; - *) usage_exit - ;; - esac - shift + case $1 in + --allow=*) + ALLOW="${ALLOW}${1#"--allow="}" + ;; + -a|--allow) + shift + ALLOW="${ALLOW}$1" + ;; + --dna) + [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" + ALLOW="$DNA_ALPHABET" + IGNORE_CASE=1 + ;; + --rna) + [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" + ALLOW="$RNA_ALPHABET" + IGNORE_CASE=1 + ;; + --nucl) + [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" + ALLOW="$NUCL_ALPHABET" + IGNORE_CASE=1 + ;; + --amino|--prot) + [ -z "$ALLOW" ] || err_exit "option $1 may be followed by --allow but not otherwise combined" + ALLOW="$AMINO_ALPHABET" + IGNORE_CASE=1 + ;; + -i|--ignore*) + IGNORE_CASE=1 + ;; + -v|--headers) + VAL_HEADERS=1 + ;; + -s|--stop) + STOP_ON_ERR=1 + ;; + -k|--keep-errors) + KEEP_ERRORS=1 + ;; + -q|--quiet) + QUIET=1 + ;; + --help) + usage_exit 0 + ;; + *) usage_exit + ;; + esac + shift done # Check that at least some alphabet was selected @@ -158,33 +158,34 @@ done # Do the work awk -b -O -v P="$(basename "$0")" -v WRONG="[^$ALLOW]" -v H=$VAL_HEADERS -v Q=$QUIET -v K=$KEEP_ERRORS -v S=$STOP_ON_ERR -v IGNORECASE=$IGNORE_CASE '{ - ERR = 0 - HDR = $0 - if ( HDR !~ /^>/ ) { - print P ": no FASTA header found at line " NR - ERR = 1 - } - else if ( H && HDR !~ /^>[[:alpha:]]+\|[[:alnum:]._]+(\|[[:alnum:]._]+)*\|?(\s+.*)?$/ ) { # rudimentary syntax check of header - print P ": invalid header syntax at line " NR ": " HDR > "/dev/stderr" - ERR = 1 - } - if ( getline SEQ != 1 ) { - print P ": read error or end of file, no sequence read at line " NR - ERR = 1 - } - else { - POS = match (SEQ, WRONG) - if ( POS ) { - print P ": invalid character in sequence at line " NR ", pos " POS ": " substr(SEQ,POS,1) > "/dev/stderr" - ERR = 1 - } - } - if ( (!ERR || K) && !Q ) { - print HDR - print SEQ - } - if ( ERR && S ) { - exit 1 - } + ERR = 0 + HDR = $0 + if ( HDR !~ /^>/ ) { + print P ": no FASTA header found at line " NR + ERR = 1 + } + else if ( H && HDR !~ /^>[[:alpha:]]+\|[[:alnum:]._]+(\|[[:alnum:]._]+)*\|?(\s+.*)?$/ ) { # rudimentary syntax check of header + print P ": invalid header syntax at line " NR ": " HDR > "/dev/stderr" + ERR = 1 + } + if ( getline SEQ != 1 ) { + print P ": read error or end of file, no sequence read at line " NR + ERR = 1 + } + else { + POS = match (SEQ, WRONG) + if ( POS ) { + print P ": invalid character in sequence at line " NR ", pos " POS ": " substr(SEQ,POS,1) > "/dev/stderr" + ERR = 1 + } + } + if ( (!ERR || K) && !Q ) { + print HDR + print SEQ + } + if ( ERR && S ) { + exit 1 + } }' "$@" +# vim: sts=4:sw=4:et:si:ai