uf-to-phy

#!/bin/sh
#
#  uf-to-phy - Convert unfasta to phylip format
#  Copyright (C) 2022  Marco van Zwetselaar <io@zwets.it>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#  This utility is part of http://io.zwets.it/unfasta

# Function to exit this script with an error message on stderr
err_exit() { echo "$(basename "$0"): $*" >&2; exit 1; }

# Function to show usage information and exit
usage_exit() {
    echo "
Usage: $(basename $0) [OPTIONS] [FILE ...]

  Convert one or more unfasta FILEs to relaxed Phylip format.  If no FILE
  is present or FILE is '-' read from standard input.

  OPTIONS
   -r, --recode  Convert invalid characters in the sequence ID to underscores
   -h, --help    Display this message

  The sequence names in the output are the sequence identifiers in the input.
  Invalid characters in the identifier abort the conversion, unless option -r
  is used to convert them to underscores.

  The program aborts when a duplicate sequence identifier is encountered.

" >&2
    exit ${1:-1}
}

# Parse options

RECODE=0
while [ $# -ne 0 -a "$(expr "$1" : '\(.\)..*')" = "-" ]; do
    case $1 in
    -r|--r*) RECODE=1 ;;
    -h|--h*) usage_exit 0 ;;
    *)       usage_exit ;;
    esac
    shift || usage_exit
done

# Do the work

gawk -bO -v P="$(basename "$0")" -v R=$RECODE '
    { SN = int((NR+1) / 2) }
    NR % 2 == 1 {
        OID = gensub(/^>([^ ]+).*$/, "\\1", "g", $0)
        NID = RECODE ? gensub(/[()\[\];:,]/, "_", "g", OID) : OID
        if (CHK[NID]) {
            print P ": duplicate identifier: " NID (OID==NID?"":" (recoded from \"" OID "\")") >"/dev/stderr"
            ERR = 1; exit 1
        }
        else { IDS[SN] = NID; CHK[NID] = 1 }
        if (length(NID) > PAD) PAD = length(NID)
    }
    NR % 2 == 0 {
        if (LEN && length() != LEN) {
            print P ": sequence with id \"" NID "\" has deviant length: " length() " != " LEN >"/dev/stderr"
            ERR = 1; exit 1
        }
        else LEN = length()
        SEQ[SN] = $0
    }
    END {
        if (ERR) exit(1)
        print length(CHK) " " LEN
        for (SN=1; SN<=length(IDS); ++SN) printf "%-" PAD "s %s\n", IDS[SN], SEQ[SN]
    }' "$@"

# vim: sts=4:sw=4:et:si:ai