-
Notifications
You must be signed in to change notification settings - Fork 0
/
uf-select
executable file
·102 lines (88 loc) · 3.37 KB
/
uf-select
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/bin/sh
#
# uf-select - Select sequences from an unfasta file
# Copyright (C) 2016 Marco van Zwetselaar <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This utility is part of http://io.zwets.it/unfasta
# Function to exit this script with an error message on stderr
err_exit() {
echo "$(basename "$0"): $*" >&2
exit 1
}
# Function to show usage information and exit
usage_exit() {
echo "
Usage: $(basename $0) [OPTIONS] [FILE ...]
Select sequences from unfasta FILEs and write to standard output.
If no FILE is present or when FILE is '-', read standard input.
OPTIONS
-n, --nth N[,..] Select the nth sequence(s) for each file
-i, --id SEQID Select the sequence whose ID equals SEQID
-g, --grep REGEX Select sequences whose header matches REGEX
-l, --len MIN Select sequences whose length is at least MIN
Options -n, -i, -g are mutually exclusive. One of -n, -i, -g, -l
must be specified.
" >&2
exit ${1:-1}
}
# Parse options
unset NTH SEQID REGEX
MINLEN=0
while [ $# -ne 0 ] && [ "$(expr "$1" : '\(.\)..*')" = "-" ]; do
case $1 in
--nth=*) NTH="${1#--nth=}" ;;
-n|--nth) shift && NTH="$1" || usage_exit ;;
--id=*) SEQID="${1#--id=}" ;;
-i|--id) shift && SEQID="$1" || usage_exit ;;
--grep=*) REGEX="${1#--grep=}" ;;
-g|--grep) shift && REGEX="$1" || usage_exit ;;
--len=*) MINLEN=${1#--len=} ;;
-l|--len) shift && MINLEN=$1 || usage_exit ;;
-h|--help) usage_exit 0 ;;
-) break ;;
*) usage_exit ;;
esac
shift || usage_exit
done
# Check options
[ -z "$NTH" ] || [ -z "${SEQID}${REGEX}" ] || err_exit "options --nth, --id, and --grep cannot be combined"
[ -z "$SEQID" ] || [ -z "${NTH}${REGEX}" ] || err_exit "options --nth, --id, and --grep cannot be combined"
[ -z "$REGEX" ] || [ -z "${NTH}${SEQID}" ] || err_exit "options --nth, --id, and --grep cannot be combined"
[ -z "$NTH" ] || expr "$NTH" : '\([0-9]\+\(,[0-9]\+\)*$\)' >/dev/null || err_exit "not a valid number list: $NTH"
# Do the work
FILE="${1:--}"
while [ -n "$FILE" ]; do
if [ -n "$NTH" ]; then
gawk -bO -v NTH=$NTH '
BEGIN { split(NTH,lines,/,/) }
{ for (x in lines) if (int((NR+1)/2) == lines[x]) print }
' "$FILE"
elif [ -n "$SEQID" ]; then
gawk -bO -v SEQID="$SEQID" '
NR % 2 == 1 && $1 == ">" SEQID { print; getline; print }
' "$FILE"
elif [ -n "$REGEX" ]; then
gawk -bO "$(printf 'NR %% 2 == 1 && /%s/ { print; getline; print }\n' "$REGEX")" "$FILE"
elif [ $MINLEN -ne 0 ]; then
cat "$FILE"
else
usage_exit
fi
[ $# -gt 0 ] && shift && FILE="$1" || FILE=""
done | gawk -bO -v MINLEN="$MINLEN" '
NR%2 == 1 { H=$0 }
NR%2 == 0 && length() >= MINLEN { print H; print }'
# vim: sts=4:sw=4:et:si:ai