Skip to content

Commit

Permalink
rfilter: cli fix and new flag -n
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Feb 5, 2021
1 parent a2d0ef8 commit 24c0c81
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 45 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

- v0.17.2
- `unikmer`: slightly speedup for computing LCA.
- `unikmer rfilter:`
- flag `-E/--equal-to` supports multiple values.
- new flag `-n/--save-predictable-norank`: do not discard some special ranks without order when using -L, where rank of the closest higher node is still lower than rank cutoff.
- v0.17.1
- `unikmer rfilter:` change handling of black list.
- v0.17.0
Expand Down
148 changes: 104 additions & 44 deletions unikmer/cmd/rfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ Attentions:
2. A list of pre-ordered ranks is in ~/.taxonkit/ranks.txt, you can use
your list by -r/--rank-file, the format specification is below.
3. All ranks in taxonomy database should be defined in rank file.
4. TaxIDs with no rank can be optionally discarded by -N/--discard-noranks.
5. Futher ranks can be removed with black list via -B/--black-list.
4. Ranks can be removed with black list via -B/--black-list.
5. TaxIDs with no rank can be optionally discarded by -N/--discard-noranks.
6. But when filtering with -L/--lower-than, you can use
-n/--save-predictable-norank to save some special ranks without order,
where rank of the closest higher node is still lower than rank cutoff.
Rank file:
1. Blank lines or lines starting with "#" are ignored.
Expand Down Expand Up @@ -86,14 +89,20 @@ Rank file:
rankFile := getFlagString(cmd, "rank-file")

discardNoRank := getFlagBool(cmd, "discard-noranks")
saveNorank := getFlagBool(cmd, "save-predictable-norank")

blackListRanks := getFlagStringSlice(cmd, "black-list")

rootTaxid := getFlagUint32(cmd, "root-taxid")
discardRoot := getFlagBool(cmd, "discard-root")

higher := strings.ToLower(getFlagString(cmd, "higher-than"))
lower := strings.ToLower(getFlagString(cmd, "lower-than"))
equal := strings.ToLower(getFlagString(cmd, "equal-to"))
equalsS := getFlagStringSlice(cmd, "equal-to")
equals := make([]string, 0, len(equalsS))
for _, val := range equalsS {
equals = append(equals, strings.ToLower(val))
}

listOrder := getFlagBool(cmd, "list-order")
listRanks := getFlagBool(cmd, "list-ranks")
Expand All @@ -102,6 +111,14 @@ Rank file:
checkError(fmt.Errorf("-H/--higher-than and -L/--lower-than can't be simultaneous given"))
}

if saveNorank {
discardNoRank = true

if lower == "" {
checkError(fmt.Errorf("flag -n/--save-predictable-norank only works along with -L/--lower-than"))
}
}

rankOrder, noRanks, err := readRankOrder(opt, rankFile)
checkError(errors.Wrap(err, rankFile))

Expand Down Expand Up @@ -193,7 +210,7 @@ Rank file:
}
}

filter, err := newRankFilter(taxondb.Ranks, rankOrder, noRanks, lower, higher, equal, blackListRanks, discardNoRank)
filter, err := newRankFilter(taxondb, rankOrder, noRanks, lower, higher, equals, blackListRanks, discardNoRank, saveNorank)
checkError(err)

if !isStdout(outFile) {
Expand Down Expand Up @@ -269,12 +286,7 @@ Rank file:
continue
}

rank = taxondb.Rank(taxid)
if rank == "" {
continue
}

pass, err = filter.isPassed(rank)
pass, err = filter.isPassed(taxid)
if err != nil {
checkError(errors.Wrapf(err, "file: %s, rank: %s", file, rank))
}
Expand Down Expand Up @@ -312,24 +324,27 @@ func init() {
rfilterCmd.Flags().BoolP("list-order", "", false, "list defined ranks in order")
rfilterCmd.Flags().BoolP("list-ranks", "", false, "list ordered ranks in taxonomy database")

filterCmd.Flags().BoolP("discard-noranks", "N", false, `discard ranks without order, type "taxonkit filter --help" for details`)
filterCmd.Flags().StringSliceP("black-list", "B", []string{}, `black list of ranks to discard, e.g., '"no rank", "clade"'`)
rfilterCmd.Flags().BoolP("discard-noranks", "N", false, `discard ranks without order, type "unikmer filter --help" for details`)
rfilterCmd.Flags().BoolP("save-predictable-norank", "n", false, `do not discard some special ranks without order when using -L, where rank of the closest higher node is still lower than rank cutoff`)
rfilterCmd.Flags().StringSliceP("black-list", "B", []string{}, `black list of ranks to discard, e.g., '"no rank", "clade"'`)

rfilterCmd.Flags().BoolP("discard-root", "R", false, `discard root taxid, defined by --root-taxid`)
rfilterCmd.Flags().Uint32P("root-taxid", "", 1, `root taxid`)

rfilterCmd.Flags().StringP("lower-than", "L", "", "output ranks lower than a rank, exclusive with --higher-than")
rfilterCmd.Flags().StringP("higher-than", "H", "", "output ranks higher than a rank, exclusive with --lower-than")
rfilterCmd.Flags().StringP("equal-to", "E", "", "output ranks equal to a rank")
rfilterCmd.Flags().StringSliceP("equal-to", "E", []string{}, `output taxIDs with rank equal to some ranks, multiple values can be separated with comma "," (e.g., -E "genus,species"), or give multiple times (e.g., -E genus -E species)`)
}

type rankFilter struct {
taxondb *unikmer.Taxonomy

dbRanks map[string]interface{}
rankOrder map[string]int

lower string
higher string
equal string
equals []string

oLower int
oHigher int
Expand All @@ -341,14 +356,16 @@ type rankFilter struct {

noRanks map[string]interface{}
blackLists map[string]interface{}
oEquals map[int]interface{}

discardNorank bool
discardNorank bool
saveKnownNoRank bool

cache map[string]bool
cache map[uint32]bool
}

func newRankFilter(dbRanks map[string]interface{}, rankOrder map[string]int, noRanks map[string]interface{},
lower, higher, equal string, blackList []string, discardNorank bool) (*rankFilter, error) {
func newRankFilter(taxondb *unikmer.Taxonomy, rankOrder map[string]int, noRanks map[string]interface{},
lower string, higher string, equals []string, blackList []string, discardNorank bool, saveKnownNoRank bool) (*rankFilter, error) {

if lower != "" && higher != "" {
return nil, fmt.Errorf("higher and lower can't be simultaneous given")
Expand All @@ -358,16 +375,19 @@ func newRankFilter(dbRanks map[string]interface{}, rankOrder map[string]int, noR
for _, r := range blackList {
blackListMap[r] = struct{}{}
}
dbRanks := taxondb.Ranks
f := &rankFilter{
dbRanks: dbRanks,
rankOrder: rankOrder,
lower: lower,
higher: higher,
equal: equal,
noRanks: noRanks,
blackLists: blackListMap,
discardNorank: discardNorank,
cache: make(map[string]bool, 1024),
taxondb: taxondb,
dbRanks: dbRanks,
rankOrder: rankOrder,
lower: lower,
higher: higher,
equals: equals,
noRanks: noRanks,
blackLists: blackListMap,
discardNorank: discardNorank,
saveKnownNoRank: saveKnownNoRank,
cache: make(map[uint32]bool, 1024),
}
var err error
if lower != "" {
Expand All @@ -384,10 +404,15 @@ func newRankFilter(dbRanks map[string]interface{}, rankOrder map[string]int, noR
}
f.limitHigher = true
}
if equal != "" {
f.oEqual, err = getRankOrder(dbRanks, rankOrder, equal)
if err != nil {
return nil, err
if len(equals) > 0 {
f.oEquals = make(map[int]interface{}, len(equals))
var oe int
for _, equal := range equals {
oe, err = getRankOrder(dbRanks, rankOrder, equal)
if err != nil {
return nil, err
}
f.oEquals[oe] = struct{}{}
}
f.limitEqual = true
}
Expand All @@ -406,36 +431,71 @@ func getRankOrder(dbRanks map[string]interface{}, rankOrder map[string]int, rank
return rankOrder[rank], nil
}

func (f *rankFilter) isPassed(rank string) (bool, error) {
func (f *rankFilter) isPassed(taxid uint32) (bool, error) {
rank := f.taxondb.Rank(taxid)
if rank == "" {
return false, nil
}

rank = strings.ToLower(rank)

if v, ok := f.cache[rank]; ok {
if v, ok := f.cache[taxid]; ok {
return v, nil
}

if f.discardNorank {
if _, ok := f.noRanks[rank]; ok {
f.cache[rank] = false
return false, nil
}
}

if _, ok := f.blackLists[rank]; ok {
f.cache[rank] = false
f.cache[taxid] = false
return false, nil
}

var isNoRank bool
_, ok := f.noRanks[rank]
if ok {
if f.discardNorank {
isNoRank = true
if !f.saveKnownNoRank {
f.cache[taxid] = false
return false, nil
}
}
}

var pass bool

if isNoRank && f.limitLower && f.saveKnownNoRank {
nodes := f.taxondb.Nodes
var _rank string
var _ok bool
var _order int

parent := nodes[taxid]
for {
if parent == 1 {
f.cache[taxid] = false
return false, nil
}

_rank = f.taxondb.Rank(parent)
_order, _ok = f.rankOrder[_rank]
if _ok {
pass = _order <= f.oLower

f.cache[taxid] = pass
return pass, nil
}
parent = nodes[parent]
}
}

order, _ := f.rankOrder[rank]
// order, ok := f.rankOrder[rank]
// if !ok {
// return false, fmt.Errorf("rank order not defined in rank file: %s", rank)
// }

if f.limitEqual {
if f.oEqual == order {
pass = true
if _, pass = f.oEquals[order]; pass {
// pass = true
} else if f.limitLower {
pass = order < f.oLower
} else if f.limitHigher {
Expand All @@ -451,7 +511,7 @@ func (f *rankFilter) isPassed(rank string) (bool, error) {
pass = true // no any filter
}

f.cache[rank] = pass
f.cache[taxid] = pass
return pass, nil
}

Expand Down
2 changes: 1 addition & 1 deletion unikmer/cmd/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import (
)

// VERSION is the version
var VERSION = "0.16.1"
var VERSION = "0.17.2"

// versionCmd represents the version command
var versionCmd = &cobra.Command{
Expand Down

0 comments on commit 24c0c81

Please sign in to comment.