Skip to content

Commit

Permalink
edit README
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Aug 9, 2018
1 parent 0425b99 commit 67cd2d4
Show file tree
Hide file tree
Showing 10 changed files with 215 additions and 45 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,9 @@
*.out

*.directory
unikmer/unikmer
unikmer/unikmer*
unikmer/binaries*
doc/site/*

*ssshtest
testdata/*.unik
139 changes: 138 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,138 @@

# unikmer

unikmer (unique Kmer) is a golang package and a command-line toolkit for
manipulating [Kmers](https://en.wikipedia.org/wiki/K-mer) while NOT recording
Kmer frequencies.

## The package

[![GoDoc](https://godoc.org/github.com/shenwei356/unikmer?status.svg)](https://godoc.org/github.com/shenwei356/unikmer)
[![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/unikmer)](https://goreportcard.com/report/github.com/shenwei356/unikmer)

The unikmer package provides basic manipulations of unique Kmers (NOT including
Kmer frequencies) and its binary file.

### Installation

go get -u github.com/shenwei356/unikmer

### Benchmark

$ go test . -bench=Bench*
goos: linux
goarch: amd64
pkg: github.com/shenwei356/unikmer
BenchmarkEncodeK32-4 20000000 98.1 ns/op
BenchmarkDecodeK32-4 20000000 102 ns/op
BenchmarkRevK32-4 20000000 64.2 ns/op
BenchmarkCompK32-4 20000000 54.8 ns/op
BenchmarkRevCompK32-4 10000000 116 ns/op


## The toolkit

`unikmer` is a command-line toolkit provides some functions including counting,
format convertion, set operations and searching on unique Kmers.

### Installation

1. Download [binary files](https://github.com/shenwei356/unikmer/releases).

1. Bioconda (not available now)

conda install unikmer

### Commands

1. Counting

count count Kmer from FASTA/Q sequences
subset extract smaller Kmers from binary file

1. Format convertion

view read and output binary format to plain text
dump convert plain Kmer text to binary format

1. Set operations

concat concatenate multiple binary files
diff set difference of multiple binary files
inter intersection of multiple binary files
union union of multiple binary files

1. Searching

grep search Kmer from binary file

1. Misc

genautocomplete generate shell autocompletion script
help Help about any command
version print version information and check for update

### Quick Start

# counting
$ time unikmer count -k 31 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz
real 0m5.209s
user 0m6.864s
sys 0m0.169s

$ ls -lh Ecoli-MG1655.fasta.gz*
-rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.unik


# view
$ unikmer view Ecoli-MG1655.fasta.gz.unik | head -n 3
AGCTTTTCATTCTGACTGCAACGGGCAATAT
GCTTTTCATTCTGACTGCAACGGGCAATATG
CTTTTCATTCTGACTGCAACGGGCAATATGT

$ unikmer view Ecoli-MG1655.fasta.gz.unik | wc -l
9108538


# union
$ unikmer union Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o union


# intersection
$ unikmer inter Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o inter


# difference
$ unikmer diff -t 4 Ecoli-MG1655.fasta.gz.unik Ecoli-IAI39.fasta.gz.unik -o diff


# -------------------------------------------------------------------------

$ ls -lh
-rw-rw-r--. 1 shenwei shenwei 1.6M Aug 9 23:19 Ecoli-IAI39.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 25M Aug 9 23:29 Ecoli-IAI39.fasta.gz.unik
-rw-rw-r--. 1 shenwei shenwei 1.4M Aug 9 23:19 Ecoli-MG1655.fasta.gz
-rw-rw-r--. 1 shenwei shenwei 23M Aug 9 23:29 Ecoli-MG1655.fasta.gz.unik
-rw-rw-r--. 1 shenwei shenwei 38M Aug 9 23:32 union.unik
-rw-rw-r--. 1 shenwei shenwei 35M Aug 9 23:33 inter.unik
-rw-rw-r--. 1 shenwei shenwei 35M Aug 9 23:34 diff.unik

$ unikmer view Ecoli-MG1655.fasta.gz.unik | wc -l
9108538
$ unikmer view Ecoli-IAI39.fasta.gz.unik | wc -l
9821960
$ unikmer view union.unik | wc -l
14402956
$ unikmer view inter.unik | wc -l
4527542
$ unikmer view diff.unik | wc -l
4580996


## Contributing

We welcome pull requests, bug fixes and issue reports.

## License

[MIT License](https://github.com/shenwei356/unikmer/blob/master/LICENSE)
28 changes: 14 additions & 14 deletions file.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,22 @@ import (
"io"
)

// MainVersion is the main version number
// MainVersion is the main version number.
const MainVersion int64 = 0

// MinorVersion is the minor version number
// MinorVersion is the minor version number.
const MinorVersion int64 = 1

// Magic number of binary file
// Magic number of binary file.
var Magic = [8]byte{'.', 'u', 'n', 'i', 'k', 'm', 'e', 'r'}

// ErrInvalidFileFormat means invalid file format
// ErrInvalidFileFormat means invalid file format.
var ErrInvalidFileFormat = errors.New("unikmer: invalid binary format")

// ErrBrokenFile means the file is not complete
// ErrBrokenFile means the file is not complete.
// var ErrBrokenFile = errors.New("unikmer: broken file")

// ErrKMismatch means K size mismatch
// ErrKMismatch means K size mismatch.
var ErrKMismatch = errors.New("unikmer: K mismatch")

var be = binary.BigEndian
Expand All @@ -57,7 +57,7 @@ func (h Header) String() string {
return fmt.Sprintf("unikmer binary kmer data file v%s, K=%d", h.Version, h.K)
}

// Reader is for reading KmerCode
// Reader is for reading KmerCode.
type Reader struct {
Header
r io.Reader
Expand All @@ -66,7 +66,7 @@ type Reader struct {
size uint64
}

// NewReader returns a Reader
// NewReader returns a Reader.
func NewReader(r io.Reader) (*Reader, error) {
reader := &Reader{r: r}
reader.err = reader.readHeader()
Expand Down Expand Up @@ -106,7 +106,7 @@ func (reader *Reader) readHeader() error {
return nil
}

// Read reads one KmerCode
// Read reads one KmerCode.
func (reader *Reader) Read() (KmerCode, error) {
reader.err = binary.Read(reader.r, be, &reader.code)
if reader.err != nil {
Expand All @@ -116,7 +116,7 @@ func (reader *Reader) Read() (KmerCode, error) {
return KmerCode{Code: reader.code, K: reader.Header.K}, nil
}

// Writer writes KmerCode
// Writer writes KmerCode.
type Writer struct {
Header
w io.Writer
Expand All @@ -126,7 +126,7 @@ type Writer struct {
size int64
}

// NewWriter creates a Writer
// NewWriter creates a Writer.
func NewWriter(w io.Writer, k int) *Writer {
return &Writer{
Header: Header{Version: fmt.Sprintf("%d.%d", MainVersion, MinorVersion), K: k},
Expand All @@ -148,7 +148,7 @@ func (writer *Writer) writeHeader() error {
return nil
}

// WriteKmer writes one Kmer
// WriteKmer writes one Kmer.
func (writer *Writer) WriteKmer(mer []byte) error {
writer.kcode, writer.err = NewKmerCode(mer)
if writer.err != nil {
Expand All @@ -157,7 +157,7 @@ func (writer *Writer) WriteKmer(mer []byte) error {
return writer.Write(writer.kcode)
}

// Write writes one KmerCode
// Write writes one KmerCode.
func (writer *Writer) Write(kcode KmerCode) error {
if writer.Header.K != kcode.K {
writer.err = ErrKMismatch
Expand All @@ -181,7 +181,7 @@ func (writer *Writer) Write(kcode KmerCode) error {
return nil
}

// Flush writes the size to the end
// Flush is not used actually.
func (writer *Writer) Flush() error {
// writer.err = binary.Write(writer.w, be, writer.size)
// if writer.err != nil {
Expand Down
2 changes: 1 addition & 1 deletion file_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func genKmers(k int, num int) [][]byte {
for i := 0; i < num; i++ {
mers[i] = make([]byte, k)
for j = 0; j < k; j++ {
mers[i][j] = code2base[rand.Intn(4)]
mers[i][j] = bit2base[rand.Intn(4)]
}
}
return mers
Expand Down
59 changes: 34 additions & 25 deletions kmer.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,34 @@ import (
"errors"
)

// ErrIllegalBase means that base beyond "ACGTU" was detected
// ErrIllegalBase means that base beyond IUPAC symbols are detected.
var ErrIllegalBase = errors.New("unikmer: illegal base")

// ErrKOverflow means K > 32
// ErrKOverflow means K > 32.
var ErrKOverflow = errors.New("unikmer: K (1-32) overflow")

// Encode converts byte slice to bits.
//
// M AC
// V ACG
// H ACT
// R AG
// D AGT
// W AT
// S CG
// B CGT
// Y CT
// K GT
// Codes:
//
// A 00
// C 01
// G 10
// T 11
//
// For degenerate bases, only the first base is kept.
//
// M AC A
// V ACG A
// H ACT A
// R AG A
// D AGT A
// W AT A
// S CG C
// B CGT C
// Y CT C
// K GT G
// N ACGT A
//
func Encode(mer []byte) (code uint64, err error) {
size := len(mer)
Expand All @@ -66,7 +75,7 @@ func Encode(mer []byte) (code uint64, err error) {
return code, nil
}

// Reverse returns code of reversed sequence
// Reverse returns code of the reversed sequence.
func Reverse(code uint64, k int) (c uint64) {
if k <= 0 || k > 32 {
panic(ErrKOverflow)
Expand All @@ -78,7 +87,7 @@ func Reverse(code uint64, k int) (c uint64) {
return
}

// Complement return code of complement sequence
// Complement returns code of complement sequence.
func Complement(code uint64, k int) (c uint64) {
if k <= 0 || k > 32 {
panic(ErrKOverflow)
Expand All @@ -90,29 +99,29 @@ func Complement(code uint64, k int) (c uint64) {
return
}

// code2base is for mapping code to base
var code2base = [4]byte{'A', 'C', 'G', 'T'}
// bit2base is for mapping bit to base.
var bit2base = [4]byte{'A', 'C', 'G', 'T'}

// Decode converts the bits to origional seq
// Decode converts the code to origional seq
func Decode(code uint64, k int) []byte {
if k <= 0 || k > 32 {
panic(ErrKOverflow)
}
mer := make([]byte, k)
for i := 0; i < k; i++ {
mer[k-1-i] = code2base[code&3]
mer[k-1-i] = bit2base[code&3]
code >>= 2
}
return mer
}

// KmerCode is a struct representing a kmer in 64-bits
// KmerCode is a struct representing a kmer in 64-bits.
type KmerCode struct {
Code uint64
K int
}

// NewKmerCode returns a new KmerCode from byte slice
// NewKmerCode returns a new KmerCode struct from byte slice.
func NewKmerCode(mer []byte) (KmerCode, error) {
code, err := Encode(mer)
if err != nil {
Expand All @@ -121,27 +130,27 @@ func NewKmerCode(mer []byte) (KmerCode, error) {
return KmerCode{code, len(mer)}, err
}

// Equal checks wether two KmerCodes are the same
// Equal checks wether two KmerCodes are the same.
func (kcode KmerCode) Equal(kcode2 KmerCode) bool {
return kcode.K == kcode2.K && kcode.Code == kcode2.Code
}

// Rev returns KmerCode of the reverse sequence
// Rev returns KmerCode of the reverse sequence.
func (kcode KmerCode) Rev() KmerCode {
return KmerCode{Reverse(kcode.Code, kcode.K), kcode.K}
}

// Comp returns KmerCode of the complement sequence
// Comp returns KmerCode of the complement sequence.
func (kcode KmerCode) Comp() KmerCode {
return KmerCode{Complement(kcode.Code, kcode.K), kcode.K}
}

// RevComp returns KmerCode of the reverse complement sequence
// RevComp returns KmerCode of the reverse complement sequence.
func (kcode KmerCode) RevComp() KmerCode {
return kcode.Rev().Comp()
}

// Bytes returns kmer in []byte
// Bytes returns kmer in []byte.
func (kcode KmerCode) Bytes() []byte {
return Decode(kcode.Code, kcode.K)
}
Expand Down
Loading

0 comments on commit 67cd2d4

Please sign in to comment.