Skip to content

Commit

Permalink
r52: support approximate long k-mers
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Mar 11, 2020
1 parent f9be57f commit ab78d88
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 28 deletions.
22 changes: 21 additions & 1 deletion count.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@ static void count_seq_buf(ch_buf_t *buf, int k, int p, int len, const char *seq)
}
}

static void count_seq_buf_long(ch_buf_t *buf, int k, int p, int len, const char *seq) // insert k-mers in $seq to linear buffer $buf
{
int i, l;
uint64_t x[4], mask = (1ULL<<k) - 1, shift = k - 1;
for (i = l = 0, x[0] = x[1] = x[2] = x[3] = 0; i < len; ++i) {
int c = seq_nt4_table[(uint8_t)seq[i]];
if (c < 4) { // not an "N" base
x[0] = (x[0] << 1 | (c&1)) & mask;
x[1] = (x[1] << 1 | (c>>1)) & mask;
x[2] = x[2] >> 1 | (uint64_t)(1 - (c&1)) << shift;
x[3] = x[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift;
if (++l >= k)
ch_insert_buf(buf, p, yak_hash_long(x));
} else l = 0, x[0] = x[1] = x[2] = x[3] = 0; // if there is an "N", restart
}
}

typedef struct { // global data structure for kt_pipeline()
const yak_copt_t *opt;
int create_new;
Expand Down Expand Up @@ -101,7 +118,10 @@ static void *worker_pipeline(void *data, int step, void *in) // callback for kt_
MALLOC(s->buf[i].a, m);
}
for (i = 0; i < s->n; ++i) {
count_seq_buf(s->buf, p->opt->k, p->opt->pre, s->len[i], s->seq[i]);
if (p->opt->k < 32)
count_seq_buf(s->buf, p->opt->k, p->opt->pre, s->len[i], s->seq[i]);
else
count_seq_buf_long(s->buf, p->opt->k, p->opt->pre, s->len[i], s->seq[i]);
free(s->seq[i]);
}
free(s->seq); free(s->len);
Expand Down
23 changes: 6 additions & 17 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ int main_count(int argc, char *argv[])
fprintf(stderr, "ERROR: -p should be at least %d\n", YAK_COUNTER_BITS);
return 1;
}
if (opt.k >= 64) {
fprintf(stderr, "ERROR: -k must be smaller than 64\n");
return 1;
} else if (opt.k >= 32) {
fprintf(stderr, "WARNING: counts are inexact if -k is greater than 31\n");
}
h = yak_count(argv[o.ind], &opt, 0);
if (opt.bf_shift > 0) {
yak_ch_destroy_bf(h);
Expand Down Expand Up @@ -120,22 +126,6 @@ int main_qv(int argc, char *argv[])
return 0;
}

int main_test(int argc, char *argv[])
{
yak_ch_t *h;
int64_t cnt[YAK_N_COUNTS];
int i;
if (argc == 1) {
fprintf(stderr, "Usage: yaks test <in.yak>\n");
return 1;
}
h = yak_ch_restore(argv[1]);
yak_ch_hist(h, cnt, 4);
for (i = 1; i < YAK_N_COUNTS; ++i)
printf("%d\t%ld\n", i, (long)cnt[i]);
return 0;
}

int main(int argc, char *argv[])
{
extern int main_triobin(int argc, char *argv[]);
Expand All @@ -156,7 +146,6 @@ int main(int argc, char *argv[])
else if (strcmp(argv[1], "qv") == 0) ret = main_qv(argc-1, argv+1);
else if (strcmp(argv[1], "triobin") == 0) ret = main_triobin(argc-1, argv+1);
else if (strcmp(argv[1], "inspect") == 0) ret = main_inspect(argc-1, argv+1);
else if (strcmp(argv[1], "test") == 0) ret = main_test(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0) {
puts(YAKS_VERSION);
return 0;
Expand Down
1 change: 1 addition & 0 deletions qv.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ static void worker_qv(void *_data, long k, int tid)
int i, l, tot, non0, shift = 2 * (qs->ch->k - 1);
uint64_t x[2], mask = (1ULL<<2*qs->ch->k) - 1;

assert(qs->ch->k < 32);
if (s->l_seq < qs->opt->min_len) return;
if (b->max < s->l_seq) {
b->max = s->l_seq;
Expand Down
39 changes: 30 additions & 9 deletions triobin.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,41 @@ static void tb_worker(void *_data, long k, int tid)
tb_shared_t *aux = t->aux;
bseq1_t *s = &t->seq[k];
tb_buf_t *b = &aux->buf[tid];
uint64_t x[2], mask = (1ULL<<2*aux->ch->k) - 1;
int i, l, shift = 2 * (aux->ch->k - 1);
uint64_t x[4], mask;
int i, l, shift;
if (aux->ch->k < 32) {
mask = (1ULL<<2*aux->ch->k) - 1;
shift = 2 * (aux->ch->k - 1);
} else {
mask = (1ULL<<aux->ch->k) - 1;
shift = aux->ch->k - 1;
}
if (s->l_seq > b->max) {
b->max = s->l_seq;
kroundup32(b->max);
b->s = (uint32_t*)realloc(b->s, b->max * sizeof(uint32_t));
}
memset(b->s, 0, s->l_seq * sizeof(uint32_t));
for (i = l = 0, x[0] = x[1] = 0; i < s->l_seq; ++i) {
for (i = l = 0, x[0] = x[1] = x[2] = x[3] = 0; i < s->l_seq; ++i) {
int flag, c = seq_nt4_table[(uint8_t)s->seq[i]];
if (c < 4) {
x[0] = (x[0] << 2 | c) & mask;
x[1] = x[1] >> 2 | (uint64_t)(3 - c) << shift;
if (aux->ch->k < 32) {
x[0] = (x[0] << 2 | c) & mask;
x[1] = x[1] >> 2 | (uint64_t)(3 - c) << shift;
} else {
x[0] = (x[0] << 1 | (c&1)) & mask;
x[1] = (x[1] << 1 | (c>>1)) & mask;
x[2] = x[2] >> 1 | (uint64_t)(1 - (c&1)) << shift;
x[3] = x[3] >> 1 | (uint64_t)(1 - (c>>1)) << shift;
}
if (++l >= aux->k) {
int type = 0, c1, c2;
uint64_t y = x[0] < x[1]? x[0] : x[1];
uint64_t y;
++t->cnt[k].nk;
y = yak_hash64(y, mask);
if (aux->ch->k < 32)
y = yak_hash64(x[0] < x[1]? x[0] : x[1], mask);
else
y = yak_hash_long(x);
flag = yak_ch_get(aux->ch, y);
if (flag < 0) flag = 0;
c1 = flag&3, c2 = flag>>2&3;
Expand All @@ -72,7 +89,7 @@ static void tb_worker(void *_data, long k, int tid)
if (aux->print_diff && (flag>>2&3) != (flag&3))
printf("D\t%s\t%d\t%d\t%d\n", s->name, i, flag&3, flag>>2&3);
}
} else l = 0, x[0] = x[1] = 0;
} else l = 0, x[0] = x[1] = x[2] = x[3] = 0;
}
for (l = 0, i = 1; i <= s->l_seq; ++i) {
if (i == s->l_seq || b->s[i] != b->s[l]) {
Expand Down Expand Up @@ -156,7 +173,7 @@ int main_triobin(int argc, char *argv[])
fprintf(stderr, " -c INT min occurrence [%d]\n", min_cnt);
fprintf(stderr, " -d INT mid occurrence [%d]\n", mid_cnt);
fprintf(stderr, " -t INT number of threads [%d]\n", aux.n_threads);
fprintf(stderr, "Output: ctg err strongMixed sPat sMat weakMixed wPat1 wMat1 wPat2 wMat2\n");
// fprintf(stderr, "Output: ctg err strongMixed sPat sMat weakMixed wPat1 wMat1 wPat2 wMat2\n");
return 1;
}

Expand All @@ -165,6 +182,10 @@ int main_triobin(int argc, char *argv[])

aux.k = ch->k;
aux.fp = bseq_open(argv[o.ind+2]);
if (aux.fp == 0) {
fprintf(stderr, "ERROR: fail to open file '%s'\n", argv[o.ind+2]);
exit(1);
}
aux.ch = ch;
aux.buf = (tb_buf_t*)calloc(aux.n_threads, sizeof(tb_buf_t));
kt_pipeline(2, tb_pipeline, &aux, 2);
Expand Down
18 changes: 18 additions & 0 deletions yak-priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,24 @@ static inline uint64_t yak_hash64(uint64_t key, uint64_t mask) // invertible int
return key;
}

static inline uint64_t yak_hash64_64(uint64_t key)
{
key = ~key + (key << 21);
key = key ^ key >> 24;
key = (key + (key << 3)) + (key << 8);
key = key ^ key >> 14;
key = (key + (key << 2)) + (key << 4);
key = key ^ key >> 28;
key = key + (key << 31);
return key;
}

static inline uint64_t yak_hash_long(uint64_t x[4])
{
int j = x[1] < x[3]? 0 : 1;
return yak_hash64_64(x[j<<1|0]) + yak_hash64_64(x[j<<1|1]);
}

double yak_cputime(void);
void yak_reset_realtime(void);
double yak_realtime(void);
Expand Down
2 changes: 1 addition & 1 deletion yak.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef YAK_H
#define YAK_H

#define YAKS_VERSION "r51"
#define YAKS_VERSION "r52"

#include <stdint.h>

Expand Down

0 comments on commit ab78d88

Please sign in to comment.