forked from lh3/samtools
-
Notifications
You must be signed in to change notification settings - Fork 1
/
errmod.c
134 lines (125 loc) · 3.46 KB
/
errmod.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <math.h>
#include "errmod.h"
#include "ksort.h"
#include "glibc_win64_flat/_math_.h"
#define lgamma __lgamma
KSORT_INIT_GENERIC(uint16_t)
typedef struct __errmod_coef_t {
double *fk, *beta, *lhet;
} errmod_coef_t;
typedef struct {
double fsum[16], bsum[16];
uint32_t c[16];
} call_aux_t;
static errmod_coef_t *cal_coef(double depcorr, double eta)
{
int k, n, q;
long double sum, sum1;
double *lC;
errmod_coef_t *ec;
ec = calloc(1, sizeof(errmod_coef_t));
// initialize ->fk
ec->fk = (double*)calloc(256, sizeof(double));
ec->fk[0] = 1.0;
for (n = 1; n != 256; ++n)
ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
// initialize ->coef
ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
lC = (double*)calloc(256 * 256, sizeof(double));
for (n = 1; n != 256; ++n) {
double lgn = lgamma(n+1);
for (k = 1; k <= n; ++k)
lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1);
}
for (q = 1; q != 64; ++q) {
double e = pow(10.0, -q/10.0);
double le = log(e);
double le1 = log(1.0 - e);
for (n = 1; n <= 255; ++n) {
double *beta = ec->beta + (q<<16|n<<8);
sum1 = sum = 0.0;
for (k = n; k >= 0; --k, sum1 = sum) {
sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
beta[k] = -10. / M_LN10 * logl(sum1 / sum);
}
}
}
// initialize ->lhet
ec->lhet = (double*)calloc(256 * 256, sizeof(double));
for (n = 0; n < 256; ++n)
for (k = 0; k < 256; ++k)
ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
free(lC);
return ec;
}
errmod_t *errmod_init(float depcorr)
{
errmod_t *em;
em = (errmod_t*)calloc(1, sizeof(errmod_t));
em->depcorr = depcorr;
em->coef = cal_coef(depcorr, 0.03);
return em;
}
void errmod_destroy(errmod_t *em)
{
if (em == 0) return;
free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
free(em->coef); free(em);
}
// qual:6, strand:1, base:4
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
call_aux_t aux;
int i, j, k, w[32];
if (m > m) return -1;
memset(q, 0, m * m * sizeof(float));
if (n == 0) return 0;
// calculate aux.esum and aux.fsum
if (n > 255) { // then sample 255 bases
ks_shuffle(uint16_t, n, bases);
n = 255;
}
ks_introsort(uint16_t, n, bases);
memset(w, 0, 32 * sizeof(int));
memset(&aux, 0, sizeof(call_aux_t));
for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
uint16_t b = bases[j];
int q = b>>5 < 4? 4 : b>>5;
if (q > 63) q = 63;
k = b&0x1f;
aux.fsum[k&0xf] += em->coef->fk[w[k]];
aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
++aux.c[k&0xf];
++w[k];
}
// generate likelihood
for (j = 0; j != m; ++j) {
float tmp1, tmp3;
int tmp2, bar_e;
// homozygous
for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
if (k == j) continue;
tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
}
if (tmp2) {
bar_e = (int)(tmp1 / tmp3 + 0.499);
if (bar_e > 63) bar_e = 63;
q[j*m+j] = tmp1;
}
// heterozygous
for (k = j + 1; k < m; ++k) {
int cjk = aux.c[j] + aux.c[k];
for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
if (i == j || i == k) continue;
tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
}
if (tmp2) {
bar_e = (int)(tmp1 / tmp3 + 0.499);
if (bar_e > 63) bar_e = 63;
q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
} else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
}
for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
}
return 0;
}