-
Notifications
You must be signed in to change notification settings - Fork 1
/
make.summary.data.R
executable file
·209 lines (167 loc) · 7.36 KB
/
make.summary.data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env Rscript
argv <- commandArgs(trailingOnly = TRUE)
if(length(argv) < 6) {
q()
}
options(stringsAsFactors = FALSE)
source('Util.R')
library(fqtl)
library(dplyr)
library(readr)
library(methods)
chr <- as.integer(argv[1]) # e.g., chr = 19
gene.idx.str <- argv[2] # e.g., gene.idx.str = '1:33'
tis.name <- argv[3] # e.g., tis.name = '10_Brain_Cerebellar_Hemisphere'
n.ctrl.per.gene <- as.integer(argv[4]) # e.g., n.ctrl.per.gene = 3
data.type <- argv[5] # e.g., data.type = 'rsem-count'
out.hdr <- argv[6] # e.g., out.hdr = 'temp'
out.resid.file <- glue(out.hdr, '.resid.gz')
out.qtl.file <- glue(out.hdr, '.qtl.gz')
out.sample.file <- glue(out.hdr, '.samples.gz')
out.gene.file <- glue(out.hdr, '.genes.gz')
gene.idx <- eval(parse(text = gene.idx.str))
################################################################
## scratch pad
temp.dir <- system(glue('mkdir -p ', out.hdr, '; mktemp -d ', out.hdr, '/temp.XXXXXXXX'),
intern = TRUE, ignore.stderr = TRUE)
################################################################
## take expression matrices
data.file <- glue('data/', tis.name, '/chr', chr, '/', data.type, '.txt.gz')
gene.file <- glue('data/', tis.name, '/chr', chr, '/genes.txt.gz')
sample.file <- glue('data/', tis.name, '/samples.txt.gz')
other.data.files <- glue('data/', tis.name, '/chr', setdiff(1:22, chr), '/rsem-count.txt.gz')
size.factor.file <- glue('data/', tis.name, '/rsem-size-factor.txt.gz')
.read.tsv <- function(...) { readr::read_tsv(..., col_names = FALSE) %>% as.matrix() }
################################################################
## transformation of raw data into
size.factor <- .read.tsv(size.factor.file)
adjust.size <- function(mat, sf = size.factor[, 1]) { sweep(mat, 1, sf, `/`) }
rm.zero <- function(mat) { mat[mat == 0] <- NA; return(mat) }
stdize.count <- function(xx) {
xx[xx < .5] <- NA
xx.med <- apply(xx, 2, median, na.rm = TRUE)
xx.scaled <- sweep(xx, 2, xx.med, `/`)
ret <- xx.scaled * 10
}
.trans.normal <- function(..., .data.type = c('rsem-count', 'vsn')) {
.data.type <- match.arg(.data.type)
if(.data.type == 'rsem-count') {
ret <- log2(1/2 + rm.zero(...) %>% adjust.size()) %>% scale()
} else if(.data.type == 'vsn') {
ret <- scale(...)
}
return(ret)
}
.pre <- function(..., .data.type = c('rsem-count', 'vsn')) {
.data.type <- match.arg(.data.type)
if(.data.type == 'rsem-count') {
ret <- rm.zero(...) %>% adjust.size() %>% stdize.count()
} else if(.data.type == 'vsn') {
ret <- scale(...)
}
return(ret)
}
.model.name <- function(.data.type = c('rsem-count', 'vsn')) {
.data.type <- match.arg(.data.type)
if(.data.type == 'rsem-count') {
ret <- 'nb'
} else if(.data.type == 'vsn') {
ret <- 'gaussian'
}
return(ret)
}
trans.normal <- function(...) .trans.normal(..., .data.type = data.type)
model.name <- .model.name(data.type)
preprocess <- function(...) .pre(..., .data.type = data.type)
find.most.cor.genes <- function(.others, Y, n.ctrl = n.ctrl.per.gene, na.permit = 0) {
valid.others <- which(apply(is.na(.others), 2, sum) <= na.permit)
.others <- .others %c% valid.others
.cor <- abs(fast.cor(.others, Y))
.ret <- lapply(1:ncol(.cor), function(j) order(.cor[, j], decreasing = TRUE)[1:n.ctrl])
.ret <- valid.others[unique(do.call(c, .ret))]
return(.ret)
}
take.cor.genes.chr <- function(other.chr, Y, n.ctrl = n.ctrl.per.gene) {
.others <- .read.tsv(other.data.files[other.chr])
.ret <- find.most.cor.genes(.others %>% trans.normal(), Y %>% trans.normal(), n.ctrl)
return(.others %c% .ret)
}
subset.plink <- function(plink.hdr, chr, plink.lb, plink.ub, temp.dir) {
require(fqtl)
require(dplyr)
plink.cmd <- sprintf('./bin/plink --bfile %s --make-bed --geno 0.05 --maf 0.05 --chr %d --from-bp %d --to-bp %d --out %s', plink.hdr, chr, plink.lb, plink.ub, glue(temp.dir, '/plink'))
system(plink.cmd)
plink <- read.plink(glue(temp.dir, '/plink'))
colnames(plink$BIM) <- c('chr', 'rs', 'missing', 'snp.loc', 'plink.a1', 'plink.a2')
colnames(plink$FAM) <- c('fam', 'iid', 'father', 'mother', 'sex.code', '.pheno')
plink$FAM <- plink$FAM %>% mutate(iid = sapply(iid, gsub, pattern = 'GTEX-', replacement = ''))
return(plink)
}
################################################################
gene.cols <- c('chr', 'tss', 'tes', 'strand', 'ensg', 'hgnc', 'remove')
genes <- read.table(gene.file, col.names = gene.cols) %>%
dplyr::select(-remove) %r% gene.idx
sample.cols<-c('sampid','iid','data.pos','RIN','TSISCH','tis.name',
'SEX','AGE','HGHT','WGHT','BMI','SEX.std','AGE.std','HGHT.std',
'WGHT.std','BMI.std','ntis','tis.idx','tis.dir')
samples <- read.table(sample.file, col.names = sample.cols) %>%
select(-data.pos, -tis.name, -tis.idx, -tis.dir,
-SEX, -AGE, -HGHT, -WGHT, -BMI, -ntis, -RIN, -TSISCH)
cis.dist <- 1e6
lb <- pmax(min(genes$tss) - cis.dist, 0)
ub <- pmax(max(genes$tes) + cis.dist, 0)
plink <- subset.plink(glue('geno/chr', chr), chr, lb, ub, temp.dir)
x.fam <- plink$FAM %>% select(iid) %>% mutate(x.pos = 1:n())
x.bim <- plink$BIM
samples.matched <- samples %>% mutate(y.pos = 1:n()) %>%
left_join(x.fam, by = 'iid') %>% na.omit()
x.pos <- samples.matched$x.pos
y.pos <- samples.matched$y.pos
################################################################
Y <- .read.tsv(data.file) %c% gene.idx
Y.conf <- do.call(cbind, lapply(1:length(other.data.files), take.cor.genes.chr, Y = Y))
gc()
.conf <- find.most.cor.genes(Y.conf %>% trans.normal(), Y %>% trans.normal(), n.ctrl.per.gene)
Y.conf <- Y.conf %c% .conf
################################################################
Y.std <- Y %>% preprocess() %r% y.pos
Y.conf.std <- Y.conf %>% trans.normal() %r% y.pos
X.std <- plink$BED %r% x.pos %>% scale()
C <- samples.matched %>%
dplyr::select(contains('std')) %>%
mutate(intercept = 1) %>%
as.matrix()
vb.opt <- list(vbiter = 5000,
gammax = 1e4,
out.residual = TRUE,
tol = 1e-8,
rate = 1e-2,
decay = -1e-2,
pi = -2,
tau = -4,
do.hyper = FALSE,
model = model.name,
svd.init = TRUE,
jitter = 0.01,
k = 5)
fqtl.out <- fqtl.regress(y = Y.std, x.mean = Y.conf.std, c.mean = C,
factored = TRUE, options = vb.opt)
## generate eQTL summary statistics
R <- fqtl.out$resid$theta
R[is.na(Y.std)] <- NA
R <- R %>% scale()
colnames(X.std) <- x.bim$rs
colnames(R) <- genes$ensg
rownames(R) <- samples.matched$iid
rm.ensg <- genes$ensg[apply(is.na(R), 2, mean) > .5]
marginal.qtl <- get.marginal.qtl(X.std, R) %>%
rename(rs = snp, ensg = gene) %>%
mutate(rs = as.character(rs), ensg = as.character(ensg)) %>%
left_join(genes, by = 'ensg') %>%
left_join(x.bim %>% select(-chr), by = 'rs') %>%
filter((tss - cis.dist) < snp.loc & snp.loc < (tes + cis.dist)) %>%
filter(!ensg %in% rm.ensg)
write.mat(marginal.qtl, file = gzfile(out.qtl.file))
write.mat(samples.matched, file = gzfile(out.sample.file))
write.mat(genes, file = gzfile(out.gene.file))
write.mat(R, file = gzfile(out.resid.file))