forked from matsengrp/PREAST
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.r
377 lines (318 loc) · 14.1 KB
/
utils.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/usr/bin/env Rscript
# plot distances form founder infered by prank to actual founder.
#
# Works of files created by a simulation process to mimic some of
# those that we obtain from the RV217 project. The RV217 sample ahave
# around 10-12 sequences per sample, all taken from the c2v3c3 region
# gp-120 from the HIV-1 genome.
# install any missing packages
# http://stackoverflow.com/a/19873732/1135316
if (!suppressMessages(require("pacman"))) install.packages("pacman")
pacman::p_load(ape, dplyr, ggplot2, stringr, readr, tidyr, seqinr, functional)
# turn off annoying progress bar
options(dplyr.show_progress=FALSE)
# cribbed from http://manuals.bioinformatics.ucr.edu/home/ht-seq#TOC-Multiple-Sequence-Alignments-MSAs-
removeGaps <- function(align, gap="-") {
myseq <- maskMotif(align, gap)
myseq <- paste(as(myseq, "XStringViews"), collapse="")
return(myseq)
}
# Extract the identifier of the root node from a tree.
# Typically used to extract the name of the root node from a phylogeny inferred by PRANK.
root.id <- function(file) {
tree <- ape::read.tree(file, comment.char = "")
return(tree$node.label[1])
}
# Extract the inferred sequence at the root node of PRANK output
#
# 'dir' parameter is a character string naming a directory where the PRANK
# output files can be found. The directory is expected to have a file
# 'prank.best.anc.dnd' containing a tree generated by PRANK, and a
# file 'prank.best.anc.fas' containing the sequences at each node of
# Read the founder sequences used to seed all the simulated RV!27 samples.
# If our simulation environment gets more complex we will need to
# parameterize this routine, but for now there is only one founder.
founder.sequence <- function(path=file.path('..', '..', 'templates', 'HIV1C2C3.fasta')) {
fasta <- read.dna(path, format = "fasta", comment.char = "")
return(paste(fasta[1,],collapse=''))
}
# Calculate the majority-rule consensus of sequences from the BEAST
# posterior traits after a burning period.
#
# NOTE This consensus is unrelated to the score used to assess the
# quality of the founder inferred by BEAST.
# That score is the mean of all pairwise alignments
# between sequences in the posterior and the actual founder,
# NOT of the alignment between a single consensus and the
# founder.
beast.sequence <- function(dir, burnin=0.10) {
# burning in the percentage of samples to skip at the beginning of the posterior.
stopifnot(burnin >= 0 && burnin <= 1.0)
consensus <- function(seqs) {
seqinr::consensus(as.matrix(seqs), method = "majority", type="DNA")
}
tryCatch({
read_tsv(file.path(dir, 'ancestralSequences.log'), col_names=c("state", "trait"), skip=3) %>%
add_rownames() %>%
filter(as.numeric(rowname) > (n() * burnin)) %>%
summarize(trait=consensus(trait)) %>% as.character()
}, error = function(e) {
as.character(NA)
})
}
# NOTE: MULTIPLE RETURN TYPES!
# if scoreOnly is FALSE, create a DNAbin object representing a pairwise alignment between two sequences
# otherwise just return the N-W alignment score.
pwalign <- function(s1, s2, scoreOnly=FALSE) {
## Nucleotide global, local, and overlap alignments
pattern <- DNAString(gsub('-', '', s1))
subject <- DNAString(gsub('-', '', s2))
# First use a fixed substitution matrix
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = FALSE)
globalAlign <- pairwiseAlignment(pattern, subject, type="global", scoreOnly=scoreOnly, substitutionMatrix = mat,
gapOpening = 10, gapExtension = 4)
## # extracting the aligned sequences from a pairwiseAlignment object is unncessarily complex.
## # see https://support.bioconductor.org/p/70913/#70966
## seqset <- Biostrings:::.makePostalignedSeqs(globalAlign)[1]
## return(do.call(rbind, lapply(seqset, as.DNAbin)))
return(globalAlign)
}
pwscore <- function(s1,s2) {
tryCatch({pwalign(s1,s2,scoreOnly=TRUE)}, error = function(e) {NA})
}
# NOTE This distance calculation does not use the beast consensus sequence.
# Instead it is based on the mean score of all pairwise alignments
# between sequences in the posterior and the actual founder
beast.distance <- function(dir) {
burnin_pct = 0.99
n <- try({
read_tsv(file.path(dir, 'ancestralSequences.log'), col_names=c("state", "trait"), skip=3) %>%
add_rownames() %>%
filter(as.numeric(rowname) > (n() * burnin_pct)) %>%
select(c(trait)) %>%
rowwise() %>% mutate(dist=pwscore(trait, founder)) %>%
ungroup() %>% summarize(dist=mean(dist))
}, error = function(e) {
NA
})
return(as.numeric(n))
}
# calculate a majority-rule consensus from the input sequences.
# This is the simplest, most obvious founder inference method that works surprisingly well early in infection.
consensus.sequence <- function(dir, allowIUPAC=FALSE) {
tryCatch({
if (allowIUPAC) {
readDNAStringSet(file.path(dir, 'sample_aln.fa')) %>%
(function(s) {chartr("-", "N", s)}) %>%
consensusMatrix() %>%
consensusString(ambiguityMap=IUPAC_CODE_MAP, threshold=0.25)
} else {
readDNAStringSet(file.path(dir, 'sample_aln.fa')) %>%
as.matrix() %>% seqinr::consensus(method = "majority", type="DNA") %>%
as.character() %>% paste0(collapse='')
}
}, error = function(e) {
as.character(NA)
})
}
# proportion of ambiguous positions in the consensus of all input sequences
nAmbig <- function(dir) {
s <- consensus.sequence(dir, allowIUPAC=TRUE)
t <- table(strsplit(toupper(s), '')[[1]])
sum(t[!(names(t) %in% c('A','C','G','T'))])/nchar(s)
}
# proportion of sites in multiple alignment of all input sequences that are gaps
nGaps <- function(dir) {
file.path(dir, 'sample_aln.fa') %>%
read.dna(format = 'fasta', comment.char = '') %>%
sapply(as.character) %>%
(function(s) grepl('-',s)) %>%
(function(b) sum(b)/length(b))
}
# total number of input sequences
nSeqs <- function(dir) {
file.path(dir, 'sample_aln.fa') %>%
read.dna(format = 'fasta', comment.char = '') %>%
nrow()
}
# pairwiseGaps - total number of gaps in a pairwise alignment between a sequence and the an inferred prank-codon founder sequence
pairwiseGaps <- function(s, dir) {
f <- prank.sequence(dir, prank.model='codon')
n <- sum(grepl('-',sapply(pwalign(s, f), as.character)))
return(n)
}
# Calculate the distance between the consensus of the INPUT sequences and the actual founder.
consensus.distance <- function(dir, model='K80', founder=founder) {
# calculate a consensus of the input sequences
tryCatch({
consensus.sequence(dir) %>%
sapply(function(s) removeGaps(DNAString(s))) %>%
pwscore(founder)
}, error = function(e) {
as.numeric(NA)
})
}
sample_seq <- function(dir, n=2) {
readDNAStringSet(file.path(dir, 'sample_aln.fa')) %>%
sample(n, replace=FALSE) %>%
as.character() %>%
(function(s) list(sname=names(s),seq=s)) %>% data.frame(stringsAsFactors=FALSE)
}
# Extract the inferred ancestral sequence at the root of the prank tree.
#
# This routine is parameterized by the name of the prank model, which
# is used to construct the name of the subdirectory when the prank
# results are expected.
prank.sequence <- function(dir, prank.model='codon') {
tryCatch({
dir <- file.path(dir, paste0('prank_', prank.model))
treefile <- file.path(dir, 'prank.best.anc.dnd')
rootid <- root.id(treefile)
fastafile <- file.path(dir, 'prank.best.anc.fas')
fasta <- read.dna(fastafile, format = 'fasta', comment.char = '')
root <- fasta[which(labels(fasta)==rootid),]
paste(root,collapse='')
}, error = function(e) {
NA
})
}
prank.distance <- function(dir, prank.model='codon', model='K80', founder=founder) {
#print(file)
tryCatch({
prank.sequence(dir, prank.model) %>% pwscore(founder)
}, error = function(e) {prank.seq
as.numeric(NA)
})
}
get_sequences <- function(dir) {
c(beast=beast.sequence(dir),
pcodon=prank.sequence(dir,'codon'),
pdna=prank.sequence(dir, 'dna'),
unguided=prank.sequence(dir, 'unguided'),
consensus=consensus.sequence(dir))
}
# save a multiple alignment to a file, and plot the pw alignment scores to the founder.
alignment <- function(dir) {
DNAStringSet(get_sequences(dir)) %>%
sapply(function(x) removeGaps(align=x)) %>%
DNAStringSet() %>%
AlignSeqs(verbose=FALSE)
}
# pairwise alignment distances between the actual founder and each of the inferred founders in a directory
distances.dist.dna <- function(dir) {
d <- alignment(dir) %>%
as.matrix() %>% ape::as.alignment() %>% as.DNAbin() %>%
dist.dna(as.matrix = TRUE, pairwise.deletion=TRUE)
t(d['founder',2:ncol(d)])
}
# pairwise alignment scores between the actual founder and each of the inferred founders in a directory
distances.pwscore <- function(dir) {
get_sequences(dir) %>%
sapply(function(s) pwscore(founder,s)) %>%
sapply(function(s) -(s-nchar(founder))/nchar(founder)) %>%
as.list() %>% as_data_frame() %>%
do({names(.) <- paste0('dist.',names(.)); .})
}
# Measure the diversity of the aligned input sample.
# taken as the median of all the pairwise distance between individual sequences
diversity <- function(dir) {
tryCatch({
file.path(dir, 'sample_aln.fa') %>%
readDNAStringSet() %>%
as.matrix() %>% ape::as.alignment() %>% as.DNAbin() %>%
dist.dna(pairwise.deletion=TRUE) %>% median()
}, error = function(e) {
as.numeric(NA)
})
}
## : the number of ambiguous bases in some consensus allowing ambiguity
# save a multiple alignment to a file, and plot the pw alignment scores to the founder.
write.alignment <- function(df) {
print(class(df))
print(dim(df))
with(df, {
print(dir)
## plotfile <- sprintf('plot.%s.%s.%s.replicate_%d.generation_%s.png',rate,fitness,ifelse(indel,"indel","noindel"),replicate,wpi)
## print(plotfile)
## png(file=plotfile, width=18, height=12, units = "in", res=300)
## print(barplot(data.frame(df)))
## dev.off()
alnfile <- sprintf('alignment.%s.%s.%s.replicate_%d.generation_%s.fasta',rate,fitness,ifelse(indel,"indel","noindel"),replicate,wpi)
aln <- alignment(dir)
print(aln)
writeXStringSet(aln, alnfile)
})
}
# plot lines across the meanvalue of each inference methid, grouped by rate, fitness, indel, wpi, and method
lines_by_method <- function(tmp) {
tmp %>%
gather(method, dist, starts_with("dist.")) %>%
group_by(rate, fitness, indel, wpi, method) %>%
summarize(dist=mean(dist)) %>%
mutate(method=sub('dist.','',method)) %>%
rowwise() %>%
mutate(method=sprintf("%s_%s", method,rate)) %>%
ungroup() %>%
mutate(method=factor(method)) %>%
# filter(wpi==1200) %>%
#filter(indel & fitness=='none') %>%
ggplot(aes(x=wpi, y=dist, group=method, color=method)) +
facet_grid(indel ~ fitness, labeller=facet_labeller) +
geom_line() +
#geom_point() +
scale_color_discrete(name="Method",
breaks=c("beast", "consensus", 'pcodon', 'pdna', 'unguided'),
labels=c("Beast", "Consensus", 'Prank codon', 'Prank dna', 'Prank unguided'))
}
facet_labeller <- function(variable,value) {
switch(variable,
indel=ifelse(value, "With indels", "Without indels"),
as.character(value))
}
scatterplot <- function(tmp) {
tmp %>% gather(method, dist, starts_with("dist.")) %>%
mutate(method=sub('dist.','',method)) %>%
mutate(method=factor(method)) %>%
filter(as.integer(as.character(wpi)) < 5000) %>%
ggplot(aes(x=wpi, y=dist, group=method, color=method)) +
facet_grid(indel ~ fitness, labeller=facet_labeller) +
geom_point(position = position_jitter(w = 0.1)) +
scale_color_discrete(name="Method",
breaks=c("beast", "consensus", 'pcodon', 'pdna', 'unguided'),
labels=c("Beast", "Consensus", 'Prank codon', 'Prank dna', 'Prank unguided'))
# geom_smooth(method="lm", fill=NA)
}
boxplot <- function(tmp) {
tmp %>% gather(method, dist, starts_with("dist.")) %>%
mutate(method=sub('dist.','',method)) %>%
mutate(method=factor(method)) %>%
filter(as.integer(as.character(wpi)) < 20000) %>%
mutate(wpi=factor(wpi)) %>%
#filter(indel) %>%
ggplot(aes(x=wpi, y=dist, fill=method)) +
facet_grid(indel ~ fitness, labeller=facet_labeller) +
theme(axis.text.x = element_text(size = 8, colour = "red", angle = 45)) +
geom_boxplot() +
xlab("Generations") +
ylab("N-W Pairwise Distance") +
scale_fill_discrete(name="Method",
breaks=c("beast", "consensus", 'pcodon', 'pdna', 'unguided', 'relaxed', 'strict'),
labels=c("Beast", "Consensus", 'Prank codon', 'Prank dna', 'Prank unguided', 'Beast relaxed', 'Beast strict'))
}
barplot <- function(tmp) {
tmp %>% gather(method, dist, starts_with("dist.")) %>%
mutate(method=sub('dist.','',method)) %>%
mutate(method=factor(method)) %>%
filter(as.integer(as.character(wpi)) < 20000) %>%
mutate(wpi=factor(wpi)) %>%
#filter(indel) %>%
ggplot(aes(x=factor(wpi), y=dist, fill=method)) +
facet_grid(indel ~ fitness, labeller=facet_labeller) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(y=dist, ymax=dist, label=method), position= position_dodge(width=0.9), vjust=-.5, color="black") +
scale_y_continuous("Distance") +
scale_x_discrete("Generations") +
scale_fill_discrete(name="Method",
breaks=c("beast", "consensus", 'pcodon', 'pdna', 'unguided'),
labels=c("Beast", "Consensus", 'Prank codon', 'Prank dna', 'Prank unguided'))
}