-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_expression_meanF.Rmd
executable file
·172 lines (140 loc) · 9.25 KB
/
compute_expression_meanF.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
---
title: "Compute per-variant expression functional score"
author: "Tyler Starr"
date: "8/6/2021"
output:
github_document:
html_preview: false
editor_options:
chunk_output_type: inline
---
This notebook reads in per-barcode counts from `count_variants.ipynb` for expression Sort-seq experiments, computes functional scores for scFv expression levels, and does some basic QC on variant expression functional scores.
```{r setup, message=FALSE, warning=FALSE, error=FALSE}
require("knitr")
knitr::opts_chunk$set(echo = T)
knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
options(repos = c(CRAN = "https://cran.r-project.org"))
#list of packages to install/load
packages = c("yaml","data.table","tidyverse","fitdistrplus","gridExtra")
#install any packages not already installed
installed_packages <- packages %in% rownames(installed.packages())
if(any(installed_packages == F)){
install.packages(packages[!installed_packages])
}
#load packages
invisible(lapply(packages, library, character.only=T))
#read in config file
config <- read_yaml("config.yaml")
#make output directory
if(!file.exists(config$expression_sortseq_dir)){
dir.create(file.path(config$expression_sortseq_dir))
}
```
Session info for reproducing environment:
```{r print_sessionInfo}
sessionInfo()
```
## Setup
```{r test_prepped}
pdt <- data.table(read.csv(file=config$prepped_variant_counts_file,stringsAsFactors=F))
# retain all rows where 'SortSeq' is in the 'sample' string
pdt <- pdt[grepl("SortSeq", sample)]
# rename columns:
# read_count -> count,
# estimated_cell_count -> count.norm,
# variants -> aa_substitutions,
# barcode -> variant_call_support,
# bin -> sample
setnames(
pdt,
c("read_count", "estimated_cell_count", "variant", "barcode"),
c("count", "count.norm", "aa_substitutions", "n_bc_expr")
)
# assign a column "target" with all values equal to "CGG_naive"
pdt[, target := "CGG_naive"]
pdt[,variant_class:=as.character(NA)]
pdt[n_aa_substitutions==0, variant_class := "wildtype"]
# note that we've dropped all synonymous variants with silent mutations in the prep stage.
pdt[n_aa_substitutions>0 & grepl("*",aa_substitutions,fixed=T), variant_class := "stop"]
pdt[n_aa_substitutions == 1 & !grepl("*",aa_substitutions,fixed=T), variant_class := "1 nonsynonymous"]
pdt[n_aa_substitutions > 1 & !grepl("*",aa_substitutions,fixed=T), variant_class := ">1 nonsynonymous"]
```
```{r }
#cast the data frame into wide format
pdt <- dcast(pdt, library + target + variant_class + aa_substitutions + n_aa_substitutions + n_bc_expr ~ sample, value.var="count.norm")
```
```{r }
#add total count corresponding to count across the four bins for each barcode
pdt[,expr_count := sum(SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4),by=c("library","aa_substitutions")]
#add indicator if count>1 in >1 bin
pdt[,total_bins_w_count := sum(.(SortSeq_bin1,SortSeq_bin2,SortSeq_bin3,SortSeq_bin4)>0),by=c("library","aa_substitutions")]
```
## Calculating mean fluorescence
Next, for each barcode, calculate its mean fluorescence as an indicator of RBD expression level. We will use a maximum likelihood approach to determine the mean and standard deviation of fluorescence for a barcode, given its distribution of cell counts across sort bins, and the known fluorescence boundaries of those sort bins from the sorting log. The package `fitdistcens` enables this ML estimation for these type of *censored* observations, where we know we observed a cell within some fluorescence interval but do not know the exact fluorescence value attributed to that observation. The counts are multiplied by 20 so that there is not a large rounding effect when they are rounded to integers.
```{r calculate_meanF, error=FALSE, message=FALSE, warning=FALSE, results=F}
#define function to calculate ML meanF
calc.MLmean <- function(b1,b2,b3,b4,min.b1,min.b2,min.b3,min.b4,max.b4,min.count=1){ #b1-4 gives observed cell counts in bins 1-4; remaining arguments give fluorescence boundaries of the respective bins; min.count gives minimum number of total observations needed across bins in order to calculate meanF (default 1)
data <- data.frame(left=c(rep(min.b1,round(b1)),rep(min.b2,round(b2)),rep(min.b3,round(b3)),rep(min.b4,round(b4))),
right=c(rep(min.b2,round(b1)),rep(min.b3,round(b2)),rep(min.b4,round(b3)),rep(max.b4,round(b4)))) #define data input in format required for fitdistcens
if(nrow(unique(data))>1 & nrow(data)>min.count){ #only fits if above user-specified min.count, and if the data satisfies the fitdistcens requirement that cells are observed in at least two of the censored partitions to enable ML estimation of identifiable parameters
fit <- fitdistcens(data,"norm")
return(list(as.numeric(summary(fit)$estimate["mean"]),as.numeric(summary(fit)$estimate["sd"])))
} else {
return(list(as.numeric(NA),as.numeric(NA)))
}
}
#fit ML mean and sd fluorescence for each barcode, and calculate total cell count as the sum across the four bins. Multiply cell counts by a factor of 20 to minimize rounding errors since fitdistcens requires rounding to integer inputs
invisible(pdt[library=="lib1",c("expression","ML_sdF") := tryCatch(calc.MLmean(b1=SortSeq_bin1*20,b2=SortSeq_bin2*20,
b3=SortSeq_bin3*20,b4=SortSeq_bin4*20,
min.b1=log(20),min.b2=log(645.5),min.b3=log(15584.5),
min.b4=log(33302.5),max.b4=log(229000)),
error=function(e){return(list(as.numeric(NA),as.numeric(NA)))}),by=c("library","aa_substitutions")])
invisible(pdt[library=="lib2",c("expression","ML_sdF") := tryCatch(calc.MLmean(b1=SortSeq_bin1*20,b2=SortSeq_bin2*20,
b3=SortSeq_bin3*20,b4=SortSeq_bin4*20,
min.b1=log(20),min.b2=log(645.5),min.b3=log(15584.5),
min.b4=log(33302.5),max.b4=log(229000)),
error=function(e){return(list(as.numeric(NA),as.numeric(NA)))}),by=c("library","aa_substitutions")])
#save temp data file for downstream troubleshooting since the ML meanF took >1hr to calculate -- don't use these for final anlaysis though for reproducibility!
save(pdt,file=paste(config$expression_sortseq_dir,"/dt.temp.Rda",sep=""))
```
## Basic plotting and QC
Let's look at the distibution of expression scores by variant class for each library.
```{r unfiltered_expression_distribution, echo=T, fig.width=7, fig.height=5, fig.align="center", dpi=300,dev="png"}
#histogram of mean F, separated by class
hist(pdt[variant_class %in% (c("1 nonsynonymous",">1 nonsynonymous")),expression],col="gray40",main="",breaks=50,xlab="ML mean fluorescence (a.u.)")
hist(pdt[variant_class %in% (c("wildtype")),expression],col="#92278F",add=T,breaks=50)
hist(pdt[variant_class %in% (c("stop")),expression],col="#BE1E2D",add=T,breaks=50)
```
Next let's look at the distributon of cell counts across the four bins for each barcode, for those for which estimates were generated on the bottom.
```{r cell_count_coverage, echo=T, fig.width=10, fig.height=10, fig.align="center", dpi=300,dev="png"}
#histograms
par(mfrow=c(2,2))
hist(log10(pdt[library=="lib1",expr_count]+0.1),xlab="cell count (log10, plus 0.1 pseudocount)",main="lib1, all bc",col="gray50")
hist(log10(pdt[library=="lib2",expr_count]+0.1),xlab="cell count (log10, plus 0.1 pseudocount)",main="lib2, all bc",col="gray50")
hist(log10(pdt[library=="lib1" & !is.na(expression),expr_count]+0.1),xlab="cell count (log10, plus 0.1 pseudocount)",main="lib1, determined",col="gray50")
hist(log10(pdt[library=="lib2" & !is.na(expression),expr_count]+0.1),xlab="cell count (log10, plus 0.1 pseudocount)",main="lib2, determined",col="gray50")
```
Filter out expression measurements determined from <10 estimated cells
```{r filter_min_cfu}
min_count <- config$min_Sortseq_reads_per_variant
pdt[expr_count<min_count, c("expression","ML_sdF","expr_count") := NA]
```
Do as violin plots, faceted by each target. In next notebook, we'll evaluate count depth and possibly apply further filtering to remove low-count expression estimates
```{r expression_distribution_vioplot, echo=T, fig.width=6, fig.height=4.5, fig.align="center", dpi=300,dev="png"}
p1 <- ggplot(pdt[!is.na(expression),],aes(x=variant_class,y=expression))+
geom_violin(scale="width")+stat_summary(fun=median,geom="point",size=1)+
ggtitle("expression")+xlab("variant class")+theme(axis.text.x=element_text(angle=-90,hjust=0))+
facet_wrap(~target,nrow=4)
grid.arrange(p1,ncol=1)
#save pdf
invisible(dev.print(pdf, paste(config$expression_sortseq_dir,"/violin-plot_meanF-by-target.pdf",sep="")))
```
We have generated expression measurements for `r round(nrow(pdt[!is.na(expression)])/nrow(pdt)*100,digits=2)`% of the barcodes in our libraries.
## Data Output
Finally, let's output our measurements for downstream analyses.
```{r output_data}
pdt[,.(library,target,variant_class,aa_substitutions,n_aa_substitutions,n_bc_expr,
expr_count,expression)] %>%
mutate_if(is.numeric, round, digits=6) %>%
write.csv(file=config$expression_sortseq_file, row.names=F)
```