-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepocessing.Rmd
266 lines (208 loc) · 10.4 KB
/
prepocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
---
title: "Preprocessing for Forst et al"
author: Matthew Chung,Meg Hockman and David Gresham
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
toc: true
theme: united
---
# Preanalysis
This script organizes all metadata and RNAseq data for the UGA4 Day 0 study.
Table S1 is generated from this script for inclusion in the publication
All variables for subsequent analysis and figure generation are generated by this script and saved as a R data object called `data.Rdata`.
## Load packages and view sessionInfo
```{r,warning=FALSE,message = FALSE}
library(biomaRt)
library(cowplot)
library(clusterProfiler)
library(cowplot)
#library(DESeq2)
library(edgeR)
library(enrichplot)
library(ggplot2)
library(gprofiler2)
library(openxlsx)
#library(pathview)
library(reshape2)
library(see)
library(splines)
library(SuperExactTest)
library(UpSetR)
```
## Read inputs
read in the different files containing information on participants and samples
```{r}
counts <- read.delim("../data/rnaseq-merged-counts.tsv",
row.names = 1)
runinfo <- read.xlsx("../data/CIVR-HRP_SequencingRuns.xlsx", sheet = 2)
#Read in file from UGA with metadata and HAI information. File is the same as that on Synapse
raw_metadata1 <- read.xlsx("../data/2019-20 UGA Data.xlsx",
sheet = 1,
rows = seq(4,378),
cols = seq(1,107))
raw_metadata2 <- read.xlsx("../data/2019-20 UGA Data.xlsx",
sheet = 2,
rows = seq(6,467),
cols = seq(1,107))
#Confirm the origin of this file
raw_metadata3 <- read.xlsx("../data/20201019_UGA4_Metadata.xlsx")
#file not available
#raw_metadata4 <- read.xlsx("../data/mtdt_modelingoutput_210805.xlsx",
# sheet= 1 )
```
## Create combined metadata table from the four metadata inputs
Removes undetermined column from RNAseq counts data frame and combines the different tables.
```{r}
counts <- counts[,grep("undetermined",colnames(counts),invert=T)]
runinfo$UGA.Sample.ID <- as.numeric(as.character(runinfo$UGA.Sample.ID))
raw_metadata1$Study.ID <- as.numeric(as.character(gsub(".*-","",raw_metadata1$Study.ID)))
#raw_metadata4 <- raw_metadata4[raw_metadata4$Cohort == "UGA4",]
metadata <- as.data.frame(cbind(runinfo,
raw_metadata1[match(runinfo$UGA.Sample.ID,raw_metadata1$Study.ID),9:ncol(raw_metadata1)],
raw_metadata2[match(runinfo$UGA.Sample.ID,raw_metadata2$ID),8:ncol(raw_metadata3)],
raw_metadata3[match(runinfo$UGA.Sample.ID,raw_metadata3$UGA_ID),] ))
#metadata$CorrectedSeroconversion_composite <- #raw_metadata4$CorrectedSeroconversion_composite[match(runinfo$UGA.Sample.ID,raw_metadata4$ID)]
metadata <- metadata[match(colnames(counts),metadata$File.Name),]
```
## Add additional metadata columns
These columns include:
1) Composite_Baseline, which is the average baseline HAI using 4 strains
2) Composite_D28, which is the average response HAI using 3 or 4 strains depending on dose
3) average_seroconversion, which is the average of 3 or 4 seroconversion values depending on dose
4) Prevacc.naive
5) Month.vaccinated
Also fixes typos in Race_Ethnicity in metadata
Simplified approach without transforming data
```{r}
metadata <- metadata[,!(duplicated(colnames(metadata)))]
metadata$Race_Ethnicity[which(metadata$Race_Ethnicity == "White_Cauasian")] <- "White_Caucasian"
metadata$Race_Ethnicity[which(metadata$Race_Ethnicity == "White_Caucasain")] <- "White_Caucasian"
metadata$`Vaccine.2.seasons.ago.(fall.2017-spring.2018)`[is.na(metadata$`Vaccine.2.seasons.ago.(fall.2017-spring.2018)`)] <- "Unsure"
#define prevaccination status
metadata$Prevacc.naive <- ifelse(rowSums(metadata[,grep("Vaccine[.]", colnames(metadata))] == "N") == 3, F, T)
metadata$Month.vaccinated <- month.name[as.numeric(unlist(strsplit(metadata$DateVaccinated,"_"))[c(T,F,F)])]
#################################################
## Compute composite values for Day 0 and Day 28
#sum all four untransformed values for Day 0 HAI and divide by 4 to obtain average
#include Yamagata for both standard dose and high dose participants as this assay is performed to define pre-vaccination serology
metadata$Composite_Baseline <- (rowSums(metadata[,grep("D0_Baseline",colnames(metadata))])/4)
#Sum all four values if received standard dose and divide by 4 to get average
#Exclude Yamagata if received high dose and sum three values then divide by 3 to get average
#There are two "unsure" cases - we assume high dose and therefore exclude Yamagata for them as well.
metadata$Composite_D28 <- ifelse(metadata$`Standard.or.high-dose.vaccine?.(S/H)` == "S",
(rowSums(metadata[,grep("D28_HAI",colnames(metadata))])/4),
(rowSums(metadata[,grep("D28_HAI", colnames(metadata))]) - metadata[,grep("D28_HAI_Yamagata", colnames(metadata))])/3)
###########################
#Compute seroconversion for each strain by dividing Day28 HAI value by Day0 HAI value
#MRH_h denotes used for high dose calculation only (3 strains)
#MRH* denotes used for standard dose calculation (4 strains)
#A/H1N1 (A/Brisbane/02/2018)
metadata$H1N1Seroconversion_MRH_h <- metadata$D28_HAI_H1N1/metadata$D0_Baseline_HAI_H1N1
#A/H3N2 (A/Kansas/14/2017)
metadata$H3N2Seroconversion_MRH_h <- metadata$D28_HAI_H3N2/metadata$D0_Baseline_HAI_H3N2
#B/Victoria (B/Colorado/6/2017-like strain)
metadata$VictoriaSeroconversion_MRH_h <- metadata$D28_HAI_Victoria/metadata$D0_Baseline_Victoria
#(B/Phuket/3073/2013)
metadata$YamagataSeroconversion_MRH <- metadata$D28_HAI_Yamagata/metadata$D0_Baseline_HAI_Yamagata
###########################
#Compute average seroconversion for all strains.
#For standard dose use sum of all four strains and divide by 4 to get average.
#For high dose exclude Yamagata and compute average for 3 strains.
metadata$average_seroconversion <- ifelse(metadata$`Standard.or.high-dose.vaccine?.(S/H)` == "S",
rowSums(metadata[,grep("MRH",colnames(metadata))])/4,
rowSums(metadata[,grep("MRH_h",colnames(metadata))])/3
)
```
## Filters samples to only keep only 1 sample per individual based on the highest number of reads
This must results in 275 samples.
```{r}
samples <- c()
for(i in 1:length(unique(metadata$UGA.Sample.ID))){
counts.subset <- counts[,metadata$File.Name[metadata$UGA.Sample.ID == unique(metadata$UGA.Sample.ID)[i]],drop = F]
samples <- c(samples, colnames(counts.subset)[colSums(counts.subset) == max(colSums(counts.subset))])
}
counts <- counts[,samples]
metadata <- metadata[match(samples,metadata$File.Name),]
```
## Pull annotation for human genes in counts table
define the subset of genes that will be included in analyses by 1) excluding pseudogenes and 2) excluding the three hemoglobin genes that are extremely highly expressed.
```{r}
mart = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
genes <- getBM(c("ensembl_gene_id","hgnc_symbol","gene_biotype","description",
"start_position","end_position","transcript_length"),
"ensembl_gene_id",
rownames(counts), mart)
genes.subset <- unique(genes$ensembl_gene_id[grep("pseudogene",genes$gene_biotype,invert = T)])
genes.subset <- genes.subset[!(genes.subset %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))]
genes.subset.length <- unlist(lapply(genes.subset,function(x){mean(genes$transcript_length[genes$ensembl_gene_id == x])}))
table(grep("pseudogene",genes$gene_biotype,invert = T,value = T))
```
## Download human annotations for GSEA
```{r,warning=FALSE,message = FALSE}
organism = "org.Hs.eg.db"
BiocManager::install(organism, character.only = TRUE)
library(organism, character.only = TRUE)
```
## Calculate TPM for gene subset
TPM (transcripts per million) is the gene length normalized proportion of reads mapping to the gene in a sample and can be used to directly compare between samples.
TPM is used for visualization purposes only.
```{r}
calculate_tpm <- function(counts,gene_length){
tpm <- counts
for(i in 1:ncol(tpm)){
tpm[,i] <- tpm[,i]/gene_length
tpm[,i] <- tpm[,i]/(sum(tpm[,i])/1000000)
}
return(tpm)
}
counts.subset <- counts[rownames(counts) %in% genes.subset,]
tpm.subset <- calculate_tpm(counts.subset,genes.subset.length)
tpm.subset <- tpm.subset[rowSums(tpm.subset) != 0,]
#write.csv(tpm.subset, "tpm.hemoglobin.csv")
```
## Create lists of gene sets of interests and list of covariants of interest
```{r}
gene.sets <- list(all = rownames(counts),
no_high_exp_hemo = rownames(counts)[!(rownames(counts) %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))],
final_set = rownames(counts)[rownames(counts) %in% unique(genes$ensembl_gene_id[grep("pseudogene",genes$gene_biotype,invert = T)])])
gene.sets$final_set <- gene.sets$final_set[!(gene.sets$final_set %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))]
```
## Remove unnused columns from metadata
metadata is the data frame used in all subsequent scripts
```{r}
metadata <- metadata[,c(1,2,3,5:8,11,13:15,17:21,23,29:30,57:66,72,74,77,79,82,84,87,89,92,93:100)]
```
## write supplementary Table 1
remove irrelevant columns and rename and reorder columns to generate Table S1 for publication
```{r}
tableS1 <- metadata
#remove additional unnecessary columns for table for publication purposes
tableS1 <- tableS1[,-c(2,3,5,7,17,19,20,21,22,23,24)]
#Tidy up column headers
names(tableS1)[1] <- "Participant ID"
names(tableS1)[2] <- "Current smoker"
names(tableS1)[3] <- "Prior smoker"
names(tableS1)[4] <- "Sleep apnea"
names(tableS1)[15] <- "Sex"
names(tableS1)[25] <- "D0_Baseline_HAI_Victoria"
names(tableS1)[27] <- "Vaccinated in prior 3 seasons"
names(tableS1)[29] <- "Average Day 0 HAI"
names(tableS1)[30] <- "Average Day 28 HAI"
names(tableS1)[31] <- "H1N1 Seroconversion"
names(tableS1)[32] <- "H3N2 Seroconversion"
names(tableS1)[33] <- "Victoria Seroconversion"
names(tableS1)[34] <- "Yamagata Seroconversion"
names(tableS1)[35] <- "Average Seroconversion"
#Reorder the columns
tableS1 <- tableS1[,c(1,14,15,16,11,12,17,18,2,3,4,5,6,7,8,9,10,27,13,28,19,21,23,25,29,20,22,24,26,30,31,32,33,34,35)]
write.table(tableS1, file = "Table S1.txt", sep = "\t")
```
```{r}
save(counts,
counts.subset,
tpm.subset,
genes,
gene.sets,
metadata, file = "data.RData")
```