prepocessing.Rmd

---
title: "Preprocessing for Forst et al"
author: Matthew Chung,Meg Hockman and David Gresham
date: "Last compiled on `r format(Sys.time(), '%d %B, %Y')`"
output: 
  html_document:
    toc: true
    theme: united
---

# Preanalysis
This script organizes all metadata and RNAseq data for the UGA4 Day 0 study.
Table S1 is generated from this script for inclusion in the publication
All variables for subsequent analysis and figure generation are generated by this script and saved as a R data object called `data.Rdata`.

## Load packages and view sessionInfo

```{r,warning=FALSE,message = FALSE}
library(biomaRt)
library(cowplot)
library(clusterProfiler)
library(cowplot)
#library(DESeq2)
library(edgeR)
library(enrichplot)
library(ggplot2)
library(gprofiler2)
library(openxlsx)
#library(pathview)
library(reshape2)
library(see)
library(splines)
library(SuperExactTest)
library(UpSetR)
```

## Read inputs
read in the different files containing information on participants and samples 
```{r}
counts <- read.delim("../data/rnaseq-merged-counts.tsv",
                     row.names = 1)

runinfo <- read.xlsx("../data/CIVR-HRP_SequencingRuns.xlsx", sheet = 2)

#Read in file from UGA with metadata and HAI information.  File is the same as that on Synapse
raw_metadata1 <- read.xlsx("../data/2019-20 UGA Data.xlsx", 
                            sheet = 1, 
                            rows = seq(4,378),
                            cols = seq(1,107))
raw_metadata2 <- read.xlsx("../data/2019-20 UGA Data.xlsx", 
                            sheet = 2, 
                            rows = seq(6,467),
                            cols = seq(1,107))

#Confirm the origin of this file
raw_metadata3 <- read.xlsx("../data/20201019_UGA4_Metadata.xlsx")

#file not available
#raw_metadata4 <- read.xlsx("../data/mtdt_modelingoutput_210805.xlsx",
#                           sheet= 1 )
```

## Create combined metadata table from the four metadata inputs

Removes undetermined column from RNAseq counts data frame and combines the different tables. 
```{r}
counts <- counts[,grep("undetermined",colnames(counts),invert=T)]

runinfo$UGA.Sample.ID <- as.numeric(as.character(runinfo$UGA.Sample.ID))

raw_metadata1$Study.ID <- as.numeric(as.character(gsub(".*-","",raw_metadata1$Study.ID)))
#raw_metadata4 <- raw_metadata4[raw_metadata4$Cohort == "UGA4",]

metadata <- as.data.frame(cbind(runinfo,
                                raw_metadata1[match(runinfo$UGA.Sample.ID,raw_metadata1$Study.ID),9:ncol(raw_metadata1)],
                                raw_metadata2[match(runinfo$UGA.Sample.ID,raw_metadata2$ID),8:ncol(raw_metadata3)],
                                raw_metadata3[match(runinfo$UGA.Sample.ID,raw_metadata3$UGA_ID),]                               ))
#metadata$CorrectedSeroconversion_composite <- #raw_metadata4$CorrectedSeroconversion_composite[match(runinfo$UGA.Sample.ID,raw_metadata4$ID)]

metadata <- metadata[match(colnames(counts),metadata$File.Name),]
```

## Add additional metadata columns

These columns include:

1) Composite_Baseline, which is the average baseline HAI using 4 strains
2) Composite_D28, which is the average response HAI using 3 or 4 strains depending on dose
3) average_seroconversion, which is the average of 3 or 4 seroconversion values depending on dose
4) Prevacc.naive
5) Month.vaccinated

Also fixes typos in Race_Ethnicity in metadata

Simplified approach without transforming data
```{r}
metadata <- metadata[,!(duplicated(colnames(metadata)))]

metadata$Race_Ethnicity[which(metadata$Race_Ethnicity == "White_Cauasian")] <- "White_Caucasian"
metadata$Race_Ethnicity[which(metadata$Race_Ethnicity == "White_Caucasain")] <- "White_Caucasian"

metadata$`Vaccine.2.seasons.ago.(fall.2017-spring.2018)`[is.na(metadata$`Vaccine.2.seasons.ago.(fall.2017-spring.2018)`)] <- "Unsure"

#define prevaccination status
metadata$Prevacc.naive <- ifelse(rowSums(metadata[,grep("Vaccine[.]", colnames(metadata))] == "N") == 3, F, T)
metadata$Month.vaccinated <- month.name[as.numeric(unlist(strsplit(metadata$DateVaccinated,"_"))[c(T,F,F)])]

#################################################
## Compute composite values for Day 0 and Day 28

#sum all four untransformed values for Day 0 HAI and divide by 4 to obtain average
#include Yamagata for both standard dose and high dose participants as this assay is performed to define pre-vaccination serology
metadata$Composite_Baseline <- (rowSums(metadata[,grep("D0_Baseline",colnames(metadata))])/4)

#Sum all four values if received standard dose and divide by 4 to get average
#Exclude Yamagata if received high dose and sum three values then divide by 3 to get average
#There are two "unsure" cases - we assume high dose and therefore exclude Yamagata for them as well.
metadata$Composite_D28 <- ifelse(metadata$`Standard.or.high-dose.vaccine?.(S/H)` == "S", 
                                      (rowSums(metadata[,grep("D28_HAI",colnames(metadata))])/4),
                                      (rowSums(metadata[,grep("D28_HAI", colnames(metadata))]) - metadata[,grep("D28_HAI_Yamagata", colnames(metadata))])/3)

###########################
#Compute seroconversion for each strain by dividing Day28 HAI value by Day0 HAI value

#MRH_h denotes used for high dose calculation only (3 strains)
#MRH* denotes used for standard dose calculation (4 strains)

#A/H1N1 (A/Brisbane/02/2018)
metadata$H1N1Seroconversion_MRH_h <- metadata$D28_HAI_H1N1/metadata$D0_Baseline_HAI_H1N1

#A/H3N2 (A/Kansas/14/2017)
metadata$H3N2Seroconversion_MRH_h <- metadata$D28_HAI_H3N2/metadata$D0_Baseline_HAI_H3N2
  
#B/Victoria (B/Colorado/6/2017-like strain)
metadata$VictoriaSeroconversion_MRH_h <- metadata$D28_HAI_Victoria/metadata$D0_Baseline_Victoria
  
#(B/Phuket/3073/2013)
metadata$YamagataSeroconversion_MRH <- metadata$D28_HAI_Yamagata/metadata$D0_Baseline_HAI_Yamagata
  
###########################
#Compute average seroconversion for all strains.
#For standard dose use sum of all four strains and divide by 4 to get average. 
#For high dose exclude Yamagata and compute average for 3 strains.

metadata$average_seroconversion <- ifelse(metadata$`Standard.or.high-dose.vaccine?.(S/H)` == "S", 
                                           rowSums(metadata[,grep("MRH",colnames(metadata))])/4,
                                           rowSums(metadata[,grep("MRH_h",colnames(metadata))])/3
)

```

## Filters samples to only keep only 1 sample per individual based on the highest number of reads
This must results in 275 samples.
```{r}
samples <- c()
for(i in 1:length(unique(metadata$UGA.Sample.ID))){
  counts.subset <- counts[,metadata$File.Name[metadata$UGA.Sample.ID == unique(metadata$UGA.Sample.ID)[i]],drop = F]
  samples <- c(samples, colnames(counts.subset)[colSums(counts.subset) == max(colSums(counts.subset))])
}

counts <- counts[,samples]
metadata <- metadata[match(samples,metadata$File.Name),]
```

## Pull annotation for human genes in counts table
define the subset of genes that will be included in analyses by 1) excluding pseudogenes and 2) excluding the three hemoglobin genes that are extremely highly expressed. 
```{r}
mart = useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")

genes <- getBM(c("ensembl_gene_id","hgnc_symbol","gene_biotype","description",
                 "start_position","end_position","transcript_length"),
               "ensembl_gene_id",
               rownames(counts), mart)


genes.subset <- unique(genes$ensembl_gene_id[grep("pseudogene",genes$gene_biotype,invert = T)])
genes.subset <- genes.subset[!(genes.subset %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))]
genes.subset.length <- unlist(lapply(genes.subset,function(x){mean(genes$transcript_length[genes$ensembl_gene_id == x])}))

table(grep("pseudogene",genes$gene_biotype,invert = T,value = T))
```

## Download human annotations for GSEA

```{r,warning=FALSE,message = FALSE}
organism = "org.Hs.eg.db"
BiocManager::install(organism, character.only = TRUE)
library(organism, character.only = TRUE)
```

## Calculate TPM for gene subset
TPM (transcripts per million) is the gene length normalized proportion of reads mapping to the gene in a sample and can be used to directly compare between samples. 
TPM is used for visualization purposes only.
```{r}
calculate_tpm <- function(counts,gene_length){
  tpm <- counts
  for(i in 1:ncol(tpm)){
    tpm[,i] <- tpm[,i]/gene_length
    tpm[,i] <- tpm[,i]/(sum(tpm[,i])/1000000)
  }
  return(tpm)
}
counts.subset <- counts[rownames(counts) %in% genes.subset,]
tpm.subset <- calculate_tpm(counts.subset,genes.subset.length)

tpm.subset <- tpm.subset[rowSums(tpm.subset) != 0,]

#write.csv(tpm.subset, "tpm.hemoglobin.csv")
```

## Create lists of gene sets of interests and list of covariants of interest

```{r}
gene.sets <- list(all = rownames(counts),
                  no_high_exp_hemo = rownames(counts)[!(rownames(counts) %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))],
                  final_set = rownames(counts)[rownames(counts) %in% unique(genes$ensembl_gene_id[grep("pseudogene",genes$gene_biotype,invert = T)])])

gene.sets$final_set <- gene.sets$final_set[!(gene.sets$final_set %in% c("ENSG00000188536","ENSG00000244734","ENSG00000206172"))]
```

## Remove unnused columns from metadata
metadata is the data frame used in all subsequent scripts
```{r}
metadata <- metadata[,c(1,2,3,5:8,11,13:15,17:21,23,29:30,57:66,72,74,77,79,82,84,87,89,92,93:100)]
```

## write supplementary Table 1
remove irrelevant columns and rename and reorder columns to generate Table S1 for publication
```{r}
tableS1 <- metadata

#remove additional unnecessary columns for table for publication purposes
tableS1 <- tableS1[,-c(2,3,5,7,17,19,20,21,22,23,24)]

#Tidy up column headers
names(tableS1)[1] <- "Participant ID"
names(tableS1)[2] <- "Current smoker"
names(tableS1)[3] <- "Prior smoker"
names(tableS1)[4] <- "Sleep apnea"
names(tableS1)[15] <- "Sex"
names(tableS1)[25] <- "D0_Baseline_HAI_Victoria"
names(tableS1)[27] <- "Vaccinated in prior 3 seasons"
names(tableS1)[29] <- "Average Day 0 HAI"
names(tableS1)[30] <- "Average Day 28 HAI"
names(tableS1)[31] <- "H1N1 Seroconversion"
names(tableS1)[32] <- "H3N2 Seroconversion"
names(tableS1)[33] <- "Victoria Seroconversion"
names(tableS1)[34] <- "Yamagata Seroconversion"
names(tableS1)[35] <- "Average Seroconversion"

#Reorder the columns
tableS1 <- tableS1[,c(1,14,15,16,11,12,17,18,2,3,4,5,6,7,8,9,10,27,13,28,19,21,23,25,29,20,22,24,26,30,31,32,33,34,35)]

write.table(tableS1, file = "Table S1.txt", sep = "\t")
```


```{r}
save(counts,
     counts.subset,
     tpm.subset,
     genes,
     gene.sets,
     metadata, file = "data.RData")
```