-
Notifications
You must be signed in to change notification settings - Fork 1
/
A2p.Rmd
386 lines (278 loc) · 11.8 KB
/
A2p.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
---
title: "BCB420 - Computational Systems Biology Assignment 2 Differential Gene expression and Preliminary ORA"
author: "Joshua Efe"
date: "r Sys.Date()"
output:
html_document:
toc: yes
toc_depth: 2
bibliography: a.bib
---
# Material from A1
```{r a1, message=FALSE, warning=FALSE, child='A1t.Rmd', include=FALSE, echo=FALSE, eval=TRUE}
```
Data was from GEO with ID 'GSE72055'
```{r}
library(ComplexHeatmap)
library(circlize)
library(dplyr)
library(magrittr)
library(knitr)
library(kableExtra)
(NormalizedData)
```
# Differential Gene Expression
The MDS Plot from A1
```{r}
plotMDS(d, labels=rownames(samples),
col = c("gold4","blue")[factor(samples$treatment)])
```
```{r}
normalized_count_data <- NormalizedData
kable(normalized_count_data[1:5,1:5], type="html")
heatmap_matrix <- normalized_count_data[,3:ncol(normalized_count_data)]
rownames(heatmap_matrix) <- normalized_count_data$ensembl_gene_id
colnames(heatmap_matrix) <- colnames(normalized_count_data[,3:ncol(normalized_count_data)])
```
```{r}
limma::plotMDS(heatmap_matrix,
col = rep(c("darkgreen","blue"),10))
samples <- data.frame(lapply(colnames(normalized_count_data)[3:8],
FUN=function(x){unlist(strsplit(x, split = "_"))[c(1,2)]}))
colnames(samples) <- colnames(normalized_count_data)[3:8]
rownames(samples) <- c("treatment","repID")
samples <- data.frame(t(samples))
samples[1:6,]
```
## Create our data matrix
```{r}
model_design <- model.matrix(~ samples$treatment)
kable(model_design, type="html")
expressionMatrix <- as.matrix(normalized_count_data[,3:8])
rownames(expressionMatrix) <- normalized_count_data$ensembl_gene_id
colnames(expressionMatrix) <- colnames(normalized_count_data)[3:8]
minimalSet <- ExpressionSet(assayData=expressionMatrix)
#Fit our data to the above model
fit <- lmFit(minimalSet, model_design)
```
## Getting the P values
We will be applying empircal Bayes to compute differential expression for the data model we created above to get the P values
```{r}
fit2 <- eBayes(fit,trend=TRUE)
topfit <- topTable(fit2,
coef=ncol(model_design),
adjust.method = "BH",
number = nrow(expressionMatrix))
#merge hgnc names to topfit table
output_hits <- merge(normalized_count_data[,1:2],
topfit,
by.y=0,by.x=1,
all.y=TRUE)
#sort by pvalue
output_hits <- output_hits[order(output_hits$P.Value),]
kable(output_hits[1:10,],type="html")
```
## How many gene pass the threshold p-value < 0.05 in both the unadjusted and adjusted p-values?
```{r}
length(which(output_hits$P.Value < 0.05))
length(which(output_hits$adj.P.Val < 0.05))
```
```{r}
model_design_pat <- model.matrix(
~ samples$repID + samples$treatment)
kable(model_design_pat,type="html")
```
## Finding the Pvalue with a different model using the Limma package
```{r}
#Fit our data to the above model
fit_pat <- lmFit(minimalSet, model_design_pat)
#Apply empircal Bayes to compute differential expression for the above described model.
fit2_pat <- eBayes(fit_pat,trend=TRUE)
topfit_pat <- topTable(fit2_pat,
coef=ncol(model_design_pat),
adjust.method = "BH",
number = nrow(expressionMatrix))
#merge hgnc names to topfit table
output_hits_pat <- merge(normalized_count_data[,1:2],
topfit_pat,by.y=0,by.x=1,all.y=TRUE)
#sort by pvalue
output_hits_pat <- output_hits_pat[order(output_hits_pat$P.Value),]
kable(output_hits_pat[1:10,],type="html")
```
```{r}
#How many gene pass the threshold p-value < 0.05?
length(which(output_hits_pat$P.Value < 0.05))
#How many genes pass correction?
length(which(output_hits_pat$adj.P.Val < 0.05))
```
## Compare the results from the two different models
```{r}
simple_model_pvalues <- data.frame(ensembl_id = output_hits$ensembl_gene_id,
simple_pvalue=output_hits$P.Value)
pat_model_pvalues <- data.frame(ensembl_id = output_hits_pat$ensembl_gene_id,
replicate_pvalue = output_hits_pat$P.Value)
two_models_pvalues <- merge(simple_model_pvalues,
pat_model_pvalues,by.x=1,by.y=1)
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue<0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$replicate_pvalue<0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue<0.05 & two_models_pvalues$replicate_pvalue<0.05] <- "red"
plot(two_models_pvalues$simple_pvalue,two_models_pvalues$replicate_pvalue,
col = two_models_pvalues$colour,
xlab = "simple model p-values",
ylab ="replicate model p-values",
main="Simple vs replicate Limma")
```
```{r}
ensembl_of_interest <- normalized_count_data$ensembl_gene_id[
which(normalized_count_data$hgnc_symbol == "CYP1A1")]
two_models_pvalues$colour <- "grey"
two_models_pvalues$colour[two_models_pvalues$ensembl_id==ensembl_of_interest] <- "red"
plot(two_models_pvalues$simple_pvalue,two_models_pvalues$replicate_pvalue,
col = two_models_pvalues$colour,
xlab = "simple model p-values",
ylab ="replicate model p-values",
main="Simple vs replicate Limma")
```
--
## Set up our edgeR objects
```{r}
d = DGEList(counts=filtered_data_matrix, group=samples$treatment)
#Estimate Dispersion - our model design.
d <- estimateDisp(d, model_design_pat)
#Fit the model
fit <- glmQLFit(d, model_design_pat)
kable(model_design_pat[1:6,1:4], type="html") %>%
row_spec(0, angle = -40)
## Calculating the differential expression using the Quasi liklihood model
qlf.pos_vs_neg <- glmQLFTest(fit, coef='samples$treatmentiso12')
kable(topTags(qlf.pos_vs_neg), type="html")
```
```{r}
qlf_output_hits <- topTags(qlf.pos_vs_neg,sort.by = "PValue",
n = nrow(normalized_count_data))
#How many gene pass the threshold p-value < 0.05?
length(which(qlf_output_hits$table$PValue < 0.05))
#How many genes pass correction?
length(which(qlf_output_hits$table$FDR < 0.05))
```
## Compare the results from the two different models
```{r}
#Limma vs Quasi liklihood
qlf_pat_model_pvalues <- data.frame(
ensembl_id = rownames(qlf_output_hits$table),
qlf_replicate_pvalue=qlf_output_hits$table$PValue)
limma_pat_model_pvalues <- data.frame(
ensembl_id = output_hits_pat$ensembl_gene_id,
limma_replicate_pvalue = output_hits_pat$P.Value)
two_models_pvalues <- merge(qlf_pat_model_pvalues,
limma_pat_model_pvalues,
by.x=1,by.y=1)
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$qlf_replicate_pvalue<0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$limma_replicate_pvalue<0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$qlf_replicate_pvalue<0.05 & two_models_pvalues$limma_replicate_pvalue<0.05] <- "red"
plot(two_models_pvalues$qlf_replicate_pvalue,
two_models_pvalues$limma_replicate_pvalue,
col = two_models_pvalues$colour,
xlab = "QLF replicate model p-values",
ylab ="Limma replicate model p-values",
main="QLF vs Limma")
```
# MA PLot
```{r}
plotMA(fit2, main="hTR DMSO vs ISO")
```
Differential Gene Expression
Conduct differential expression analysis with your normalized expression set from Assignment #1. Define your model design to be used to calculate differential expression - revisit your MDS plot from Assignment #1 to demonstrate your choice of factors in your model.
Calculate p-values for each of the genes in your expression set. How many genes were significantly differentially expressed? What thresholds did you use and why?
Based of the P values 12739 genes where signifcantly diffentailly expressed
Multiple hypothesis testing - correct your p-values using a multiple hypothesis correction method. Which method did you use? And Why? How many genes passed correction?
I used the Emprical Bayes method, Based of the adjusted P values 12643 genes where signifcantly diffentailly expressed
Based off the adjusted P values 12643 genes where signifcantly diffentailly expressed I used
# Thresholded over-representation analysis
```{r}
#estimate dispersion
d <- estimateDisp(d, model_design_pat)
#calculate normalization factors
d <- calcNormFactors(d)
#fit model
fit <- glmQLFit(d, model_design_pat)
#calculate differential expression
qlf.pos_vs_neg <- glmQLFTest(fit, coef='samples$treatmentiso12')
qlf_output_hits <- topTags(qlf.pos_vs_neg,sort.by = "PValue",
n = nrow(filtered_data_matrix))
length(which(qlf_output_hits$table$PValue < 0.05))
kable(topTags(qlf.pos_vs_neg), type="html")
```
## Upreglauted and downregulated genes
```{r}
#How many genes are up regulated?
length(which(qlf_output_hits$table$PValue < 0.05
& qlf_output_hits$table$logFC > 0))
#How many genes are down regulated?
length(which(qlf_output_hits$table$PValue < 0.05
& qlf_output_hits$table$logFC < 0))
```
## Threshold number of genes
```{r}
#Create thresholded lists of genes.
#merge essabmle IDs with the top hits
qlf_output_hits_withgn <- merge(tel_exp[,1:2],qlf_output_hits, by.x=1, by.y = 0)
qlf_output_hits_withgn[,"rank"] <- -log(qlf_output_hits_withgn$PValue,base =10) * sign(qlf_output_hits_withgn$logFC)
qlf_output_hits_withgn <- qlf_output_hits_withgn[order(qlf_output_hits_withgn$rank),]
upregulated_genes <- qlf_output_hits_withgn$ID[
which(qlf_output_hits_withgn$PValue < 0.05
& qlf_output_hits_withgn$logFC > 0)]
downregulated_genes <- qlf_output_hits_withgn$ID[
which(qlf_output_hits_withgn$PValue < 0.05
& qlf_output_hits_withgn$logFC < 0)]
write.table(x=upregulated_genes,
file=file.path("data","tel_exp_upregulated_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
write.table(x=downregulated_genes,
file=file.path("data","tel_exp_downregulated_genes.txt"),sep = "\t",
row.names = FALSE,col.names = FALSE,quote = FALSE)
```
# Running thresholded gene set enrichment analysis with gProfiler
I ran an analysis of the upregualed genes the downregulated genes and finally all of them together using gprofiler
https://biit.cs.ut.ee/gprofiler/gost
Here are the downregulated gene results
![](downreg1.png)
![](downreg2.png)
![](downreg3.png)
Here are the upregulataed gene results
![](upreg1.png)
![](upreg2.png)
![](upreg3.png)
The upregulated genes seem to to have fewer terms show up with as many hits and when run together the results seem to favor the genes that have been downregulated
Q:Which method did you choose and why?
I used g:profiler since it worked really well with the homework assigment and has a large number of datasets it can pull from
Q:What annotation data did you use and why? What version of the annotation are you using?
I used the following annoted data sets:
GO molecular function
GO biological process
Reactome
WikiPathways
Q:How many genesets were returned with what thresholds?
Do the over-representation results support conclusions or mechanism discussed in the original paper?
These results seem to support the findings in the paper many of the terms that were returned for the downregulated gene sets seeme to invlvoved in RNA regualtion such as RNA Polymerase II Transcription, ATP binding,ribonucleotide binding suggesting that isoginkgetin does indeed mimic the effects of RNA exosome inhibition
# References
[@tseng_wang_burns_schroeder_gaspari_baumann_2015]
[@tseng_wang_burns_schroeder_gaspari_baumann_2015]
[@hgnc]
[@database]
[@edgeR]
[@limma]
[@GEOmetadb]
[@BiocManager]
[@knitr]
[@GEOmetadb]
[@GSA]
[@RCurl]
[@futilelogger]
[@VennDiagram]
[@Zuguang_Roland]
[@Zuguang_Lei]
[@gable_gaysinskaya_atik_talbot_kang_stanley_pugh_amat_codina_schenk_arcasoy_et_al_2019]
[@uniprot_consortiumeuropean_bioinformatics_instituteprotein_information_resourcesib_swiss_institute_of_bioinformatics_2019]