-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path12_Gene_set_testing.Solutions.Rmd
231 lines (196 loc) · 6.6 KB
/
12_Gene_set_testing.Solutions.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
---
title: "Introduction to Bulk RNAseq data analysis"
subtitle: "Gene Set Testing for RNA-seq - Solutions"
output:
html_document:
toc: yes
toc_float: yes
pdf_document:
toc: yes
layout: page
always_allow_html: true
---
```{r setup, include=FALSE}
library(msigdbr)
library(clusterProfiler)
library(pathview)
library(org.Mm.eg.db)
library(tidyverse)
knitr::opts_knit$set(cache=TRUE)
options(bitmapType='cairo')
knitr::opts_chunk$set(dev = c("png"))
```
```{r prepareORAData, include=FALSE}
shrink.d11 <- readRDS("RObjects/Shrunk_Results.d11.rds")
# Kegg data
sigGenes <- shrink.d11 %>%
drop_na(Entrez, FDR) %>%
filter(FDR < 0.01 & abs(logFC) > 1) %>%
pull(Entrez)
kk <- enrichKEGG(gene = sigGenes, organism = 'mmu')
```
## Exercise 1 - pathview
> 1. Use `pathview` to export a figure for "mmu04659", but this time only
> use genes that are statistically significant at FDR < 0.01
```{r solution1}
logFC <- shrink.d11 %>%
drop_na(FDR, Entrez) %>%
filter(FDR < 0.01) %>%
dplyr::select(Entrez, logFC) %>%
deframe()
pathview(gene.data = logFC,
pathway.id = "mmu04659",
species = "mmu",
limit = list(gene=5, cpd=1))
```
mmu04659.pathview.png:
![mmu04659 - Th17 cell differentiation](images/mmu04659_pathview.png)
## Exercise 2 - GO term enrichment analysis
> `clusterProfiler` can also perform over-representation analysis on GO terms.
> using the commmand `enrichGO`. Look at the help page for the command
> `enrichGO` (`?enrichGO`) and have a look at the instructions in the
> [clusterProfiler book](http://yulab-smu.top/clusterProfiler-book/chapter5.html#go-over-representation-test).
>
> 1. Run the over-representation analysis for GO terms
> - Use genes that have an adjusted p-value (FDR) of less than 0.01 and
> an absolute fold change greater than 2.
> - For this analysis you can use Ensembl IDs rather then Entrez
> - You'll need to provide the background (`universe`) genes, this should be
> all the genes in our analysis.
> - The mouse database package is called `org.Mm.eg.db`. You'll need to load
> it using `library` before running the analysis.
> - As we are using Ensembl IDs, you'll need to set the `keyType`
> parameter in the `enrichGO` command to indicate this.
> - Only test terms in the "Biological Processes" ontology
> 2. Use the `dotplot` function to visualise the results.
```{r solution2}
sigGenes <- shrink.d11 %>%
drop_na(FDR) %>%
filter(FDR < 0.01 & abs(logFC) > 1) %>%
pull(GeneID)
universe <- shrink.d11$GeneID
ego <- enrichGO(gene = sigGenes,
universe = universe,
OrgDb = org.Mm.eg.db,
keyType = "ENSEMBL",
ont = "BP",
pvalueCutoff = 0.01,
readable = TRUE)
dotplot(ego,
font.size = 8,
)
```
```{r}
barplot(ego,
drop = TRUE,
showCategory = 10,
label_format = 20,
title = "GO Biological Pathways",
font.size = 8)
```
## Exercise 3 - GSEA
> Another common way to rank the genes is to order by pvalue, but also, sorting
> so that upregulated genes are at the start and downregulated at the end -
> you can do this combining the sign of the fold change and the pvalue.
>
> 1. Rank the genes by statisical significance - you will need to create
> a new ranking value using `-log10({p value}) * sign({Fold Change})`
> 2. Run `fgsea` using the new ranked genes and the H pathways
> 3. Conduct the same analysis for the d33 vs control contrast.
### Exercise 3 - d11 new rank
```{r solution3_GSEA_1}
# 1. Rank the genes by statistical significance - you will need to create
# a new ranking value using `-log10({p value}) * sign({Fold Change})`
# obtain the H(allmarks) catalog for mouse:
m_H_t2g <- msigdbr(species = "Mus musculus", category = "H") %>%
dplyr::select(gs_name, entrez_gene, gene_symbol)
# rank genes
rankedGenes.e1 <- shrink.d11 %>%
drop_na(Entrez, pvalue, logFC) %>%
# rank genes by strength of significance,
# keeping the direction of the fold change
mutate(rank = -log10(pvalue) * sign(logFC)) %>%
# sort genes by decreasing rank.
arrange(-rank) %>%
# keep ranks and Entrez IDs
pull(rank,Entrez)
# conduct analysis:
gseaRes.e1 <- GSEA(rankedGenes.e1,
TERM2GENE = m_H_t2g[,c("gs_name", "entrez_gene")],
#pvalueCutoff = 0.05,
pvalueCutoff = 1.00, # to retrieve whole output
minGSSize = 15,
maxGSSize = 500)
```
```{r}
# have function to format in scientific notation
format.e1 <- function(x) (sprintf("%.1e", x))
# format table:
gseaRes.e1 %>%
# sort in decreasing order of absolute NES
arrange(desc(abs(NES))) %>%
# only keep the 10 entries with the lowest p.adjust
top_n(10, -p.adjust) %>%
# remove columns 'core_enrichment' and 'Description'
dplyr::select(-core_enrichment) %>%
dplyr::select(-Description) %>%
# convert to data.frame
data.frame() %>%
# remove row names
remove_rownames() %>%
# format score
mutate(NES=formatC(NES, digits = 3)) %>%
mutate(ES=formatC(enrichmentScore, digits = 3)) %>%
relocate(ES, .before=NES) %>%
dplyr::select(-enrichmentScore) %>%
# format p-values
modify_at(
c("pvalue", "p.adjust", "qvalues"),
format.e1
) %>%
# display
DT::datatable(options = list(dom = 't'))
```
### Exercise 3 - d33
With d33 and H catalog:
```{r solution3_GSEA_3}
# read d33 data in:
shrink.d33 <- readRDS("RObjects/Shrunk_Results.d33.rds")
# get mouse H(allmarks) catalog
m_H_t2g <- msigdbr(species = "Mus musculus", category = "H") %>%
dplyr::select(gs_name, entrez_gene, gene_symbol)
# rank genes
rankedGenes.e3 <- shrink.d33 %>%
drop_na(Entrez, pvalue, logFC) %>%
mutate(rank = -log10(pvalue) * sign(logFC)) %>%
arrange(-rank) %>%
pull(rank,Entrez)
# perform analysis
gseaRes.e3 <- GSEA(rankedGenes.e3,
TERM2GENE = m_H_t2g[,c("gs_name", "entrez_gene")],
#pvalueCutoff = 0.05,
pvalueCutoff = 1.00, # to retrieve whole output
minGSSize = 15,
maxGSSize = 500)
```
Check outcome:
```{r}
gseaRes.e3 %>%
arrange(desc(abs(NES))) %>%
top_n(10, -p.adjust) %>%
dplyr::select(-core_enrichment) %>%
dplyr::select(-Description) %>%
data.frame() %>%
remove_rownames() %>%
# format score
mutate(NES=formatC(NES, digits = 3)) %>%
mutate(ES=formatC(enrichmentScore, digits = 3)) %>%
relocate(ES, .before=NES) %>%
dplyr::select(-enrichmentScore) %>%
# format p-values
modify_at(
c("pvalue", "p.adjust", "qvalues"),
format.e1
) %>%
DT::datatable(options = list(dom = 't'))
```