-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPrEP_Study_Microarray_Analysis.Rmd
822 lines (474 loc) · 25.6 KB
/
PrEP_Study_Microarray_Analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
---
title: "PreP Study Microarray Analysis"
author: "Claire Levy"
output: github_document
---
##Experimental set up
We isolated RNA from four different sample types from 8 donors pre- and post- initiation of PreP. The pre-initation was called "Enrollment" and the post-initiation visit was called "Visit2"
Sample Types
* Duodenal biopsy
* Rectal biopsy
* PAXgene (whole blood collected into RNA preservative)
* PBMC (PBMC isolated from whole blood)
NOTE: PTID BG2305 may not have been adherent, left them in for now. May have had problems refilling prescription on time.
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
library(dplyr)
library(lumi)
library(limma)
library(pander)
library(stringr)
library(reshape2)
library(ggplot2)
library(ggrepel)
library(RColorBrewer)
```
```{r read in raw data, cache = TRUE, results = 'hide'}
lumibatch<-lumiR(fileName = "raw_data/2017.04.18.smhughesFinalReport.txt ",
detectionTh = 0.05,
annotationColumn=c('ENTREZ_GENE_ID','ACCESSION', 'SYMBOL', 'PROBE_SEQUENCE', 'PROBE_START', 'CHROMOSOME', 'PROBE_CHR_ORIENTATION', 'PROBE_COORDINATES', 'DEFINITION'))
#read in pData
#Note that my pData file has rownames that match the sampleNames used in the array AND I have a column called sampleNames containing the same data because I'll want to use it later when I make MDS plots and need to merge on sampleNames. This is annoying but kind of necessary.
pData <-read.table("raw_data/pData.txt", header = TRUE, stringsAsFactors = FALSE)
#put pData into an adf
adf<-AnnotatedDataFrame(data = pData)
complete_lumibatch<-new("LumiBatch",
assayData = assayData(lumibatch),
phenoData = adf,
detection = detection(lumibatch),
controlData = controlData(lumibatch),
featureData = featureData(lumibatch))
```
```{r background correction, results = 'hide'}
#the data we got from the core has no background correction (I don't think it did anyway...) so I will do it here.
B_complete_lumibatch<-lumiB(complete_lumibatch, method = "bgAdjust")
```
### Plots of background-corrected but non-normalized data
```{r nonnormalized data}
#MDS
dim_data<- plotMDS(B_complete_lumibatch)
d<-data.frame(data.frame(Dim1 = dim_data$x,
Dim2 = dim_data$y,
sampleNames = names(dim_data$x)))
d<-merge(d, pData, by = "sampleNames")
ggplot(d, aes(x=Dim1, y = Dim2))+
geom_point(aes(color = Tissue), size = 3)
#A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin to be specified on the four sides of the plot. The default is c(5, 4, 4, 2) + 0.1.
par(cex= 0.7, mar = c(10,10,10,10))
boxplot(B_complete_lumibatch, col = c("#1b9e77","#80cdc1","#984ea3","#e7298a"))
#Ryan Basom looked at this box plot with me and we talked about how it shows that the paxgene samples almost always have low expression and the IQR is smaller. If I normalize all samples together, apparently non-random variation might cause problems. RB's recommendation was to normalize and analyze the tissues separately.
```
```{r split up data into tissue types}
duod_batch<-complete_lumibatch[, complete_lumibatch$Tissue =="Duodenal"]
rec_batch<-complete_lumibatch[, complete_lumibatch$Tissue =="Rectal"]
pb_batch<-complete_lumibatch[, complete_lumibatch$Tissue =="PBMC"]
px_batch<-complete_lumibatch[, complete_lumibatch$Tissue =="PAXgene"]
```
```{r vst transform androbust spline normalization, results = 'hide'}
#decided to do the transformation separately for each tissue since the function uses the variance to transform. I get a slightly different number of up and down DE probes than when I transformed all together (together = 76 up, 15 down, separate = 70 up and 11 down)
duod_trans_batch<- lumiT(duod_batch)
duod_norm_batch<-lumiN(duod_trans_batch, method = "rsn")
rec_trans_batch<- lumiT(rec_batch)
rec_norm_batch<-lumiN(rec_trans_batch, method = "rsn")
pb_trans_batch<- lumiT(pb_batch)
pb_norm_batch<-lumiN(pb_trans_batch, method = "rsn")
px_trans_batch<- lumiT(px_batch)
px_norm_batch<-lumiN(px_trans_batch, method = "rsn")
```
These normalized samples all look good.
```{r MDS plot function}
# a function to make nice MDS plots
#plotMDS invisibly returns a large object that has the actual dimensions data in matrices called x and y (one for each dimension)
#Dim1 is a column containing the values from the x matrix
#Dim2 is a column containing the values from the y matrix
#sampleNames is a column that holds the column names from the x matrix, which happen to be the microarray wells.
#add a plot title to this function!!!
plot_MDS_by_Visit<-function(batch){
dim_data<- plotMDS(batch)
d<-data.frame(data.frame(Dim1 = dim_data$x,
Dim2 = dim_data$y,
sampleNames = names(dim_data$x)))
d<-merge(d, pData, by = "sampleNames")
ggplot(d, aes(x=Dim1, y = Dim2))+
geom_point(aes(color = Visit), size = 3)
}
```
```{r norm plots}
#duod
plotDensities(duod_norm_batch,legend = FALSE, main = "Normalized Duodenal Samples")
boxplot(duod_norm_batch,main = "Normalized Duodenal Samples")
plot_MDS_by_Visit(duod_norm_batch)
# rectal
plotDensities(rec_norm_batch,legend = FALSE, main = "Normalized Rectal Samples")
boxplot(rec_norm_batch, main =" Normalized Rectal Samples" )
plot_MDS_by_Visit(rec_norm_batch)
#PBMC
plotDensities(pb_norm_batch,legend = FALSE, main = "Normalized PBMC Samples")
boxplot(pb_norm_batch, main ="Normalized PBMC Samples" )
plot_MDS_by_Visit(pb_norm_batch)
#PAXgene
plotDensities(px_norm_batch,legend = FALSE, main = "Normalized PAXgene Samples")
boxplot(px_norm_batch, main = "Normalized PAXgene Samples" )
plot_MDS_by_Visit(px_norm_batch)
```
## Non-specific filtering
There are 8 donors x 1 tissue type x 2 timepoints = 16 samples in this set.
I will filter out any probes that are not expressed above the 0.05 p-value cut-off in at least 7 samples. I chose 7 because a probe may not be expressed at all in the 8 Enrollment samples (so 0/16 total), but may show up at Visit2 (8 possibilities). It would still be biologically interesting if a probe was expressed at Visit2 (and not at enrollment) even if it wasn't in all donors, so I'll require it to show up in at least 7 of them.
```{r ns filtering}
#count probes where det pval <0.05 in at least 7 samples
#extract those probes from the batch
find_expressed <-function(batch){
x <-rowSums(detection(batch)<0.05)>= 7
batch[x,]
}
duod_expressed<-find_expressed(duod_norm_batch)
rec_expressed<-find_expressed(rec_norm_batch)
pb_expressed<-find_expressed(pb_norm_batch)
px_expressed<-find_expressed(px_norm_batch)
```
Number of probes removed from the data sets after filtering for expression. All started with 47323 probes.
```{r probes pre and post filter}
#here are the probes that were removed by NS filtering for each tissue
probes_removed<- data.frame(
"Duod" = nrow(fData(duod_norm_batch))-nrow(fData(duod_expressed)),
"Rectal" = nrow(fData(rec_norm_batch))-nrow(fData(rec_expressed)),
"PBMC" =nrow(fData(pb_norm_batch))-nrow(fData(pb_expressed)),
"PAXgene" = nrow(fData(px_norm_batch))-nrow(fData(px_expressed)))
pander(probes_removed)
```
```{r design and fit function}
#When I have ~0 for the intercept, each coeff represents the avg of the samples at each level of Condition and Donor. If I use ~1 instead, the intercept would represent the avg of the samples for the first factor and the next coeff would represent the increase in the average of the next factor OVER the first. See here:https://support.bioconductor.org/p/67395/ and here https://support.bioconductor.org/p/12145/
fit_model<-function(expressedProbes){
Ptid <- pData(expressedProbes)$Ptid #get pData just for that tissue
Visit <- pData(expressedProbes)$Visit
design <- model.matrix(~0 + Visit + Ptid)
fit<-lmFit(expressedProbes, design = design)
contrasts<-makeContrasts(
Visit2vsEnrollment = VisitVisit2-VisitEnrollment, levels = design)
fit2 <- contrasts.fit(contrasts, fit = fit)
eBayes(fit2)
}
#fit model to each tissue
duod_fit2<-fit_model(duod_expressed)
ttDuod<-topTable(duod_fit2, coef = "Visit2vsEnrollment", adjust.method = "BH", number = Inf)
write.csv(ttDuod, file = "data_out/PrEP-Truvada-Oral-60Days-Daily-Duodenum-complete.csv" )
rec_fit2<-fit_model(rec_expressed)
ttRec<-topTable(rec_fit2, coef = "Visit2vsEnrollment", adjust.method = "BH", number = Inf)
# write.csv(ttRec, file = "data_out/PrEP-Truvada-Oral-60Days-Daily-15cmRectum-complete.csv" )
pb_fit2<-fit_model(pb_expressed)
# ttPb<-topTable(pb_fit2, coef = "Visit2vsEnrollment", adjust.method = "BH", number = Inf)
#
# write.csv(ttPb, file = "data_out/PrEP-Truvada-Oral-60Days-Daily-PBMC-complete.csv" )
px_fit2<-fit_model(px_expressed)
# ttPx<-topTable(px_fit2, coef = "Visit2vsEnrollment", adjust.method = "BH", number = Inf)
#
# write.csv(ttPx, file = "data_out/PrEP-Truvada-Oral-60Days-Daily-PAXgene-complete.csv" )
```
## Volcano plots of each tissue
The log odds of differential expression is a statistic about the chances that a probe is differentially expressed. Log odds of 0 means that there is a 50/50 chance that the probe is differentially expressed (if log odds = 0, odds = 10^0^. Probability = odds/1+odds. 10^0^/1+10^0^ = 1/2 = 50%) This statistic is already adjusted for multiple testing. Negative odds indicate the probability that the probes is *not* differentially expressed.
```{r volcano plot function}
load("R_output/rec_expressed.Rdata")
load("R_output/pb_expressed.Rdata")
load("R_output/px_expressed.Rdata")
load("R_output/duod_expressed.Rdata")
#list of exprs
expressed_probes<-list(duod_expressed,rec_expressed,pb_expressed,px_expressed)
#names to use for the list of toptables
ttNames<-c("Duodenum","15cm Rectum","PBMC","PAXgene")
ttVolcano<- function(expressed_list, ttNames, contrast){
x<-lapply(expressed_list, FUN = fit_model)
y<-lapply(x, FUN = topTable,coef = contrast,
adjust.method = "BH", number = Inf)
names(y)<-ttNames
z<-bind_rows(y, .id = "Tissue")
z<-z%>%
mutate(Significance = ifelse(adj.P.Val <= 0.05, "<=0.05", ">0.05"))
ggplot(z,aes(x = logFC, y = B))+
geom_point(aes(),size = 1.7, alpha= 0.5) +
theme_bw(base_size = 16) +
geom_text_repel(
data = subset(z, adj.P.Val < 3E-10),
aes(label = TargetID),
size = 3,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines"))+
facet_wrap(~Tissue)+
labs(y = "Log Odds of Differential Expression")
}
ttVolcano(expressed_list = expressed_probes, ttNames = ttNames, contrast = "Visit2vsEnrollment" )
#From http://vassarstats.net/tabs.html#odds1
# Given p, an observed proportion or probability:
# Odds = p/(1-p)
# Log Odds: LO = loge[Odds]= loge[p/(1-p)]
# Given the Log Odds: Odds = exp[LO]
# Given the Odds: p = Odds/(1+Odds)
#so 80% probability = odds of 4:1
#log e (4) =
```
```{r results summary function}
#results summary function
#SEE LIMMA DOCUMENTATION SECTION 13.3 for explaination of which method to use for decideTests function (adjust results for multiple testing).
#In short, method = "separate" means that testing contrasts to gether will give the same results as when each contrast is tested alone. Does NOT do any adjusting betweeen contrasts and the raw p-value cutoff corresponding to significance can be different for different contrasts. Use when different contrasts are being analyzed to answer independant questions. Gives the same results as when you do topTable.
#use method = "global" when closely related contrasts are being tested and is recommended if you want to compare the number of DE genes found for different contrasts. All tests are treated as equivalent regardless of which probe or contrast they relate to, so raw p-value cutoff is consistant across contrasts. I guess you won't necessarily get the same numbers of DE probes when you do topTable after using method = "global"??
#here I'm using a contrast matrix with only one coeff, so it doesn't make a difference if I use separate or global. Also limma says you should avoid unnessary contrasts...
# It says here, https://support.bioconductor.org/p/66818/#66831, and in the limma guide apparently, that you should only have pval cutoff in decideTests(), not both pval AND lfc
fit_results<- function(fit2){
results <- decideTests(fit2,method="separate", adjust.method="BH",
p.value=0.05)
resultsDF<-as.data.frame(results)
resultsDF$ProbeID<-rownames(resultsDF)
rownames(resultsDF)<-NULL
#melt the df for easy summarizing
resultsDFmelt<-melt(resultsDF, id.vars="ProbeID")
summary<-resultsDFmelt %>%
group_by(variable)%>%
summarize(down=sum(value=="-1"),up=sum(value=="1"))
}
#summarise each tissue
duod_summary<-fit_results(duod_fit2)
rec_summary<-fit_results(rec_fit2)
pb_summary<-fit_results(pb_fit2)
px_summary<-fit_results(px_fit2)
```
##Number of DE probes for each contrast
The duodenal samples were the only ones with any differentially expressed probes. For Visit2 relative to enrollment, there were `r duod_summary$down` down-regulated probes and `r duod_summary$up` up-regulated probes.
## top 10 changes in the Duodenum pre vs post-PreP: p-value cut-off = 0.05
```{r ttduod}
#from ?topTable()
# Although topTable enables users to set p-value and lfc cutoffs simultaneously, this is not generally recommended. If the fold changes and p-values are not highly correlated, then the use of a fold change cutoff can increase the false discovery rate above the nominal level. Users wanting to use fold change thresholding are usually recommended to use treat and topTreat instead.
#In general, the adjusted p-values returned by adjust.method=BH remain #valid as FDR bounds only when the genes remain sorted by p-value. #Resorting the table by log-fold-change can increase the false discovery #rate above the nominal level for genes at the top of resorted table.
#this gives the same number of up and down probes as decideTests(), but this won't always be the case I don't think, depending on the choice of methods in decideTests() and number of coefs.
ttDuod <- topTable(duod_fit2, coef = "Visit2vsEnrollment", adjust.method = "BH", number = Inf, p.value = 0.05)
rownames(ttDuod)<-NULL
pander(ttDuod %>%
select (TargetID, logFC, DEFINITION, adj.P.Val)%>%
head(., n= 10))
```
## CAMERA testing
From the CAMERA documentation:
"camera performs a competitive test in the sense defined by Goeman and Buhlmann (2007). It tests whether the genes in the set are highly ranked in terms of differential expression relative to genes not in the set."
### Hallmark gene sets: top 10 results
```{r camera test setup}
#I'm going to do a CAMERA test using the MSigDB hallmark sets on all of my contrasts
#downloaded Rdata file 22Apr16 from http://bioinf.wehi.edu.au/software/MSigDB/
load("geneSets_for_CAMERA/human_Hallmark_v5.rdata")
#Eset to use
eSet<- exprs(duod_expressed)
#IDs from eSet: entrezIDs from the data I'm analyzing (not the pre-determined gene set).
identifiers <- as.character(fData(duod_expressed)[,"ENTREZ_GENE_ID"])
```
```{r camera function}
#a function to get the appropriate design and contrast matrices for expressed probes from each tissue, then run camera on them.
camera_function<-function(expressedProbes, geneSet){
Ptid <- pData(expressedProbes)$Ptid #get pData just for that tissue
Visit <- pData(expressedProbes)$Visit
design <- model.matrix(~0+ Visit + Ptid)
contrast <-makeContrasts(
Visit2vsEnrollment = VisitVisit2-VisitEnrollment, levels = design)
#IDs from eSet that I'll use for camera: entrezIDs *from the data I'm analyzing* (not the pre-determined gene set).
identifiers <- as.character(fData(expressedProbes)[,"ENTREZ_GENE_ID"])
#make the index of the geneSet using identifiers from my eSet
index <- ids2indices(gene.sets = geneSet, identifiers = identifiers)
camera(y = exprs(expressedProbes), index = index, design = design, contrast = contrast)
}
```
```{r camera results function}
# a function to make the camera results look better
camera_results<-function(camera_result){
camera_result$Gene_Set <- rownames(camera_result)
rownames(camera_result)<-NULL
camera_result%>%
mutate("Gene_Set" = tolower(Gene_Set))%>%
head(n = 10)%>%
mutate(Gene_Set = str_replace(Gene_Set,"hallmark_",""))%>%
mutate(Gene_Set = str_replace_all(Gene_Set,"_"," "))%>%
dplyr::select(Gene_Set, Direction, FDR, NGenes)%>%
pander()
}
```
####Duodenal sample results
```{r hallmark duod}
#I want to use the "Hs.H" hallmark genesets
camera_results(camera_function(duod_expressed, Hs.H))
```
####Rectal sample results
```{r hallmark rec}
#I want to use the "Hs.H" hallmark genesets
camera_results(camera_function(rec_expressed, Hs.H))
```
####PBMC sample results
```{r hallmark pb}
#I want to use the "Hs.H" hallmark genesets
camera_results(camera_function(pb_expressed, Hs.H))
```
####PAXgene sample results
```{r hallmark px}
#I want to use the "Hs.H" hallmark genesets
camera_results(camera_function(px_expressed, Hs.H))
```
### GO biological process gene sets: top 10 results
Sean wrote some code (https://github.com/seaaan/Bioinformatics/tree/master/GOTermMappingsForCamera) to extract just the Biological Process gene sets out of all the GO gene sets.
####Duodenal sample results
```{r }
load("geneSets_for_CAMERA/Hs.c5.BP.rdata")
#I want to use the "Hs.c5.BP" genesets
camera_results(camera_function(duod_expressed, Hs.c5.BP))
```
####Rectal sample results
```{r }
#I want to use the "Hs.c5.BP" genesets
camera_results(camera_function(rec_expressed, Hs.c5.BP))
```
####PBMC sample results
```{r }
#I want to use the "Hs.c5.BP" genesets
camera_results(camera_function(pb_expressed, Hs.c5.BP))
```
####PAXgene sample results
```{r}
#I want to use the "Hs.c5.BP" genesets
camera_results(camera_function(px_expressed, Hs.c5.BP))
```
### MTN-007 gene sets
Here I am comparing the probes in this experiment to significantly differentially expressed probes from MTN-007 9cm biopsies at 7 days.
```{r }
load("geneSets_for_CAMERA/mtn_007_sets.Rda")
#I want to use the "mtn_007_sets" genesets
mtn_007<-rbind(camera_function(duod_expressed, mtn_007_sets),
camera_function(rec_expressed, mtn_007_sets),
camera_function(pb_expressed, mtn_007_sets),
camera_function(px_expressed, mtn_007_sets))
mtn_007$Gene_Sets<- rownames(mtn_007)
rownames(mtn_007)<-NULL
mtn_007<-mtn_007 %>%
mutate(Gene_Sets = rep(c("MTN_007 Down", "MTN_007 Up"), times = 4))%>%
mutate(Tissue = c("Duodenal","Duodenal", "Rectal","Rectal", "PBMC", "PBMC", "PAXgene", "PAXgene"))%>%
select(Gene_Sets, Tissue, Direction, FDR, NGenes)
pander(mtn_007)
```
##Which DE probes from the PreP study overlap with those from MTN-007?
```{r mtn007 overlaps}
#read in mtn007 DEG data 7 days 9cm
mtn007_7d_9cm<-read.csv("raw_data/mtn-007-7d-9cm-DEGs.csv")
overlap<-ttDuod[ttDuod$TargetID %in% mtn007_7d_9cm$TargetId,]%>%
select(TargetID, DEFINITION, logFC)
rownames(overlap)<- NULL
names(overlap)[3]<- "logFC in PreP Study"
pander(overlap)
```
## CAMERA test of Hallmark interferon alpha geneset founder sets
```{r inf alpha founder sets }
#to drill down into the hallmark results, I'll run camera on the founder sets of the hallmark interferon alpha set.
library(GSEABase)
library(illuminaHumanv4.db)
#I'm using the illuminaHumanv4.db annotation database because that's the kind of chip we used. http://bioconductor.org/packages/release/data/annotation/html/illuminaHumanv4.db.html but you can us a different annotation if you want to.
inf_a_founders<-getBroadSets("geneSets_for_CAMERA/HALLMARK_INTERFERON_ALPHA_RESPONSE.xml",geneIdType(EntrezIdentifier(annotation = "illuminaHumanv4.db")))
#map the symbols to entrez IDs because that's the identifier I used in my eSet.
inf_a_founders_entrez<-mapIdentifiers(inf_a_founders, EntrezIdentifier(annotation = "illuminaHumanv4.db"))
#here is my gene set, a list of 82 named elements and in each element are the entrez Ids in the set.
inf_a_founders_set<-geneIds(inf_a_founders_entrez)
```
####Duodenal sample results
```{r test inf alpha}
camera_results(camera_function(duod_expressed, inf_a_founders_set))
```
####Rectal sample results
```{r }
camera_results(camera_function(rec_expressed, inf_a_founders_set))
```
####PBMC sample results
```{r }
camera_results(camera_function(pb_expressed, inf_a_founders_set))
```
####PAXgene sample results
```{r}
camera_results(camera_function(px_expressed, inf_a_founders_set))
```
```{r CAMERA with tenofovir studies genesets}
#load the expression sets to test
load("R_output/duod_expressed.Rdata")
load("R_output/px_expressed.Rdata")
load("R_output/rec_expressed.Rdata")
load("R_output/pb_expressed.Rdata")
load("R_output/pData.Rdata")
#run the camera function and load libraries!
#load the genesets
load("../../Tenofovir R01/Cross-StudyMicroarrayAnalysis/data-out/up_and_down_gsets.Rda")
```
### CAMERA tests using DE probes from all Tenofovir studies as genesets
```{r compilation camera function}
#here is a function that takes an expression set and a geneset, makes the appropriate contrast, generates identifiers (the entrez ids from the data to be tested) and makes an index of the geneset using the identifiers AND runs the camera function.
#Here you need to provide an expression set to the "expressedProbes" argument and the name of the data that you are doing the camera test on, using the format we designated for the cross study analyis.
cross_study_camera_function<-function(expressedProbes, Condition, geneSets){
Ptid <- pData(expressedProbes)$Ptid #get pData just for that tissue
Visit <- pData(expressedProbes)$Visit
design <- model.matrix(~0+ Visit + Ptid)
contrast <-makeContrasts(
Visit2vsEnrollment = VisitVisit2-VisitEnrollment, levels = design)
#IDs from eSet that I'll use for camera: entrezIDs *from the data I'm analyzing* (not the pre-determined gene set).
identifiers <- as.character(fData(expressedProbes)[,"ENTREZ_GENE_ID"])
#make the index of the geneSet using identifiers from my eSet
index <- ids2indices(gene.sets = geneSets, identifiers = identifiers)
result<-camera(y = exprs(expressedProbes), index = index, design = design, contrast = contrast)
result$Gene_Set<- rownames(result)
rownames(result)<-NULL
result$Data_Tested<-Condition
result<- select(result, Data_Tested, Gene_Set, Direction, NGenes, PValue, FDR)
return(result)
}
```
```{r camera test with tenfovir study genesets}
# a vector of expression sets to camera test
eSets<-c(duod_expressed, rec_expressed, pb_expressed, px_expressed)
#a vector of the official cross-study analysis names for each condition in this study that I'm testing.
conds<-c("PrEP-Truvada-60Days-Daily-Oral-Duodenum","PrEP-Truvada-60Days-Daily-Oral-Rectum","PrEP-Truvada-60Days-Daily-Oral-PBMC","PrEP-Truvada-60Days-Daily-Oral-PAXgene")
#use mapply to iterate over both the eSets vector and the conditions vector, SIMPLFY = FALSE or else it makes it into a weird matrix thing. NOTE use MoreArgs to include another argument that I don't want to iterate over.
x<-mapply(FUN = cross_study_camera_function, expressedProbes = eSets, Condition = conds, MoreArgs = list(geneSets = up_and_down_gsets), SIMPLIFY = FALSE)
#make the list into a data frame
camera_PrEP_Truvada_60Days_Daily_Oral<-bind_rows(x)
#save in cross study analysis
save(camera_PrEP_Truvada_60Days_Daily_Oral, file = "../../Tenofovir R01/Cross-StudyMicroarrayAnalysis/data-in/camera_PrEP_Truvada_60Days_Daily_Oral.Rda")
#save in PrEP folder
save(camera_PrEP_Truvada_60Days_Daily_Oral,file = "R_output/camera_PrEP_Truvada_60Days_Daily_Oral.Rda")
pander(camera_PrEP_Truvada_60Days_Daily_Oral)
```
```{r hallmark for cross study analysis}
#I can use the same cross study analysi camera function I made above, but put in the hallmark genesets(Hs.H) for the geneSets argument.
camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral<-mapply(FUN = cross_study_camera_function, expressedProbes = eSets, Condition = conds, MoreArgs = list(geneSets = Hs.H), SIMPLIFY = FALSE)
#make the list into a df.
camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral<-bind_rows(camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral)
#save in cross study analysis
save(camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral, file = "../../Tenofovir R01/Cross-StudyMicroarrayAnalysis/data-in/camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral.Rda")
#save in PrEP folder
save(camera_Hallmark_PrEP_Truvada_60Days_Daily_Oral,file = "R_output/camera_Hallmar_PrEP_Truvada_60Days_Daily_Oral.Rda")
```
```{r exprs for cross-study analysis function }
library(stringr)
library(reshape2)
get_exprs_for_cross_study_analysis<- function(expressedProbes, Tissue){
x<-exprs(expressedProbes)%>%
melt(x, id.vars = rownames(x), varnames = c("ProbeId", "Ptid"),value.name = "Expression")%>%
mutate(Sample = Tissue)%>%
mutate(Visit = ifelse(str_detect(Ptid, "Enrollment"), "Baseline", "60 days"))%>%
mutate(Treatment = ifelse(str_detect(Ptid, "Enrollment"), "Baseline", "Truvada"))%>%
mutate(TreatmentLocation = ifelse(str_detect(Ptid, "Enrollment"), "Baseline", "Oral"))%>%
mutate(TreatmentInterval = ifelse(str_detect(Ptid, "Enrollment"), "Baseline", "Daily"))%>%
mutate(TreatmentLength = ifelse(str_detect(Ptid, "Enrollment"), "Baseline", "60 days"))
y<-select(fData(expressedProbes),ProbeID, TargetID, ENTREZ_GENE_ID)
z<-merge(x, y, by.x = "ProbeId", by.y = "ProbeID")
names(z)[10:11]<-c("Gene", "EntrezId")
return(z)
}
```
```{r get exprs}
#already have eSets vector of esets for each tissue
#here is a vector of tissue types to loop over:
tissue<-c("Duodenum","Rectum","PBMC","PAXgene")
all_exprs<-mapply(get_exprs_for_cross_study_analysis, expressedProbes = eSets,
Tissue = tissue, SIMPLIFY = FALSE)
prep<-bind_rows(all_exprs)
prep <- prep %>%
mutate(Ptid = as.integer(str_sub(Ptid, 5, 5)), Study = "PrEP")
save(prep, file = "../../Tenofovir R01/Cross-StudyMicroarrayAnalysis/data-in/PrEP-filtered-expression-values.Rda")
save(prep, file = "R_output/PrEP-filtered-expression-values.Rda")
```