-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelli_transf.R
646 lines (487 loc) · 30.1 KB
/
helli_transf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
# Hellinger transformation: RDA, MRT
ASV_count
dim(ASV_count)
cyano_counts
dim(cyano_counts)
#Hellinger transformation
#https://wiki.qcbs.ca/r_workshop9
tASV_count <- t(ASV_count) #species should be in columns
dim(tASV_count)
vir_hel<-decostand(tASV_count, method="hellinger")
tcyano_counts <- t(cyano_counts)
bact_hel<-decostand(tcyano_counts, method="hellinger")
# OR
bactps_helli <- transform(bact_physeq, transform = "hellinger", target = "OTU")
bactps_helli %>% otu_table() %in% bact_hel #both methods are the same! yay!
cyano_ps_helli <- subset_taxa(bactps_helli, Phylum == "p__Cyanobacteria")
cyano_helli <- cyano_ps_helli %>% otu_table()
##set vir_hel to same format as bact_hel
#remove sample ID at beginning
row.names(vir_hel) <- sub("*._*._*._*._*._*._*._","", row.names(vir_hel))
#change "_" to "."
rownames(vir_hel) <- gsub("_", ".", row.names(vir_hel))
#CLR transformation
library(zCompositions)
abund.no0 = tASV_count[ ,colSums(tASV_count)!=0 ]
abund.count <- t(cmultRepl(t(abund.no0), method="CZM", output="p-counts"))
abund.prop.rows <- decostand(abund.count,"total",MARGIN=1)
abund.clr <- t(apply(abund.prop.rows, 1, function(x){log(x) - mean(log(x))}))
vir.abund.clr <- as.data.frame(abund.clr)
bact.abund.no0 = tcyano_counts[ ,colSums(tcyano_counts)!=0 ]
bact.abund.count <- t(cmultRepl(t(bact.abund.no0), method="CZM", output="p-counts"))
bact.abund.prop.rows <- decostand(bact.abund.count,"total",MARGIN=1)
bact.abund.clr <- t(apply(bact.abund.prop.rows, 1, function(x){log(x) - mean(log(x))}))
bact.abund.clr <- as.data.frame(bact.abund.clr)
##### RDA TRANSFORMED HELLINGER ######
library(vegan)
transposed_asv_count_meta <- t(asv_count_meta)
meta
vir_helli2 <- decostand(transposed_asv_count_meta, method="hellinger")
env <- meta
names(env)
#remove catagorical data from env (do RDA without sites and time -- see PERMANOVA (far) below)
str(env)
temp_vars <- env[,!(colnames(env) %in% c("Total_Phosphorus_ug", "Total_Nitrogen_mg", "Dissolved_P", "Dissolved_N", "Cumulative_precipitation_t1_t7_mm", "Mean_temperature_t0_t7",
"Microcystin", "Dolicho.Abundance", "Micro.Abundance", "Cyano.Abundance", "cyano.sum.helli",
"micro.sum.helli" , "doli.sum.helli", "description", "Date"))]
env_vars <- env[,!(colnames(env) %in% c("description", "Date", "Months", "Years", "Site", "Period", "bloom2",
"Microcystin", "Dolicho.Abundance", "Micro.Abundance", "Cyano.Abundance"))]
#"cyano.sum.helli", "doli.sum.helli", "micro.sum.helli"))]
colnames(env_vars)
#standardize environmental data
library(vegan)
env_vars.std <- env_vars
env_vars.std[1:6] <-decostand(env_vars.std[1:6], method="standardize")
head(env_vars.std, n=2)
# apply(env_vars[,8:16], 2, mean) #data are now centered (mean~0)
# apply(env_vars[,8:16], 2, sd) #data are now scaled (st. dev =1)
#check how many rows there are without any NAs
complete.cases(env_vars.std)
#rm NAs
env_keep <- env_vars.std[complete.cases(env_vars.std), ]
env_keep %>% dplyr::glimpse()
summary(env_keep)
temp_keep <- temp_vars[complete.cases(temp_vars), ]
#### Remove viral asvs that are not present (due to removal of NA from env vars)
sp.asv <- t(vir_helli)
#rm sample rows that are not present in env_keep/temp_keep
vir.rm <- sp.asv[rownames(sp.asv) %in% rownames(env_keep),]
dim(vir.rm)
#which rows are the same
#intersect(abund_name, env_row_name)
# #specific samples that are not the same
# (row_remove <- setdiff(row.names(sp.asv), row.names(env_keep)))
# #count how many are different
# length(setdiff(row.names(sp.asv), row.names(env_keep)))
#
# #remove rows (samples) that aren't in env_var from abundance
# sp.asv.rm <- sp.asv[!(row.names(sp.asv) %in% row_remove), ]
# dim(sp.asv.rm)
# species and enviro data without NAs needed for RDA
head(vir.rm)
head(env_keep)
length(env_keep) #number of env vars
length(vir.rm) #nnumber asv
nrow(env_keep) #number samples
nrow(vir.rm) #numer of samples
vir.rda <- rda(vir.rm~., data = env_keep)
summary(vir.rda, display = NULL)
#These results contain: (1) the proportion of variance of Y explained by the X variables (constrained proportion, 16.17% here)
#(2) the unexplained variance of Y (unconstrained proportion, 83.83% here) and
#(3) then summarize the eigenvalues, the proportions explained and the cumulative proportion of each canonical axis
# (each canonical axis = each constraining variable, in this case, the environmental variables from env).
# Select the significant explanatory variables by forward selection
ordiR2step(rda(vir.rm~1, data = env_keep), scope=formula(vir.rda), direction="forward", R2scope = T, pstep=1000)
anova.cca(vir.rda, by ="terms")
temp_sig <- adonis(vir.rm~., data = temp_keep)
str(temp_keep)
temp_sig
env_sig <- adonis(vir.rm~., data = env_keep)
env_sig
#get adjusted R2
(R2adj <- RsquareAdj(vir.rda)$adj.r.squared)
#test significance of the model and of each canonical axis
anova.cca(vir.rda, step=1000) #model
anova.cca(vir.rda, step=1000, by="axis") #canonical axes
#significant env vars
env.signif <- subset(env_keep, select = c("Cumulative_precipitation_t1_t7_mm", "doli.sum.helli",
"Total_Phosphorus_ug", "Dissolved_P", "Dissolved_N"))
env.signif <- subset(env_keep, select = c("doli.sum.helli", "Cumulative_precipitation_t1_t7_mm"))
#the proportion of variation explained by the three constraining variables being 0.058
rda.signif <- rda(vir.rm~., data=env.signif)
summary(rda.signif, display=NULL)
#The explanatory variables now explain 11.67% of the variance in Y (species).
(R2adj <- RsquareAdj(rda.signif)$adj.r.squared)
#Here the strength of the relationship between X and Y corrected for the number of X variables is 0.0227.
#The significance of the model and of each canonical axis can be tested using the function anova
#(note this is different from retaining significant variables as was done with forward selection, now we're testing the significance of the RDA axes):
anova.cca(rda.signif, step=1000)
#the RDA model is highly significant (p=0.006)
anova.cca(rda.signif, step=1000, by="axis")
# as well as both canonical axes.
#To visualize the results of the RDA, triplots can be drawn using the plot().
#Note that as in PCA, users can create scaling 1 and scaling 2 triplots.
#In scaling 1, distance among objects approximate their Euclidean distances
# in scaling 2, angles between variables X and Y reflect their correlation.
#Thus, scaling 1 triplots can be used to interpret distances among objects
#and scaling 2 triplots to interpret the relationships between X and Y.
#Quick plots scaling 1 and 2
plot(rda.signif, scaling=1, main="Triplot RDA (scaling 1)")
plot(rda.signif, scaling=2, main="Triplot RDA (scaling 2)")
# #Advanced plots scaling 1
# plot(rda.signif, scaling=1, main="Triplot RDA - scaling 1", type="none", xlab=c("RDA1"), ylab=c("RDA2"), xlim=c(-1,1), ylim=c(-1,1))
# points(scores(rda.signif, display="sites", choices=c(1,2), scaling=1),
# pch=21, col="black", bg="steelblue", cex=1.2)
# arrows(0,0,
# scores(rda.signif, display="species", choices=c(1), scaling=1),
# scores(rda.signif, display="species", choices=c(2), scaling=1),
# col="black",length=0)
# text(scores(rda.signif, display="species", choices=c(1), scaling=1),
# scores(rda.signif, display="species", choices=c(2), scaling=1),
# labels=rownames(scores(rda.signif, display="species", scaling=1)),
# col="black", cex=0.8)
# arrows(0,0,
# scores(rda.signif, display="bp", choices=c(1), scaling=1),
# scores(rda.signif, display="bp", choices=c(2), scaling=1),
# col="red")
# text(scores(rda.signif, display="bp", choices=c(1), scaling=1)+0.05,
# scores(rda.signif, display="bp", choices=c(2), scaling=1)+0.05,
# labels=rownames(scores(rda.signif, display="bp", choices=c(2), scaling=1)),
# col="red", cex=1)
#
# #Advanced plots scaling 2
# plot(rda.signif, scaling=2, main="Triplot RDA - scaling 2", type="none", xlab=c("RDA1"), ylab=c("RDA2"), xlim=c(-1,1), ylim=c(-1,1))
# points(scores(rda.signif, display="sites", choices=c(1,2), scaling=2),
# pch=20, col="steelblue", cex=1.2)
# arrows(0,0,
# scores(rda.signif, display="species", choices=c(1), scaling=2)*2,
# scores(rda.signif, display="species", choices=c(2), scaling=2)*2,
# col="black",length=0)
# text(scores(rda.signif, display="species", choices=c(1), scaling=2)*2.1,
# scores(rda.signif, display="species", choices=c(2), scaling=2)*2.1,
# labels=rownames(scores(rda.signif, display="species", scaling=2)),
# col="black", cex=0.8)
# arrows(0,0,
# scores(rda.signif, display="bp", choices=c(1), scaling=2),
# scores(rda.signif, display="bp", choices=c(2), scaling=2),
# col="red")
# text(scores(rda.signif, display="bp", choices=c(1), scaling=2)+0.05,
# scores(rda.signif, display="bp", choices=c(2), scaling=2)+0.05,
# labels=rownames(scores(rda.signif, display="bp", choices=c(2), scaling=2)),
# col="red", cex=1)
##### db-RDA #####
# https://sites.ualberta.ca/~ahamann/teaching/renr690/Lab9b.pdf
#decide which distance measure to use by looking at the rank correlations between dissimilarity indices and gradient separation (the higher the value the better)
rankindex(env_keep, sp.asv.rm, indices = c("euc", "man", "gow", "bra", "kul"), stepacross = F, method = "spearman")
#remove site and temporal data from env_keep (do RDA without sites and time -- see PERMANOVA (far) below)
colnames(env_keep)
env_vars <- env_keep[,!(colnames(env_keep) %in% c("Months", "Years", "Site", "Period", "bloom2"))]
colnames(env_vars)
dbRDA <- capscale(vir.rm ~ ., data = env_keep, distance="bray")
plot(dbRDA)
#get the R2 for model fit for teh constrained ordinations
dbR2 <- RsquareAdj(dbRDA)$r.squared
dbR2
#adjusted R^2: adjusted R2 measures the unbiased amount of explained variation
dbR2adj <- RsquareAdj(dbRDA)$adj.r.squared
dbR2adj
# plot the RDA using ggplot (ggord package)
library(ggord)
ggord(dbRDA, env_keep$bloom2) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # looking at the raw code, this is plotting the 'wa scores', the blue dots are different species
# overall test of the significance of the analysis
anova(dbRDA)
#test axes for significance
anova(dbRDA, by="axis", perm.max=500)
#test for sig. environmental variables
anova(dbRDA, by="terms", permu=200)
# Df SumOfSqs F Pr(>F)
# Months 6 3.7920 5.0998 0.001 ***
# Years 7 5.8803 6.7786 0.001 ***
# Site 1 0.1032 0.8324 0.630
# Period 2 0.4735 1.9104 0.005 **
# bloom2 1 0.1635 1.3195 0.170
# Total_Phosphorus_ug 1 0.2037 1.6436 0.068 .
# Total_Nitrogen_mg 1 0.1820 1.4683 0.111
# Dissolved_P 1 0.1254 1.0122 0.414
# Dissolved_N 1 0.2277 1.8374 0.035 *
# Cumulative_precipitation_t1_t7_mm 1 0.1861 1.5018 0.086 .
# Mean_temperature_t0_t7 1 0.2035 1.6418 0.056 .
# Dolicho.Abundance 1 0.0633 0.5108 0.968
# Micro.Abundance 1 0.0736 0.5936 0.916
# Cyano.Abundance 1 0.1260 1.0170 0.437
# Residual 23 2.8503
#transforming negtaive eigenvalues (get rid of the -ve eigenvalues when doing the analysis)
#1) add a constant:
dbRDA_constant <- capscale(sp.asv.rm ~., env_vars, distance = "bray", add=TRUE)
plot(dbRDA_constant)
anova(dbRDA_constant)
#2) take the sqrt of dissimilarities
dbRDA_sqrt <- capscale(sp.asv.rm ~., env_vars, dist="bray", sqrt.dist=T)
plot(dbRDA_sqrt)
anova(dbRDA_sqrt)
#3) Do a square root transformation, Wisconsin double standardization (this emphasizes the environmental variables):
dbRDA_metaMDS=capscale(sp.asv.rm ~ ., env_vars, dist="bray", metaMDS=TRUE, sqrt.dist=TRUE)
plot(dbRDA_metaMDS)
anova(dbRDA_metaMDS)
#select the significant explanatory variables
?ordiR2step
ordiR2step(rda(sp.asv.rm~1, data=env_vars), scope=formula(dbRDA), direction= "forward", R2scope=TRUE, pstep=1000)
#the proportion of variation explained by the constraining variables being 0.3542
#retain significant variables only
signif_env <- subset(env_vars, select = c("Total_Nitrogen_mg", "Mean_temperature_t0_t7", "Dissolved_N"))
#rda with significant variables
dbrda_env_signif <- capscale(sp.asv.rm~., data=signif_env, dist="bray", sqrt.dist=T)
summary(dbrda_env_signif, display=NULL)
screeplot(dbrda_env_signif)
#The proportion of the variance of Y (species) explained by the X (env) variables = 37.93% (constrained)
#the unexplained variance of Y = 62.07% (unconstrained)
#adjusted R^2
(adjR2 <- RsquareAdj(dbrda_env_signif)$adj.r.squared)
#strength of the relationship between X and Y corrected for the number of X variables is 27.94%
#the explanatory variables explain 38.32% of the variance in Y (species)
#test significance of the RDA canonical axis (ie. constraining variables, ie. env var)
anova.cca(dbrda_env_signif, step=1000)
#RDA model is highly significant (p=0.001)
anova.cca(dbrda_env_signif, step=1000, by="axis")
#axis' 1-2 v significant(p=0.001), 3 (p=0.026)
#plot dbRDA
#Quick plots scaling 1 and 2
plot(dbrda_env_signif, scaling=2, main="Triplot RDA (scaling 2)")
#with ellipses
ggord(dbrda_env_signif, signif_env$bloom2) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # looking at the raw code, this is plotting the 'wa scores', the blue dots are different species
#https://fukamilab.github.io/BIO202/06-B-constrained-ordination.html
#simple RDA
simpleRDA <- rda(vir.rm ~., env_keep)
summary(simpleRDA)
screeplot(simpleRDA)
# The total variance of the data set, partitioned into constrained and unconstrained variances, is a standard result. This result shows how much variation in your response variables was redundant with the variation in your explanatory variables. If the constrained variance is much higher than your unconstrained variance, the analysis suggests that much of the variation in the response data may be accounted for by your explanatory variables. If, however, there is a large proportion of unconstrained variation (i.e. variation in your response matrix that is non-redundant with the variation in the explanatory matrix), then the results should be interpreted with caution as only a small amount of the variation in your response matrix is displayed
# constrained axes (RDA axes)
# unconstrained axes (PCA axes)
#Object and response variable scores are often reported as "site" and "species" scores, respectively. These scores are the coordinates used to ordinate points and vectors. The coordinates of variables should be understood as the "tip" of their vector with the origin as its "tail". The direction of the vector is the direction of increase for that variable.
#Explanatory variable scores, also referred to as constraining variable scores, may be interpreted as response variable scores when the explanatory variable in question is quantitative. Scores for each state of nominal or factorial variables are the coordinates of these states' centroids and show the average position of the sites that have that state.
# first 2 axes explain most of the variation (RDA1: 0.3452 & RDA2: 0.2721) so plotting by these two axes represent the data well
# the unconstrained eigenvalue (PC1) is 0.09707, which is comparatively small, which means it does not display any important residenual structure of teh response data
# each RDA axis has an eigenvalue, which is the proportion of the variance explained by each axis
# The species and site scores = where the sites and species fall along the axes
# Partition of variance: the overall variance is partitioned into constrained and unconstrained fractions.
# constrained: the amount of variance the species by site matrix is explained by the explanatory variables (expressed as a proportion--is equivalent to R2 in a multiole regression). This R2 is biased so you have to look at the adjusted R2
# Eigenvalues and their contribute to variance: RDA1 -> RDA26 for the canonical axes, and unconstrained axes. The cumulative contribute of the variance is the proportion of the total variance of the response data explained by the RDA. The results also give the eigenvalues.
# Eigenvalues: the canonical eigenvalues are decreasing in value (in the order they are presented). But sometimes the residual structure (residual eigenvalue PC1) can be larger than the last RDA eigenvalue, which means that the residual structure has more variance than some of the structures that can be explained by the explan variables.
# Canonical eigenvalues: measure the amount of variance explained by the RDA model
# Residual eigenvalues: measure the amount of variance represented by the residual axes
# Accumulated constrained eigenvalues: cumulative amounts of variance expressed as proportions of the total explained variance
# Species scores: coordinates of the tips of vectors representing the response variables in bi or triplots
# Site scores (weighted sums of species scores): coordinates of the sites as expressed in the space of the response variables
# Site constraints (linear combinations of constraining variables): coordinates of the sites in space of the explanatory variables
# Biplot scores for constraining variables: coordinates of the tips of the vectors represnting explanatory variables
# Centroids for factor constraints: coordinates of centroids of levels of factor variables
#getting the canonical coefficient (ie. the equivalent of regression coefficients for each explanatory variable on each canonical axis)
coef(simpleRDA)
#get the R2 for model fit for teh constrained ordinations
R2 <- RsquareAdj(simpleRDA)$r.squared
R2
#adjusted R^2: adjusted R2 measures the unbiased amount of explained variation
R2adj <- RsquareAdj(simpleRDA)$adj.r.squared
R2adj #0.5084 ==> this model explains 50.8% of the variation in the data (if used the biased R2, any variable included in the explanatory responses would increase the R2, so the R2 needs to be adjusted for the number of eplanatory variables -- esp since we have 8)
#plot RDA
# two ways: using "wa" (weighted sums of species) and "lc" (fitted site scores)
# wa: more robust to noise in the environmental variables but are a step between constrained towards unconstrained. Default because more robust to noise in data
# lc: orthogonal linear combinations of the explanatory variable
#The interpretation of these plots depends on what scaling has been chosen.
#In general, consider type I scaling if the distances between objects are of particular value or if most explanatory variables are binary or nominal.
# Consider type II scaling if the correlative relationships between variables are of more interest.
# https://mb3is.megx.net/gustame/constrained-analyses/rda
### Type I Scaling - Distance plots (object focused) ###
#Distances between object points approximate Euclidean distances. Thus, objects ordinated closer together can be expected to have similar variable values. This will not always hold true, as RDA only recovers part of the variation in the data set.
#Right-angled projections of object points onto vectors representing response variables approximate variable values for a given object.
#The angles between vectors representing response variables are meaningless.
#The angles between vectors representing response variables and those representing explanatory variables reflect their (linear) correlation.
#Note, that binary explanatory variables may be represented as points. These points are the centroids of objects which have a state "1" for a given binary variable. Projecting centroid points onto a vector representing a response variable reflects the relationship between these variables.
#Distances between centroids and between centroids and object points approximates Euclidean distances.
# Triplot: three different entities in the plot: sites, response variables and explanatory variables (arrowheads are on the explanatory variables)
# Scaling 1
plot(simpleRDA, scaling=1, main="Triplot RDA matrix ~ env - scaling 1 - wa scores")
# arrows for species are missing, so lets add them without heads so they look different than the explanatory variables
spe.sc <- scores(simpleRDA, choices=1:2, scaling=1, display="sp")
arrows(0,0,spe.sc[,1], spe.sc[,2], length=0, lty=1, col='red')
### Type II Scaling - Correlation plots (response variable focused) ###
#Distances between object points should not be considered to approximate Euclidean distances.
#Right-angled projections of object points onto vectors representing response variables approximate variable values for a given object.
#The angles between all vectors reflect their (linear) correlation. The correlation is equal to the cosine of the angle between vectors (e.g. a vector pair describing an angle of 90° are uncorrelated as cos(90) = 0), those describing an angle of 20° have strong, positive correlation as cos(20) = 0.94)
# Note, that binary or nominal explanatory variables may be represented as points. These points are the centroids of objects which have a state "1" for a given binary variable or realise a particular level of a nominal explanatory variable. Projecting centroid points onto a vector representing a response variable reflects the relationship between these variables.
# Scaling 2
plot(simpleRDA, main="Triplot RDA matrix ~ env - scaling 2 - wa scores")
spe2.sc <- scores(simpleRDA, choices=1:2, display="sp") # scores() choices= indicates which axes are to be selected, make sure to specify the scaling if its different than 2
arrows(0,0,spe2.sc[,1], spe2.sc[,2], length=0, lty=1, col='red')
# plot the RDA using ggplot (ggord package)
library(ggord)
ggord(simpleRDA, env_vars$bloom2) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # looking at the raw code, this is plotting the 'wa scores', the blue dots are different species
#plot results of RDA using "lc"
# site scores as linear combinations of the environmental variables
# Scaling 1
plot(simpleRDA, scaling=1, display=c("lc", "sp", "cn"), main="Triplot RDA matrix ~ env -scaling 1- lc scores")
arrows(0,0, spe.sc[,1], spe.sc[,2], length=0, lty=1, col='red')
# scaling 2
plot(simpleRDA, display=c("sp", "lc", "cn"), main="Triplot RDA matrix ~ env -scaling2-lc scores", col=env_vars$bloom2)
arrows(0,0,spe2.sc[,1],spe2.sc[,2], length=0, lty=1,col='red')
# To choose the elements that are plotted, use the argument display=c(), sp=species, wa= site scores in the species space (weighted averages), lc= fitted site scores (linear combinations of explanatory variables) and cn= constraints (the explanatory variables)
## Fwd selection of variables ##
# variance inflation factors in the RDA
vif.cca(simpleRDA) #Anything above 10 should be examined or avoided
# RDA with all explanatory variables
spe.rda.all <- rda(sp.asv.rm ~ Years + Site + Period + bloom2 + Total_Phosphorus_ug + Total_Nitrogen_mg + Dissolved_P + Dissolved_N +
Cumulative_precipitation_t1_t7_mm + Mean_temperature_t0_t7 + Micro.Abundance, data=env_vars)
# Forward selection using ordistep
ordistep(rda(sp.asv.rm ~., data=env_vars), direction="forward", pstep=1000, R2scop=TRUE) #R2scope only accepts models with lower adjusted R2
# db-RDA
dbRDA2 <- capscale(sp.asv.rm ~ ., data=env_vars, distance = "bray")
dbRDA2
plot(dbRDA2)
summary(dbRDA2)
screeplot(dbRDA2)
#why use CAP: it allows for any dissimilarity measure & takes into account any correlation structure among the response variables (i.e. this is why its called ‘canonical’)
gen.imp <- t(vir_abund_helli)
env <- meta
#RDA requires complete data frames (i.e., no missing data)
sum(is.na(gen.imp))
#look at the structure of the df
str(env)
#confirm that asv and enviro data are in the same order
identical(rownames(gen.imp), rownames(env))
#RDA is a regression-based method, and so can be subject to problems when using highly correlated predictors (Dormann et al., 2013). Generally, the |r| > 0.7 “rule of thumb” is a good guideline for removing correlated predictors. We will also check for multicollinearity using Variance Inflation Factors (VIF)
#Variable reduction should be guided by an ecological interpretation of the relevance of possible predictors. Here, we use the function pairs.panels to visualize correlations among our predictors. Correlation coefficients are in the upper right diagonal, with their size scaled to their |r|. The lower left shows scatter plots, while the diagonal shows histograms of the data
library(psych)
pairs.panels(env[,7:15], scale=T)
#remove total phsophorus and dissolved p
pred <- subset(env, select=-c(Dissolved_P, Total_Phosphorus_ug))
#rda
vir.rda <- rda(formula=gen.imp ~ Months + Years + Site + Period + bloom2 +
Total_Nitrogen_mg + Dissolved_N + Cumulative_precipitation_t1_t7_mm + Mean_temperature_t0_t7+Cyano.Abundance, data = pred, scale=T)
# *no RDA* for site and time. use PERMANOVA (combining adonis2 and betadisper)
##### MRT #####
# library(devtools)
# devtools::install_github("cran/mvpart")
# install_github("cran/MVPARTwrap", force=TRUE)
library(MVPARTwrap)
vir_helli #from phyloseq
vir_helli2 #from decostand
names(env)
#remove catagorical data from env (do RDA without sites and time -- see PERMANOVA (far) below)
str(env)
mrt_vars <- env[,(colnames(env) %in% c("Months", "Years", "Site", "Period"))] #test with nico
dim(mrt_vars)
env_vars <- mrt_vars
colnames(env_vars)
#rm NAs
env_keep <- env_vars[complete.cases(env_vars), ]
env_keep %>% dplyr::glimpse()
summary(env_keep)
#### Remove viral asvs that are not present (due to removal of NA from env vars)
sp.asv <- t(vir_helli)
sp.asv <- vir_helli2
#rm sample rows that are not present in env_keep
vir.rm <- sp.asv[rownames(sp.asv) %in% rownames(env_keep),]
# species and enviro data without NAs needed for RDA
head(vir.rm)
dim(vir.rm)
head(env_keep)
dim(env_keep)
#use all data to identify which group accurately predicts
mrt <- mvpart(as.matrix(vir.rm) ~ ., env_keep,
legend=F,
margin=0.01,
cp=0,
xv = "pick",
xval=nrow(vir.rm),
xvmult=100,
which = 4)
mrt_month <- mvpart(vir.rm~ Months, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.975
mrt_year <- mvpart(as.matrix(vir.rm)~ Years, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.962
mrt_site <- mvpart(as.matrix(vir.rm)~ Site, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#error = 0.98
mrt_period <- mvpart(as.matrix(vir.rm)~ Period, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#error = 0.976
mrt_bloom <- mvpart(as.matrix(vir.rm)~ bloom2, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.983
mrt_totP <- mvpart(as.matrix(vir.rm)~ Total_Phosphorus_ug, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.884
mrt_totN <- mvpart(as.matrix(vir.rm)~ Total_Nitrogen_mg, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#error = 0.958
mrt_DisP <- mvpart(as.matrix(vir.rm)~ Dissolved_P, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.726
(DisP_R2 <- RsquareAdj(mrt_DisP)$adj.r.squared)
rpart.pca(mrt_DisP)
mrt_DisN <- mvpart(as.matrix(vir.rm)~ Dissolved_N, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.835
mrt_precip <- mvpart(as.matrix(vir.rm)~ Cumulative_precipitation_t1_t7_mm, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.889
mrt_temp <- mvpart(as.matrix(vir.rm)~ Mean_temperature_t0_t7, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.962
mrt_cyano <- mvpart(as.matrix(vir.rm)~ cyano.sum.helli, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#error = 0.836
mrt_doli <- mvpart(as.matrix(vir.rm)~ doli.sum.helli, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# error = 0.937
mrt_micro <- mvpart(as.matrix(vir.rm)~ micro.sum.helli, env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#error = 0.947
#significant vars based off of MRT scores
mrt_signif <- mvpart(as.matrix(vir.rm) ~ cyano.sum.helli+Cumulative_precipitation_t1_t7_mm+Dissolved_N+Dissolved_P+Total_Phosphorus_ug,
env_keep,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
# significant vars based off of ordi2step
signif_env
mrt_ordi <- mvpart(as.matrix(vir.rm) ~ Years+Months+Dissolved_N+Period+Dissolved_P, signif_env,
legend=T, margin=0.01, cp=0, xv="pick",
xval=nrow(vir.rm), xvmult=100, which=4, big.pts=T, bars=F)
#view tree details
printcp(tree)
str(tree)
summary(tree)
printcp(tree_month)
# obtain the path to the leaf nodes
tree$frame
leafnodeRows <- grepl("leaf", tree$frame$var)
nodevals <- as.numeric(rownames(tree$frame)[leafnodeRows])
rules <- path.rpart(tree, nodevals)
rulesdf <- do.call(
"rbind",
lapply(rules, function(x) paste(x, collapse = " -AND- "))
)
rulesdf <- data.frame(
nodeNumber = rownames(rulesdf),
rule = rulesdf[, 1],
stringsAsFactors = FALSE
)
rulesdf
#PCA
rpart.pca(tree, interact=T, wgt.ave = T, add.tree=T, speclabs = F)