Multivariate_Ordinations.rmd

---
title: "Multivariate Fish Analysis"
author: Nick Toimieri
date: 
output: 
  html_document:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    fig_caption: true
fontsize: 11pt

---

```{r SETUP, include=FALSE}
# A bunch of initial set up junk

rm(list = ls())

##### knitr stuff and libraries ####

HomeFile = getwd()
Fig_Loc = paste0(HomeFile,"/Figures/")
Data_Loc = paste0(HomeFile,"/Data/")
Results_Loc = paste0(HomeFile,"/Results/")
Other_Files = paste0(HomeFile,"/Other Files/")

# tools in use 
library(knitr)
# library(tidyr)
# library(dplyr)
library(tidyverse)
library(stringr)
library(tinytex)
library(RColorBrewer)
# display.brewer.all(colorblindFriendly = TRUE)
library(readxl)

# stats packages etc
library(vegan)
library(BiodiversityR)
library(pracma)
library(factoextra)

# options
knitr::opts_chunk$set(
  fig.path = Fig_Loc,
  cache.path = 'cache/graphics-', 
  echo=FALSE,
  error=FALSE,
  include = TRUE,
  dev='png',
  dpi=300,
  warning=FALSE,
  #out.width = '100%',
  fig.align='center'
  
  )
# opts_knit$set(eval.after = "fig.cap")

# just to keep track of when I ran things.

Last_Run0 = Sys.time()
Last_Run = str_replace(Last_Run0, ":", ".")
Last_Run = str_replace(Last_Run, ":", ".")
capture.output(Last_Run, file = 'Last_Run.txt')

par(ps=10, cex=1)

```


```{r Triggers_and_Settings}
process.data = FALSE # FALSE to speed up and use previously processed data


# FISH settings ####
# spp for univariate plots
fish.include = c("OPEL" , "HEXA" , "EMBI" , "AUFL" , "BAIT",  
                 "SECA" , "SCMA" , "SENE" , "SEFL" , "SEME" , "SEMY" , 
                 "SECAy", "SEPIy", "SEMEy", "SEBYTy","RYOY") 
# spp for analysis
cap.fish =  c("OPEL" , "HEXA" , "EMBI", "AUFL",  "BAIT",  
              "SECA" , "SCMA" , "SENE", "SEFL" , "SEME" ) 

# min rockfish size for YOY for correcting some files
min_rockfish_size = 6 # higher?

# include 2015 or not
Include.2015 = FALSE

# Delete low vis transects
Min.Vis = 2.0 # NA will use all the data, otherwise set min.

# common transform for all multivariate analyses
data.transform = 'none' # 'none', 'log', 'sqrt' , '4th-root'

# INVERT SETTINGS ####

invert.spp = c("bivalve","blood_star",
               "brood_sea_star","Cal_cuc",
               "chiton","crabs","green_urchin", 
               "hermit_crabs","kelp_crab",
               "large_anemone","large_barnacle",
               "large_nudibranch", "large_sea_star",
               "large_seastar","leather_star",
               "med_nudibranch","med_sea_star",
               "med_seastar","mobile_invert",
               "no_org","octopus","orange_cucumber",
               "P_ochraceous","purple_urchin",
               "red_urchin","sea_cucumber","sea_star_YOY",
               "shelled_gastropod","shelled_mollusk",
               "small_anemone","sponge", "tunicate"  )

cap.inverts = c("bivalve","blood_star",
               "brood_sea_star","Cal_cuc",
               "chiton","crabs","green_urchin", 
               "hermit_crabs","kelp_crab",
               "large_anemone","large_barnacle",
               "large_nudibranch", "large_sea_star",
               "large_seastar","leather_star",
               "med_nudibranch","med_sea_star",
               "med_seastar", "orange_cucumber",
               "P_ochraceous","purple_urchin",
               "red_urchin","sea_cucumber","sea_star_YOY",
               "shelled_gastropod","shelled_mollusk",
               "small_anemone","sponge", "tunicate"  )

# delete octopus, molile_invert, no_org

invert.transform = '4th-root'

## Common Plotting Settings #####

years = 2015:2019
PCH = c(8, 21, 22, 24, 25)

if( Include.2015 == FALSE ){ years = 2016:2019 ; PCH = c(21, 22, 24,25) }
year.pch = data.frame(cbind(years, PCH))

site = c("Neah Bay","Tatoosh Island","Cape Alava","Cape Johnson","Destruction Island")
col = brewer.pal(n = 5, name = "Dark2")

site = data.frame(cbind(site,col))
colnames(site) <- c('site', 'col')

#########################

if(Include.2015 == TRUE){year.statement <- "NOTE: These analyses include 2015." }
if(Include.2015 == FALSE){year.statement <-  "NOTE: These analyses DO NOT include 2015."}

if(is.na(Min.Vis)){vis.statement <- "NOTE: The data were not corrected for visibility." }
if(!is.na(Min.Vis)){vis.statement <- paste0("NOTE: Transects with visibility < ", Min.Vis, " m were deleted from the analysis.")}
  
###########################
```

# FUNCTIONS  

```{r Functions}
# removes rows with zero data and columns with too few observations

Sum_Stats <- function( form1 , data.file ){
  # calculate a bunch of summary stats and put in one file
  # text in quotes, form1 looks like: "Y = site + year"
  # no spaces allowed in column titles
  
  library('stringr')
  data.file$xNx = 1

  mn = aggregate( as.formula( form1 ) , data = data.file , FUN = mean )
  s2 = aggregate( as.formula( form1 ) , data = data.file , FUN = var )
  
  x = stringr::str_split( form1 , "~", simplify = TRUE )[2]
  form2 = paste0( 'xNx ~ ', x )
  n  = aggregate( as.formula( form2 ) , data = data.file , FUN = sum )
  
  BY = stringr::str_split( x , "\\+" , simplify = TRUE ) 
  BY = stringr::str_remove_all(BY," ")
  BY = as.vector(BY , mode = 'character')
  
  a = merge( mn , s2 , by = BY, all = TRUE )
  z = merge( a , n , by = BY , all = TRUE )
  colnames( z )[(ncol(z)-2):ncol(z)] <- c( 'mean' , 'var' , 'n' ) 
  
  z$sd = sqrt(z$var)
  z$se = z$sd/sqrt((z$n-1))
  
  return( z )
  
}

# Add plotting info to files
# Colnames specific to these files

### transfer info

Add_Info <- function(data.file, cnames){ #specific to this file
  df = data.file
  df_info = df[, cnames]
  df_info$col <- site$col[match( df_info$SITE , site$site)]
  df_info$pch <- year.pch$PCH[match( df_info$YEAR , year.pch$years)]
  z = grep("ZONE", cnames)
  if( length(z)==1 ){ df_info$bgcol <- ifelse(df_info$ZONE==5, 'white', df_info$col) }
  df1 = merge( df_info , df, by = cnames)
}

### Plot CAP ordinations/summarize data

Plot_Ordination <- function( data.file , ord.file, Yform, Xform, 
                             Xlim=NA, Ylim=NA, Xlim2 = NA, Ylim2=NA, Xlab = "Axis 1", Ylab = "Axis 2", 
                             min.score = 0.0, plot.species = TRUE, spp.separate = FALSE){
  
  form1 = paste0(Yform, '1', Xform)
  form2 = paste0(Yform, '2', Xform)
  df_1 = Sum_Stats( form1 , data.file )
  df_2 = Sum_Stats( form2 , data.file )
  
  if(is.na(Xlim[1]) | is.na(Ylim[1])){
    plot( df_1$mean , df_2$mean, pch = df_1$pch , col =df_1$col, bg =df_1$col , xlab=Xlab, ylab=Ylab)}else{
    plot( df_1$mean , df_2$mean, pch = df_1$pch , col =df_1$col, bg =df_1$col, xlim = Xlim, ylim = Ylim , xlab=Xlab, ylab=Ylab )
      }
  # error bars
  arrows( df_1$mean+df_1$se , df_2$mean,  df_1$mean-df_1$se , df_2$mean, col = df_1$col, length = 0)
  arrows( df_1$mean, df_2$mean+df_2$se , df_1$mean , df_2$mean-df_2$se, col = df_1$col, length = 0)
  segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
  segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)
  # spp spp scores
  if(plot.species==TRUE){ 
    spp_scores = scores(ord.file)$species
    spp_scores1 = spp_scores[ abs(spp_scores[,1]) > min.score | abs(spp_scores[,2]) > min.score, ]
    text(spp_scores1[,1] , spp_scores1[,2], rownames(spp_scores1), cex=0.8 , col='red')
    }
  if(spp.separate==TRUE){
    spp_scores = scores(ord.file)$species
    spp_scores1 = spp_scores[ abs(spp_scores[,1]) > min.score | abs(spp_scores[,2]) > min.score, ]
    if(is.na(Xlim2)[1] | is.na(Ylim2)[1] ){
      plot(spp_scores1[,1] , spp_scores1[,2], pch="" , xlab=Xlab, ylab=Ylab)}else{
      plot(spp_scores1[,1] , spp_scores1[,2], pch="" , xlim = Xlim2, ylim = Ylim2 , xlab=Xlab, ylab=Ylab)
      }
    
    text(spp_scores1[,1] , spp_scores1[,2], rownames(spp_scores1), cex=0.8 , col='red')
    segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
    segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)
    }
  # zero guides
  segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
  segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)
}

```

# FISH ORDINATION

This is an up-dated fish file with only the "final" fish multivariate analyses for the **Flagstone MS**.  

## Import and manipulate data

For data processing: 

* Corrected some spp codes
* Make a 'group' variable for lumping taxa/species
* Combined 2015 and 2016-2019 data
  * 2015 data are not necessarily used in analyses. See below.
  * 2015 data lack visibility estimates, so are usually deleted.
* No depth data for the 2015 data. 
  * I took the average of the quadrats.
  * I assigned transects with an average depth of >20 ft to the 10-m group.

`r vis.statement`  

`r year.statement`

DATA TRANSFORM: `r data.transform`

I set the data transform to 'none' because there wasn't a lot of variation in spp density among groups.  Some but not a lot. Again, this emphasizes the more abudnant species, which is, I think, ok.

```{r Import_Data , include=FALSE }
#

spp_code =  data.frame(read.csv("spp_codes.csv", header = TRUE))

if(process.data == TRUE){

fishx =  data.frame(read.csv(paste0(Data_Loc, "NWFSC_FISH_ALLYEARS_data_2019.csv"), header = TRUE))

# Bring in 2015 data and combine
df1 <- data.frame(read_excel( paste0(Data_Loc, "2015_OCNMSDataComplete_standardized_122116.xlsx"),
                              sheet = "Master data"))

# use quadrats to apply depth to transects and get zones

id = paste(df1$Site, df1$Transect, df1$Date, sep='_')
x = data.frame(cbind(id,df1$Depth.ft))
colnames(x) <- c('id','depth.ft')
x$depth.ft = as.numeric(x$depth.ft)
depth = aggregate( x$depth.ft ~ id, data=x, FUN = mean, na.action = na.omit)
colnames(depth) <- c('id','ft')

# get just transects
df2 = df1[ df1$data.type == 'swath' ,]
# get just fish
df3 = df2[ df2$PISCO.Classcode %in% spp_code$code , ]
df3$YEAR = substring(df3$Date, 1 , 4)

id3 = paste(df3$Site, df3$Transect, df3$Date, sep='_')
df3$Depth.ft <- depth$ft[ match(id3, depth$id)]

# subset columns to match fish0
df4 = df3[ , c('YEAR', 'Site','Observer','Depth.ft', 'Transect', 'PISCO.Classcode' , 'Count', 'Size.cm')]

# change to fish0 colnames
colnames(df4) <- c('YEAR', 'SITE', 'OBSERVER', 'DEPTH_FT', 'TRANSECT', 'SPECIES' , 'QUANTITY', 'Size_Min')
df4$Size_Max <- df4$Size_Min

# populate data frame for 2015 that matches 2016-2019
dfx =  data.frame(array(NA, dim=c(nrow(df4),ncol(fishx))))
colnames(dfx)<- colnames(fishx)
x <- colnames(df4)

for(i in 1:length(x)){
  dfx[,x[i]] = df4[,x[i]]  
}
# correct zone for 2015
dfx$ZONE <- ifelse(dfx$DEPTH_FT > 20, 10, 5)

fish_all <- rbind(dfx,fishx)

# transfer some names
fish_all$group = spp_code$group[ match( fish_all$SPECIES , spp_code$code)]
fish_all$spp = spp_code$spp[ match( fish_all$SPECIES , spp_code$code)]
fish_all$com_name = spp_code$com_name[ match( fish_all$SPECIES , spp_code$code)]

# get just 2016-2019 sites for analysis and later plotting

fish_all$Size_Max = as.numeric(fish_all$Size_Max)

fish_all$group <- ifelse(fish_all$Size_Max <= min_rockfish_size & fish_all$group == "SECA" ,  "SECAy",  fish_all$group)
fish_all$group <- ifelse(fish_all$Size_Max <= min_rockfish_size & fish_all$group == "SEPI" ,  "SEPIy",  fish_all$group)
fish_all$group <- ifelse(fish_all$Size_Max <= min_rockfish_size & fish_all$group == "SEME" ,  "SEMEy",  fish_all$group)
fish_all$group <- ifelse(fish_all$Size_Max <= min_rockfish_size & fish_all$group == "SEFL" ,  "SEBYTy",  fish_all$group)
fish_all$group <- ifelse(fish_all$Size_Max <= min_rockfish_size & fish_all$group == "SEMA" ,  "SEMAy",  fish_all$group)

write.csv(fish_all , paste0(Data_Loc,"Data_Fish_All_2015_2019.csv") , row.names = FALSE)

} # END IMPORT AND PROCESS DATA

if(process.data == FALSE){ # bring in data file again
  # just to get sites
  fishx =  data.frame(read.csv(paste0(Data_Loc, "NWFSC_FISH_ALLYEARS_data_2019.csv"), header = TRUE))
  # data for analysis
  fish_all = data.frame(read.csv(paste0(Data_Loc,"Data_Fish_All_2015_2019.csv"), header = TRUE))}

fish0 <- fish_all[ fish_all$group %in% fish.include, ]
fish0 <- fish0[fish0$SITE %in% site[,1], ]

if( !is.na(Min.Vis) ){fish0 <- fish0[ fish0$VIS_M >= Min.Vis , ] }
if( Include.2015 == FALSE ){ fish0 <- fish0[ fish0$YEAR != 2015 , ]}

```

NOTES

* Some rockfishes were recorded in the 'adult' category (eg SECA instead of SECAy) but were small.  
* Set minimum size for rockfishes to `r min_rockfish_size` cm. Fish <= to `r min_rockfish_size` cm were classified as YOY.
* Maintained their original designation in SPECIES column.  New designation in the 'group' column.

Because there were a lot of zeros for some species, we lumped some species or taxa into higher taxonomic categories. Species higher grouping used in the analyses are:

```{r Species_Included}


spp.inlcluded = spp_code[spp_code$group %in% fish.include , c(1,2,4)]
colnames(spp.inlcluded) <- c('Spp_Code', 'Spp_Group', 'Common_Name')
spp.inlcluded


```

This list may have been sub-setted again. See below.

### Year x Site x Depth  

No text here. Just calculating averages by year x site x depth and outputting data. 

```{r Group_Year_Site_Depth , include = FALSE}
# aggregate for analysis
fish0$QUANTITY <- as.numeric(fish0$QUANTITY)

l.fish.ysz = aggregate( QUANTITY ~ YEAR + SITE + ZONE + group , data = fish0 , FUN = mean )
# change to wide format
w.fish.ysz = spread(l.fish.ysz , group , QUANTITY )
w.fish.ysz[is.na(w.fish.ysz)] <- 0

# data.file = w.fish.ysz
# cnames = c('YEAR','SITE','ZONE')
# cnames = c('YEAR','SITE')


w.fish.ysz = Add_Info( w.fish.ysz, c('YEAR','SITE','ZONE') )

# # for plotting
# # fish_info = w.fish.ysz[,c('YEAR','SITE','ZONE')]
# fish_info$color = site$col[match(fish_info$SITE, site$site)]
# fish_info$pch = ifelse(fish_info$ZONE == 5, 19, 21)
# fish_info$id = paste(fish_info$SITE, fish_info$YEAR, fish_info$ZONE, sep="_")
# w.fish.ysz = cbind(fish_info[,c('color','pch','id')] , w.fish.ysz)

# save out
write.csv(l.fish.ysz , paste0(Data_Loc, "D_Fish_Group_Site_Year_Zone_long.csv") , row.names = FALSE)
write.csv(w.fish.ysz , paste0(Data_Loc, "D_Fish_Group_Site_Year_Zone_wide.csv") , row.names = FALSE)

```

### Year x Site

No text here. Just calculating averages by year x site and outputting data.  

```{r Group_Year_Site , include = FALSE}
# aggregate for analysis
l.fish.ys = aggregate( QUANTITY ~ YEAR + SITE + group , data = fish0 , FUN = mean )
# change to wide format
w.fish.ys = spread(l.fish.ys , group , QUANTITY )
w.fish.ys[is.na(w.fish.ys)] <- 0

w.fish.ys = Add_Info( w.fish.ys, c('YEAR','SITE') )

# for plotting
fish_info = w.fish.ys[,c('YEAR','SITE' )]
fish_info$color = site$col[match(fish_info$SITE, site$site)]
fish_info$id = paste(fish_info$SITE, fish_info$YEAR , sep="_")
w.fish.ys = cbind(fish_info[, c('color', 'id') ] , w.fish.ys)

# save out
write.csv(l.fish.ys , paste0(Data_Loc, "D_Fish_Group_Site_Year_long.csv") , row.names = FALSE)
write.csv(w.fish.ys , paste0(Data_Loc, "D_Fish_Group_Site_Year_wide.csv") , row.names = FALSE)

```

## Constrained Ordination: Year + Site + Depth (zone)

The ordinations here use transect level information in constrained ordinations.  The PerMANOVA uses Year, Site, and Depth as Factors.  The ordination is and RDA-type approach to Canonical Analysis of Principal Coordinates (not exactly the same) using the 'capscale' package.  I present centroids and se, not transects in the figures for clarity. 

```{r Set_Fish_Transform}
# spread to WIDE
# transform data

l.fish  = aggregate( QUANTITY ~ YEAR + SITE + ZONE + TRANSECT + group , data = fish0 , FUN = mean )
# change to wide format
w.fish1 = spread(l.fish , group , QUANTITY )
w.fish1[is.na(w.fish1)] <- 0
rownames(w.fish1) <- paste0("Tran_", 1:nrow(w.fish1))

x = rowSums(w.fish1[ , cap.fish])
w.fish <- w.fish1[ x != 0 , ]

w.fish <- Add_Info(w.fish, c('YEAR', 'SITE', 'ZONE'))

if(data.transform == "none"){df_fish = w.fish[, cap.fish]}
if(data.transform == "sqrt"){df_fish = w.fish[, cap.fish]^(1/2)}
if(data.transform == "4th-root"){df_fish = w.fish[, cap.fish]^(1/4)}
if(data.transform == "log"){df_fish = log(w.fish[, cap.fish]+1) }

# remove transects with zero fish

if(length(cap.fish) == length(fish.include)){YOY.statement = "This analysis includes rockfish YOY."}else{
  YOY.statement = "This analysis does NOT include rockfish YOY.  We deleted YOY from the analyses because they were highly variable and often appeared in only one year. Univariate plots do show YOY."}


```

Note: `r YOY.statement`

DATA TRANSFORM: `r data.transform`

Taxa Included: `r cap.fish`

### Permanova  

In the PerMANVOA everything is significant.  I think this is OK.  Things are messy. I don't actually think it is necessary for our paper.  

The associated ordination is more clear. This ordination uses site x year x depth as groups and transects as replicates.  I have, however, calculated the centroids for plotting.

FYI removing BAIT doesn't do much.

Bray-Curtis transform.

```{r PerManova_Transect}

pm1 = adonis( df_fish ~ SITE*ZONE*YEAR, data = w.fish, method = 'bray')

# summary(pm1)
pm1$aov.tab

saveRDS(pm1, file = paste0(Results_Loc,'Fish_PerMANOVA_transect_all.rds'))
capture.output(pm1, file =  paste0(Results_Loc,'Fish_PerMANOVA_transect_all.txt'))

```

### capscale - site x year x depth

To match PerMANOVA. RDA type analysis with permutations.  This analysis has the same Factor groupings as the PerMANOVA.

```{r capscale1 , include = FALSE}

CAPid = as.factor(paste(w.fish$SITE , w.fish$ZONE , w.fish$YEAR, sep = "_") )
CAPfish = data.frame(cbind(CAPid, w.fish))
# fish.start = grep('AUFL', colnames(CAPfish))
# df_fish <- (CAPfish[, fish.start:ncol(CAPfish) ] )^(1/4)
df_fish <- as.matrix(df_fish)
cap1 = capscale( df_fish  ~  as.factor(CAPid) , data=CAPfish , dist = 'bray' )
# cap2

cap1_scores = scores(cap1)
cap1_df = data.frame(cbind(  w.fish , cap1_scores$sites ) )

capture.output( cap1, paste0(Results_Loc,"R_capscale_site_depth_year.txt") )
saveRDS( cap1, paste0(Results_Loc,"R_capscale_site_depth_year.rds") )
capture.output( cap1_df , paste0(Data_Loc, "D_Fish_CAP_YSD.csv") )
```


```{r Fish_capscale1_plot_YSD , fig.width=6, fig.height= 6, fig.cap="Ordination plot for CAP based on site x year x depth.  Error bars are +/- 1.0 s.e. Lower-right pane zooms in on the species scores."}

par( mfrow = c(2,2), pty='s' , mar = c(4,4,1,1))

Xlim = c( -3 , 3 )
Ylim = c( -3 , 3 )

plot(cap1_df$CAP1 , cap1_df$CAP2, pch = 19 , col = cap1_df$col, xlim = Xlim, ylim = Ylim )

# summarize
xy=c(-2,2)
Plot_Ordination( data.file = cap1_df , ord.file = cap1, 
                 Yform = 'CAP', Xform =  ' ~ SITE + YEAR + col + pch',
                 Xlim = xy, Ylim=xy,
                 plot.species = FALSE)

spp_scores = data.frame(cap1_scores$species)
plot(spp_scores[,1] , spp_scores[,2], pch="", xlim = c(-4,4), ylim = c(-4,4) )
text(spp_scores[,1] , spp_scores[,2], rownames(spp_scores), cex=0.8)
segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)

spp_scores = data.frame(cap1_scores$species)
plot(spp_scores[,1] , spp_scores[,2], pch="", xlim = c(-0.5,0.5), ylim = c(-0.5,0.5) )
text(spp_scores[,1] , spp_scores[,2], rownames(spp_scores), cex=0.8)
segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)


# plot(1:10, 1:10, pch="", xlab=NA, ylab=NA, xaxt='n', yaxt='n', bty='n')
legend('bottomright' , legend = site$site, col = site$col , pch = 19, bty = 'n', cex = 0.8)
legend('bottomleft' , legend = years, pch = PCH, bty = 'n', cex = 0.8)
```


```{r Fish_capscale1_plot_YSD_1pane , fig.width=5, fig.height= 5, fig.cap="Ordination plot for CAP based on site x year x depth.  Error bars are +/- 1.0 s.e. BAIT is not shown but is to the lower right outside the current axes limits."}

xy = c(-2,2)
Plot_Ordination( data.file = cap1_df , ord.file = cap1, 
                 Yform = 'CAP', Xform =  ' ~ SITE + YEAR + col + pch',
                 Xlim = xy, Ylim=xy)
legend('topleft' , legend = site$site, col = site$col , pch = 19, bty = 'n', cex = 0.8)
legend('topright' , legend = years, pch = PCH, bty = 'n', cex = 0.8)
```

Same data replotted on one pane. We can see some clear site differences:

* Cape Alava and Cape Johnson do not overlap
* Tatoosh and Neah Bay are intermediate between the former two
* Destruction is all over the place

# INVERT ORDINATION

```{r Inverts_cap_data_manip}

# remove transects with zero fish

if(length(cap.fish) == length(fish.include)){YOY.statement = "This analysis includes rockfish YOY."}else{
  YOY.statement = "This analysis does NOT include rockfish YOY.  We deleted YOY from the analyses because they were highly variable and often appeared in only one year. Univariate plots do show YOY."}

l.inverts = data.frame(read.csv(paste0(Data_Loc, "invert_swath_transectlevel.csv")), header=TRUE)
w.inverts = spread( l.inverts[, c('year','site','zone','transect','lump','dens_weighted')] , lump , dens_weighted )
colnames(w.inverts)[1:3] <- c("YEAR", "SITE" , "ZONE")
w.inverts[is.na(w.inverts)] <- 0
w.inverts <- Add_Info( w.inverts , c("YEAR", "SITE" , "ZONE") )
if(Include.2015 == FALSE){ w.inverts <- w.inverts[ w.inverts$YEAR !=2015 , ] }


if(invert.transform == "none"){df_inverts = w.inverts[, cap.inverts]}
if(invert.transform == "sqrt"){df_inverts = w.inverts[, cap.inverts]^(1/2)}
if(invert.transform == "4th-root"){df_inverts = w.inverts[, cap.inverts]^(1/4)}
if(invert.transform == "log"){df_inverts = log(w.inverts[, cap.inverts]+1) }

# drop zero rows

```

```{r invert_capscale, include=FALSE}

CAPid = as.factor(paste(w.inverts$SITE , w.inverts$ZONE , w.inverts$YEAR, sep = "_") )
CAPinverts = data.frame(cbind(CAPid, w.inverts))
df_inverts <- as.matrix(df_inverts)
cap2 = capscale( df_inverts  ~  as.factor(CAPid) , data=CAPinverts , dist = 'bray' )

cap2_scores = scores(cap2)
cap2_df = data.frame(cbind(  w.inverts , cap2_scores$sites ) )

capture.output( cap1, paste0(Results_Loc,"R_Inverts_capscale_site_depth_year.txt") )
saveRDS( cap1, paste0(Results_Loc,"R_Inverts_capscale_site_depth_year.rds") )
capture.output( cap1_df , paste0(Data_Loc, "D_Inverts_CAP_YSD.csv") )

```

Groupings used in the Invert Ordination were: 

`r colnames(df_inverts)`

The invert ordination was pretty clear. Tatoosh, for exmaple, was characterized by large numbers of all urchins. Destruction, Alava, and Tatoosh all had some *Pisaster* and leather stars.


```{r invert_capscale_plot_YSD , fig.width=6.5, fig.height= 5, fig.cap="Ordination plot for CAP based on site x year x depth.  Error bars are +/- 1.0 s.e."}
par( pty = 's', mfrow = c(1,2) , mar=c(4,3,1,1) , ps=10, cex=1)
xy = c(-2,2)
xy2 = c(-6,6)
Plot_Ordination( data.file = cap2_df , ord.file = cap2, 
                 Yform = 'CAP', Xform =  ' ~ SITE + YEAR + col + pch',
                 Xlim = xy, Ylim=xy, Xlim2 = xy2, Ylim2=xy2,
                 plot.species = FALSE, spp.separate = TRUE)
legend('bottomleft' , legend = site$site, col = site$col , pch = 19, bty = 'n', cex = 0.8)
legend('topleft' , legend = years, pch = PCH, bty = 'n', cex = 0.8)

# some additional plotting
dfx = cap2_scores$species[ c('purple_urchin', 'red_urchin', 'green_urchin','P_ochraceous') , ]
text(dfx[,1] , dfx[,2], rownames(dfx), col='black', cex=0.8)
  
```

***  

# HABITAT VARIABLES


```{r Import_UPC_Data, include=FALSE}

# bring in data

# algae

algae_zone_long = data.frame(read.csv( paste0(Data_Loc , "algae_swath_zonelevel_lump.csv"), header = TRUE))
algae_zone <- spread(algae_zone_long[, c('year','site','zone','classcode','dens_weighted')], classcode, dens_weighted)
colnames(algae_zone)[1:3] <- c('YEAR', 'SITE','ZONE')
algae_zone <- Add_Info( algae_zone , c('YEAR','SITE','ZONE'))
if(Include.2015 == FALSE){algae_zone <- algae_zone[ algae_zone$YEAR != 2015, ]}


# UPC ####
upc_zone_long = data.frame(read.csv( paste0(Data_Loc , "UPC_Zone_Level_Summary.csv"), header = TRUE))

# convert to wide 
upc_zone <- spread(upc_zone_long[ ,c("SITE", "YEAR", "ZONE", "Functional_Group","MEAN")], Functional_Group, MEAN)

cn = colnames(upc_zone)
cn1 = stringr::str_replace(cn, " " , "_")
cn2 = stringr::str_replace(cn1, "-" , "_")
colnames(upc_zone) = cn2
upc_zone <- Add_Info(upc_zone , c("SITE",'YEAR','ZONE'))

# Substrate ####

rock_zone_long = data.frame(read.csv( paste0(Data_Loc , "SUBSTRATE summary by YEAR-SITE-DEPTH 2016-2019.csv"), header = TRUE))
# convert to wide

rock_zone <- spread( rock_zone_long[, c("YEAR", "SITE", "ZONE", "CLASSCODE", "MEAN")] , CLASSCODE , MEAN)
rock_zone$pch = year.pch$PCH[ match(rock_zone$YEAR , year.pch$years)]
rock_zone$col = site$col[ match(rock_zone$SITE , site$site)]  
if(Include.2015 == FALSE){rock_zone <- rock_zone[ rock_zone$YEAR != 2015, ]}
# Relief ####


relief.div_zone_long = data.frame(read.csv( paste0(Data_Loc , "RELIEF_DIVERSITY_Zone_Level_Summary.csv"), header = TRUE))
if(Include.2015 == FALSE){relief.div_zone_long <- relief.div_zone_long[ relief.div_zone_long$YEAR != 2015, ]}
# Relief  ####

relief_zone_long = data.frame(read.csv( paste0(Data_Loc , "RELIEF_Zone_Level_Summary.csv"), header = TRUE))
# convert to wide

relief_zone <- spread( relief_zone_long[, c("YEAR", "SITE", "ZONE", "CLASSCODE", "MEAN")] , CLASSCODE , MEAN)
if(Include.2015 == FALSE){relief_zone <- relief_zone[ relief_zone$YEAR != 2015, ]}

#### combine all substrate data for pca


hab1 = merge(rock_zone, relief_zone , by = c("YEAR", "SITE", "ZONE"))
hab = merge(hab1, relief.div_zone_long[,c("YEAR", "SITE", "ZONE","MEAN")], by = c("YEAR", "SITE", "ZONE"))
colnames(hab)[ncol(hab)] <- "rel.div"

```

I used PCA for the habitat variable ordinations instead of a constrained ordination or nMDS in order to do data reduction and produce variables to include in future ordination or other analyzes (see below).  Constrained ordinations are not really data reductions and nMDS doesn't really produces usable axes.  Thus PCA seemed the best approach.  

* PCA the various habitat variables to reduces dimensions
* Do separately for each variable type (substrate, UPC)
* Initial analysis by depth zone to maintain the most replicates
  * not necessarily interested in depth for the analyses
* Did NOT include kelp to maintain a separation between biotic and abiotic habitat

## Substrate PCA - YEAR x SITE x DEPTH means

PC1 mostly distinguishes between bedrock and boulder. Boulder areas had higher relief (>2m) and higher relief diversity. PC2 tends to distinguish between the two mid-complexity categories.

While there is some variation among years within sites (spread), it isn't bad.  Mostly, there are some obvious differences among sites with regard to habitat. 

This might be a good initial ordination figure just describing the sites and showing that the physcial habitat differs among sites. Depths to a lesser extent.

* Tatoosh and Destruction Island have more bedrock and higher relief than other areas. 
* Neah Bay is bascially intermediate for everything.
* Cape Alava and Cape Johnson have more boulder and kind of intermediate relief.


```{r pca_rock , fig.width=5, fig.height = 5, fig.cap= "Substrate PCA. Open circles are 5-m depth zone; closed circles 10-m depth zone."}  

Vars = c("BEDRK","BOULD","COB","SAND",">2m","0-10cm","10cm-1m","1m-2m","rel.div")

hab_pca <- princomp( hab[, Vars] )

par(pty='s', mar = c(4,3,2,1))

# screeplot(rock_pca)
fviz_eig(hab_pca)

# good old ordiplot

hab$col2 = ifelse( hab$ZONE == 5 , NA, hab$col)
par( pty = 's', ps = 10, cex = 1.0)
ordiplot(hab_pca, display = 'species' , type = 'text' , xlim = c(-1 , 1) , ylim = c(-1 , 1))

pcs_hab = scores(hab_pca)
points(pcs_hab[,1] , pcs_hab[,2], pch = hab$pch , col = hab$col, bg = hab$col2 , lwd = 1.5 )

segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)

legend('topright' , legend = site$site, pch=19, col=site$col, bty='n', cex = 0.8)
legend('topleft' , legend = years, pch=PCH , bty = 'n', cex= 0.8)

x = data.frame(hab[,c('YEAR','SITE','ZONE','pch','col')])
abiotic = data.frame(cbind(x,pcs_hab))

write.csv(abiotic , paste0(Data_Loc,"PCs_Abiotic_Habitat.csv") , row.names = TRUE)

```
## KELP PCA

The kelp ordination isn't great and we can't really reduce the axes much more than the four axes in the original data.  Also, given only four groups and three actual species, I think it is better to maintain the identity and use the original data not PCs in the following analyses.

You can see some depth pattern with deeper sites (filled circles) off to the upper left. This is hardly surprising.  


```{r pca_kelp}

pca_algae <- princomp( algae_zone[, c("MACPYR",'NERLUE','OTHER','PTECAL')])

par( pty = 's', ps = 10, cex = 1.0)

fviz_eig(pca_algae)

ordiplot(pca_algae,  display='species', type = 'text' , xlim = c(-3 ,3) , ylim =c(-3,3))

pcs_alg = scores(pca_algae)

points(pcs_alg[,1] , pcs_alg[,2], pch = algae_zone$pch , col = algae_zone$col, bg = algae_zone$bgcol , lwd = 1.5 )

segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)

legend('topright' , legend = site$site, pch=19, col=site$col, bty='n', cex = 0.8)
legend('topleft' , legend = years, pch=PCH , bty = 'n', cex= 0.8)


```


## UPC - PCA

There is some separation of points based on site:

```{r pca_upc , fig.width=5, fig.height = 5, fig.cap= "UPC PCA analysis. Open circles are 5-m depth zone; closed circles 10-m depth zone."}  

Vars = c("Brown_Algae","Diatom_Layer","Eelgrass_Surfgrass","Encrusting_Species","Green_algae","Mobile_Invertebrate","Non_living_Substrate","Non_mobile_Invertebrate", "Red_Algae")

upc_pca <- princomp( upc_zone[, Vars] )

par(pty='s', mar = c(4,3,2,1))

# screeplot(rock_pca)
fviz_eig(upc_pca)


# # biplot(rock_pca , arrow.len = 0.05)
# 
# fviz_pca_biplot(hab_pca, repel = TRUE,
#                 geom.ind = c('point'),
#                 col.var = "#2E9FDF", # Variables color
#                 col.ind =  hab$col, # Individuals color
#                 #addEllipses = TRUE,
#                 title = "Substrate Biplot"
#                 )
# 

# good old ordiplot
par( pty = 's', ps = 10, cex = 1.0)
ordiplot(upc_pca, display = 'species' , type = 'text' , xlim = c(-1.5 , 1.5) , ylim = c(-1.5 , 1.5))

pcs_upc = scores(hab_pca)
points(pcs_upc[,1] , pcs_upc[,2], pch = upc_zone$pch , col =upc_zone$col, bg = upc_zone$bgcol , lwd = 1.5 )

segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)

legend('topright' , legend = site$site, pch=19, col=site$col, bty='n', cex = 0.8)
legend('topleft' , legend = years, pch=PCH , bty = 'n', cex= 0.8)

x = data.frame(upc_zone[,c('YEAR','SITE','ZONE','pch','col')])
upc = data.frame(cbind(x,pcs_hab))

write.csv(upc , paste0(Data_Loc,"PCs_UPC.csv") , row.names = TRUE)

```

# FISH VS ABIOTIC HABITAT AND KELP

I combined the PCs from the habitat analysis with the kelp data.  I then ran and RDA style constrained analysis.    

This constrained habitat vs. fish analysis is ugly. In fact, it is non-significant! So while there are differences among sites x depth x year in fish abundance, they don't seem related to habitat directly.  

This actually makes sense when you compare the separate fish and habitat ordinations. For example, Cape Alava and Cape Johnson have different fish fauna, but similar habitat. 


```{r Can_Cor_Type_RDA}

# merge data

hab_vars0 = data.frame(merge(abiotic, algae_zone , by=c("YEAR", "SITE", "ZONE")))
hab_vars1 = hab_vars0[, c("YEAR" ,"SITE","ZONE","Comp.1","Comp.2","MACPYR","NERLUE","PTECAL","OTHER") ]
colnames(hab_vars1) <- c("YEAR" ,"SITE","ZONE","Subs_1","Subs_2","MACPYR","NERLUE","PTECAL","OTHER")
upc1 = upc[ , c("YEAR", "SITE", "ZONE", 'Comp.1','Comp.2')]
colnames(upc1)[4:5] <- c('upc_1','upc_2')
hab_vars2 = merge( hab_vars1 , upc1 , by=c("YEAR", "SITE", "ZONE"))

# standardize variables

hab_vars3 = apply( hab_vars2[, c('Subs_1','Subs_2','upc_1','upc_2','MACPYR','NERLUE','PTECAL','OTHER') ] , 2, scale)

hab_vars = cbind(hab_vars0[,1:3] , hab_vars3)
colnames(hab_vars)

cap_data = data.frame(merge(w.fish.ysz , hab_vars, by=c("YEAR", "SITE", "ZONE")))

write.csv( cap_data , paste0(Data_Loc, 'D_cap_data_test.csv'), row.names = FALSE)

cap4 = capscale( cap_data[,cap.fish] ~ Subs_1 + Subs_1 + upc_1 + upc_2 + MACPYR + NERLUE + PTECAL + OTHER, 
                 data = cap_data)
cap4
cap_sum <- summary(cap4)

scores1 = scores(cap4)
anova(cap4)

```


```{r Hab_Analysis_capscale_plot}


df = data.frame(scores1$sites)
xy = max(df)
ordiplot(cap4 , display = 'species' , type = 'text' , xlim = c(-xy,xy) , ylim = c(-xy,xy) , col = 'blue' )
points( df$CAP1 , df$CAP2 , pch = cap_data$pch , col = cap_data$col, bg = cap_data$bgcol) 
hab_scores = data.frame(cap_sum$biplot)
text(hab_scores$CAP1, hab_scores$CAP1 , rownames(hab_scores) , col = 'red')

```
## INVERTS


```{r Invert_Habitat_CAP}
# invert data by zone
l.inverts = data.frame(read.csv(paste0(Data_Loc,"invert_swath_zonelevel.csv") , header=TRUE))
if(Include.2015 == FALSE){l.inverts <- l.inverts[ l.inverts$year != 2015 , ]}
w.inverts.ysz <- spread( l.inverts[ , c('year','site','zone','lump','dens_weighted')] , lump , dens_weighted )
colnames(w.inverts.ysz)[1:3] <- c("YEAR", "SITE", "ZONE")

cap_data_inverts = data.frame(merge(w.inverts.ysz , hab_vars, by=c("YEAR", "SITE", "ZONE")))
cap_data_inverts <- Add_Info( cap_data_inverts , c("YEAR", "SITE", "ZONE") )
write.csv( cap_data_inverts , paste0(Data_Loc, 'D_Inverts_cap_data_test.csv'), row.names = FALSE)

cap_inv = capscale( cap_data_inverts[, cap.inverts] ~ Subs_1 + Subs_1 + upc_1 + upc_2 + MACPYR + NERLUE + PTECAL + OTHER, 
                 data = cap_data_inverts)
cap_inv
cap_sum <- summary(cap_inv)


anova(cap_inv)


```

The Invert vs Habitat relationship was significant.  The main difference separated Tatoosh from other areas. Interestingly, 
* Urchins in general were both positively and negatively associated with Nero.  
  * On Axis 1 Nero, Ptero and urchins are all off to the right. 
  * On Axis 2 the urchins are positive while the kelps are more negative. 

* The postive relationship between UPC_1 and urchins in general puts the urchins at sites with more brown algae and less red algae. 

* The positive relationship with Subs_1 on the x-axis suggests more urchins at areas with bedrock.
  * The negative relationship with Subs_1 on the y-axis suggests the relationship is more complex
  * Perhaps urchins like areas with both bedrock and boulder.  Broad areas to eat, some crevices to hide.

```{r Hab_Analysis_capscale_plot , fig.width=6, fig.height=6, fig.cap="Ordination of invertebrate density vs. habitat charcteristics. Note, the second pane zooms in on the central cluster of points and exclues some points seen in the first pane."}
inv_scores = scores(cap_inv)

df = data.frame(inv_scores$sites)
xy = max(df)

par( pty='s', ps=10, cex=1 , mfrow = c(2,2) , mar=c(4,3,1,1))

ordiplot(cap_inv , xlim = c(-xy, xy), ylim=c(-xy,xy) )
points( df$CAP1 , df$CAP2 , pch = cap_data_inverts$pch , col = cap_data_inverts$col, bg = cap_data_inverts$bgcol) 
hab_scores = data.frame(cap_sum$biplot)
# text(hab_scores$CAP1, hab_scores$CAP1 , rownames(hab_scores) , col = 'red', cex = 0.8)

plot( df$CAP1 , df$CAP2 , 
      pch = cap_data_inverts$pch , col = cap_data_inverts$col, bg = cap_data_inverts$bgcol, 
      xlim = c(-1, 1), ylim=c(-1,1)) 

spp_scores = data.frame(scores(cap_inv)$species)
plot(spp_scores$CAP1 , spp_scores$CAP2 , pch="", xlim=c(-1,1), ylim=c(-1,1))

text(spp_scores$CAP1, spp_scores$CAP1 , rownames(spp_scores) , col = 'red', cex = 0.8)
segments( par()$usr[1],0,par()$usr[2],0, lty = 'dotted', lwd = 0.5)
segments( 0, par()$usr[3],0,par()$usr[4], lty = 'dotted', lwd = 0.5)

plot(1:10, 1:10, pch="", xlab=NA, ylab=NA, xaxt='n', yaxt='n', bty='n')
legend('left' , legend = site$site, col = site$col , pch = 19, bty = 'n', cex = 1)
legend('right' , legend = years, pch = PCH, bty = 'n', cex = 1)

```

***

# UNIVARIATE PLOTS

These plots include all the "groups" in the data.  These taxa-groups are not all in the ordinations above. They are plotted here for referecne  

## FISH

```{r Fish_Spp_Plots_Site_Year , fig.width=6, fig.height=8, fig.cap="Fish abundance by site and year."}


par(  mfrow = c(4,2) , mar=c(3,4,1,1))

for(i in 1:length(fish.include)){
  form1 = paste0(fish.include[i] , " ~ YEAR + SITE" )
  df1 = Sum_Stats( form1 , w.fish)
  df1$seUP = df1$mean + df1$se
  df1$seLO= df1$mean - df1$se
  Ymax = max(df1$seUP)
  
  # blank plot
  plot( as.integer(years) , seq(0,Ymax, length.out = length(years)) , pch ="", ylim = c(0, Ymax) , 
        xlab = NA, ylab = "Number" , xaxt = 'n')
  axis( side = 1 , at = years, label=years, tck = -0.05)
  cn = spp.inlcluded$Common_Name[ match(fish.include[i] , spp.inlcluded$Spp_Group)]
  legend('topleft', legend = paste0(fish.include[i] , ": ", cn), bty='n')
  for(k in 1:nrow(site)){
    df2 = df1[ df1$SITE == site$site[k] , ]
    points( df2$YEAR , df2$mean , type = "b", pch =19, col = site$col[k] )
    
    if(i %in% c(1,9)){ legend('topright', legend = site$site, pch=19 , col=site$col , bty='n' , cex=0.8)}
  }
}

```
### YOY plots

Two heat plots for rockfish YOY. They are the same data, just ordered differently to emphasize either species or site.  

The first heat plot shows clearly the different recruiment patterns among species with 
* Blacks and Black/YT recruiting heavily in 2016
* Yelloweye (SEPI) in 2018
* Copper/quill and unidentified RYOY in 2019.

NOTE: colors are scales across rows.

```{r YOY_heatmap, fig.width=6.5, fig.height=5, fig.cap="Heat-map plot of YOY abundance"}
par(ps=10,cex=1)
yoy.spp = c(  "SEBYTy" , "SEMEy", "SECAy", "SEPIy", "RYOY")

# df_yoy = w.fish.ys[ , c('YEAR','SITE', yoy.spp)]
  
df_yoy = l.fish.ys[ l.fish.ys$group %in% yoy.spp , ]
w.yoy = spread(df_yoy , YEAR , QUANTITY)
w.yoy[ is.na(w.yoy) ] <-0
rownames(w.yoy) <- paste(w.yoy$SITE,w.yoy$group,sep="_")

for( i in 1: length(yoy.spp)){
  dfx = w.yoy[w.yoy$group == yoy.spp[i] ,]
  dfy = data.frame(site$site)
  dfy$group = yoy.spp[i]
  dfy[ , 3:6] = dfx[ match(dfy$site.site, dfx$SITE),3:6] 
  if(i==1){dfz = dfy}else{dfz = rbind(dfz,dfy)}
  
}
rownames(dfz) <- paste(dfz$group , dfz$site.site, sep="_")
yoy.matrix.spp = as.matrix(dfz[ , 3:6 ])
yoy.matrix.site = as.matrix(w.yoy[,3:6])

par( mfrow=c(2,1) )

COL = brewer.pal(9,'YlOrRd')
COL = brewer.pal(9,'Blues')

heatmap(yoy.matrix.spp, Rowv = NA, Colv = NA, scale = "row", col = COL, cexRow = 0.8, cexCol = 1)

heatmap(yoy.matrix.site, Rowv = NA, Colv = NA, scale = "row", col = COL, cexRow = 0.8, cexCol = 1)

```


## INVERTS

```{r Invert_Spp_Plots_Site_Year , fig.width=6, fig.height=8, fig.cap="Fish abundance by site and year."}


par(  mfrow = c(4,2) , mar=c(3,4,1,1))

for(i in 1:length(invert.spp)){
  form1 = paste0(invert.spp[i] , " ~ YEAR + SITE" )
  df1 = Sum_Stats( form1 , w.inverts)
  df1$seUP = df1$mean + df1$se
  df1$seLO= df1$mean - df1$se
  Ymax = max(df1$seUP)
  
  # blank plot
  plot( years , seq(0,Ymax, length.out = length(years)) , pch ="", ylim = c(0, Ymax) , 
        xlab = NA, ylab = "Number", xaxt='n')
  axis( side = 1 , at = years, label=years, tck = -0.05)
  legend('topleft', legend = invert.spp[i], bty='n')
  for(k in 1:nrow(site)){
    df2 = df1[ df1$SITE == site$site[k] , ]
    points( df2$YEAR , df2$mean , type = "b", pch =19, col = site$col[k] )
    
    if(i %in% c(1,9,17,25)){ legend('topright', legend = site$site, pch=19 , col=site$col , bty='n' , cex=0.8)}
   
   
  }
  
  
}

```