Env-Data-Sci-FA23 · ajloud · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/01_Tidying_WQ_data.Rmd → Loudenback_01_Tidying_WQ_data.Rmd b/01_Tidying_WQ_data.Rmd → Loudenback_01_Tidying_WQ_data.Rmd
@@ -254,7 +254,68 @@ data frame to `conc_daily` (i.e., a data frame of daily average ion
 concentrations).
 
 ```{r}
-#<<<< YOUR CODE HERE >>>>#
+general_WQ <- tibble(siteid = c("?"),
+                   basin = c("?"))
+
+WQ_tidying_data <- function(general_WQ){
+parameters <- c("Calcium", "Magnesium", "Sodium", "Potassium", "Sulfate", "Sulfate as SO4", "Sulfur Sulfate", "Total Sulfate", "Chloride", "Alkalinity, bicarbonate", "Bicarbonate")
+
+conc_wide_USGS <- readWQPdata(siteid = general_WQ$siteid, # pulling our siteids column in from the colorado object, becomes a vector
+                           sampleMedia = "Water", # WQP also has bilogical data and sediment data
+                           startDateLo = "1980-10-01", # must be formatted as YYYY-MM-DD
+                           startDateHi = "2023-11-01", # must be formatted as YYYY-MM-DD
+                           characteristicName = parameters) # our vector of `characteristcName`s
+
+conc_small_USGS <-  conc_wide %>%
+    select(date = ActivityStartDate,
+           parameter = CharacteristicName,
+           units = ResultMeasure.MeasureUnitCode,
+           siteid = MonitoringLocationIdentifier,
+           org = OrganizationFormalName,
+           org_id = OrganizationIdentifier,
+           time = ActivityStartTime.Time,
+           value = ResultMeasureValue,
+           sample_method = SampleCollectionMethod.MethodName,
+           analytical_method = ResultAnalyticalMethod.MethodName,
+           particle_size = ResultParticleSizeBasisText,
+           date_time = ActivityStartDateTime,
+           media = ActivityMediaName,
+           sample_depth = ActivityDepthHeightMeasure.MeasureValue,
+           sample_depth_unit = ActivityDepthHeightMeasure.MeasureUnitCode,
+           fraction = ResultSampleFractionText,
+           status = ResultStatusIdentifier) %>%
+    mutate(units = trimws(units))
+
+conc_meta_USGS <- conc_small %>%
+    left_join(., general_WQ, by = "siteid") %>%
+    dplyr::mutate(ion = dplyr::case_when(parameter == "Calcium" ~ "Ca",
+                                         parameter == "Magnesium" ~ "Mg",
+                                         parameter == "Sodium" ~ "Na",
+                                         parameter == "Potassium" ~ "K",
+                                         parameter %in% c("Sulfate", "Sulfate as SO4", "Sulfur Sulfate", "Total Sulfate") ~ "SO4",
+                                         parameter == "Chloride" ~ "Cl",
+                                         parameter %in% c("Alkalinity, bicarbonate", "Bicarbonate") ~ "HCO3")) %>%
+    select(siteid, basin, ion, parameter, date, everything())
+
+conc_tidy_USGS <- conc_meta %>%
+    filter(units == 'mg/l') %>%
+    mutate(date = ymd(date)) %>%
+    select(date,
+           parameter,
+           ion,
+           siteid,
+           basin,
+           conc = value)
+
+conc_daily_USGS <- conc_tidy %>%
+    group_by(date, parameter, siteid, basin) %>%
+    summarize(conc = mean(conc, na.rm = T))
+}
+
+```
+
+```{r}
+WQ_tidying_data(general_WQ = colorado)
 ```
 
 ## Question 2
@@ -265,6 +326,8 @@ quality data for the site IDs listed below:
 ```{r}
 additional_data <- tibble(siteid = c('USGS-09180000', 'USGS-09180500', 'USGS-09380000'),
                           basin = c('dolores', 'colorado4', 'colorado5'))
+
+WQ_tidying_data(general_WQ = additional_data)
 ```
 
 This output of running the function should look identical in format to
@@ -277,10 +340,11 @@ Combine the data pulled in Question 2 with the original data from
 combined data as `tidied_full_wq.RDS` in a folder called data.
 
 ```{r}
+library(dplyr)
 
-#<<<< YOUR CODE HERE >>>>#
+tidied_full_wq <- bind_rows(conc_daily, additional_data)
 
-#saveRDS(wq, file = 'data/tidied_full_wq.RDS')
+saveRDS(tidied_full_wq, file = 'data/tidied_full_wq.RDS')
 ```
 
 ## Question 4
@@ -297,6 +361,10 @@ you need to remove `USGS-` from each site. Discharge is
 automatically make the column names a little less annoying.
 
 ```{r}
+library(dataRetrieval)
+
+# Save the discharge data as an RDS object
+saveRDS(tidied_full_wq, file = 'data/Q.RDS')
 # Reminder! you can use ?readNWISdv to read about how the function works. 
 sites <- colorado %>%
   #Bind the two datasets to get all 8 sites
@@ -306,8 +374,14 @@ sites <- colorado %>%
   #Remove the USGS- prefix
   gsub('USGS-', '', .)
 
-#<<<< YOUR CODE HERE >>>>#
-
+# Download daily discharge data for all eight sites
+q_data <- readNWISdv(siteNumbers = sites, 
+                     parameterCd = "00060", 
+                     startDate = "1980-10-01", 
+                     endDate = "2023-11-01") %>%
+  # Rename columns to be less annoying
+  renameNWISColumns()
 
 #saveRDS(q_data, file = 'data/Q.RDS')
+saveRDS(q-data, file = 'data/tidied_full_wq.RDS')
 ```
diff --git a/02_Modelling_WQ_data.Rmd → Loudenback_02_Modelling_WQ_data.Rmd b/02_Modelling_WQ_data.Rmd → Loudenback_02_Modelling_WQ_data.Rmd
@@ -269,59 +269,99 @@ The above workflow really focuses on trend detection with Sens slope, but here w
 Use `inner_join` to join our daily discharge data (`Q.RDS`) to our raw water quality data (`tidied_full_wq.RDS`). You want to join by both date and siteid. Remember! the discharge data has site IDs that we had to drop the `USGS-` from, so you will need to add that back in using `paste0`.
 
 ```{r}
+names(discharge_data)
+head(discharge_data)
 
-#<<<< YOUR CODE HERE >>>>#
+names(water_quality_data)
+head(water_quality_data)
+```
+
+```{r}
+library(dplyr)
+
+discharge_data <- readRDS('~/Desktop/Classes/Fall 2023/ESS 523A/Week-5-Nested-Modelling/data/Q.RDS')
+names(discharge_data) <- tolower(names(discharge_data))
+discharge_data <- rename(discharge_data, siteid = site_no)
+
+water_quality_data <- readRDS('~/Desktop/Classes/Fall 2023/ESS 523A/Week-5-Nested-Modelling/data/tidied_full_wq.RDS')
+
+discharge_data <- discharge_data %>%
+  mutate(siteid = paste0('USGS-', siteid))
+
+joined_data <- inner_join(water_quality_data, discharge_data, by = c('date', 'siteid'))
+
+View(joined_data)
 
 ```
 
 ### Pick any site and ion combination and plot discharge versus ion concentration
 
 ```{r}
+selected_data <- joined_data %>%
+  filter(siteid == "USGS-09034500", ion == "Mg")
 
-#<<<< YOUR CODE HERE >>>>#
-
+ggplot(selected_data, aes(x = flow, y = conc)) +
+  geom_point() +
+  labs(title = paste("Discharge vs", "Mg", "Concentration at", "USGS-09034500"),
+       x = "Discharge", y = paste("Mg", "Concentration"))
 ```
 
-#### What do you see in this relationship?
+#### What do you see in this relationship? When looking at discharge vs magensium concentration, I see that the concentrations at this specific height are greatest when water flow (discharge) is slower. 
 
 ## Models everywhere
 
 Group your data by basin and water quality parameter and nest the data.
 
 ```{r}
+grouped_data <- joined_data %>%
+  group_by(basin, parameter) %>%
+  nest()
 
-#<<<< YOUR CODE HERE >>>>#
-
+print(grouped_data)
 ```
 
 ## Apply a linear model to the data
 
 You will need to use a `map` command like this: `map(data, ~lm(conc ~ q, data = .x))`
 
 ```{r}
+library(purrr)
 
-#<<<< YOUR CODE HERE >>>>#
+grouped_datalm <- grouped_data %>%
+  mutate(grouped_datalm = map(data, ~lm(conc ~ flow, data = .x)))
 
+print(grouped_datalm)
 ```
 
 ## Summarize your data using `tidy`
 
 You should have a new column called `mods` or something similar, and you need to `tidy` those mods and store this new, tidier data in another column.
 
 ```{r}
+library(broom)
 
-#<<<< YOUR CODE HERE >>>>#
-
+grouped_datalm <- grouped_datalm %>%
+  mutate(mods = map(grouped_datalm, tidy))
 
+print(grouped_datalm)
 ```
 
 ## Make a visual of your models' summaries that shows both which sites have significant relationships between discharge and concentration and the slope of that relationship.
 
 ```{r}
-
-#<<<< YOUR CODE HERE >>>>#
-
-
+tidy_data <- grouped_datalm %>%
+  unnest(mods)
+
+significant_data <- filter(tidy_data, p.value < 0.05)
+
+ggplot(tidy_data, aes(x = estimate, y = p.value, color = basin)) +
+  geom_point() +
+  labs(
+    title = "Relationship Between Discharge and Concentration",
+    x = "Slope of Relationship",
+    y = "P-value"
+  ) +
+  theme_minimal()
 ```
 
 ## Bonus

diff --git a/compiledcode_WQfunctions.R b/compiledcode_WQfunctions.R
@@ -0,0 +1,72 @@
+colorado <- tibble(siteid = c("USGS-09034500", "USGS-09069000",
+                              "USGS-09085000", "USGS-09095500", "USGS-09152500"),
+                   basin = c("colorado1", "eagle",
+                             "roaring", "colorado3", "gunnison"))
+
+WQ_tidying_data <- function(){
+  # Compile all these names into a single vector:
+  parameters <- c("Calcium", "Magnesium", "Sodium", "Potassium", "Sulfate", "Sulfate as SO4", "Sulfur Sulfate", "Total Sulfate", "Chloride", "Alkalinity, bicarbonate", "Bicarbonate")
+
+  conc_wide <- readWQPdata(siteid = colorado$siteid, # pulling our siteids column in from the colorado object, becomes a vector
+                           sampleMedia = "Water", # WQP also has bilogical data and sediment data
+                           startDateLo = "1980-10-01", # must be formatted as YYYY-MM-DD
+                           startDateHi = "2023-11-01", # must be formatted as YYYY-MM-DD
+                           characteristicName = parameters) # our vector of `characteristcName`s
+
+  View(conc_wide)
+
+  # This code mostly just grabs and renames the most important data columns
+  conc_small <-  conc_wide %>%
+    select(date = ActivityStartDate,
+           parameter = CharacteristicName,
+           units = ResultMeasure.MeasureUnitCode,
+           siteid = MonitoringLocationIdentifier,
+           org = OrganizationFormalName,
+           org_id = OrganizationIdentifier,
+           time = ActivityStartTime.Time,
+           value = ResultMeasureValue,
+           sample_method = SampleCollectionMethod.MethodName,
+           analytical_method = ResultAnalyticalMethod.MethodName,
+           particle_size = ResultParticleSizeBasisText,
+           date_time = ActivityStartDateTime,
+           media = ActivityMediaName,
+           sample_depth = ActivityDepthHeightMeasure.MeasureValue,
+           sample_depth_unit = ActivityDepthHeightMeasure.MeasureUnitCode,
+           fraction = ResultSampleFractionText,
+           status = ResultStatusIdentifier) %>%
+    # Remove trailing white space in labels
+    mutate(units = trimws(units))
+
+  conc_meta <- conc_small %>%
+    left_join(., colorado, by = "siteid") %>%
+    dplyr::mutate(ion = dplyr::case_when(parameter == "Calcium" ~ "Ca",
+                                         parameter == "Magnesium" ~ "Mg",
+                                         parameter == "Sodium" ~ "Na",
+                                         parameter == "Potassium" ~ "K",
+                                         parameter %in% c("Sulfate", "Sulfate as SO4", "Sulfur Sulfate", "Total Sulfate") ~ "SO4",
+                                         parameter == "Chloride" ~ "Cl",
+                                         parameter %in% c("Alkalinity, bicarbonate", "Bicarbonate") ~ "HCO3")) %>%
+    select(siteid, basin, ion, parameter, date, everything())
+
+  View(conc_meta)
+
+  table(conc_meta$units)
+
+  conc_tidy <- conc_meta %>%
+    filter(units == 'mg/l') %>%
+    mutate(date = ymd(date)) %>%
+    select(date,
+           parameter,
+           ion,
+           siteid,
+           basin,
+           conc = value)
+
+
+  # The amazing group_by function groups all the data so that the summary
+  # only applies to each subgroup (siteid, date, and parameter combination).
+  # So in the end you get a daily average concentration for each siteid and parameter type.
+  conc_daily <- conc_tidy %>%
+    group_by(date, parameter, siteid, basin) %>%
+    summarize(conc = mean(conc, na.rm = T))
+  }
diff --git a/data/Q.RDS b/data/Q.RDS
diff --git a/data/tidied_full_wq.RDS b/data/tidied_full_wq.RDS