prediction-disease-modeling_windowpane.qmd

---
title: "correl"
format: html
editor_options: 
  chunk_output_type: console
---

```{r}
library(gsheet)
library(tidyverse)
library(nasapower)
library(lubridate)
library(progress)
library(r4pde)
# Load data from Google Sheets
```

## load the data

```{r}
trials <- BlastWheat
# Data preparation
trials2 <- trials |>
#filter(study %in% c(1, 2, 3, 4)) |>
  mutate(
    heading = as.Date(heading, format = "%d-%m-%Y")  # Convert to Date format
  )


```

## Get weather


```{r}
# Example usage with a specified number of days around a date
weather_data <- get_nasapower(
  data = trials2,
  days_around = 28,
  date_col = "heading"
)
```

## Join trial and weather

```{r}
trials3 <- full_join(trials2, weather_data)
head(trials3)
```


## windowpane single variable and window

```{r}
wp1 <- windowpane(
  data = trials3,
  end_date_col = heading,
  date_col = YYYYMMDD,
  variable = T2M,  # Example variable
  summary_type = "mean",
  threshold = NULL,
  window_lengths = c(7, 28),
  direction = "both",
  group_by_cols = 
    "study", # Grouping by 'study'
)

wp_1 <- wp_1 |> 
  mutate(inc = trials$inc_mean,
         inc2 = case_when(inc > 20 ~ 1,
                          TRUE ~ 0))
```


## window pane multiple variables

```{r}
## By multiple variables

# Define the variables you want to analyze
variables <- c("T2M", "T2M_MIN", "T2M_MAX", "RH2M")  # Add more variables as needed

# Apply the function to each variable and combine the results
wp_means <- map(variables, function(var) {
  windowpane(
    data = trials3,
    end_date_col = heading,
    date_col = YYYYMMDD,
    variable = !!sym(var),   # Example variable
    summary_type = "mean",
    threshold = NULL,
    window_lengths = c(7, 14, 21, 28),
    direction = "both",
    group_by_cols = "study", # Grouping by 'study'
  )
})


wp_means_df <- reduce(wp_means, left_join, by = c("study", "heading"))  # Replace with your grouping columns


# sum rainfall

wp_sums_df <- windowpane(
    data = trials3,
    end_date_col = heading,
    date_col = YYYYMMDD,
    variable = PRECTOTCORR,   # Example variable
    summary_type = "sum",
    threshold = NULL,
    window_lengths = c(7, 14, 21, 28),
    direction = "both",
    group_by_cols = "study", # Grouping by 'study'
  )


wp_sums_df <- wp_sums_df |> 
  select(-heading, -study)


## count 

wp_count_df <- windowpane(
    data = trials3,
    end_date_col = heading,
    date_col = YYYYMMDD,
    variable = T2M_MIN,   # Example variable
    summary_type = "below_threshold",
    threshold = 15,
    window_lengths = c(7, 14, 21, 28),
    direction = "both",
    group_by_cols = "study", # Grouping by 'study'
  )


wp_count_df <- wp_count_df |> 
  select(-heading, -study)


wp_all <- cbind(wp_means_df, wp_sums_df, wp_count_df)

duplicated(names(wp_all))

wp_all <- wp_all |> 
  mutate(inc = trials$inc_mean,
         inc2 = case_when(inc > 20 ~ 1,
                          TRUE ~ 0))

T2M_7day <- wp_all |> 
  select(starts_with("length7_T2M_mean")) |> 
  mutate(inc = trials$inc_mean,
         inc2 = case_when(inc > 20 ~ 1,
                          TRUE ~ 0))


```


## function correlations and simes method

```{r}


# Define the simplified function for correlation analysis
variable_selection <- function(data, response_var, corr_type = "spearman", alpha = 0.05) {
  # Define predictors and response
  predictors <- setdiff(names(data), response_var)
  response <- data[[response_var]]
  
  # Ensure predictors are numeric
  data[predictors] <- lapply(data[predictors], as.numeric)
  
  # Initialize the results data frame
  results <- data.frame(variable = predictors, correlation = NA, p_value = NA)
  
  # Loop through each predictor
  for (var in predictors) {
    var_data <- data[[var]]
    
    # Ensure the variable is numeric and not constant
    if (is.numeric(var_data) && length(unique(var_data)) > 1) {
      # Check if enough complete cases exist
      complete_cases <- complete.cases(var_data, response)
      if (sum(complete_cases) > 2) {
        # Extract complete cases only
        var_data <- var_data[complete_cases]
        response_data <- response[complete_cases]
        
        # Compute the specified type of correlation
        corr <- cor(var_data, response_data, method = corr_type)
        
        # Compute the p-value using cor.test()
        p_value <- cor.test(var_data, response_data, method = corr_type)$p.value
        
        # Store the correlation and p-value in the results data frame
        results[results$variable == var, c('correlation', 'p_value')] <- c(corr, p_value)
      }
    }
  }

  # Apply Simes method
  results <- results %>%
    arrange(p_value) %>%
    mutate(rank = row_number(),
           m = n(),
           threshold = alpha * rank / m,
           significant_simes = p_value <= threshold)

  # Select significant variables by Simes
  selected_simes_variables <- results %>%
    filter(significant_simes == TRUE) %>%
    pull(variable)

  # Apply Benjamini-Hochberg FDR correction
  results <- results %>%
    mutate(fdr_threshold = p.adjust(p_value, method = "BH"),
           significant_fdr = fdr_threshold < alpha)

  # Select significant variables by FDR
  selected_fdr_variables <- results %>%
    filter(significant_fdr == TRUE) %>%
    pull(variable)

  # Return the results data frame and selected variables
  return(list(
    results = results,
    selected_simes = selected_simes_variables,
    selected_fdr = selected_fdr_variables
  ))
}

# Example usage
data <- wp_all |> select(-study, -heading, -inc)  # Example data selection
response_var <- 'inc2'

# Call the simplified function
results <- variable_selection(data, response_var, corr_type = "spearman", alpha = 0.05)

# Print the results
print(results$results)
cat("Selected variables by Simes method:", results$selected_simes, "\n")
cat("Selected variables by FDR method:", results$selected_fdr, "\n")

# Example usage
names(wp_all)
data <- wp_all |> select(-study, -heading, -inc, - `T2M_MIN_0_-27`)
response_var <- 'inc2'

# Call the function
results <- variable_selection(data, response_var, corr_type = "kendall")

view(results$results)
# Print the results
print(results$results)
cat("Selected variables by Simes method:", results$selected_simes, "\n")
cat("Selected variables by FDR method:", results$selected_fdr, "\n")

```

## Function bootstrapping correlations

```{r}

# Define the function for bootstrapping correlation analysis with refined Simes method
variable_selection_with_refined_simes <- function(data, response_var, corr_type = "spearman", R = 1000, global_alpha = 0.05, individual_alpha = 0.005) {
  # Define predictors and response
  predictors <- setdiff(names(data), response_var)
  response <- data[[response_var]]
  
  # Ensure predictors are numeric
  data[predictors] <- lapply(data[predictors], as.numeric)

  # Initialize the results data frame
  results <- data.frame(variable = predictors, correlation = NA, p_value = NA, 
                        mean_corr = NA, sd_corr = NA, median_corr = NA)

  # Define the internal function for bootstrapping
  calc_correlation <- function(data, indices, var, response_var) {
    # Subset data for bootstrap sample
    sample_data <- data[indices, ]
    var_data <- sample_data[[var]]
    response_data <- sample_data[[response_var]]

    # Calculate correlation for the given sample
    corr_result <- cor.test(var_data, response_data, method = corr_type)
    
    # Return the correlation estimate and p-value
    return(c(as.numeric(corr_result$estimate), corr_result$p.value))
  }

  # Loop through each predictor
  for (var in predictors) {
    var_data <- data[[var]]
    
    # Ensure the variable is numeric and not constant
    if (is.numeric(var_data) && length(unique(var_data)) > 1) {
      # Check if enough complete cases exist
      complete_cases <- complete.cases(var_data, response)
      if (sum(complete_cases) > 5) {  # Minimum sample size of 5
        # Prepare data for bootstrapping
        data_boot <- data.frame(var_data = var_data[complete_cases], response = response[complete_cases])
        
        # Run the bootstrap (using boot package)
        boot_result <- boot(data = data_boot, statistic = function(data, indices) {
          calc_correlation(data, indices, "var_data", "response")
        }, R = R)
        
        # Prepare bootstrap summary
        bootstrap_df <- as.data.frame(boot_result$t)
        colnames(bootstrap_df) <- c("correlation", "p_value")
        
        # Calculate mean, standard deviation, and median of the correlation estimates
        mean_corr <- mean(bootstrap_df$correlation, na.rm = TRUE)
        sd_corr <- sd(bootstrap_df$correlation, na.rm = TRUE)
        median_corr <- median(bootstrap_df$correlation, na.rm = TRUE)

        # Extract the initial correlation and p-value from the bootstrap
        corr <- boot_result$t0[1]
        p_value <- boot_result$t0[2]
        
        # Store results
        results[results$variable == var, c('correlation', 'p_value', 'mean_corr', 'sd_corr', 'median_corr')] <- 
          c(corr, p_value, mean_corr, sd_corr, median_corr)
      }
    }
  }

  # Apply Simes method to adjust for multiple testing
  results <- results %>%
    arrange(p_value) %>%
    mutate(rank = row_number(),
           m = n(),
           simes_threshold = global_alpha * rank / m,
           significant_simes = p_value <= simes_threshold,
           individual_significant = p_value <= individual_alpha)  # Use individual_alpha = 0.005

  # Calculate the global p-value (Pg) as the minimum of the Simes-adjusted p-values
  Pg <- min(results$p_value / (results$rank / results$m), na.rm = TRUE)

  # Determine global significance
  global_significant <- Pg < global_alpha

  # Select significant variables by refined Simes method
  selected_simes_variables <- results %>%
    filter(significant_simes == TRUE) %>%
    pull(variable)

  # Select significant variables by individual alpha threshold (0.005)
  selected_individual_variables <- results %>%
    filter(individual_significant == TRUE) %>%
    pull(variable)

  # Return the results data frame and selected variables
  return(list(
    results = results,
    selected_simes = selected_simes_variables,
    selected_individual = selected_individual_variables,
    global_significant = global_significant,
    Pg = Pg
  ))
}

# Example usage
data <- wp_all |> select(-study, -heading, -inc2)  # Example data selection
response_var <- 'inc'

library(boot)
# Call the function with refined Simes adjustment
results <- variable_selection_with_refined_simes(data, response_var, corr_type = "spearman", R = 1000, global_alpha = 0.05, individual_alpha = 0.005)

# Print the results
view(results$results)
cat("Selected variables by Simes method:", results$selected_simes, "\n")
cat("Selected variables by individual significance (alpha=0.005):", results$selected_individual, "\n")
cat("Global significance (Pg):", results$Pg, "\n")
cat("Is globally significant?", results$global_significant, "\n")

```

## bootstraping correlation simes in table

```{r}
T2M_MAX_7 <- wp_all |> 
  select(starts_with("length7_T2M_MAX_mean"))
```


```{r}
windowpane_tests <- function(data, response_var, corr_type = "spearman", R = 1000, global_alpha = 0.05, individual_alpha = 0.005) {
  # Define predictors and response
  predictors <- setdiff(names(data), response_var)
  response <- data[[response_var]]

  # Ensure predictors are numeric
  data[predictors] <- lapply(data[predictors], as.numeric)

  # Initialize the results data frame
  results <- data.frame(variable = predictors, 
                        correlation = NA, 
                        p_value = NA, 
                        mean_corr = NA, 
                        sd_corr = NA, 
                        median_corr = NA)

  # Define the internal function for bootstrapping
  calc_correlation <- function(data, indices, var, response_var) {
    # Subset data for bootstrap sample
    sample_data <- data[indices, ]
    var_data <- sample_data[[var]]
    response_data <- sample_data[[response_var]]

    # Calculate correlation for the given sample
    corr_result <- cor.test(var_data, response_data, 
                            method = corr_type, exact = FALSE)
    
    # Return the correlation estimate and p-value
    return(c(as.numeric(corr_result$estimate), corr_result$p.value))
  }

  # Loop through each predictor
  for (var in predictors) {
    var_data <- data[[var]]
    
    # Ensure the variable is numeric and not constant
    if (is.numeric(var_data) && length(unique(var_data)) > 1) {
      # Check if enough complete cases exist
      complete_cases <- complete.cases(var_data, response)
      if (sum(complete_cases) > 5) {  # Minimum sample size of 5
        # Prepare data for bootstrapping
        data_boot <- data.frame(var_data = var_data[complete_cases], response = response[complete_cases])
        
        # Run the bootstrap (using boot package)
        boot_result <- boot(data = data_boot, statistic = function(data, indices) {
          calc_correlation(data, indices, "var_data", "response")
        }, R = R)
        
        # Prepare bootstrap summary
        bootstrap_df <- as.data.frame(boot_result$t)
        colnames(bootstrap_df) <- c("correlation", "p_value")

        # Calculate mean, standard deviation, and median of the correlation estimates
        mean_corr <- mean(bootstrap_df$correlation, na.rm = TRUE)
        sd_corr <- sd(bootstrap_df$correlation, na.rm = TRUE)
        median_corr <- median(bootstrap_df$correlation, na.rm = TRUE)

        # Extract the initial correlation and p-value from the bootstrap
        corr <- boot_result$t0[1]
        p_value <- boot_result$t0[2]

        # Store results
        results[results$variable == var, c('correlation', 'p_value', 'mean_corr', 'sd_corr', 'median_corr')] <- 
          c(corr, p_value, mean_corr, sd_corr, median_corr)
      }
    }
  }

  # Apply Simes method to adjust for multiple testing
  results <- results %>%
    arrange(p_value) %>%
    mutate(rank = row_number(),
           m = n(),
           simes_threshold = global_alpha * rank / m,
           significant_simes = p_value <= simes_threshold,
           individual_significant = p_value <= individual_alpha)  # Use individual_alpha = 0.005

  # Calculate the global p-value (Pg) as the minimum of the Simes-adjusted p-values
  Pg <- min(results$p_value / (results$rank / results$m), na.rm = TRUE)

  # Determine global significance
  global_significant <- Pg < global_alpha

  # Find the maximum correlation
  max_correlation <- max(results$correlation, na.rm = TRUE)

  # Add global Pg and max correlation as a separate row
  summary_table <- data.frame(
    Metric = c("Global P-value (Pg)", "Max Correlation"),
    Value = c(Pg, max_correlation)
  )

  # Return the results data frame and summary table
  return(list(
    results = results,
    summary_table = summary_table,
    global_significant = global_significant
  ))
}

# Example usage
data <- T2M_MAX_7   
data$inc <- trials$inc_mean
response_var <- 'inc'


library(boot)

# Call the function with refined Simes adjustment
results <- variable_selection_with_refined_simes_table(data, response_var, corr_type = "spearman", R = 1000)

# View the results

view(results$results)
print(results$summary_table)
cat("Is globally significant?", results$global_significant, "\n")


# Define a function to identify clusters of significant correlations
windowpane_clusters <- function(results_df, min_cluster_size = 5) {
  # Create a logical vector indicating whether each correlation is significant
  is_significant <- results_df$individual_significant
  
  # Initialize variables to track clusters
  cluster_starts <- c()
  cluster_ends <- c()
  current_cluster_size <- 0
  cluster_start <- NULL
  
  # Loop through the significance vector
  for (i in seq_along(is_significant)) {
    if (is_significant[i]) {
      # Start or extend a cluster
      if (current_cluster_size == 0) {
        cluster_start <- i
      }
      current_cluster_size <- current_cluster_size + 1
    } else {
      # End a cluster
      if (current_cluster_size >= min_cluster_size) {
        cluster_starts <- c(cluster_starts, cluster_start)
        cluster_ends <- c(cluster_ends, i - 1)
      }
      # Reset cluster size
      current_cluster_size <- 0
    }
  }
  
  # Check if the last cluster was not closed
  if (current_cluster_size >= min_cluster_size) {
    cluster_starts <- c(cluster_starts, cluster_start)
    cluster_ends <- c(cluster_ends, length(is_significant))
  }
  
  # Create a data frame to summarize clusters
  clusters_df <- data.frame(
    Cluster_Start = cluster_starts,
    Cluster_End = cluster_ends,
    Cluster_Size = cluster_ends - cluster_starts + 1
  )
  
  return(clusters_df)
}

```


## Elastic net 


```{r}
# Elastic net
library(glmnet)   # For Elastic Net model
library(caret)    # For data splitting and cross-validation

# Load your dataset
data <- wp_all |> select(-study, -heading, -inc)

# Define predictors and response (now 'inc2' for binary response)
response_var <- 'inc2'
predictors <- setdiff(names(data), response_var)
response <- data[[response_var]]

# Convert predictors to numeric matrix
X <- as.matrix(data[predictors])
y <- as.numeric(response)

# Remove rows with missing values
complete_cases <- complete.cases(X, y)
X <- X[complete_cases, ]
y <- y[complete_cases]

set.seed(123)  # For reproducibility

# Create training and testing sets
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]

# Define the range of alpha to explore
alpha_values <- seq(0, 1, by = 0.1)  # Alpha ranges from 0 (Ridge) to 1 (Lasso)

# Initialize storage for results
cv_results <- list()

# Perform cross-validation for each alpha
for (alpha in alpha_values) {
  cv_fit <- cv.glmnet(X_train, y_train, alpha = alpha, 
                      family = "binomial",       # For binary response
                      nfolds = 10,               # 10-fold cross-validation
                      type.measure = "class")    # Classification error
  
  cv_results[[paste0("alpha_", alpha)]] <- cv_fit
}

# Find the best alpha and lambda based on cross-validation
best_alpha <- 0
best_lambda <- Inf
min_error <- Inf

for (alpha in names(cv_results)) {
  fit <- cv_results[[alpha]]
  if (min(fit$cvm) < min_error) {
    min_error <- min(fit$cvm)
    best_alpha <- as.numeric(gsub("alpha_", "", alpha))
    best_lambda <- fit$lambda.min
  }
}

cat("Best alpha:", best_alpha, "\n")
cat("Best lambda:", best_lambda, "\n")

# Fit the final Elastic Net model with optimal alpha and lambda
final_fit <- glmnet(X_train, y_train, 
                    alpha = best_alpha, 
                    lambda = best_lambda, 
                    family = "binomial")

# Print the coefficients of the selected variables
selected_coefficients <- coef(final_fit, s = best_lambda)

# Convert to a matrix for easier subsetting
selected_coefficients <- as.matrix(selected_coefficients)

# Extract the names of the variables with non-zero coefficients
selected_variables <- rownames(selected_coefficients)[selected_coefficients != 0]

# Remove the intercept from the selected variables
selected_variables <- selected_variables[selected_variables != "(Intercept)"]

cat("Selected variables by Elastic Net:", selected_variables, "\n")

# Predict on the test set (probabilities)
y_pred_prob <- predict(final_fit, newx = X_test, s = best_lambda, type = "response")

# Convert probabilities to binary predictions
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(y_pred == y_test)
cat("Accuracy on Test Set:", accuracy, "\n")

# Calculate confusion matrix
conf_matrix <- table(Predicted = y_pred, Actual = y_test)
print(conf_matrix)

# Calculate AUC
library(pROC)
roc_obj <- roc(y_test, y_pred_prob)
auc <- auc(roc_obj)
cat("AUC on Test Set:", auc, "\n")

# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Logistic Elastic Net Model")


```

## Elastic net top 15
```{r}
# Elastic net
library(glmnet)   # For Elastic Net model
library(caret)    # For data splitting and cross-validation

# Load your dataset
data <- wp_all |> select(-study, -heading, -inc)

# Define predictors and response (now 'inc2' for binary response)
response_var <- 'inc2'
predictors <- setdiff(names(data), response_var)
response <- data[[response_var]]

# Convert predictors to numeric matrix
X <- as.matrix(data[predictors])
y <- as.numeric(response)

# Remove rows with missing values
complete_cases <- complete.cases(X, y)
X <- X[complete_cases, ]
y <- y[complete_cases]

set.seed(123)  # For reproducibility

# Create training and testing sets
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]

# Define the range of alpha to explore
alpha_values <- seq(0, 1, by = 0.1)

# Initialize storage for results
cv_results <- list()

# Perform cross-validation for each alpha
for (alpha in alpha_values) {
  cv_fit <- cv.glmnet(X_train, y_train, alpha = alpha, 
                      family = "binomial",       
                      nfolds = 10,               
                      type.measure = "class")    
  
  cv_results[[paste0("alpha_", alpha)]] <- cv_fit
}

# Find the best alpha and lambda based on cross-validation
best_alpha <- 0
best_lambda <- Inf
min_error <- Inf

for (alpha in names(cv_results)) {
  fit <- cv_results[[alpha]]
  if (min(fit$cvm) < min_error) {
    min_error <- min(fit$cvm)
    best_alpha <- as.numeric(gsub("alpha_", "", alpha))
    best_lambda <- fit$lambda.min
  }
}

cat("Best alpha:", best_alpha, "\n")
cat("Best lambda:", best_lambda, "\n")

# Fit the final Elastic Net model with optimal alpha and lambda
final_fit <- glmnet(X_train, y_train, 
                    alpha = best_alpha, 
                    lambda = best_lambda, 
                    family = "binomial")

# Extract coefficients of the final model
selected_coefficients <- coef(final_fit, s = best_lambda)

# Convert coefficients to a matrix for easier manipulation
selected_coefficients <- as.matrix(selected_coefficients)

# Remove the intercept from the selected coefficients
selected_coefficients <- selected_coefficients[-1, , drop = FALSE]

# Select the top 15 variables with the highest absolute coefficients
top_15_indices <- order(abs(selected_coefficients), decreasing = TRUE)[1:15]
top_15_variables <- rownames(selected_coefficients)[top_15_indices]

cat("Top 15 selected variables:", top_15_variables, "\n")

# Refit the model using only the top 15 predictors
X_train_top15 <- X_train[, top_15_variables]
X_test_top15 <- X_test[, top_15_variables]

final_fit_top15 <- glmnet(X_train_top15, y_train, 
                          alpha = best_alpha, 
                          lambda = best_lambda, 
                          family = "binomial")

# Predict on the test set (probabilities)
y_pred_prob <- predict(final_fit_top15, newx = X_test_top15, s = best_lambda, type = "response")

# Convert probabilities to binary predictions
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(y_pred == y_test)
cat("Accuracy on Test Set:", accuracy, "\n")

# Calculate confusion matrix
conf_matrix <- table(Predicted = y_pred, Actual = y_test)
print(conf_matrix)

# Calculate AUC
library(pROC)
roc_obj <- roc(y_test, y_pred_prob)
auc <- auc(roc_obj)
cat("AUC on Test Set:", auc, "\n")

# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Logistic Elastic Net Model with Top 15 Predictors")

```


## best glm

```{r}

## Best glm
# Load the necessary libraries
library(bestglm)

# Prepare the data frame for bestglm
data_subset <- data.frame(data[, top_15_variables], inc2 = response)

# Remove rows with missing values
data_subset <- na.omit(data_subset)
data_subset <- data_subset 
names(data_subset)
# Convert the response variable to a factor for logistic regression
data_subset$inc2 <- as.factor(data_subset$inc2)


# Fit the Best Subset Selection model with bestglm
bestglm_fit <- bestglm(
  data_subset,
  family = binomial,   # Logistic regression
  IC = "BIC"  ,
  method = "exhaustive"# Use BIC as the information criterion
)

# Print the summary of the best model
summary(bestglm_fit)

# Extract the names of the selected variables
selected_bestglm_variables <- names(coef(bestglm_fit$BestModel))[-1]  # Exclude intercept

cat("Variables selected by Best Subset Selection (bestglm):", selected_bestglm_variables, "\n")

# Predict probabilities on the training set
y_pred_prob <- predict(bestglm_fit$BestModel, type = "response")

# Convert probabilities to binary predictions
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(y_pred == data_subset$inc2)
cat("Accuracy of Best Subset Model:", accuracy, "\n")

# Calculate confusion matrix
conf_matrix <- table(Predicted = y_pred, Actual = data_subset$inc2)
print(conf_matrix)

# Calculate AUC
library(pROC)
roc_obj <- roc(data_subset$inc2, y_pred_prob)
auc <- auc(roc_obj)
cat("AUC of Best Subset Model:", auc, "\n")

# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Best Subset Logistic Model")


```

## Logistic after bestglm

```{r}
# Get the selected variables from the bestglm model
selected_vars <- names(bestglm_fit$BestModel$coefficients)[-1]  # Exclude intercept
cat("Selected variables by bestglm:", selected_vars, "\n")

# Split the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(data_subset$inc2, p = 0.8, list = FALSE)
train_data <- data_subset[train_index, ]
test_data <- data_subset[-train_index, ]

# Fit a logistic regression model using the selected variables
formula <- as.formula(paste("inc2 ~", paste(selected_vars, collapse = " + ")))
logistic_model <- glm(formula, data = train_data, family = binomial)

# Print model summary
summary(logistic_model)

# Predict probabilities on the test set
y_pred_prob <- predict(logistic_model, newdata = test_data, type = "response")

# Convert probabilities to binary predictions
y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(y_pred == as.numeric(test_data$inc2) - 1)
cat("Accuracy on Test Set:", accuracy, "\n")

# Calculate confusion matrix
conf_matrix <- table(Predicted = y_pred, Actual = as.numeric(test_data$inc2) - 1)
print(conf_matrix)

# Calculate AUC
roc_obj <- roc(as.numeric(test_data$inc2) - 1, y_pred_prob)
auc <- auc(roc_obj)
cat("AUC on Test Set:", auc, "\n")

# Plot ROC curve
plot(roc_obj, main = "ROC Curve for Logistic Regression Model")


## Youden vallue

# Load the pROC package
library(pROC)

# Ensure 'rf_pred_prob' is numeric and 'actual_binary' is binary (0 and 1)
rf_pred_prob <- as.numeric(y_pred_prob)
actual_binary <- as.numeric(as.factor(test_data$inc2)) - 1


# Calculate the ROC curve
roc_obj <- roc(actual_binary, rf_pred_prob, levels = c(0, 1), direction = "<")


# Find the threshold that maximizes Youden's Index (sensitivity + specificity - 1)
optimal_coords <- coords(roc_obj, "best", best.method = "youden", ret = "threshold")

# Apply the optimal threshold to generate binary predictions
optimal_pred <- ifelse(rf_pred_prob >= as.numeric(optimal_coords), 1, 0)

# Calculate confusion matrix
conf_matrix <- table(Predicted = optimal_pred, Actual = actual_binary)


print(conf_matrix)

# Calculate sensitivity and specificity
sensitivity <- sum(optimal_pred == 1 & actual_binary == 1) / sum(actual_binary == 1)
specificity <- sum(optimal_pred == 0 & actual_binary == 0) / sum(actual_binary == 0)
cat("Sensitivity:", sensitivity, "\n")
cat("Specificity:", specificity, "\n")

# Calculate the AUC
auc_value <- auc(roc_obj)
cat("AUC on Test Set:", auc_value, "\n")

# Plot the ROC curve
plot(roc_obj, main = "ROC Curve with Optimal Threshold (Youden's Index)")
abline(v = optimal_coords, col = "red", lty = 2)  # Add vertical line for optimal threshold


# Convert vectors to factors to use with confusionMatrix()
optimal_pred_factor <- as.factor(optimal_pred)
actual_binary_factor <- as.factor(actual_binary)

# Create a confusion matrix and calculate all statistics
conf_matrix <- confusionMatrix(data = optimal_pred_factor, 
                               reference = actual_binary_factor, 
                               positive = "1",  # Set the positive class as "1"
                               mode = "everything")

# Print the confusion matrix
print(conf_matrix)


## Using ROCR

library(ROCR)

# Create prediction object
pred_obj <- prediction(y_pred_prob, actual_binary)

# Calculate performance for sensitivity + specificity (Youden's Index)
perf_obj <- performance(pred_obj, measure = "tpr", x.measure = "fpr")

# Calculate Youden's Index and find the optimal threshold
youden_index <- perf_obj@y.values[[1]] + (1 - perf_obj@x.values[[1]]) - 1
optimal_threshold_rocr <- pred_obj@cutoffs[[1]][which.max(youden_index)]
cat("Optimal threshold (ROCR):", optimal_threshold_rocr, "\n")

# Generate binary predictions based on the optimal threshold
optimal_pred <- ifelse(rf_pred_prob >= optimal_threshold_rocr, 1, 0)

# Calculate confusion matrix using caret
conf_matrix <- confusionMatrix(as.factor(optimal_pred), as.factor(actual_binary), positive = "1")
print(conf_matrix)

```


## Random forest model

```{r}

library(randomForest)  # For Random Forest model
library(caret)         # For data partitioning and model evaluation
library(pROC)  
# Subset the data to include only the top 15 variables
data_rf <- data.frame(data[selected_variables], inc2 = as.factor(data$inc2))

# Split the data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(data_rf$inc2, p = 0.8, list = FALSE)
train_data <- data_rf[train_index, ]
test_data <- data_rf[-train_index, ]

# Fit a Random Forest model using the training data
rf_model <- randomForest(inc2 ~ ., data = train_data, 
                         ntree = 500,          # Number of trees
                         mtry = sqrt(length(top_15_variables)),  # Number of variables tried at each split
                         importance = TRUE)    # Measure variable importance

# Print the Random Forest model summary
print(rf_model)

# Plot variable importance
varImpPlot(rf_model, main = "Variable Importance in Random Forest Model")

# Predict on the test set
rf_pred_prob <- predict(rf_model, newdata = test_data, type = "prob")[,2]
rf_pred <- ifelse(rf_pred_prob > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(rf_pred == as.numeric(test_data$inc2) - 1)
cat("Accuracy on Test Set:", accuracy, "\n")

# Calculate confusion matrix
conf_matrix <- confusionMatrix(data = as.factor(rf_pred), 
                               reference = as.factor(as.numeric(test_data$inc2) - 1), 
                               mode = "everything")
print(conf_matrix)


## using PRROC

library(PRROC)

# Convert 'actual_binary' to numeric if necessary
actual_binary <- as.numeric(as.factor(test_data$inc2)) - 1

# Create an ROC curve using PRROC
prroc_obj <- roc.curve(scores.class0 = rf_pred_prob, weights.class0 = actual_binary, curve = TRUE)

# Print AUC
cat("AUC on Test Set (PRROC):", prroc_obj$auc, "\n")

# Plot the ROC curve
plot(prroc_obj, main = "ROC Curve for Random Forest Model (PRROC)")


## using ROCCR

# Install ROCR package if not already installed
# install.packages("ROCR")

library(ROCR)

# Convert predicted probabilities to a prediction object
pred_obj <- prediction(rf_pred_prob, actual_binary)

# Calculate the ROC curve
perf_obj <- performance(pred_obj, "tpr", "fpr")

# Calculate the AUC
auc_value <- performance(pred_obj, "auc")@y.values[[1]]
cat("AUC on Test Set (ROCR):", auc_value, "\n")

# Plot the ROC curve
plot(perf_obj, main = "ROC Curve for Random Forest Model (ROCR)")


```

## Boruta
```{r}
library(Boruta)
boruta.bank_train <- Boruta(inc2 ~., data = data_rf, doTrace = 2)
print(boruta.bank_train)
boruta.bank <- TentativeRoughFix(boruta.bank_train)
print(boruta.bank)
plot(boruta.bank, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(boruta.bank$ImpHistory),function(i)
boruta.bank$ImpHistory[is.finite(boruta.bank$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.bank$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.bank$ImpHistory), cex.axis = 0.4)


boruta_selected <- getSelectedAttributes(boruta.bank, withTentative = F)[1:5]
boruta_selected
```


## plot window pane

```{r}
library(tidyverse)
# Load the required library
library(ggplot2)

# Define total days and window lengths
max_days <- 28
window_lengths <- c(7, 14, 21, 28)

# Create an empty data frame for all sliding windows
window_data <- data.frame()

# Populate the data frame with start and end points for each window
for (length in window_lengths) {
  for (start_day in 0:(max_days - length)) {
    end_day <- start_day + length
    window_data <- rbind(
      window_data,
      data.frame(
        start = start_day,
        end = end_day,
        window_length = length
      )
    )
  }
}

# Order the data by the start day (ascending) and create a new variable ID
window_data <- window_data |> 
  arrange(start, window_length) |> 
  mutate(var_id = row_number())

# Convert window_length to a factor for correct ordering in the legend
window_data$window_length <- factor(window_data$window_length, levels = sort(unique(window_data$window_length)))

# Plotting the sliding windows using ggplot2
ggplot(window_data, aes(x = start, xend = end, y = var_id, yend = var_id, color = window_length)) +
  geom_segment(size = 1.5) +  # Line segments for each window
  scale_x_continuous(breaks = 0:max_days, limits = c(0, max_days)) +
  scale_y_continuous(breaks = 1:nrow(window_data)) +
  labs(
    title = "Sliding Windows: Y-axis Ordered by Start Day",
    x = "Days",
    y = "Variable ID (Ordered by Start Day)",
    color = "Window Length (days)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "top"
  )

```

```{r}
# Load the required library
library(ggplot2)

# Define total days and window lengths
max_days <- 28
window_lengths <- c(7, 14, 21, 28)

# Create an empty data frame for all sliding windows
window_data <- data.frame()

# Populate the data frame with start and end points for each window
var_id <- 1  # Variable ID for each window
for (length in sort(window_lengths)) {  # Sort window lengths from shortest to longest
  for (start_day in 0:(max_days - length)) {
    end_day <- start_day + length
    window_data <- rbind(
      window_data,
      data.frame(
        start = start_day,
        end = end_day,
        var_id = var_id,
        window_length = length
      )
    )
    var_id <- var_id + 1  # Increment variable ID
  }
}

# Convert window_length to a factor for correct ordering in the legend
window_data$window_length <- factor(window_data$window_length, levels = sort(unique(window_data$window_length)))

# Plotting the sliding windows using ggplot2
ggplot(window_data, aes(x = start, xend = end, y = var_id, yend = var_id, color = window_length)) +
  geom_segment(size = 1.5) +  # Line segments for each window
  scale_x_continuous(breaks = 0:max_days, limits = c(0, max_days)) +
  scale_y_continuous(breaks = 1:var_id) +
  labs(
    title = "Sliding Windows: Each Variable Over 28 Days (Ordered by Length)",
    x = "Days",
    y = "Variable ID",
    color = "Window Length (days)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "top"
  )

```