Skip to content

Commit

Permalink
Update 08.1-cluster-accessions.R
Browse files Browse the repository at this point in the history
  • Loading branch information
kauedesousa committed Mar 25, 2024
1 parent a283a42 commit 6ab62e4
Showing 1 changed file with 234 additions and 43 deletions.
277 changes: 234 additions & 43 deletions script/08.1-cluster-accessions.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,27 @@ library("terra")
library("geodata")
library("tidyverse")
library("magrittr")
library("car")
library("patchwork")
library("broom")
library("sf")
source("script/helper-01-functions.r")
# ....................................
# ....................................
# Input data #####
# read file with crop parameters
output = "output/cluster-accessions/"
dir.create(output, recursive = TRUE, showWarnings = FALSE)

list.files("data")
plant_spp = read.csv("data/calib-eco-pars.csv")

biovif =
adm = st_read("data/world-map/world_borders_adm0.shp")
adm = st_as_sf(adm)
adm = adm[adm$CONTINENT != "Antarctica", ]

biovif = read.csv("data/vif-bioclim-selection.csv")
biovif = gsub("_", "", biovif$bio)
biovif = paste0("bio", 1:19)[paste0("bio", 1:19) %in% biovif]

sel = list.files("data/cwr-processed", pattern = ".csv")
sel = gsub(".csv", "", sel)
Expand All @@ -40,6 +53,8 @@ plant_spp = plant_spp[plant_spp$NAME %in% sel, ]

plant_spp = plant_spp[!duplicated(plant_spp$NAME), ]

# ....................................
# ....................................
# read crop wild relatives data
# run over crop files
cwr = list()
Expand Down Expand Up @@ -77,7 +92,8 @@ table(cwr$NAME)
# ggplot(cwr, aes(x = x, y = y, color = SPAM_Code)) +
# geom_point()


# ....................................
# ....................................
# read landrace data
# run over crop files
landrace = list()
Expand Down Expand Up @@ -123,7 +139,6 @@ rm(landrace, cwr)

head(dat)


# ggplot(dat, aes(x = x, y = y, color = GR)) +
# geom_point()

Expand All @@ -150,6 +165,8 @@ bio = worldclim_global("bio", res = 5, wcpath)
bionames = paste0("bio", 1:19)
names(bio) = bionames

all(biovif %in% names(bio))

# ....................................
# ....................................
# Extract climate data #####
Expand All @@ -163,47 +180,221 @@ dat2 = cbind(dat, bio_e)

head(dat2)

dat2$group = paste(dat$GR, dat$SPAM_Code, sep = " - ")

table(dat2$group)

groups = unique(dat2$group)

i = 5

groups[i]

keep = dat2$group == groups[i]
dat2$group = paste(dat$NAME, dat$GR, sep = " - ")

d1 = dat2[keep, bionames]
dat2$group = gsub("African rice", "Rice", dat2$group)

d = scale(d1)

d = dist(d)

#hc = hclust(d, method = "ward.D2")
#plot(hc, cex = 0.5, hang = -1)
#k = cutree(hc, h = 50)
#table(k)


set.seed(2035)
hc = kmeans(d, 7, nstart = 25)

print(hc)

fviz_cluster(hc, data = d)

longlat = dat[keep, c("x", "y")]

longlat$clust = as.factor(hc$cluster)

ggplot(longlat, aes(x = x, y = y, color = clust)) +
geom_point()



aggregate(dat2[keep, bionames], by=list(cluster = hc$cluster), mean)
table(dat2$group)

groups = sort(unique(dat2$group))

pca_list = list()
map_list = list()

for (i in seq_along(groups)){

print(groups[i])

keep = dat2$group == groups[i]

di = dat2[keep, ]

di[di == 0] = NA

di = na.omit(di)

d = scale(di[bionames])

d = dist(d)

hc = hclust(d, method = 'ward.D2')
# plot(hc, cex = 0.5, hang = -1)

# select optimal cutoff point using lm
false = FALSE
nk = 8
while(isFALSE(false)) {

k = cutree(hc, k = nk)
di$clust = as.factor(k)

dil = di[c("clust", biovif)] %>%
pivot_longer(!clust , names_to = "bio", values_to = "value") %>%
group_by(clust, bio)

dil$value = ifelse(dil$value <= 0, dil$value + 100, dil$value)

dil$value = log(dil$value)

mod = lm(value ~ clust, data = dil)

false = all(tidy(mod)[,5] < 0.001)

if(isTRUE(false)) {
false = TRUE
}

if(isFALSE(false)) {
nk = nk - 1
}

if(isTRUE(nk < 3)){
false = TRUE
nk = 3
}

}

cat("using", nk, "clusters for", groups[i], "\n")

pc = scale(di[bionames])

PCA2 = princomp(pc)

pc_plot = plot_pca(PCA2, labels = di$clust, scale = 10) +
scale_color_brewer(palette = "Set1", direction = -1) +
labs(title = groups[i])

pca_list[[i]] = pc_plot

ggsave(gsub(" ", "", paste0(output, groups[i], "-pca-clust.pdf")),
plot = pc_plot,
width = 20,
height = 20,
units = "cm")

map_clust = ggplot() +
geom_sf(adm$geometry,
mapping = aes(),
colour = "white",
fill = "#828282") +
geom_jitter(data = di, aes(x = x, y = y, color = clust)) +
theme_void() +
scale_color_brewer(palette = "Set1") +
theme(legend.text = element_text(size = 14),
legend.position = c(0.11, 0.45),
plot.title = element_text(hjust = 0.5)) +
labs(color = groups[i])

map_list[[i]] = map_clust

ggsave(gsub(" ", "", paste0(output, groups[i], "-map-clust.pdf")),
plot = map_clust,
width = 40,
height = 20,
units = "cm")

# take descriptive stats by cluster
sums = di[c("clust", biovif)] %>%
pivot_longer(!clust , names_to = "bio", values_to = "value") %>%
group_by(clust, bio) %>%
summarise(mean = mean(value),
median = median(value),
sd = sd(value),
min = min(value),
max = max(value)) %>%
ungroup()

write.csv(sums,
gsub(" ", "", paste0(output, groups[i], "-summaries.csv")),
row.names = FALSE)

# plot points by clusters
boxp = di[c("clust", biovif)] %>%
pivot_longer(!clust , names_to = "bio", values_to = "value") %>%
ggplot(aes(x = bio, y = value, fill = clust)) +
geom_boxplot() +
facet_wrap(bio ~ ., scale = "free") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(x = "",
y = "Value",
fill = groups[i])

ggsave(gsub(" ", "", paste0(output, groups[i], "-boxplot-clust.pdf")),
plot = boxp,
width = 25,
height = 25,
units = "cm")

densp = di[c("clust", biovif)] %>%
pivot_longer(!clust , names_to = "bio", values_to = "value") %>%
ggplot(aes(x = value, fill = clust)) +
geom_density(alpha = 0.2) +
facet_wrap(bio ~ ., scale = "free") +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(x = "",
y = "Density",
fill = groups[i])

ggsave(gsub(" ", "", paste0(output, groups[i], "-densityplot-clust.pdf")),
plot = densp,
width = 25,
height = 25,
units = "cm")

}


groups

grep("CWR", groups)

map_cwr = map_list[[1]] + map_list[[3]] +
map_list[[5]] + map_list[[7]] +
map_list[[9]] + map_list[[11]] +
map_list[[13]] + map_list[[15]] +
map_list[[17]] + map_list[[19]] +
plot_layout(ncol = 2)

ggsave(paste0(output, "all-cwr-map.png"),
plot = map_cwr,
width = 50,
height = 70,
units = "cm",
dpi = 500)

map_landr = map_list[[2]] + map_list[[4]] +
map_list[[6]] + map_list[[8]] +
map_list[[10]] + map_list[[12]] +
map_list[[14]] + map_list[[16]] +
map_list[[18]] + map_list[[20]] +
plot_layout(ncol = 2)

ggsave(paste0(output, "all-landrace-map.png"),
plot = map_landr,
width = 50,
height = 70,
units = "cm",
dpi = 500)

pca_cwr = pca_list[[1]] + pca_list[[3]] +
pca_list[[5]] + pca_list[[7]] +
pca_list[[9]] + pca_list[[11]] +
pca_list[[13]] + pca_list[[15]] +
pca_list[[17]] + pca_list[[19]] +
plot_layout(ncol = 2)

ggsave(paste0(output, "all-cwr-pca.png"),
plot = pca_cwr,
width = 40,
height = 65,
units = "cm",
dpi = 500)

pca_landr = pca_list[[2]] + pca_list[[4]] +
pca_list[[6]] + pca_list[[8]] +
pca_list[[10]] + pca_list[[12]] +
pca_list[[14]] + pca_list[[16]] +
pca_list[[18]] + pca_list[[20]] +
plot_layout(ncol = 2)

ggsave(paste0(output, "all-landrace-pca.png"),
plot = pca_landr,
width = 40,
height = 65,
units = "cm",
dpi = 500)

0 comments on commit 6ab62e4

Please sign in to comment.