Skip to content

Commit

Permalink
Merge pull request #7 from Eco-Flow/Fix_database_autmation_if_different
Browse files Browse the repository at this point in the history
Fix database automation if different in R processing
  • Loading branch information
chriswyatt1 authored Jun 12, 2024
2 parents 2068e8a + 51a7e18 commit d5883a9
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 76 deletions.
69 changes: 69 additions & 0 deletions bin/r_processing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/Rscript
library(dplyr)
library(hash)

args <- commandArgs(trailingOnly = TRUE)
small<- strsplit(args[1], "[.]")[[1]][2]

data <- read.table(args[1], header=F, sep="\t")
data <- data %>% mutate_all(na_if,"")

id <- data.frame(do.call('rbind', strsplit(as.character(data$V1), ';', fixed=TRUE)))[1]

size <- data.frame(do.call('rbind', strsplit(as.character(data$V1), '=', fixed=TRUE)))[2]
size$X2 <- as.numeric(size$X2)

classif <- data.frame(do.call('rbind', strsplit(as.character(data$V2),',',fixed=TRUE)))

h <- hash(k="kingdom", p="phylum", c="class", o="order", f="family", g="genus", s="species")

tax_vars = c()
outdf = c()
outdf <- cbind(id)
tablecolnames <- c("sample")

for (x in 1:length(classif) ) {
tvar=paste0("X", x)
my_res<-data.frame(do.call('rbind', strsplit(gsub(")", "",classif[[tvar]] ),'(',fixed=TRUE)))
my_tax <- strsplit(my_res$X1[1],':')[[1]][1]
full_name<- h[[paste0(strsplit(my_res$X1[1],':')[[1]][1])]]
my_res$X1 <- strsplit(my_res$X1[1],':')[[1]][2]
my_res$X2 <- as.numeric(my_res$X2)
tax_vars <- c(tax_vars, my_tax)
outdf <- cbind(outdf, my_res)
tablecolnames <- c(tablecolnames, full_name )
prob_name=paste0("prob_", full_name)
tablecolnames <- c(tablecolnames, prob_name )
}

outdf <- cbind(outdf, size)
tablecolnames <- c(tablecolnames, "size" )
colnames(outdf)<-tablecolnames

write.table(outdf, file=paste(small, ".classified.tsv", sep=""), quote=FALSE, sep='\t', row.names = FALSE)

#Function for making pie charts
pie_plots <- function(r){
sums <- aggregate(size ~ outdf[[r]], outdf, sum)
colnames(sums) <- c("tax", "size")
sums <- sums[order(sums$size, decreasing = TRUE),]
sums <- sums %>% mutate(perc = size/sum(size))
cutoff <- sum(sums$perc > 0.03)
top_val <- head(sums$tax, n = cutoff)
sums <- sums %>% mutate(legend_value = case_when(tax %in% top_val ~ tax, !(tax %in% top_val) ~ "OTHER" ))
pie_table <- aggregate(size~legend_value,sums,sum)
pdf (paste(small, ".", r, ".pdf", sep = ""), width=6, height=6)
pie(pie_table$size, pie_table$legend_value, clockwise = T)
dev.off()
}

#Make pie charts for order, family and genus
for (tax in c("family", "order", "genus")) {
pie_plots(tax)
}

#Generate summary stats
total_mapped_reads <- sum(aggregate(size~genus, outdf, sum)[2]) #just need total so does not matter which column used
unique_samples= nrow(outdf)
output <- cbind(unique_samples, total_mapped_reads)
write.table(output,"summary.tsv", quote=F, sep="\t", row.names = FALSE)
2 changes: 0 additions & 2 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@ params {
database = "s3://pollen-metabarcoding-test-data/data/viridiplantae_all_2014.sintax.fa"
FW_primer = "ATGCGATACTTGGTGTGAAT"
RV_primer = "GCATATCAATAAGCGGAGGA"
retain_untrimmed = true
fastq_maxee = 0.5
fastq_minlen = 250
fastq_maxns = 0
fasta_width = 0
minuniquesize = 2
derep_strand = "plus"
sizeout = true
sintax_strand = "both"
sintax_cutoff = 0.95
// Max resources for test input data can be reduced
Expand Down
2 changes: 0 additions & 2 deletions conf/test_small.config
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@ params {
database = "s3://pollen-metabarcoding-test-data/data/viridiplantae_all_2014.sintax.fa"
FW_primer = "ATGCGATACTTGGTGTGAAT"
RV_primer = "GCATATCAATAAGCGGAGGA"
retain_untrimmed = true
fastq_maxee = 0.5
fastq_minlen = 250
fastq_maxns = 0
fasta_width = 0
minuniquesize = 2
derep_strand = "plus"
sizeout = true
sintax_strand = "both"
sintax_cutoff = 0.95
// Max resources for test input data can be reduced
Expand Down
75 changes: 3 additions & 72 deletions modules/local/r_processing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process R_PROCESSING {
tag "${meta.id}"
label 'process_low'

container = 'ecoflowucl/rocker-r_base:r-base-4.3.3_dplyr-1.1.4'
container = 'ecoflowucl/rocker-r_base:r-base-4.3.3_dplyr-1.1.4_hash'

input:
tuple val(meta), path(sintax_tsv)
Expand All @@ -14,76 +14,7 @@ process R_PROCESSING {

script:
"""
#!/usr/bin/Rscript
library(dplyr)
data <- read.table("${sintax_tsv}", header=F, sep="\t")
data <- data %>% mutate_all(na_if,"")
id <- data.frame(do.call('rbind', strsplit(as.character(data\$V1), ';', fixed=TRUE)))[1]
size <- data.frame(do.call('rbind', strsplit(as.character(data\$V1), '=', fixed=TRUE)))[2]
size\$X2 <- as.numeric(size\$X2)
classif <- data.frame(do.call('rbind', strsplit(as.character(data\$V2),',',fixed=TRUE)))
k <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X1),'(',fixed=TRUE)))
k\$X1 <- gsub("k:", "", k\$X1)
k\$X2 <- as.numeric(k\$X2)
p <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X2),'(',fixed=TRUE)))
p\$X1 <- gsub("p:", "", p\$X1)
p\$X2 <- as.numeric(p\$X2)
c <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X3),'(',fixed=TRUE)))
c\$X1 <- gsub("c:", "", c\$X1)
c\$X2 <- as.numeric(c\$X2)
o <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X4),'(',fixed=TRUE)))
o\$X1 <- gsub("o:", "", o\$X1)
o\$X2 <- as.numeric(o\$X2)
f <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X5),'(',fixed=TRUE)))
f\$X1 <- gsub("f:", "", f\$X1)
f\$X2 <- as.numeric(f\$X2)
g <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X6),'(',fixed=TRUE)))
g\$X1 <- gsub("g:", "", g\$X1)
g\$X2 <- as.numeric(g\$X2)
s <- data.frame(do.call('rbind', strsplit(gsub(")", "",classif\$X7),'(',fixed=TRUE)))
s\$X1 <- gsub("s:", "", s\$X1)
s\$X2 <- as.numeric(s\$X2)
outdf <- cbind(id, k, p, c, o, f, g, s, size)
colnames(outdf)[c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)] <- c("sample", "kingdom", "prob_kingdom", "division", "prob_division", "clade", "prob_clade", "order", "prob_order", "family", "prob_family", "genus", "prob_genus", "species", "prob_species", "size")
write.table(outdf, file=paste("${meta.id}", ".classified.tsv", sep=""), quote=FALSE, sep='\t', row.names = FALSE)
#Function for making pie charts
pie_plots <- function(r){
sums <- aggregate(size ~ outdf[[r]], outdf, sum)
colnames(sums) <- c("tax", "size")
sums <- sums[order(sums\$size, decreasing = TRUE),]
sums <- sums %>% mutate(perc = size/sum(size))
cutoff <- sum(sums\$perc > 0.03)
top_val <- head(sums\$tax, n = cutoff)
sums <- sums %>% mutate(legend_value = case_when(tax %in% top_val ~ tax, !(tax %in% top_val) ~ "OTHER" ))
pie_table <- aggregate(size~legend_value,sums,sum)
pdf (paste("${meta.id}", ".", r, ".pdf", sep = ""), width=6, height=6)
pie(pie_table\$size, pie_table\$legend_value, clockwise = T)
dev.off()
}
#Make pie charts for order, family and genus
for (tax in c("family", "order", "genus")) {
pie_plots(tax)
}
#Generate summary stats
total_mapped_reads <- sum(aggregate(size~genus, outdf, sum)[2]) #just need total so does not matter which column used
unique_samples= nrow(outdf)
output <- cbind(unique_samples, total_mapped_reads)
write.table(output,"summary.tsv", quote=F, sep="\t", row.names = FALSE)
#Run the R script in \$projectDir/bin/r_processing.R
r_processing.R ${sintax_tsv}
"""
}

0 comments on commit d5883a9

Please sign in to comment.