Merge pull request #135 from TORCH-Consortium/develop

Add samplesheet validation
TORCH-Consortium · Nov 6, 2022 · c9fa840 · c9fa840
2 parents fa4a5c4 + 10e1aea
commit c9fa840
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 52 deletions.
diff --git a/bin/samplesheet_validation.py b/bin/samplesheet_validation.py
@@ -27,7 +27,8 @@
         fail = True
 
 if not fail:
-    print('No errors found')
+    print('Samplesheet validation checks passed')
     exit(0)
 else:
+    print('Samplesheet validation checks failed')
     exit(1)
diff --git a/conf/conda_local.config b/conf/conda_local.config
@@ -9,7 +9,7 @@ conda {
 process {
 
     withName:
-    'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*' {
+    'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*|SAMPLESHEET.*' {
         //environment does exist:
         conda = "${params.conda_envs_location}/xbs-nf-env-1"
 

diff --git a/conf/docker.config b/conf/docker.config
@@ -5,7 +5,7 @@ process {
     //----------------------------------------------
 
     withName:
-    'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*' {
+    'GATK.*|LOFREQ.*|DELLY.*|TBPROFILER.*|MULTIQC.*|FASTQC.*|UTILS.*|FASTQ.*|SAMPLESHEET.*' {
         container = "rg.nl-ams.scw.cloud/xbs-nf-containers/xbs-nf-container-1:0.9.8"
     }
 

diff --git a/main.nf b/main.nf
@@ -13,50 +13,6 @@ include { MERGE_WF } from './workflows/merge_wf.nf'
 include { QUALITY_CHECK_WF } from './workflows/quality_check_wf.nf'
 include { REPORTS_WF } from './workflows/reports_wf.nf'
 
-//================================================================================
-// Prepare channels
-//================================================================================
-
-
-//NOTE: Expected structure of input CSV samplesheet
-//   0     1       2       3    4  5     6      7       8
-// Study,Sample,Library,Attempt,R1,R2,Flowcell,Lane,Index Sequence
-
-reads_ch = Channel.fromPath(params.input_samplesheet)
-        .splitCsv(header: false, skip: 1)
-        .map { row -> {
-                    study           = row[0]
-                    sample          = row[1]
-                    library         = row[2]
-                    attempt         = row[3]
-                    read1           = row[4]
-                    read2           = row[5]
-                    flowcell        = row[6]
-                    lane            = row[7]
-                    index_sequence  = row[8]
-
-            //NOTE: Platform is hard-coded to illumina
-            bam_rg_string ="@RG\\tID:${flowcell}.${lane}\\tSM:${study}.${sample}\\tPL:illumina\\tLB:lib${library}\\tPU:${flowcell}.${lane}.${index_sequence}"
-
-            unique_sample_id = "${study}.${sample}.L${library}.A${attempt}.${flowcell}.${lane}.${index_sequence}"
-
-            //Accomodate single/multi reads
-            if (read1 && read2) {
-
-                return tuple(unique_sample_id, bam_rg_string, tuple(file(read1), file(read2)))
-
-            } else if (read1) {
-
-                return tuple(unique_sample_id, bam_rg_string, tuple(file(read1)))
-
-            } else {
-
-                return tuple(unique_sample_id, bam_rg_string, tuple(file(read2)))
-
-            }
-        }
-    }
-
 
 //================================================================================
 // Main workflow
@@ -66,11 +22,11 @@ workflow {
 
     if (params.only_validate_fastqs) {
 
-        VALIDATE_FASTQS_WF(reads_ch)
+        VALIDATE_FASTQS_WF(params.input_samplesheet)
 
     } else {
 
-        validated_reads_ch = VALIDATE_FASTQS_WF(reads_ch)
+        validated_reads_ch = VALIDATE_FASTQS_WF(params.input_samplesheet)
 
         QUALITY_CHECK_WF(validated_reads_ch)
 
@@ -79,7 +35,6 @@ workflow {
 
         MULTIPLE_INFECTIONS_WF(MAP_WF.out.rejected_sorted_reads_ch)
 
-
         CALL_WF(MAP_WF.out.approved_sorted_reads_ch)
 
         collated_gvcfs_ch = CALL_WF.out.gvcf_ch

diff --git a/modules/utils/samplesheet_validation.nf b/modules/utils/samplesheet_validation.nf
@@ -0,0 +1,14 @@
+process SAMPLESHEET_VALIDATION {
+
+    input:
+        path(samplesheet)
+
+    output:
+        path(samplesheet)
+
+    script:
+
+        """
+        samplesheet_validation.py ${samplesheet}
+        """
+}
diff --git a/workflows/validate_fastqs_wf.nf b/workflows/validate_fastqs_wf.nf
@@ -1,13 +1,53 @@
 include { FASTQ_VALIDATOR } from '../modules/fastq_utils/validator.nf' addParams ( params.FASTQ_VALIDATOR  )
 include { UTILS_FASTQ_COHORT_VALIDATION } from '../modules/utils/fastq_cohort_validation.nf' addParams ( params.UTILS_FASTQ_COHORT_VALIDATION  )
+include { SAMPLESHEET_VALIDATION } from '../modules/utils/samplesheet_validation.nf' 
 
 workflow VALIDATE_FASTQS_WF {
     take:
-         reads_ch
+         samplesheet
 
     main:
+        SAMPLESHEET_VALIDATION(samplesheet)
+
+        //NOTE: Expected structure of input CSV samplesheet
+        //   0     1       2       3    4  5     6      7       8
+        // Study,Sample,Library,Attempt,R1,R2,Flowcell,Lane,Index Sequence
+
+        reads_ch = SAMPLESHEET_VALIDATION.out
+                    .splitCsv(header: false, skip: 1)
+                    .map { row -> {
+                                study           = row[0]
+                                sample          = row[1]
+                                library         = row[2]
+                                attempt         = row[3]
+                                read1           = row[4]
+                                read2           = row[5]
+                                flowcell        = row[6]
+                                lane            = row[7]
+                                index_sequence  = row[8]
+
+                    //NOTE: Platform is hard-coded to illumina
+                    bam_rg_string ="@RG\\tID:${flowcell}.${lane}\\tSM:${study}.${sample}\\tPL:illumina\\tLB:lib${library}\\tPU:${flowcell}.${lane}.${index_sequence}"
+
+                    unique_sample_id = "${study}.${sample}.L${library}.A${attempt}.${flowcell}.${lane}.${index_sequence}"
+
+                    //Accomodate single/multi reads
+                    if (read1 && read2) {
+
+                        return tuple(unique_sample_id, bam_rg_string, tuple(file(read1), file(read2)))
+
+                    } else if (read1) {
+
+                        return tuple(unique_sample_id, bam_rg_string, tuple(file(read1)))
+
+                    } else {
+
+                        return tuple(unique_sample_id, bam_rg_string, tuple(file(read2)))
+
+                    }
+                }
+            }
 
-        //FIXME: Add the samplesheet validator process for samplesheet_validation.py script
 
         FASTQ_VALIDATOR(reads_ch)