From fec19623a51d3d080aedf50df509cd4aba7148b4 Mon Sep 17 00:00:00 2001 From: Dongze He <171858310+an-altosian@users.noreply.github.com> Date: Mon, 20 Jan 2025 11:34:03 -0800 Subject: [PATCH 01/11] update simpleaf subworkflow --- modules/local/alevinqc.nf | 20 +++++----- nextflow.config | 2 +- subworkflows/local/alevin.nf | 69 -------------------------------- subworkflows/local/simpleaf.nf | 72 ++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 79 deletions(-) delete mode 100644 subworkflows/local/alevin.nf create mode 100644 subworkflows/local/simpleaf.nf diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf index 777a1371..10613867 100644 --- a/modules/local/alevinqc.nf +++ b/modules/local/alevinqc.nf @@ -1,20 +1,22 @@ process ALEVINQC { // - // This module executes alevinfry QC reporting tool on alevin results + // This module executes alevinfry QC reporting tool on alevin-fry results // tag "$meta.id" label 'process_low' - //The alevinqc 1.14.0 container is broken, missing some libraries - thus reverting this to previous 1.12.1 version - conda "bioconda::bioconductor-alevinqc=1.12.1" + conda "bioconda::bioconductor-alevinqc=1.18.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' : - 'biocontainers/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.18.0--r43hf17093f_0' : + 'biocontainers/bioconductor-alevinqc:1.18.0--r43hf17093f_0' }" + // all metas are the same input: - tuple val(meta), path(alevin_results) + tuple val(meta), path(quant_dir) + tuple val(meta1), path(permit_dir) + tuple val(meta2), path(map_dir) output: tuple val(meta), path("alevin_report_${meta.id}.html"), emit: report @@ -29,9 +31,9 @@ process ALEVINQC { #!/usr/bin/env Rscript require(alevinQC) alevinFryQCReport( - mapDir = "${alevin_results}/af_map", - quantDir = "${alevin_results}/af_quant", - permitDir= "${alevin_results}/af_quant", + mapDir = "${map_dir}", + permitDir= "${permit_dir}", + quantDir = "${quant_dir}", sampleId = "${prefix}", outputFile = "alevin_report_${meta.id}.html", outputFormat = "html_document", diff --git a/nextflow.config b/nextflow.config index e25156ea..2ef490a7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,7 +26,7 @@ params { // alevin-fry parameters (simpleaf) simpleaf_rlen = 91 barcode_whitelist = null - salmon_index = null + simpleaf_index = null // kallisto bustools parameters kallisto_index = null diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf deleted file mode 100644 index ae98cc85..00000000 --- a/subworkflows/local/alevin.nf +++ /dev/null @@ -1,69 +0,0 @@ -/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ -include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' -include { ALEVINQC } from '../../modules/local/alevinqc' -include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' -include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' - -/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ -include { GUNZIP } from '../../modules/nf-core/gunzip/main' -include { GFFREAD as GFFREAD_TXP2GENE } from '../../modules/nf-core/gffread/main' - -def multiqc_report = [] - -workflow SCRNASEQ_ALEVIN { - - take: - genome_fasta - gtf - transcript_fasta - salmon_index - txp2gene - barcode_whitelist - protocol - ch_fastq - - - main: - ch_versions = Channel.empty() - - assert (genome_fasta && gtf && salmon_index && txp2gene) || (genome_fasta && gtf) || (genome_fasta && gtf && transcript_fasta && txp2gene): - """Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf'), or a genome fasta file - and a transcriptome fasta file ('--transcript_fasta`) if no index and txp2gene is given!""".stripIndent() - - /* - * Build salmon index - */ - if (!salmon_index) { - SIMPLEAF_INDEX( genome_fasta, transcript_fasta, gtf ) - salmon_index = SIMPLEAF_INDEX.out.index.collect() - transcript_tsv = SIMPLEAF_INDEX.out.transcript_tsv.collect() - ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) - - if (!txp2gene) { - txp2gene = SIMPLEAF_INDEX.out.transcript_tsv - } - } - - /* - * Perform quantification with salmon alevin - */ - SIMPLEAF_QUANT ( - ch_fastq, - salmon_index, - txp2gene, - protocol, - barcode_whitelist - ) - ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) - - /* - * Run alevinQC - */ - ALEVINQC( SIMPLEAF_QUANT.out.alevin_results ) - ch_versions = ch_versions.mix(ALEVINQC.out.versions) - - emit: - ch_versions - alevin_results = SIMPLEAF_QUANT.out.alevin_results.map{ meta, files -> [meta + [input_type: 'raw'], files] } - alevinqc = ALEVINQC.out.report -} diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf new file mode 100644 index 00000000..f43aad4d --- /dev/null +++ b/subworkflows/local/simpleaf.nf @@ -0,0 +1,72 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { ALEVINQC } from '../../modules/local/alevinqc' +include { SIMPLEAF_INDEX } from '../../../modules/modules/nf-core/simpleaf/index' +include { SIMPLEAF_QUANT } from '../../../modules/modules/nf-core/simpleaf/quant' + +def multiqc_report = [] + +workflow SCRNASEQ_SIMPLEAF { + + take: + genome_fasta + gtf + transcript_fasta + simpleaf_index + txp2gene + barcode_whitelist + resolution + ch_fastq + map_dir + + main: + ch_versions = Channel.empty() + + // we have four types of input: + // 1. genome fasta and gtf -> build augmented index including spliced transcripts and intronic information + // 2. transcript fasta and t2g file -> build index directly from transcript fasta + // 3. simpleaf index and t2g file -> use the index directly + // 4. mapping results -> skip mapping + assert ( txp2gene && simpleaf_index && !transcript_fasta && !genome_fasta && !gtf&& !map_dir ) || // existing index + ( txp2gene && !simpleaf_index && transcript_fasta && !genome_fasta && !gtf && !map_dir ) || // transcript fasta + ( !simpleaf_index && !transcript_fasta && genome_fasta && gtf && !map_dir ) || // genome fasta and gtf + ( !simpleaf_index && !transcript_fasta && !genome_fasta && !gtf && map_dir ) : // existing mapping + """Must provide either of the followings: (i) --genome_fasta + --gtf, (ii) --transcript_fasta + --txp2gene, (iii) --simpleaf_index + --txp2gene, and (iv) --map_dir""".stripIndent() + + /* + * Build salmon index + */ + if ( !simpleaf_index && !map_dir ) { + SIMPLEAF_INDEX( genome_fasta, gtf, transcript_fasta ) + simpleaf_index = SIMPLEAF_INDEX.out.index.collect() + transcript_tsv = SIMPLEAF_INDEX.out.transcript_tsv.collect() + ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) + + if (!txp2gene) { + txp2gene = transcript_tsv + } + } + + /* + * Perform quantification with salmon alevin + */ + SIMPLEAF_QUANT ( + ch_fastq, + simpleaf_index, + txp2gene, + resolution, + barcode_whitelist, + map_dir // this takes a directory of mapping result. Not applicable here + ) + ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) + + /* + * Run alevinQC + */ + ALEVINQC( SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.map ) + ch_versions = ch_versions.mix(ALEVINQC.out.versions) + + emit: + ch_versions + simpleaf_results = SIMPLEAF_QUANT.out.simpleaf.map{ meta, files -> [meta + [input_type: 'raw'], files] } + alevinqc = ALEVINQC.out.report +} From c6b87c4b5d356734fa0cef8d3fd951452171a8d1 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Tue, 21 Jan 2025 17:06:24 +0000 Subject: [PATCH 02/11] adopt new simpleaf modules --- nextflow.config | 2 + nextflow_schema.json | 11 ++++- subworkflows/local/simpleaf.nf | 74 +++++++++++++++++++++++----------- workflows/scrnaseq.nf | 19 +++++---- 4 files changed, 72 insertions(+), 34 deletions(-) diff --git a/nextflow.config b/nextflow.config index 2ef490a7..31e5d784 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,6 +25,8 @@ params { // alevin-fry parameters (simpleaf) simpleaf_rlen = 91 + simpleaf_index = null + simpleaf_resolution = "cr-like" barcode_whitelist = null simpleaf_index = null diff --git a/nextflow_schema.json b/nextflow_schema.json index d4a24abe..3d4aaae0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -163,9 +163,9 @@ "description": "", "default": "", "properties": { - "salmon_index": { + "simpleaf_index": { "type": "string", - "description": "This can be used to specify a precomputed Salmon index in the pipeline, in order to skip the generation of required indices by Salmon itself.", + "description": "This can be used to specify a precomputed Simpleaf index, either generated by Salmon or Piscem, in the pipeline, in order to skip the generation of required indices by Simpleaf itself.", "fa_icon": "fas fa-fish", "format": "path", "exists": true @@ -182,6 +182,13 @@ "default": 91, "description": "It is the target read length the index will be built for, using simpleaf.", "fa_icon": "fas fa-map-marked-alt" + }, + "simpleaf_resolution": { + "type": "string", + "default": "cr-like", + "enum": ["cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"], + "description": "UMI resolution strategy to deduplicate UMIs.", + "fa_icon": "fas fa-map-marked-alt" } } }, diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index f43aad4d..68604a00 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -8,54 +8,77 @@ def multiqc_report = [] workflow SCRNASEQ_SIMPLEAF { take: - genome_fasta - gtf + genome_fasta // channel + genome_gtf // channel transcript_fasta simpleaf_index txp2gene barcode_whitelist + chemistry resolution - ch_fastq + ch_fastq // channel map_dir main: ch_versions = Channel.empty() - // we have four types of input: - // 1. genome fasta and gtf -> build augmented index including spliced transcripts and intronic information - // 2. transcript fasta and t2g file -> build index directly from transcript fasta - // 3. simpleaf index and t2g file -> use the index directly - // 4. mapping results -> skip mapping - assert ( txp2gene && simpleaf_index && !transcript_fasta && !genome_fasta && !gtf&& !map_dir ) || // existing index - ( txp2gene && !simpleaf_index && transcript_fasta && !genome_fasta && !gtf && !map_dir ) || // transcript fasta - ( !simpleaf_index && !transcript_fasta && genome_fasta && gtf && !map_dir ) || // genome fasta and gtf - ( !simpleaf_index && !transcript_fasta && !genome_fasta && !gtf && map_dir ) : // existing mapping - """Must provide either of the followings: (i) --genome_fasta + --gtf, (ii) --transcript_fasta + --txp2gene, (iii) --simpleaf_index + --txp2gene, and (iv) --map_dir""".stripIndent() - /* * Build salmon index */ - if ( !simpleaf_index && !map_dir ) { - SIMPLEAF_INDEX( genome_fasta, gtf, transcript_fasta ) + if ( !simpleaf_index || !map_dir ) { + // define input channels for index building + // we can either use the genome fasta and gtf files or the transcript fasta file + if ( transcript_fasta ) { + ch_genome_fasta_gtf = [ [:],[],[] ] + ch_transcript_fasta = Channel.of( [ [id: transcript_fasta.baseName], transcript_fasta ] ) + } else { + ch_genome_fasta_gtf = genome_fasta.combine( genome_gtf ).map{ fasta, gtf -> [[id: gtf.baseName], fasta, gtf] } + ch_transcript_fasta = Channel.of( [ [:], [] ] ) + } + + SIMPLEAF_INDEX( + ch_genome_fasta_gtf, + ch_transcript_fasta + ) + // Channel of tuple(meta, index dir) simpleaf_index = SIMPLEAF_INDEX.out.index.collect() - transcript_tsv = SIMPLEAF_INDEX.out.transcript_tsv.collect() + // Channel of t2g path or empty + t2g = SIMPLEAF_INDEX.out.t2g.collect() ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) + // ensure txp2gene is a Channel if (!txp2gene) { - txp2gene = transcript_tsv + txp2gene = t2g + } else { + txp2gene = Channel.of( txp2gene ) } + } else { + // ensure simpleaf index and txp2gene are Channels + simpleaf_index = Channel.of( [ [:], simpleaf_index ] ) + txp2gene = Channel.of( txp2gene ) + } + + // define input channels for quantification + // we can either use the mapping results or the reads and index files + if ( map_dir ) { + ch_chemistry_reads = Channel.of( [ [:],[],[] ] ) + ch_index_t2g = Channel.of( [ [:],[],[] ] ) + ch_map_dir = Channel.of( [ [id: map_dir.baseName], map_dir ] ) + } else { + ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } + ch_index_t2g = simpleaf_index.combine( txp2gene ) + ch_map_dir = [ [:],[],[] ] } /* * Perform quantification with salmon alevin */ SIMPLEAF_QUANT ( - ch_fastq, - simpleaf_index, - txp2gene, + ch_chemistry_reads, + ch_index_t2g, + [[:], "unfiltered-pl", [], barcode_whitelist ], resolution, - barcode_whitelist, - map_dir // this takes a directory of mapping result. Not applicable here + ch_map_dir ) ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) @@ -65,8 +88,11 @@ workflow SCRNASEQ_SIMPLEAF { ALEVINQC( SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.map ) ch_versions = ch_versions.mix(ALEVINQC.out.versions) + emit: ch_versions - simpleaf_results = SIMPLEAF_QUANT.out.simpleaf.map{ meta, files -> [meta + [input_type: 'raw'], files] } + index = simpleaf_index + map = SIMPLEAF_QUANT.out.map + quant = SIMPLEAF_QUANT.out.quant.map{ meta, files -> [meta + [input_type: meta["count_type"]], files] } alevinqc = ALEVINQC.out.report } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 791b3713..5f75e79a 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -11,7 +11,7 @@ include { methodsDescriptionText } from '../subworkfl include { getGenomeAttribute } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' include { FASTQC_CHECK } from '../subworkflows/local/fastqc' include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' -include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { SCRNASEQ_SIMPLEAF } from '../subworkflows/local/simpleaf' include { STARSOLO } from '../subworkflows/local/starsolo' include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" include { CELLRANGER_MULTI_ALIGN } from "../subworkflows/local/align_cellrangermulti" @@ -64,7 +64,7 @@ workflow SCRNASEQ { kb_t2c = params.kb_t2c ? file(params.kb_t2c, checkIfExists: true) : [] //salmon params - ch_salmon_index = params.salmon_index ? file(params.salmon_index, checkIfExists: true) : [] + ch_simpleaf_index = params.simpleaf_index ? file(params.simpleaf_index, checkIfExists: true) : [] //star params star_index = params.star_index ? file(params.star_index, checkIfExists: true) : null @@ -135,19 +135,22 @@ workflow SCRNASEQ { // Run salmon alevin pipeline if (params.aligner == "alevin") { - SCRNASEQ_ALEVIN( + + SCRNASEQ_SIMPLEAF( ch_genome_fasta, ch_filter_gtf, ch_transcript_fasta, - ch_salmon_index, + ch_simpleaf_index, ch_txp2gene, ch_barcode_whitelist, protocol_config['protocol'], - ch_fastq + params.simpleaf_resolution, + ch_fastq, + [] // for existing map dir; not applicable ) - ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) - ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_ALEVIN.out.alevin_results.map{ meta, it -> it }) - ch_mtx_matrices = ch_mtx_matrices.mix( SCRNASEQ_ALEVIN.out.alevin_results ) + ch_versions = ch_versions.mix(SCRNASEQ_SIMPLEAF.out.ch_versions) + ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_SIMPLEAF.out.quant.map{ meta, it -> it }) + ch_mtx_matrices = ch_mtx_matrices.mix( SCRNASEQ_SIMPLEAF.out.quant ) } // Run STARSolo pipeline From f3e2977a51bba00218d261231546ce6ca9831862 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Wed, 22 Jan 2025 03:33:35 +0000 Subject: [PATCH 03/11] tested changes --- conf/modules.config | 6 +++-- core.1739377 | 0 docs/output.md | 2 +- main.nf | 4 ++-- modules/local/alevinqc.nf | 4 ++-- modules/local/mtx_to_h5ad.nf | 2 ++ ...h5ad_alevin.py => mtx_to_h5ad_simpleaf.py} | 2 +- nextflow.config | 6 ++--- nextflow_schema.json | 8 +------ subworkflows/local/simpleaf.nf | 24 +++++++++---------- workflows/scrnaseq.nf | 19 ++++++++++++--- 11 files changed, 43 insertions(+), 34 deletions(-) create mode 100644 core.1739377 rename modules/local/templates/{mtx_to_h5ad_alevin.py => mtx_to_h5ad_simpleaf.py} (97%) diff --git a/conf/modules.config b/conf/modules.config index 5b2dd9e7..8aff4655 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -41,6 +41,7 @@ process { } withName: 'ANNDATA_BARCODES' { ext.prefix = { "${meta.id}_${meta.input_type}_matrix" } + // ext.prefix = { "${meta.id}_filtered_matrix" } publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" }, mode: params.publish_dir_mode, @@ -139,7 +140,9 @@ if (params.aligner == "alevin") { mode: params.publish_dir_mode, enabled: params.save_reference ] - ext.args = { "--rlen ${params.simpleaf_rlen}" } + // because piscem create a large number of intermediate files, + // we set scratch to true to avoid IO issues + scratch = true } withName: 'SIMPLEAF_QUANT' { publishDir = [ @@ -147,7 +150,6 @@ if (params.aligner == "alevin") { mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ext.args = "-r cr-like" } // Fix for issue 196 // Modified for issue 334 diff --git a/core.1739377 b/core.1739377 new file mode 100644 index 00000000..e69de29b diff --git a/docs/output.md b/docs/output.md index 10553cba..2f81eae1 100644 --- a/docs/output.md +++ b/docs/output.md @@ -93,7 +93,7 @@ This pipeline uses the simplified and flexible modules in [Simpleaf](https://sim **Output directory: `results/reference_genome`** -- `salmon_index` +- `simpleaf_index` - Contains the indexed reference transcriptome for the Salmon mapper - `alevin/txp2gene.tsv` - The transcriptome to gene mapping TSV file utilized by Alevin-fry diff --git a/main.nf b/main.nf index b63dc43f..e1528143 100644 --- a/main.nf +++ b/main.nf @@ -30,8 +30,8 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_scrn // Thus, manually provided files are not overwritten by the genome attributes params.fasta = getGenomeAttribute('fasta') params.gtf = getGenomeAttribute('gtf') -params.salmon_index = getGenomeAttribute('simpleaf') -params.txp2gene = getGenomeAttribute('simpleaf_tx2pgene') +params.simpleaf_index = getGenomeAttribute('simpleaf') +params.txp2gene = getGenomeAttribute('simpleaf_txp2gene') params.cellranger_index = params.aligner == 'cellrangerarc' ? getGenomeAttribute('cellrangerarc') : getGenomeAttribute('cellranger') diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf index 10613867..f9096839 100644 --- a/modules/local/alevinqc.nf +++ b/modules/local/alevinqc.nf @@ -14,8 +14,8 @@ process ALEVINQC { // all metas are the same input: - tuple val(meta), path(quant_dir) - tuple val(meta1), path(permit_dir) + tuple val(meta), path(quant_dir, stageAs: "quant_dir") + tuple val(meta1), path(permit_dir, stageAs: "permit_dir") tuple val(meta2), path(map_dir) output: diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 424580f6..7e1bdd0a 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -28,6 +28,8 @@ process MTX_TO_H5AD { script: def aligner = (input_aligner in [ 'cellranger', 'cellrangerarc', 'cellrangermulti' ]) ? 'cellranger' : input_aligner + aligner = input_aligner == "alevin" ? "simpleaf" : aligner + template "mtx_to_h5ad_${aligner}.py" stub: diff --git a/modules/local/templates/mtx_to_h5ad_alevin.py b/modules/local/templates/mtx_to_h5ad_simpleaf.py similarity index 97% rename from modules/local/templates/mtx_to_h5ad_alevin.py rename to modules/local/templates/mtx_to_h5ad_simpleaf.py index 492defd3..f9b5bf01 100755 --- a/modules/local/templates/mtx_to_h5ad_alevin.py +++ b/modules/local/templates/mtx_to_h5ad_simpleaf.py @@ -85,7 +85,7 @@ def input_to_adata( # input_type comes from NF module input_to_adata( - input_data="${meta.id}_alevin_results/af_quant/alevin/", + input_data="${inputs}/alevin/", output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) diff --git a/nextflow.config b/nextflow.config index 31e5d784..b44a1676 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,11 +24,9 @@ params { gtf = null // alevin-fry parameters (simpleaf) - simpleaf_rlen = 91 - simpleaf_index = null - simpleaf_resolution = "cr-like" - barcode_whitelist = null simpleaf_index = null + barcode_whitelist = null + simpleaf_umi_resolution = "cr-like" // kallisto bustools parameters kallisto_index = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 3d4aaae0..28c3c65a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -177,13 +177,7 @@ "format": "file-path", "exists": true }, - "simpleaf_rlen": { - "type": "integer", - "default": 91, - "description": "It is the target read length the index will be built for, using simpleaf.", - "fa_icon": "fas fa-map-marked-alt" - }, - "simpleaf_resolution": { + "simpleaf_umi_resolution": { "type": "string", "default": "cr-like", "enum": ["cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"], diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index 68604a00..e6832c27 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -3,13 +3,11 @@ include { ALEVINQC } from '../../modules/local/alevinqc' include { SIMPLEAF_INDEX } from '../../../modules/modules/nf-core/simpleaf/index' include { SIMPLEAF_QUANT } from '../../../modules/modules/nf-core/simpleaf/quant' -def multiqc_report = [] - workflow SCRNASEQ_SIMPLEAF { take: - genome_fasta // channel - genome_gtf // channel + ch_genome_fasta // channel + ch_genome_gtf // channel transcript_fasta simpleaf_index txp2gene @@ -30,9 +28,9 @@ workflow SCRNASEQ_SIMPLEAF { // we can either use the genome fasta and gtf files or the transcript fasta file if ( transcript_fasta ) { ch_genome_fasta_gtf = [ [:],[],[] ] - ch_transcript_fasta = Channel.of( [ [id: transcript_fasta.baseName], transcript_fasta ] ) + ch_transcript_fasta = Channel.of( [ [id: "${transcript_fasta.getBaseName()}"], transcript_fasta ] ) } else { - ch_genome_fasta_gtf = genome_fasta.combine( genome_gtf ).map{ fasta, gtf -> [[id: gtf.baseName], fasta, gtf] } + ch_genome_fasta_gtf = ch_genome_fasta.combine( ch_genome_gtf ).map{ fasta, gtf -> [[id: "${fasta.getBaseName()}"], fasta, gtf] } ch_transcript_fasta = Channel.of( [ [:], [] ] ) } @@ -67,7 +65,7 @@ workflow SCRNASEQ_SIMPLEAF { } else { ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } ch_index_t2g = simpleaf_index.combine( txp2gene ) - ch_map_dir = [ [:],[],[] ] + ch_map_dir = [ [:],[] ] } /* @@ -82,17 +80,19 @@ workflow SCRNASEQ_SIMPLEAF { ) ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) + ch_af_map = map_dir ? ch_map_dir : SIMPLEAF_QUANT.out.map /* * Run alevinQC */ - ALEVINQC( SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.map ) + ALEVINQC( SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.quant, ch_af_map ) ch_versions = ch_versions.mix(ALEVINQC.out.versions) emit: ch_versions - index = simpleaf_index - map = SIMPLEAF_QUANT.out.map - quant = SIMPLEAF_QUANT.out.quant.map{ meta, files -> [meta + [input_type: meta["count_type"]], files] } - alevinqc = ALEVINQC.out.report + txp2gene + index = simpleaf_index + map = SIMPLEAF_QUANT.out.map + quant = SIMPLEAF_QUANT.out.quant + alevinqc = ALEVINQC.out.report } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 5f75e79a..d3afce15 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -144,13 +144,23 @@ workflow SCRNASEQ { ch_txp2gene, ch_barcode_whitelist, protocol_config['protocol'], - params.simpleaf_resolution, + params.simpleaf_umi_resolution, ch_fastq, [] // for existing map dir; not applicable ) ch_versions = ch_versions.mix(SCRNASEQ_SIMPLEAF.out.ch_versions) ch_multiqc_files = ch_multiqc_files.mix(SCRNASEQ_SIMPLEAF.out.quant.map{ meta, it -> it }) - ch_mtx_matrices = ch_mtx_matrices.mix( SCRNASEQ_SIMPLEAF.out.quant ) + ch_mtx_matrices = ch_mtx_matrices.mix( + SCRNASEQ_SIMPLEAF.out.quant.map{ + meta, files -> [ + meta + + [input_type: meta["filtered"] ? "filtered" : "raw" ], + files + ] + } + ) + + ch_txp2gene = SCRNASEQ_SIMPLEAF.out.txp2gene } // Run STARSolo pipeline @@ -285,9 +295,12 @@ workflow SCRNASEQ { // SUBWORKFLOW: Run cellbender emptydrops filter // if ( !params.skip_emptydrops && !(params.aligner in ['cellrangerarc']) ) { + // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it H5AD_REMOVEBACKGROUND_BARCODES_CELLBENDER_ANNDATA ( - ch_h5ads.filter { meta, mtx_files -> meta.input_type == 'raw' } + ch_h5ads + .filter { meta, mtx_files -> meta.input_type == 'raw' } + .map { meta, mtx_files -> [ meta + [input_type: 'filtered'], mtx_files ]} // to avoid name collision ) ch_h5ads = ch_h5ads.mix( H5AD_REMOVEBACKGROUND_BARCODES_CELLBENDER_ANNDATA.out.h5ad From bec0bfa990d78ef4083698afb8a7f46860d5a010 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 23 Jan 2025 02:20:53 +0000 Subject: [PATCH 04/11] adopt new t2g format in simpleaf index out --- subworkflows/local/simpleaf.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index e6832c27..02a5fa5b 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -48,12 +48,12 @@ workflow SCRNASEQ_SIMPLEAF { if (!txp2gene) { txp2gene = t2g } else { - txp2gene = Channel.of( txp2gene ) + txp2gene = Channel.of( [ [:], txp2gene ] ) } } else { // ensure simpleaf index and txp2gene are Channels simpleaf_index = Channel.of( [ [:], simpleaf_index ] ) - txp2gene = Channel.of( txp2gene ) + txp2gene = Channel.of( [ [:], txp2gene ] ) } // define input channels for quantification @@ -64,7 +64,8 @@ workflow SCRNASEQ_SIMPLEAF { ch_map_dir = Channel.of( [ [id: map_dir.baseName], map_dir ] ) } else { ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } - ch_index_t2g = simpleaf_index.combine( txp2gene ) + + ch_index_t2g = simpleaf_index.combine( txp2gene.map{ _m, t -> t } )} ch_map_dir = [ [:],[] ] } From 0e4c2426a6d2e80dea9077edd74564a0f0fa2aef Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 23 Jan 2025 16:38:15 +0000 Subject: [PATCH 05/11] fix typos --- subworkflows/local/simpleaf.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index 02a5fa5b..b793f68c 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -65,7 +65,7 @@ workflow SCRNASEQ_SIMPLEAF { } else { ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } - ch_index_t2g = simpleaf_index.combine( txp2gene.map{ _m, t -> t } )} + ch_index_t2g = simpleaf_index.combine( txp2gene.map{ _m, t -> t } ) ch_map_dir = [ [:],[] ] } From 2513ef0285934638906548733240a57c1ecf04bb Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 23 Jan 2025 17:45:38 +0000 Subject: [PATCH 06/11] fix typos --- subworkflows/local/simpleaf.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index b793f68c..7ae22e75 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -41,19 +41,19 @@ workflow SCRNASEQ_SIMPLEAF { // Channel of tuple(meta, index dir) simpleaf_index = SIMPLEAF_INDEX.out.index.collect() // Channel of t2g path or empty - t2g = SIMPLEAF_INDEX.out.t2g.collect() + t2g = SIMPLEAF_INDEX.out.t2g.collect().map { _meta, it -> it } ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) // ensure txp2gene is a Channel if (!txp2gene) { txp2gene = t2g } else { - txp2gene = Channel.of( [ [:], txp2gene ] ) + txp2gene = Channel.of( [ txp2gene ] ) } } else { // ensure simpleaf index and txp2gene are Channels - simpleaf_index = Channel.of( [ [:], simpleaf_index ] ) - txp2gene = Channel.of( [ [:], txp2gene ] ) + simpleaf_index = Channel.of( [ simpleaf_index ] ) + txp2gene = Channel.of( [ txp2gene ] ) } // define input channels for quantification @@ -65,7 +65,7 @@ workflow SCRNASEQ_SIMPLEAF { } else { ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } - ch_index_t2g = simpleaf_index.combine( txp2gene.map{ _m, t -> t } ) + ch_index_t2g = simpleaf_index.combine( txp2gene ) ch_map_dir = [ [:],[] ] } From 6f3834bca775d8fcfd1b9156b6fbbc5540da3aa4 Mon Sep 17 00:00:00 2001 From: Dongze He <171858310+an-altosian@users.noreply.github.com> Date: Mon, 27 Jan 2025 12:50:59 -0800 Subject: [PATCH 07/11] update doc --- conf/modules.config | 2 +- docs/output.md | 10 +- modules.json | 10 ++ .../nf-core/simpleaf/index/environment.yml | 9 + modules/nf-core/simpleaf/index/main.nf | 96 +++++++++++ modules/nf-core/simpleaf/index/meta.yml | 91 ++++++++++ .../nf-core/simpleaf/index/tests/main.nf.test | 105 ++++++++++++ .../simpleaf/index/tests/main.nf.test.snap | 105 ++++++++++++ modules/nf-core/simpleaf/index/tests/tags.yml | 2 + .../nf-core/simpleaf/quant/environment.yml | 9 + modules/nf-core/simpleaf/quant/main.nf | 158 ++++++++++++++++++ modules/nf-core/simpleaf/quant/meta.yml | 118 +++++++++++++ .../nf-core/simpleaf/quant/tests/main.nf.test | 110 ++++++++++++ .../simpleaf/quant/tests/main.nf.test.snap | 89 ++++++++++ modules/nf-core/simpleaf/quant/tests/tags.yml | 2 + nextflow_schema.json | 19 ++- subworkflows/local/simpleaf.nf | 12 +- 17 files changed, 930 insertions(+), 17 deletions(-) create mode 100644 modules/nf-core/simpleaf/index/environment.yml create mode 100644 modules/nf-core/simpleaf/index/main.nf create mode 100644 modules/nf-core/simpleaf/index/meta.yml create mode 100644 modules/nf-core/simpleaf/index/tests/main.nf.test create mode 100644 modules/nf-core/simpleaf/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/simpleaf/index/tests/tags.yml create mode 100644 modules/nf-core/simpleaf/quant/environment.yml create mode 100644 modules/nf-core/simpleaf/quant/main.nf create mode 100644 modules/nf-core/simpleaf/quant/meta.yml create mode 100644 modules/nf-core/simpleaf/quant/tests/main.nf.test create mode 100644 modules/nf-core/simpleaf/quant/tests/main.nf.test.snap create mode 100644 modules/nf-core/simpleaf/quant/tests/tags.yml diff --git a/conf/modules.config b/conf/modules.config index 38845b0b..75596e8f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -128,7 +128,7 @@ if(params.aligner == "cellrangerarc") { } } -if (params.aligner == "alevin") { +if (params.aligner == "simpleaf") { process { withName: GFFREAD_TXP2GENE { ext.args = "--table transcript_id,gene_id" diff --git a/docs/output.md b/docs/output.md index c1af8b06..04fc12ea 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,7 +15,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [FastQC](#fastqc) - [Kallisto \& Bustools Results](#kallisto--bustools-results) - [STARsolo](#starsolo) -- [Salmon \& Alevin-fry \& AlevinQC](#salmon--alevin-fry--alevinqc) +- [Simpleaf \& AlevinQC](#simpleaf--alevinqc) - [Cellranger](#cellranger) - [Cellranger ARC](#cellranger-arc) - [Cellranger multi](#cellranger-multi) @@ -80,14 +80,14 @@ For details on how to load these into R and perform further downstream analysis, - `star_index` - Contains the index of the supplied genome fasta file -## Salmon & Alevin-fry & AlevinQC +## Simpleaf & AlevinQC -This pipeline uses the simplified and flexible modules in [Simpleaf](https://simpleaf.readthedocs.io/en/latest/) for processing single-cell data with [Salmon](https://salmon.readthedocs.io/en/latest/) as the underlying mapper and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) as the quantification tool. For detailed examples of using the quantification results generated by Alevin-fry in downstream analyses, such as RNA-velocity, please refer to [Alevin-fry/simpleaf tutorials](https://combine-lab.github.io/alevin-fry-tutorials/#blog). +This pipeline uses the simplified and flexible modules in [Simpleaf](https://simpleaf.readthedocs.io/en/latest/) for processing single-cell data with [Salmon](https://salmon.readthedocs.io/en/latest/) or [Piscem](https://github.com/COMBINE-lab/piscem) as the underlying mapper and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) as the quantification tool. For detailed examples of using the quantification results generated by Alevin-fry in downstream analyses, such as RNA-velocity, please refer to [Alevin-fry/simpleaf tutorials](https://combine-lab.github.io/alevin-fry-tutorials/#blog). **Output directory: `results/alevin`** -- `alevin` - - Contains the count matrix created by Alevin-fry +- `simpleaf` + - Contains the count matrix created by Simpleaf - `alevinqc` - Contains the QC report for the aforementioned Alevin-fry output data diff --git a/modules.json b/modules.json index da7de5f0..ce2094f6 100644 --- a/modules.json +++ b/modules.json @@ -85,6 +85,16 @@ "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] }, + "simpleaf/index": { + "branch": "master", + "git_sha": "5b33c12fb7b488b39311fd3e8e63d4a73e7475c2", + "installed_by": ["modules"] + }, + "simpleaf/quant": { + "branch": "master", + "git_sha": "5b33c12fb7b488b39311fd3e8e63d4a73e7475c2", + "installed_by": ["modules"] + }, "star/genomegenerate": { "branch": "master", "git_sha": "46eca555142d6e597729fcb682adcc791796f514", diff --git a/modules/nf-core/simpleaf/index/environment.yml b/modules/nf-core/simpleaf/index/environment.yml new file mode 100644 index 00000000..7e7a1020 --- /dev/null +++ b/modules/nf-core/simpleaf/index/environment.yml @@ -0,0 +1,9 @@ +channels: + - bioconda + - conda-forge + +dependencies: + - bioconda::alevin-fry=0.11.1 + - bioconda::piscem=0.11.0 + - bioconda::salmon=1.10.3 + - bioconda::simpleaf=0.18.4 diff --git a/modules/nf-core/simpleaf/index/main.nf b/modules/nf-core/simpleaf/index/main.nf new file mode 100644 index 00000000..5959c7dd --- /dev/null +++ b/modules/nf-core/simpleaf/index/main.nf @@ -0,0 +1,96 @@ +// NOTE because the default indexer, piscem, needs to frequently read and write a large number of intermediate files, if your use case involves the situations where the CPU and storage are not physically connected, we recommend setting `--work-dir /path/to/a/local/dir` or in the `ext.args` in nextflow.config, or `scratch = true`, to avoid runtime issues. +process SIMPLEAF_INDEX { + tag "${meta.id ?: meta2.id}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1': + 'biocontainers/simpleaf:0.18.4--ha6fb395_1' }" + + input: + tuple val(meta), path(genome_fasta), path(genome_gtf) + tuple val(meta2), path(transcript_fasta) + + output: + tuple val(meta), path("${prefix}/index") , emit: index + tuple val(meta), path("${prefix}/ref") , emit: ref, optional: true + tuple val(meta), path("${prefix}/ref/{t2g,t2g_3col}.tsv") , emit: t2g, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def seq_inputs = input_args(genome_fasta, genome_gtf, transcript_fasta)//, probes_csv, features_csv) + + // Output meta needs to correspond to the input used + meta = (transcript_fasta) ? meta2 : meta + prefix = task.ext.prefix ?: "${meta.id}" + """ + # export required var + export ALEVIN_FRY_HOME=. + + # set maximum number of file descriptors for temp files + ulimit -n 2048 + + # prep simpleaf + simpleaf set-paths + + # run simpleaf index + simpleaf \\ + index \\ + --threads $task.cpus \\ + $seq_inputs \\ + $args \\ + -o ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") + salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : "${meta2.id}") + + """ + mkdir -p ${prefix}/index + mkdir -p ${prefix}/ref + touch ${prefix}/index/piscem_idx_cfish.json + touch ${prefix}/index/piscem_idx.ectab + touch ${prefix}/index/piscem_idx.sshash + touch ${prefix}/ref/t2g_3col.tsv + touch ${prefix}/ref/roers_ref.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") + salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") + END_VERSIONS + """ +} + +def input_args(genome_fasta, genome_gtf, transcript_fasta) { //, probes_csv, features_csv) { + // if (probe_csv) { + // args = "--probe_csv ${probe_csv}" + // } else if (feature_csv) { + // args = "--feature_csv ${feature_csv}" + // } else + if (transcript_fasta) { + return "--ref-seq ${transcript_fasta}" + } else if (genome_fasta && genome_gtf) { + return "--fasta ${genome_fasta} --gtf ${genome_gtf}" + } else { + error "No valid input provided; please provide either a genome fasta + gtf set or a transcript fasta file. ${genome_fasta} ${genome_gtf} ${transcript_fasta}" + // error "No valid input provided; please provide one of the followings: (i) a genome fasta + gtf set, (ii) a transcript fasta file, (iii) a probes csv file (iv) a features csv file." + } + +} diff --git a/modules/nf-core/simpleaf/index/meta.yml b/modules/nf-core/simpleaf/index/meta.yml new file mode 100644 index 00000000..a9c5e66b --- /dev/null +++ b/modules/nf-core/simpleaf/index/meta.yml @@ -0,0 +1,91 @@ +name: simpleaf_index +description: Indexing of transcriptome for gene expression quantification using SimpleAF +keywords: + - indexing + - transcriptome + - gene expression + - SimpleAF +tools: + - simpleaf: + description: | + SimpleAF is a tool for quantification of gene expression from RNA-seq data + homepage: https://github.com/COMBINE-lab/simpleaf + licence: ["BSD-3-Clause"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing information on genome_fasta and genome_gtf + - genome_fasta: + type: file + description: | + FASTA file containing the genome sequence. + It conflicts with transcript_fasta. + When transcript_fasta is provided, it must be empty (provided as []). + When transcript_fasta is empty, it must be provided together with its corresponding genome_gtf file. + - genome_gtf: + type: file + description: | + GTF file containing gene annotations. + It conflicts with transcript_fasta. + When transcript_fasta is provided, it must be empty (provided as []). + When transcript_fasta is empty, it must be provided together with its corresponding genome_fasta file. + - - meta2: + type: map + description: | + Groovy Map containing information on transcript_fasta + - transcript_fasta: + type: file + description: | + FASTA file containing the transcript sequences to build index directly on. + It conflicts with genome_gtf and genome_fasta. + When genome_gtf and genome_fasta are provided, it must be empty (provided as []). +output: + - index: + - meta: + type: map + description: | + Groovy Map containing information on the index generated by simpleaf + - ${prefix}/index: + type: map + description: | + Groovy Map containing information on the index generated by simpleaf + - ref: + - meta: + type: map + description: | + Groovy Map containing information on the transcriptomic reference constructed by simpleaf. + - ${prefix}/ref: + type: map + description: | + Groovy Map containing information on the transcriptomic reference constructed by simpleaf. + - t2g: + - meta: + type: file + description: | + Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant. + - ${prefix}/ref/{t2g,t2g_3col}.tsv: + type: file + description: | + Path to the tsv file containing the transcript-to-gene mapping information generated by simpleaf. This is used as --t2g-map when invoking simpleaf quant. + - versions: + - versions.yml: + type: file + description: | + File containing software versions + pattern: "versions.yml" +authors: + - "@fmalmeida" + - "@maxulysse" + - "@Khajidu" + - "@apeltzer" + - "@pinin4fjords" + - "@dongzehe" +maintainers: + - "@fmalmeida" + - "@maxulysse" + - "@Khajidu" + - "@apeltzer" + - "@pinin4fjords" + - "@dongzehe" diff --git a/modules/nf-core/simpleaf/index/tests/main.nf.test b/modules/nf-core/simpleaf/index/tests/main.nf.test new file mode 100644 index 00000000..d546967d --- /dev/null +++ b/modules/nf-core/simpleaf/index/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process SIMPLEAF_INDEX" + script "../main.nf" + process "SIMPLEAF_INDEX" + + tag "modules" + tag "modules_nfcore" + tag "simpleaf" + tag "simpleaf/index" + + // test piscem + test("Homo sapiens - genome index - expanded - fasta + gtf") { + + when { + process { + """ + genome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) + meta = [ 'id': 'human_genome'] + + input[0] = Channel.of([ meta, genome_fasta, gtf ]) + input[1] = Channel.of([[],[]]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() }, + { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/roers_ref.fa").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/t2g_3col.tsv").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/gene_id_to_name.tsv").exists() }, + { assert file("${process.out.ref.get(0).get(1)}/roers_make-ref.json").exists() }, + { assert file("${process.out.t2g.get(0).get(1)}").exists() }, + ) + } + + } + + test("Homo sapiens - transcriptome index - direct - transcriptome fasta") { + + when { + process { + """ + transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true) + meta = [ 'id': 'human_transcriptome'] + + input[0] = Channel.of([[],[],[]]) + input[1] = Channel.of([ meta, transcriptome_fasta ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ctab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.ectab").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.json").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.refinfo").exists() }, + { assert file("${process.out.index.get(0).get(1)}/piscem_idx.sshash").exists() }, + { assert file("${process.out.index.get(0).get(1)}/simpleaf_index.json").exists() } + // { assert snapshot( + // path("${process.out.index.get(0).get(1)}/piscem_idx.ctab"), + // path("${process.out.index.get(0).get(1)}/piscem_idx.json"), + // path("${process.out.index.get(0).get(1)}/piscem_idx_cfish.json"), + // process.out.versions) + // .match() } + ) + } + } + + test("Homo sapiens - transcriptome index - direct - transcriptome fasta - stub") { + options "-stub-run" + when { + process { + """ + transcriptome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/transcriptome.fasta', checkIfExists: true) + meta = [ 'id': 'human_transcriptome'] + + input[0] = Channel.of([[],[],[]]) + input[1] = Channel.of([ meta, transcriptome_fasta ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/simpleaf/index/tests/main.nf.test.snap b/modules/nf-core/simpleaf/index/tests/main.nf.test.snap new file mode 100644 index 00000000..5725a935 --- /dev/null +++ b/modules/nf-core/simpleaf/index/tests/main.nf.test.snap @@ -0,0 +1,105 @@ +{ + "Homo sapiens - transcriptome index - direct - transcriptome fasta": { + "content": [ + [ + "versions.yml:md5,bd96efe900339c637533c40b37fa5cfc" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T00:40:55.088252924" + }, + "Homo sapiens - transcriptome index - direct - transcriptome fasta - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + [ + "piscem_idx.ectab:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx.sshash:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx_cfish.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + [ + + ], + [ + "roers_ref.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "2": [ + [ + [ + + ], + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,78f7da1109cf98d7b9107222704848e1" + ], + "index": [ + [ + [ + + ], + [ + "piscem_idx.ectab:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx.sshash:md5,d41d8cd98f00b204e9800998ecf8427e", + "piscem_idx_cfish.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "ref": [ + [ + [ + + ], + [ + "roers_ref.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "t2g": [ + [ + [ + + ], + "t2g_3col.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,78f7da1109cf98d7b9107222704848e1" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:08:51.588975264" + }, + "Homo sapiens - genome index - expanded - fasta + gtf": { + "content": [ + [ + "versions.yml:md5,bd96efe900339c637533c40b37fa5cfc" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T00:40:41.692166586" + } +} \ No newline at end of file diff --git a/modules/nf-core/simpleaf/index/tests/tags.yml b/modules/nf-core/simpleaf/index/tests/tags.yml new file mode 100644 index 00000000..db7af084 --- /dev/null +++ b/modules/nf-core/simpleaf/index/tests/tags.yml @@ -0,0 +1,2 @@ +simpleaf/index: + - "modules/nf-core/simpleaf/index/**" diff --git a/modules/nf-core/simpleaf/quant/environment.yml b/modules/nf-core/simpleaf/quant/environment.yml new file mode 100644 index 00000000..7e7a1020 --- /dev/null +++ b/modules/nf-core/simpleaf/quant/environment.yml @@ -0,0 +1,9 @@ +channels: + - bioconda + - conda-forge + +dependencies: + - bioconda::alevin-fry=0.11.1 + - bioconda::piscem=0.11.0 + - bioconda::salmon=1.10.3 + - bioconda::simpleaf=0.18.4 diff --git a/modules/nf-core/simpleaf/quant/main.nf b/modules/nf-core/simpleaf/quant/main.nf new file mode 100644 index 00000000..818f514b --- /dev/null +++ b/modules/nf-core/simpleaf/quant/main.nf @@ -0,0 +1,158 @@ +process SIMPLEAF_QUANT { + tag "${meta.id ?: meta4.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.18.4--ha6fb395_1': + 'biocontainers/simpleaf:0.18.4--ha6fb395_1' }" + + input: + // + // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] + // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. + // + tuple val(meta), val(chemistry), path(reads) // chemistry and reads + tuple val(meta2), path(index), path(txp2gene) // index and t2g mapping + tuple val(meta3), val(cell_filter), val(number_cb), path(cb_list) // cell filtering strategy + val resolution // UMI resolution + tuple val(meta4), path(map_dir) // mapping results + + output: + tuple val(meta), path("${prefix}/af_map") , emit: map, optional: true // missing if map_dir is provided + tuple val(meta), path("${prefix}/af_quant") , emit: quant + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + // The first required input is either a mapping result directory, or the reads and index files for mapping. + mapping_args = mappingArgs(chemistry, reads, index, txp2gene, map_dir) + + // The second required input is a cell filtering strategy. + cf_option = cellFilteringArgs(cell_filter, number_cb, cb_list) + + meta = map_dir ? meta4 : meta2 + meta3 + meta + meta = meta + [ "filtered": cell_filter != "unfiltered-pl" ] + + // separate forward from reverse pairs + """ + # export required var + export ALEVIN_FRY_HOME=. + + # prep simpleaf + simpleaf set-paths + + # run simpleaf quant + simpleaf quant \\ + $mapping_args \\ + --resolution ${resolution} \\ + --output ${prefix} \\ + --threads ${task.cpus} \\ + ${cf_option} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") + salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + export ALEVIN_FRY_HOME=. + + mkdir -p ${prefix}/af_map + mkdir -p ${prefix}/af_quant/alevin + + touch ${prefix}/af_map/map.rad + touch ${prefix}/af_map/unmapped_bc_count.bin + touch ${prefix}/af_quant/alevin/quants_mat_rows.txt + touch ${prefix}/af_quant/map.collated.rad + touch ${prefix}/af_quant/permit_freq.bin + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + alevin-fry: \$(alevin-fry --version | sed -e "s/alevin-fry //g") + piscem: \$(piscem --version | sed -e "s/piscem //g") + salmon: \$(salmon --version | sed -e "s/salmon //g") + simpleaf: \$(simpleaf --version | sed -e "s/simpleaf //g") + END_VERSIONS + """ +} + +// We have mutual exclusive options for permit list generation. +// 1. 'k' (knee), which is a flag for the knee method and any value provided will be ignored; +// 2. 'f' (forced-cells), which takes an integer indicating the exact number of cells to recover; +// 3. 'e' (expect-cells), which takes an integer indicating the expected number of cells to recover; +// 4. 'x' (explicit-pl), which takes a string indicating the path to a valid permit list; +// 5. 'u' (unfiltered-pl), which takes an empty string (if `chemistry` is defined as "10xv2" or "10xv3"), or a string indicating the path to a valid white list file. +// The difference between (4) and (5) is that (4) contains the exact permit list to filter the observed barcodes, while (5) will use the white list to generate a permit list via barcode correction. + +// We have two ways to take these options. `-u` is implied by the presence of the input `whitelist` channel. The options can also be passed as arguments to ext.args. Therefore, we must check two things: +// 1. if there is at least one of the options in the args list, and +// 2. if none of the four options are in the args list, there must be a non-empty whitelist channel. + +def cellFilteringArgs(cell_filter_method, number_cb, cb_list) { + def pl_options = ["knee", "forced-cells", "explicit-pl", "expect-cells", "unfiltered-pl"] + + // try catch unintentional underscore in method name + def method = cell_filter_method.replaceAll('_','-') + + def number = number_cb + if (!method) { + error "No cell filtering method was provided; cannot proceed." + } else if (! method in pl_options) { + error "Invalid cell filtering method, '${method}', was provided; cannot proceed. possible options are ${pl_options.join(',')}." + } + + if (method == "unfiltered-pl") { + return "--${method} ${cb_list}" + } else if (method == "explicit-pl") { + return "--${method} ${cb_list}" + } else if (method == "knee") { + return "--${method}" + } else { + if (!number) { + error "Could not find the corresponding 'number' field for the cell filtering method '${method}'; please use the following format: [method:'${method}',number:3000]." + } + return "--${method} ${number}" + } +} + +def mappingArgs(chemistry, reads, index, txp2gene, map_dir) { + if ( map_dir ) { + if (reads) { + error "Found both reads and map_dir. Please provide only one of the two." + } + return "--map-dir ${map_dir}" + } else { + if (!reads) { + error "Missing read files; could not proceed." + } + if (!index) { + error "Missing index files; could not proceed." + } + if (!chemistry) { + error "Missing chemistry; could not proceed." + } + + def (forward, reverse) = reads.collate(2).transpose() + + def t2g = txp2gene ? "--t2g-map ${txp2gene}" : "" + def mapping_args = """${t2g} \\ + --chemistry ${chemistry} \\ + --index ${index} \\ + --reads1 ${forward.join( "," )} \\ + --reads2 ${reverse.join( "," )}""" + return mapping_args + } +} diff --git a/modules/nf-core/simpleaf/quant/meta.yml b/modules/nf-core/simpleaf/quant/meta.yml new file mode 100644 index 00000000..7d51dc8c --- /dev/null +++ b/modules/nf-core/simpleaf/quant/meta.yml @@ -0,0 +1,118 @@ +name: simpleaf_quant +description: simpleaf is a program to simplify and customize the running and configuration + of single-cell processing with alevin-fry. +keywords: + - quantification + - gene expression + - SimpleAF +tools: + - simpleaf: + description: | + SimpleAF is a program to simplify and customize the running and configuration of single-cell processing with alevin-fry. + homepage: https://github.com/COMBINE-lab/simpleaf + licence: ["BSD-3-Clause"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - chemistry: + type: string + description: | + Chemistry used for library preparation. It can be a string describing the specific chemistry or the geometry of the barcode, UMI, and mappable read. For example, "10xv2" and "10xv3" will apply the appropriate settings for 10x Chromium v2 and v3 protocols, respectively. Alternatively, you can provide a general geometry string if your chemistry is not pre-registered. For example, instead of "10xv2", you could use "1{b[16]u[10]x:}2{r:}", or instead of "10xv3", you could use "1{b[16]u[12]x:}2{r:}". Details at https://hackmd.io/@PI7Og0l1ReeBZu_pjQGUQQ/rJMgmvr13 + - reads: + type: file + description: | + List of input FastQ files for paired-end data. + Reads should be grouped by pairs. + For example, [ [R1_1.fastq.gz, R2_1.fastq.gz], [R1_2.fastq.gz, R2_2.fastq.gz] ] + - - meta2: + type: map + description: | + Groovy Map containing index information + e.g. [ tool:'piscem' ] + - index: + type: directory + description: Folder containing the index files. For a *salmon* index that is + not generated by simpleaf to be taken, '--no-piscem' MUST be specified in + ext.args. + - txp2gene: + type: file + description: | + File mapping transcripts to genes. It can be either a two-column TSV file for a standard transcriptomic index containing the transcript-to-gene ID mapping information, or a three-column TSV file for an augmented transcriptomic index with the third column representing the splicing status of each transcript. + - - meta3: + type: map + description: | + Groovy Map containing txp2gene information + e.g. [ mode:'usa' ] + - cell_filter: + type: string + enum: ["knee", "forced-cells", "explicit-pl", "expect-cells", "unfiltered-pl"] + description: | + Cell filtering mode. Possible values are 'usa' and 'whitelist'. 'usa' will use the default cell filtering mode, while 'whitelist' will use the whitelist file provided in the 'whitelist' input. + - number_cb: + type: integer + description: | + Number of cell barcodes to use for cell filtering. Set as empty ('[]') unless 'cell_filter' is set to 'forced-cells' or 'expect-cells'. + - cb_list: + type: file + description: | + File containing a list of cell barcodes to use for cell filtering. Set as empty ('[]') unless 'cell_filter' is set to 'unfiltered-pl' or 'explicit-pl'. + - - resolution: + type: string + description: | + UMI resolution (https://alevin-fry.readthedocs.io/en/latest/quant.html). Possible values are 'cr-like', 'cr-like-em', 'parsimony', 'parsimony-em', 'parsimony-gene', and 'parsimony-gene-em'. + - - meta4: + type: map + description: | + Groovy Map containing existing mapping results. + e.g. [ tool:'piscem' ] + - map_dir: + type: directory + description: Folder containing the existing mapping results. It must be generated + by simpleaf or alevin-fry, and contain the mapping file named map.rad. +output: + - map: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "simpleaf/af_map" + - ${prefix}/af_map: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - quant: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/af_quant: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@fmalmeida" + - "@maxulysse" + - "@Khajidu" + - "@apeltzer" + - "@pinin4fjords" + - "@dongzehe" +maintainers: + - "@fmalmeida" + - "@maxulysse" + - "@Khajidu" + - "@apeltzer" + - "@pinin4fjords" + - "@dongzehe" diff --git a/modules/nf-core/simpleaf/quant/tests/main.nf.test b/modules/nf-core/simpleaf/quant/tests/main.nf.test new file mode 100644 index 00000000..f4cdfc1e --- /dev/null +++ b/modules/nf-core/simpleaf/quant/tests/main.nf.test @@ -0,0 +1,110 @@ +nextflow_process { + + name "Test Process SIMPLEAF_QUANT" + script "../main.nf" + process "SIMPLEAF_QUANT" + + tag "modules" + tag "modules_nfcore" + tag "simpleaf" + tag "simpleaf/quant" + tag "simpleaf/index" + + + //Requires SIMPLEAF_INDEX to run prior to quant + setup { + run("SIMPLEAF_INDEX") { + script "../../index/main.nf" + process { + """ + genome_fasta = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + gtf = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) + meta = [ 'id': 'human'] + + input[0] = Channel.of([meta, genome_fasta, gtf]) + input[1] = Channel.of([[],[]]) + """ + } + } + } + + test("test_simpleaf_quant") { + when { + // config "./nextflow.config" + process { + """ + meta = [id:'test_10x', single_end:false, strandedness:'auto'] + files = [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R1_001.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R2_001.fastq.gz', checkIfExists: true) + ] + input[0] = Channel.of([meta, '10xv3', files]) + input[1] = SIMPLEAF_INDEX.out.index.combine(SIMPLEAF_INDEX.out.t2g, by: 0) + input[2] = [[],"knee",[],[]] + input[3] = Channel.of('cr-like') + input[4] = Channel.of([[],[]]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert file("${process.out.map.get(0).get(1)}/map.rad").exists() }, + { assert file("${process.out.map.get(0).get(1)}/map_info.json").exists() }, + { assert file("${process.out.map.get(0).get(1)}/unmapped_bc_count.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/permit_map.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/collate.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/generate_permit_list.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/quant.json").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/featureDump.txt").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/permit_freq.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/unmapped_bc_count_collated.bin").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat.mtx").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat_cols.txt").exists() }, + { assert file("${process.out.quant.get(0).get(1)}/alevin/quants_mat_rows.txt").exists() }, + // { assert snapshot( + // process.out.versions, + // path("${process.out.map.get(0).get(1)}/map.rad"), + // path("${process.out.map.get(0).get(1)}/unmapped_bc_count.bin"), + // path("${process.out.quant.get(0).get(1)}/map.collated.rad"), + // path("${process.out.quant.get(0).get(1)}/featureDump.txt")) + // .match() } + ) + } + + } + + test("test_simpleaf_quant stub") { + options "-stub-run" + // config "./nextflow.config" + + when { + process { + """ + meta = [id:'test_10x', single_end:false, strandedness:'auto'] + files = [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R1_001.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/10xgenomics/cellranger/5k_cmvpos_tcells/fastqs/gex_1/subsampled_5k_human_antiCMV_T_TBNK_connect_GEX_1_S1_L001_R2_001.fastq.gz', checkIfExists: true) + ] + input[0] = Channel.of([meta, '10xv3', files]) + input[1] = SIMPLEAF_INDEX.out.index.combine(SIMPLEAF_INDEX.out.t2g, by: 0) + input[2] = [[],"knee",[],[]] + input[3] = Channel.of('cr-like') + input[4] = Channel.of([[],[]]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} + diff --git a/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap b/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap new file mode 100644 index 00000000..874b8151 --- /dev/null +++ b/modules/nf-core/simpleaf/quant/tests/main.nf.test.snap @@ -0,0 +1,89 @@ +{ + "test_simpleaf_quant stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + [ + "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "2": [ + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" + ], + "map": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "map.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "unmapped_bc_count.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "quant": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + [ + "quants_mat_rows.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "map.collated.rad:md5,d41d8cd98f00b204e9800998ecf8427e", + "permit_freq.bin:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:00:35.55447474" + }, + "test_simpleaf_quant": { + "content": [ + [ + "versions.yml:md5,c9a934ed7c246bef3ccccab002db043b" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-23T02:00:28.925349117" + } +} \ No newline at end of file diff --git a/modules/nf-core/simpleaf/quant/tests/tags.yml b/modules/nf-core/simpleaf/quant/tests/tags.yml new file mode 100644 index 00000000..fe7004d5 --- /dev/null +++ b/modules/nf-core/simpleaf/quant/tests/tags.yml @@ -0,0 +1,2 @@ +simpleaf/quant: + - "modules/nf-core/simpleaf/quant/**" diff --git a/nextflow_schema.json b/nextflow_schema.json index d5c666c7..ca8268af 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -58,10 +58,10 @@ "aligner": { "type": "string", "description": "Name of the tool to use for scRNA (pseudo-) alignment.", - "default": "alevin", - "help_text": "The workflow can handle three types of methods:\n\n- Kallisto/Bustools\n- Salmon + Alevin-fry + AlevinQC\n- STARsolo\n\nTo choose which one to use, please specify either `alevin`, `star` or `kallisto` as a parameter option for `--aligner`. By default, the pipeline runs the `alevin` option. Note that specifying another aligner option also requires choosing appropriate parameters (see below) for the selected option.", + "default": "simpleaf", + "help_text": "The workflow can handle three types of methods:\n\n- Kallisto/Bustools\n- SimpleAF \n- STARsolo\n\nTo choose which one to use, please specify either `simpleaf`, `star` or `kallisto` as a parameter option for `--aligner`. By default, the pipeline runs the `simpleaf` option. Note that specifying another aligner option also requires choosing appropriate parameters (see below) for the selected option.", "fa_icon": "fas fa-align-center", - "enum": ["kallisto", "star", "alevin", "cellranger", "cellrangerarc", "cellrangermulti"] + "enum": ["kallisto", "star", "simpleaf", "cellranger", "cellrangerarc", "cellrangermulti"] }, "protocol": { "type": "string", @@ -139,7 +139,7 @@ }, "save_reference": { "type": "boolean", - "description": "Specify this parameter to save the indices created (STAR, Kallisto, Salmon) to the results.", + "description": "Specify this parameter to save the indices created (STAR, Kallisto, Simpleaf) to the results.", "fa_icon": "fas fa-bookmark" }, "save_align_intermeds": { @@ -157,22 +157,24 @@ } } }, - "alevin_options": { - "title": "Alevin-fry Options", + "simpleaf_options": { + "title": "Simpleaf Options", "type": "object", "description": "", "default": "", "properties": { "simpleaf_index": { "type": "string", - "description": "This can be used to specify a precomputed Simpleaf index, either generated by Salmon or Piscem, in the pipeline, in order to skip the generation of required indices by Simpleaf itself.", + "help_text": "This can be used to specify a precomputed Simpleaf index in the pipeline, in order to skip the generation of required indices by Simpleaf itself.", + "description": "Path to pre-built Simpleaf index.", "fa_icon": "fas fa-fish", "format": "path", "exists": true }, "txp2gene": { "type": "string", - "description": "Path to transcript to gene mapping file. This allows the specification of a transcript to gene mapping file for Alevin-fry and AlevinQC.", + "description": "Path to transcript to gene mapping file.", + "help_text": "This allows the specification of a transcript to gene mapping file for Simpleaf. Only required if the index is generated from a transcriptomic reference FASTA file (`transcript_fasta`).", "fa_icon": "fas fa-map-marked-alt", "format": "file-path", "exists": true @@ -182,6 +184,7 @@ "default": "cr-like", "enum": ["cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"], "description": "UMI resolution strategy to deduplicate UMIs.", + "help_text": "This parameter specifies the [UMI resolution strategy](https://simpleaf.readthedocs.io/en/latest/quant-command.html) to use for deduplicating UMIs. The default is `cr-like`, which mimics the behavior of Cell Ranger, i.e., discarding UMIs equally compatible to multiple genes. Strategies that resolve multi-mapped UMIs by expectation maximization are available, like `cr-like-em`, `parsimony-em`, and `parsimony-gene-em`.", "fa_icon": "fas fa-map-marked-alt" } } diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index 7ae22e75..a63f369e 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -1,7 +1,7 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ include { ALEVINQC } from '../../modules/local/alevinqc' -include { SIMPLEAF_INDEX } from '../../../modules/modules/nf-core/simpleaf/index' -include { SIMPLEAF_QUANT } from '../../../modules/modules/nf-core/simpleaf/quant' +include { SIMPLEAF_INDEX } from '../../modules/modules/nf-core/simpleaf/index' +include { SIMPLEAF_QUANT } from '../../modules/modules/nf-core/simpleaf/quant' workflow SCRNASEQ_SIMPLEAF { @@ -59,13 +59,18 @@ workflow SCRNASEQ_SIMPLEAF { // define input channels for quantification // we can either use the mapping results or the reads and index files if ( map_dir ) { + // meta, chemistry, files ch_chemistry_reads = Channel.of( [ [:],[],[] ] ) + // meta, index, t2g file ch_index_t2g = Channel.of( [ [:],[],[] ] ) + // meta, map dir ch_map_dir = Channel.of( [ [id: map_dir.baseName], map_dir ] ) } else { + // meta, chemistry, files ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } - + // meta, index, t2g file ch_index_t2g = simpleaf_index.combine( txp2gene ) + // meta, map dir ch_map_dir = [ [:],[] ] } @@ -75,6 +80,7 @@ workflow SCRNASEQ_SIMPLEAF { SIMPLEAF_QUANT ( ch_chemistry_reads, ch_index_t2g, + // meta, cell filtering method, cell filtering params, whitelist [[:], "unfiltered-pl", [], barcode_whitelist ], resolution, ch_map_dir From afcc3a90bb3f7a899561a7fd117919b3e9ed94e3 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Fri, 31 Jan 2025 03:41:02 +0000 Subject: [PATCH 08/11] avoid using channel.of --- .github/workflows/awsfulltest.yml | 2 +- .github/workflows/awstest.yml | 2 +- .github/workflows/ci.yml | 2 +- README.md | 4 ++-- assets/protocols.json | 2 +- bin/alevin_qc.r | 2 +- conf/modules.config | 6 +++++- core.1739377 | 0 docs/output.md | 2 +- modules/local/alevinqc.nf | 4 ++-- modules/local/mtx_to_h5ad.nf | 2 -- nextflow.config | 4 ++-- nextflow_schema.json | 2 +- ro-crate-metadata.json | 3 ++- subworkflows/local/simpleaf.nf | 31 ++++++++++++++++-------------- tests/main_pipeline_alevin.nf.test | 2 +- workflows/scrnaseq.nf | 4 ++-- 17 files changed, 40 insertions(+), 34 deletions(-) delete mode 100644 core.1739377 diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 2023c5ba..fdfb2cc4 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - aligner: ["alevin", "kallisto", "star", "cellranger"] + aligner: ["simpleaf", "kallisto", "star", "cellranger"] steps: - name: Get PR reviews uses: octokit/request-action@v2.x diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index fccb7fc0..7467f463 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - aligner: ["alevin", "kallisto", "star", "cellranger"] + aligner: ["simpleaf", "kallisto", "star", "cellranger"] steps: # Launch workflow using Seqera Platform CLI tool action - name: Launch workflow via Seqera Platform diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0dc0e51..20a82f67 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: NXF_VER: - "24.04.2" - "latest-everything" - profile: ["alevin", "cellranger", "cellrangermulti", "kallisto", "star"] + profile: ["simpleaf", "cellranger", "cellrangermulti", "kallisto", "star"] steps: - name: Disk space cleanup diff --git a/README.md b/README.md index 3eec9319..d461bd42 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ This is a community effort in building a pipeline capable to support: -- Alevin-Fry + AlevinQC +- SimpleAF(Alevin-Fry) + AlevinQC - STARSolo - Kallisto + BUStools - Cellranger @@ -65,7 +65,7 @@ nextflow run nf-core/scrnaseq \ --fasta GRCm38.p6.genome.chr19.fa \ --gtf gencode.vM19.annotation.chr19.gtf \ --protocol 10XV2 \ - --aligner \ + --aligner \ --outdir ``` diff --git a/assets/protocols.json b/assets/protocols.json index 613e9773..34979228 100644 --- a/assets/protocols.json +++ b/assets/protocols.json @@ -1,5 +1,5 @@ { - "alevin": { + "simpleaf": { "10XV1": { "protocol": "10xv1", "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" diff --git a/bin/alevin_qc.r b/bin/alevin_qc.r index ec1362f9..441a037e 100755 --- a/bin/alevin_qc.r +++ b/bin/alevin_qc.r @@ -15,6 +15,6 @@ sampleId <- args[2] outDir <- args[3] alevinQCReport(baseDir = baseDir, sampleId = sampleId, - outputFile = "alevinReport.html", + outputFile = "simpleafQCReport.html", outputFormat = "html_document", outputDir = outDir, forceOverwrite = TRUE) diff --git a/conf/modules.config b/conf/modules.config index 75596e8f..2f729616 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -142,7 +142,9 @@ if (params.aligner == "simpleaf") { ] // because piscem create a large number of intermediate files, // we set scratch to true to avoid IO issues - scratch = true + // scratch = true + ext.prefix = { "simpleaf_index" } + } withName: 'SIMPLEAF_QUANT' { publishDir = [ @@ -150,6 +152,8 @@ if (params.aligner == "simpleaf") { mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + ext.prefix = { "simpleaf_quant" } + } // Fix for issue 196 // Modified for issue 334 diff --git a/core.1739377 b/core.1739377 deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/output.md b/docs/output.md index 04fc12ea..703666da 100644 --- a/docs/output.md +++ b/docs/output.md @@ -84,7 +84,7 @@ For details on how to load these into R and perform further downstream analysis, This pipeline uses the simplified and flexible modules in [Simpleaf](https://simpleaf.readthedocs.io/en/latest/) for processing single-cell data with [Salmon](https://salmon.readthedocs.io/en/latest/) or [Piscem](https://github.com/COMBINE-lab/piscem) as the underlying mapper and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) as the quantification tool. For detailed examples of using the quantification results generated by Alevin-fry in downstream analyses, such as RNA-velocity, please refer to [Alevin-fry/simpleaf tutorials](https://combine-lab.github.io/alevin-fry-tutorials/#blog). -**Output directory: `results/alevin`** +**Output directory: `results/simpleaf`** - `simpleaf` - Contains the count matrix created by Simpleaf diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf index f9096839..645121dc 100644 --- a/modules/local/alevinqc.nf +++ b/modules/local/alevinqc.nf @@ -19,7 +19,7 @@ process ALEVINQC { tuple val(meta2), path(map_dir) output: - tuple val(meta), path("alevin_report_${meta.id}.html"), emit: report + tuple val(meta), path("simpleaf_qc_report_${meta.id}.html"), emit: report path "versions.yml", emit: versions when: @@ -35,7 +35,7 @@ process ALEVINQC { permitDir= "${permit_dir}", quantDir = "${quant_dir}", sampleId = "${prefix}", - outputFile = "alevin_report_${meta.id}.html", + outputFile = "simpleaf_qc_report_${meta.id}.html", outputFormat = "html_document", outputDir = "./", forceOverwrite = TRUE diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 7e1bdd0a..424580f6 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -28,8 +28,6 @@ process MTX_TO_H5AD { script: def aligner = (input_aligner in [ 'cellranger', 'cellrangerarc', 'cellrangermulti' ]) ? 'cellranger' : input_aligner - aligner = input_aligner == "alevin" ? "simpleaf" : aligner - template "mtx_to_h5ad_${aligner}.py" stub: diff --git a/nextflow.config b/nextflow.config index 90e083aa..87729ca9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,7 +10,7 @@ params { // generic options - aligner = 'alevin' + aligner = 'simpleaf' input = null save_reference = false save_align_intermeds = true @@ -23,7 +23,7 @@ params { fasta = null gtf = null - // alevin-fry parameters (simpleaf) + // simpleaf parameters simpleaf_index = null barcode_whitelist = null simpleaf_umi_resolution = "cr-like" diff --git a/nextflow_schema.json b/nextflow_schema.json index ca8268af..ae4f852e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -519,7 +519,7 @@ "$ref": "#/$defs/reference_genome_options" }, { - "$ref": "#/$defs/alevin_options" + "$ref": "#/$defs/simpleaf_options" }, { "$ref": "#/$defs/starsolo_options" diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 03ca8466..bb370c59 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -139,7 +139,8 @@ "nextflow", "10x-genomics", "10xgenomics", - "alevin", + "simpleaf", + "alevi-fry", "bustools", "cellranger", "kallisto", diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index a63f369e..b0b0f797 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -1,7 +1,7 @@ /* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ include { ALEVINQC } from '../../modules/local/alevinqc' -include { SIMPLEAF_INDEX } from '../../modules/modules/nf-core/simpleaf/index' -include { SIMPLEAF_QUANT } from '../../modules/modules/nf-core/simpleaf/quant' +include { SIMPLEAF_INDEX } from '../../../modules/modules/nf-core/simpleaf/index' +include { SIMPLEAF_QUANT } from '../../../modules/modules/nf-core/simpleaf/quant' workflow SCRNASEQ_SIMPLEAF { @@ -27,16 +27,18 @@ workflow SCRNASEQ_SIMPLEAF { // define input channels for index building // we can either use the genome fasta and gtf files or the transcript fasta file if ( transcript_fasta ) { - ch_genome_fasta_gtf = [ [:],[],[] ] - ch_transcript_fasta = Channel.of( [ [id: "${transcript_fasta.getBaseName()}"], transcript_fasta ] ) + ch_genome_fasta_gtf = [ [:],[],[] ] // meta, genome fasta, genome gtf + ch_transcript_fasta = [ [id: "${transcript_fasta.getName()}"], transcript_fasta ] // meta, transcript fasta } else { - ch_genome_fasta_gtf = ch_genome_fasta.combine( ch_genome_gtf ).map{ fasta, gtf -> [[id: "${fasta.getBaseName()}"], fasta, gtf] } - ch_transcript_fasta = Channel.of( [ [:], [] ] ) + ch_genome_fasta_gtf = ch_genome_fasta.combine( ch_genome_gtf ).map{ fasta, gtf -> [[id: "${fasta.getName()}"], fasta, gtf] } + ch_transcript_fasta = [ [:], [] ] // meta, transcript fasta } SIMPLEAF_INDEX( ch_genome_fasta_gtf, - ch_transcript_fasta + ch_transcript_fasta, + [[:], []], // meta, probe CSV + [[:], []] // meta, feature CSV ) // Channel of tuple(meta, index dir) simpleaf_index = SIMPLEAF_INDEX.out.index.collect() @@ -60,16 +62,16 @@ workflow SCRNASEQ_SIMPLEAF { // we can either use the mapping results or the reads and index files if ( map_dir ) { // meta, chemistry, files - ch_chemistry_reads = Channel.of( [ [:],[],[] ] ) + ch_chemistry_reads = [ [:],[],[] ] // meta, index, t2g file - ch_index_t2g = Channel.of( [ [:],[],[] ] ) + ch_index_t2g = [ [:],[],[] ] // meta, map dir - ch_map_dir = Channel.of( [ [id: map_dir.baseName], map_dir ] ) + ch_map_dir = [ [id: map_dir.baseName], map_dir ] } else { // meta, chemistry, files ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } // meta, index, t2g file - ch_index_t2g = simpleaf_index.combine( txp2gene ) + ch_index_t2g = simpleaf_index.combine( txp2gene ).collect() // meta, map dir ch_map_dir = [ [:],[] ] } @@ -88,10 +90,11 @@ workflow SCRNASEQ_SIMPLEAF { ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) ch_af_map = map_dir ? ch_map_dir : SIMPLEAF_QUANT.out.map + ch_af_quant = SIMPLEAF_QUANT.out.quant /* * Run alevinQC */ - ALEVINQC( SIMPLEAF_QUANT.out.quant, SIMPLEAF_QUANT.out.quant, ch_af_map ) + ALEVINQC( ch_af_quant, ch_af_quant, ch_af_map ) ch_versions = ch_versions.mix(ALEVINQC.out.versions) @@ -99,7 +102,7 @@ workflow SCRNASEQ_SIMPLEAF { ch_versions txp2gene index = simpleaf_index - map = SIMPLEAF_QUANT.out.map - quant = SIMPLEAF_QUANT.out.quant + map = ch_af_map + quant = ch_af_quant alevinqc = ALEVINQC.out.report } diff --git a/tests/main_pipeline_alevin.nf.test b/tests/main_pipeline_alevin.nf.test index 7ed23a61..da2df188 100644 --- a/tests/main_pipeline_alevin.nf.test +++ b/tests/main_pipeline_alevin.nf.test @@ -8,7 +8,7 @@ nextflow_pipeline { when { // the rest is taken from shared config params { - aligner = 'alevin' + aligner = 'simpleaf' outdir = "${outputDir}/results_alevin" } } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 9bde4996..bc6f6e5c 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -133,8 +133,8 @@ workflow SCRNASEQ { ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } - // Run salmon alevin pipeline - if (params.aligner == "alevin") { + // Run salmon simpleaf pipeline + if (params.aligner == "simpleaf") { SCRNASEQ_SIMPLEAF( ch_genome_fasta, From 2bd54eab20bec95dcaadb8eb1da2d149a9b43993 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Fri, 31 Jan 2025 04:52:11 +0000 Subject: [PATCH 09/11] back compatibility --- main.nf | 2 +- subworkflows/local/simpleaf.nf | 9 ++++----- workflows/scrnaseq.nf | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index e1528143..26f8ac1f 100644 --- a/main.nf +++ b/main.nf @@ -30,7 +30,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_scrn // Thus, manually provided files are not overwritten by the genome attributes params.fasta = getGenomeAttribute('fasta') params.gtf = getGenomeAttribute('gtf') -params.simpleaf_index = getGenomeAttribute('simpleaf') +params.simpleaf_index = getGenomeAttribute('simpleaf') ?: getGenomeAttribute('salmon') params.txp2gene = getGenomeAttribute('simpleaf_txp2gene') params.cellranger_index = params.aligner == 'cellrangerarc' ? getGenomeAttribute('cellrangerarc') : diff --git a/subworkflows/local/simpleaf.nf b/subworkflows/local/simpleaf.nf index b0b0f797..dde70184 100644 --- a/subworkflows/local/simpleaf.nf +++ b/subworkflows/local/simpleaf.nf @@ -44,7 +44,7 @@ workflow SCRNASEQ_SIMPLEAF { simpleaf_index = SIMPLEAF_INDEX.out.index.collect() // Channel of t2g path or empty t2g = SIMPLEAF_INDEX.out.t2g.collect().map { _meta, it -> it } - ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) + ch_versions = ch_versions.mix( SIMPLEAF_INDEX.out.versions ) // ensure txp2gene is a Channel if (!txp2gene) { @@ -54,8 +54,7 @@ workflow SCRNASEQ_SIMPLEAF { } } else { // ensure simpleaf index and txp2gene are Channels - simpleaf_index = Channel.of( [ simpleaf_index ] ) - txp2gene = Channel.of( [ txp2gene ] ) + simpleaf_index = Channel.of( [ [ id: simpleaf_index.getName() ], simpleaf_index ] ) } // define input channels for quantification @@ -66,10 +65,10 @@ workflow SCRNASEQ_SIMPLEAF { // meta, index, t2g file ch_index_t2g = [ [:],[],[] ] // meta, map dir - ch_map_dir = [ [id: map_dir.baseName], map_dir ] + ch_map_dir = [ [ id: map_dir.baseName ], map_dir ] } else { // meta, chemistry, files - ch_chemistry_reads = ch_fastq.map{ meta, files -> tuple(meta + ["chemistry": chemistry], chemistry, files) } + ch_chemistry_reads = ch_fastq.map{ meta, files -> [meta + ["chemistry": chemistry], chemistry, files] } // meta, index, t2g file ch_index_t2g = simpleaf_index.combine( txp2gene ).collect() // meta, map dir diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index bc6f6e5c..e60c8e74 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -134,7 +134,7 @@ workflow SCRNASEQ { } // Run salmon simpleaf pipeline - if (params.aligner == "simpleaf") { + if (params.aligner == "simpleaf" || params.aligner == "alevin") { SCRNASEQ_SIMPLEAF( ch_genome_fasta, From 5ad172762dac34d6ba3c197010066c361a6c1bd2 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Fri, 31 Jan 2025 15:58:05 +0000 Subject: [PATCH 10/11] rewrite mtx_to_h5ad_simpleaf to be aware of USA mode --- conf/modules.config | 2 +- .../local/templates/mtx_to_h5ad_simpleaf.py | 345 +++++++++++++++++- 2 files changed, 335 insertions(+), 12 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 2f729616..187d18e3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -142,7 +142,7 @@ if (params.aligner == "simpleaf") { ] // because piscem create a large number of intermediate files, // we set scratch to true to avoid IO issues - // scratch = true + scratch = true ext.prefix = { "simpleaf_index" } } diff --git a/modules/local/templates/mtx_to_h5ad_simpleaf.py b/modules/local/templates/mtx_to_h5ad_simpleaf.py index f9b5bf01..576d955e 100755 --- a/modules/local/templates/mtx_to_h5ad_simpleaf.py +++ b/modules/local/templates/mtx_to_h5ad_simpleaf.py @@ -11,19 +11,330 @@ import anndata from anndata import AnnData import platform +import json -def _mtx_to_adata( - input: str, - sample: str, +def say(quiet, words): + if not quiet: + print(words) + +def load_fry( + frydir, + output_format="snRNA", + aux_columns=["X", "Y"], + gene_id_to_name=None, + quiet=False, ): + """ + load alevin-fry quantification result into an AnnData object - adata = sc.read_mtx(f"{input}/quants_mat.mtx") - adata.obs_names = pd.read_csv(f"{input}/quants_mat_rows.txt", header=None, sep="\\t")[0].values - adata.var_names = pd.read_csv(f"{input}/quants_mat_cols.txt", header=None, sep="\\t")[0].values - adata.obs["sample"] = sample + Required Parameters + ---------- + frydir : `str` + The path to a output directory returned by alevin-fry quant command. \\ + The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory). + + Optional Parameters + ---------- + output_format : `str` or `dict` + A string represents one of the pre-defined output formats, which are "scRNA", "S+A", "snRNA", "all", "U+S+A" and "velocity". \\ + If a customized format of the returned `AnnData` is needed, one can pass a Dictionary.\\ + See Notes section for details. + + aux_columns : `list[str]` (default: `["X", "Y"]`) + A list of strings contains the column names of the auxiliary information in the barcodes file starting from the second column. \\ + The first column is assumed to be the barcodes and is named as "barcodes". \\ + Extra auxiliary columns in the barcodes file without a specified name will be ignored. + + gene_id_to_name : `str` or `None` (default: `None`) + The path to a file that contains the mapping from gene names to gene ids. \\ + It is only needed if \\ + 1. you are not using the simpleaf pipeline (`simpleaf index` + `simpleaf quant`), \\ + 2. you have such a file, and, + 3. you want to add this information to the coldata of your anndata. + If you do, please ensure it is a tab-separated, two-column file without a header, and the first column is the gene ids and the second column is the gene names. + + nonzero : `bool` (default: `False`) + True if cells with non-zero expression value across all genes should be filtered in each layer. + False if unexpressed genes should be kept. + + quiet : `bool` (default: `False`) + True if function should be quiet. + False if messages (including error messages) should be printed out. + + Notes + ---------- + The `output_format` argument takes either a dictionary that defines the customized format or + a string that represents one of the pre-defined format of the returned `AnnData` object. + + Each of the pre-defined formats contains a `X` field and some optional extra `AnnData.layers` + obtained from the submatrices representing unspliced (U), spliced (S) and ambiguous (A) counts + returned by alevin-fry. + + The following formats are defined: + + * "scRNA" and "S+A": \\ + This format is recommended for single cell RNA-sequencing experiments. + It returns a `X` field that contains the S+A count of each gene in each cell, + and a `unspliced` field that contains the U count of each gene. + + * "snRNA", "all" and "U+S+A": \\ + These three formats are the same. They return a `X` field that contains the U+S+A + count of each gene in each cell without any extra layers. + It is recommended for single-nucleus RNA-sequencing experiments. + CellRanger 7 returns this format for both single-cell and single-nucleus experiments. + + * "raw": \\ + This format uses the S count matrix as the `X` field and put the U, S, and A counts into three + separate layers, which are "unspliced", "spliced" and "ambiguous". + + * "velocity": \\ + This format is the same as "scRNA", except it contains two extra layers: the "spliced" layer, + which contains the S+A counts, and the "unspliced" layer, which contains the U counts. + + A custom output format can be defined using a Dictionary specifying the desired format of the output `Anndata` object. + If the input is not a USA mode quantification directory, this parameter is ignored + and the count matrix is returned in the `X` field of the returned `AnnData` object. If the input + quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can + be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and + ambiguous counts. The dictionary should have entries of the form `key` (str) : `value` (list[str]). + The following constraints apply : there should be one key-value pair with the key `X`, the resulting + counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number + of other key-value pairs, but each will be returned as a layer of the resulting AnnData object. + Within the key-value pairs, the key refers to the layer name that will be given to the combined + count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines + which sub-matrices should be summed. For example: + `{'X' : ['S', 'A'], 'unspliced' : ['U']}` + will result in a return AnnData object where the X field has a matrix in which each entry + corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there + is an additional "unspliced" layer, whose counts are taken directly from the unspliced sub-matrix. + + Returns: + ---------- + An AnnData object with X and layers corresponding to the requested `output_format`. + + """ + + # since alevin-fry 0.4.1 the generic "meta_info.json" + # has been replaced by a more informative name for each + # sub-command. For quantification, it is "quant.json". + # we check for both files here, in order. + meta_info_files = ["quant.json", "meta_info.json"] + + fpath = os.path.sep.join([frydir, meta_info_files[0]]) + # first, check for the new file, if we don't find it, check + # for the old one. + if not os.path.exists(fpath): + say( + quiet, + f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.", + ) + + fpath = os.path.sep.join([frydir, meta_info_files[1]]) + # if we don't find the old one either, then return None + if not os.path.exists(fpath): + raise IOError( + "The profvided `frydir` doesn't contain required meta info file; cannot proceed." + ) + + # if we got here then we had a valid json file, so + # use it to get the number of genes, and if we are + # in USA mode or not. + meta_info = json.load(open(fpath)) + ng = meta_info["num_genes"] + usa_mode = meta_info["usa_mode"] + + say(quiet, f"USA mode: {usa_mode}") + + # if we are in USA mode + if usa_mode: + # preparation + # each gene has 3 splicing statuses, so the actual number of distinct + # genes is ng/3. + ng = int(ng / 3) + output_assays = process_output_format(output_format, quiet) + else: + say( + quiet, + "Processing input in standard mode, the count matrix will be stored in field 'X'.", + ) + if output_format != "scRNA": + say(quiet, "Output_format will be ignored.") - return adata + # read the gene ids + afg_df = pd.read_table( + os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"]), + names=["gene_id"], + nrows=ng + ) + # if we have a gene name to id mapping, use it + # Otherwise, we see if there is a default mapping file + # We first hope the user gives one + gene_id_to_name_path = gene_id_to_name + # make sure the file exists + if gene_id_to_name_path is not None: + if not os.path.exists(gene_id_to_name_path): + say( + quiet, + f"The provided gene id to name mapping file {gene_id_to_name_path} does not exist; ignored.", + ) + gene_id_to_name_path = None + + # we then check if we can find a default mapping file + if gene_id_to_name_path is None: + default_gene_id_to_name_path = os.path.sep.join([frydir, "gene_id_to_name.tsv"]) + if os.path.exists(default_gene_id_to_name_path): + gene_id_to_name_path = default_gene_id_to_name_path + say( + quiet, + f"Using simpleaf gene id to name mapping file: {gene_id_to_name_path}", + ) + + # read the file if we find it + if gene_id_to_name_path is not None: + gene_id_to_name_df = pd.read_table( + gene_id_to_name_path, names=["gene_id", "gene_symbol"] + ) + afg_df = pd.merge(afg_df, gene_id_to_name_df, on="gene_id", how="left") + + afg_df = afg_df.set_index("gene_id", drop=False) + + # read the barcodes file + abc_df = pd.read_table( + os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"]), header=None + ) + + # if column names and num columns don't match, match them + # we have only barcode + if abc_df.shape[1] == 1: + columns = ["barcodes"] + else: + say(quiet, f"Found {abc_df.shape[1] - 1} auxiliary columns in barcodes file.") + columns = ["barcodes"] + aux_columns + + if abc_df.shape[1] != len(columns): + ncol = min(abc_df.shape[1], len(columns)) + + say( + quiet, + f"Number of auxiliary columns in barcodes file does not match the provided column names. Using the first {ncol} columns in the barcode file with names: {columns[:ncol]}.", + ) + + abc_df.columns = columns[:ncol] + columns = columns[:ncol] + + abc_df = abc_df.set_axis(columns, axis=1) + abc_df = abc_df.set_index(columns[0], drop=False) + + say(quiet, "Reading the quantification matrix.") + af_raw = sc.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) + x = af_raw.X + # if we're not in USA mode, just combine this info into + # an AnnData object + if not usa_mode: + say(quiet, "Constructing the AnnData object.") + af = sc.AnnData(x.T, var=abc_df, obs=afg_df) + af = af.T + + else: # USA mode + say(quiet, "Converting USA mode quantification into specified layers.") + # otherwise, combine the sub-matrices into the output object as + # specified by `output_assays` + rd = {"S": range(0, ng), "U": range(ng, 2 * ng), "A": range(2 * ng, 3 * ng)} + xcounts = output_assays["X"] + o = x[:, rd[xcounts[0]]] + for wc in xcounts[1:]: + o += x[:, rd[wc]] + af = sc.AnnData(o.T, var=abc_df, obs=afg_df) + af = af.T + + # now, if there are other layers requested, populate those + for other_layer in output_assays.keys() - "X": + xcounts = output_assays[other_layer] + o = x[:, rd[xcounts[0]]] + for wc in xcounts[1:]: + o += x[:, rd[wc]] + af.layers[other_layer] = o + + say(quiet, "Done.") + return af + + +def process_output_format(output_format, quiet): + # make sure output_format isn't empty + if not output_format: + raise ValueError("output_format cannot be empty") + + if isinstance(output_format, (str, dict)): + if isinstance(output_format, str): + predefined_format = { + "scrna": {"X": ["S", "A"], "unspliced": ["U"]}, + "S+A": {"X": ["S", "A"]}, + "snrna": {"X": ["U", "S", "A"]}, + "all": {"X": ["U", "S", "A"]}, + "U+S+A": {"X": ["U", "S", "A"]}, + "velocity": { + "X": ["S", "A"], + "spliced": ["S", "A"], + "unspliced": ["U"], + }, + "raw": { + "X": ["S"], + "spliced": ["S"], + "unspliced": ["U"], + "ambiguous": ["A"], + }, + } + + output_format = output_format.lower() + if output_format not in predefined_format.keys(): + # invalid output_format string + say(quiet, "A undefined Provided output_format string provided.") + say(quiet, "See function help message for details.") + raise ValueError("Invalid output_format.") + say(quiet, f"Using pre-defined output format: {output_format}") + say( + quiet, + f"Will populate output field X with sum of counts from {predefined_format[output_format]['X']}.", + ) + for k, v in predefined_format[output_format].items(): + if k != "X": + say(quiet, f"Will combine {v} into output layer {k}.") + + return predefined_format[output_format] + else: + say(quiet, "Processing user-defined output format.") + # make sure the X is there + if "X" not in output_format.keys(): + raise ValueError( + 'In USA mode some sub-matrices must be assigned to the "X" (default) output.' + ) + say( + quiet, + f"Will populate output field X with sum of counts frorm {output_format['X']}.", + ) + + for k, v in output_format.items(): + if not v: + # empty list + raise ValueError( + f"The element list of key '{k}' in output_format is empty. Please remove it." + ) + + # v contains Non-USA element + if len(set(v) - set(["U", "S", "A"])) != 0: + # invalid value + raise ValueError( + f"Found non-USA element in output_format element list '{v}' for key '{k}'; cannot proceed." + ) + if k != "X": + say(quiet, f"Will combine {v} into output layer {k}.") + + return output_format + else: + raise ValueError( + "Provided invalid output_format. See function help message for details" + ) def format_yaml_like(data: dict, indent: int = 0) -> str: """Formats a dictionary to a YAML-like string. @@ -63,11 +374,23 @@ def input_to_adata( print(f"Reading in {input_data}") # open main data - adata = _mtx_to_adata(input_data, sample) + simpleaf_h5ad_path = f"{input_data}/alevin/quant.h5ad" + + if os.path.exists(simpleaf_h5ad_path): + adata = sc.read_h5ad(simpleaf_h5ad_path) + else: + output_format={ + "X": ["S", "U", "A"], + "spliced": ["S"], + "unspliced": ["U"], + "ambiguous": ["A"], + } + adata = load_fry(input_data, output_format=output_format, quiet=False) + + adata.obs["sample"] = sample # standard format # index are gene IDs and symbols are a column - # TODO: how to get gene_symbols for alevin? adata.var['gene_versions'] = adata.var.index adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values adata.var_names_make_unique() @@ -85,7 +408,7 @@ def input_to_adata( # input_type comes from NF module input_to_adata( - input_data="${inputs}/alevin/", + input_data="${inputs}", output="${meta.id}_${meta.input_type}_matrix.h5ad", sample="${meta.id}" ) From 1b6dc9b5bdc196915e42befa2a2157ba4940e861 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Fri, 31 Jan 2025 16:09:14 +0000 Subject: [PATCH 11/11] rewrite mtx_to_h5ad_simpleaf to be aware of USA mode --- modules/local/templates/mtx_to_h5ad_simpleaf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/templates/mtx_to_h5ad_simpleaf.py b/modules/local/templates/mtx_to_h5ad_simpleaf.py index 576d955e..27f373ba 100755 --- a/modules/local/templates/mtx_to_h5ad_simpleaf.py +++ b/modules/local/templates/mtx_to_h5ad_simpleaf.py @@ -17,6 +17,8 @@ def say(quiet, words): if not quiet: print(words) +# From pyroe https://github.com/COMBINE-lab/pyroe/blob/develop/src/pyroe/load_fry.py +# this can be removed because the simpleaf quant module on nf-core/modules with simpleaf 0.19.0 exports an h5ad file directly. def load_fry( frydir, output_format="snRNA",