nf-core · DongzeHE · Jan 20, 2025 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        aligner: ["alevin", "kallisto", "star", "cellranger"]
+        aligner: ["simpleaf", "kallisto", "star", "cellranger"]
     steps:
       - name: Get PR reviews
         uses: octokit/[email protected]

diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        aligner: ["alevin", "kallisto", "star", "cellranger"]
+        aligner: ["simpleaf", "kallisto", "star", "cellranger"]
     steps:
       # Launch workflow using Seqera Platform CLI tool action
       - name: Launch workflow via Seqera Platform

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -38,7 +38,7 @@ jobs:
         NXF_VER:
           - "24.04.2"
           - "latest-everything"
-        profile: ["alevin", "cellranger", "cellrangermulti", "kallisto", "star"]
+        profile: ["simpleaf", "cellranger", "cellrangermulti", "kallisto", "star"]
 
     steps:
       - name: Disk space cleanup

diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@
 
 This is a community effort in building a pipeline capable to support:
 
-- Alevin-Fry + AlevinQC
+- SimpleAF(Alevin-Fry) + AlevinQC
 - STARSolo
 - Kallisto + BUStools
 - Cellranger
@@ -65,7 +65,7 @@ nextflow run nf-core/scrnaseq \
    --fasta GRCm38.p6.genome.chr19.fa \
    --gtf gencode.vM19.annotation.chr19.gtf \
    --protocol 10XV2 \
-   --aligner <alevin/kallisto/star/cellranger> \
+   --aligner <simpleaf/kallisto/star/cellranger> \
    --outdir <OUTDIR>
 ```
 

diff --git a/assets/protocols.json b/assets/protocols.json
@@ -1,5 +1,5 @@
 {
-    "alevin": {
+    "simpleaf": {
         "10XV1": {
             "protocol": "10xv1",
             "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz"

diff --git a/bin/alevin_qc.r b/bin/alevin_qc.r
@@ -15,6 +15,6 @@ sampleId <- args[2]
 outDir <- args[3]
 
 alevinQCReport(baseDir = baseDir, sampleId = sampleId,
-                outputFile = "alevinReport.html",
+                outputFile = "simpleafQCReport.html",
                 outputFormat = "html_document",
                 outputDir = outDir, forceOverwrite = TRUE)
diff --git a/conf/modules.config b/conf/modules.config
@@ -41,6 +41,7 @@ process {
         }
         withName: 'ANNDATA_BARCODES' {
             ext.prefix = { "${meta.id}_${meta.input_type}_matrix" }
+            // ext.prefix = { "${meta.id}_filtered_matrix" }
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" },
                 mode: params.publish_dir_mode,
@@ -127,31 +128,36 @@ if(params.aligner == "cellrangerarc") {
     }
 }
 
-if (params.aligner == "alevin") {
+if (params.aligner == "simpleaf" || params.aligner == "alevin") {
     process {
-        withName: GFFREAD_TXP2GENE {
-            ext.args = "--table transcript_id,gene_id"
-            ext.prefix = { "${gff.baseName}_gffread" }
-        }
         withName: 'SIMPLEAF_INDEX' {
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}" },
                 mode: params.publish_dir_mode,
                 enabled: params.save_reference
             ]
-            ext.args = { "--rlen ${params.simpleaf_rlen}" }
+            // because piscem create a large number of intermediate files,
+            // we set scratch to true to avoid IO issues
+            scratch = true
+            ext.prefix = { "simpleaf_index" }
+
         }
         withName: 'SIMPLEAF_QUANT' {
             publishDir = [
                 path: { "${params.outdir}/${params.aligner}/${meta.id}" },
                 mode: params.publish_dir_mode,
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
-            ext.args = "-r cr-like"
+            ext.prefix = { "simpleaf_quant" }
+
         }
         // Fix for issue 196
         // Modified for issue 334
         withName: 'ALEVINQC' {
+            publishDir = [
+                path: { "${params.outdir}/${params.aligner}/${meta.id}" },
+                mode: params.publish_dir_mode,
+            ]
             time = { 120.h }
         }
     }

diff --git a/docs/images/nf-core-scrnaseq_logo_light.png b/docs/images/nf-core-scrnaseq_logo_light.png
diff --git a/docs/output.md b/docs/output.md
@@ -15,7 +15,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [FastQC](#fastqc)
 - [Kallisto \& Bustools Results](#kallisto--bustools-results)
 - [STARsolo](#starsolo)
-- [Salmon \& Alevin-fry \& AlevinQC](#salmon--alevin-fry--alevinqc)
+- [Simpleaf \& AlevinQC](#simpleaf--alevinqc)
 - [Cellranger](#cellranger)
 - [Cellranger ARC](#cellranger-arc)
 - [Cellranger multi](#cellranger-multi)
@@ -80,23 +80,22 @@ For details on how to load these into R and perform further downstream analysis,
 - `star_index`
   - Contains the index of the supplied genome fasta file
 
-## Salmon & Alevin-fry & AlevinQC
+## Simpleaf & AlevinQC
 
-This pipeline uses the simplified and flexible modules in [Simpleaf](https://simpleaf.readthedocs.io/en/latest/) for processing single-cell data with [Salmon](https://salmon.readthedocs.io/en/latest/) as the underlying mapper and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) as the quantification tool. For detailed examples of using the quantification results generated by Alevin-fry in downstream analyses, such as RNA-velocity, please refer to [Alevin-fry/simpleaf tutorials](https://combine-lab.github.io/alevin-fry-tutorials/#blog).
+This pipeline uses the simplified and flexible modules in [Simpleaf](https://simpleaf.readthedocs.io/en/latest/) for processing single-cell data with [Salmon](https://salmon.readthedocs.io/en/latest/) or [Piscem](https://github.com/COMBINE-lab/piscem) as the underlying mapper and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) as the quantification tool. For detailed examples of using the quantification results generated by Alevin-fry in downstream analyses, such as RNA-velocity, please refer to [Alevin-fry/simpleaf tutorials](https://combine-lab.github.io/alevin-fry-tutorials/#blog).
 
-**Output directory: `results/alevin`**
+**Output directory: `results/simpleaf`**
 
-- `alevin`
-  - Contains the count matrix created by Alevin-fry
-- `alevinqc`
-  - Contains the QC report for the aforementioned Alevin-fry output data
-
-**Output directory: `results/reference_genome`**
-
-- `salmon_index`
-  - Contains the indexed reference transcriptome for the Salmon mapper
-- `alevin/txp2gene.tsv`
-  - The transcriptome to gene mapping TSV file utilized by Alevin-fry
+- `${meta.id}/simpleaf_quant/af_quant/alevin`
+  - Contains the quantification results -- the count matrix -- generated by simpleaf for each sample with.
+- `${meta.id}/simpleaf_quant/af_quant`
+  - Contains the logs and other intermediate results generated during the quantification stage.
+- `${meta.id}/simpleaf_quant/af_map`
+  - Contains the logs and other intermediate results generated during the mapping stage.
+- `${meta.id}/simpleaf_qc_report_${meta.id}.html`
+  - The QC report generated by AlevinQC for each sample.
+- `simpleaf_index`
+  - Contains the indexed reference transcriptome generated by Simpleaf. This folder will be generated if the index was generated by the pipeline and the `save_reference` parameter is set to `true`. Notice that, because the simpleaf index generated from a specific reference is fixed, this folder can be passed to the pipeline via the `simpleaf_index` parameter, or save it in `igenome.config` under `genomes[ params.genome ][ "simpleaf" ]`, to avoid re-indexing the reference.
 
 ## Cellranger
 
@@ -161,11 +160,11 @@ So, the conversion modules generate data with the following syntax: **`*_{raw,fi
 
 | suffix            | meaning                                                                                                                                  |
 | :---------------- | :--------------------------------------------------------------------------------------------------------------------------------------- |
-| raw               | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. |
+| raw               | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as simpleaf. |
 | filtered          | Conversion of the filtered/processed matrix generated by the tool                                                                        |
 | cellbender_filter | Conversion of the matrix that was generated by the cellbender remove background filtering module                                         |
 
-> Some aligners, like `alevin` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix.
+> Some aligners, like `simpleaf` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix.
 > Some aligners may have an option to give both raw&filtered and only one, like `kallisto`. Be aware when using the tools.
 
 ## MultiQC

diff --git a/docs/usage.md b/docs/usage.md
@@ -47,7 +47,7 @@ Note that since cellranger v7, it is **not recommended** anymore to supply the `
 
 ## Aligning options
 
-By default (i.e. `--aligner alevin`), the pipeline uses [Salmon](https://salmon.readthedocs.io/en/latest/) to perform pseudo-alignment of reads to the reference genome and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) to perform the downstream BAM-level quantification. Then QC reports are generated with AlevinQC.
+By default (i.e. `--aligner simpleaf`), the pipeline uses [Salmon](https://salmon.readthedocs.io/en/latest/) to perform pseudo-alignment of reads to the reference genome and [Alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) to perform the downstream BAM-level quantification. Then QC reports are generated with AlevinQC.
 
 Other aligner options for running the pipeline are:
 

diff --git a/main.nf b/main.nf
@@ -30,8 +30,8 @@ include { PIPELINE_COMPLETION     } from './subworkflows/local/utils_nfcore_scrn
 // Thus, manually provided files are not overwritten by the genome attributes
 params.fasta            = getGenomeAttribute('fasta')
 params.gtf              = getGenomeAttribute('gtf')
-params.salmon_index     = getGenomeAttribute('simpleaf')
-params.txp2gene         = getGenomeAttribute('simpleaf_tx2pgene')
+params.simpleaf_index   = getGenomeAttribute('simpleaf') ?: getGenomeAttribute('salmon')
+params.txp2gene         = getGenomeAttribute('simpleaf_txp2gene')
 params.cellranger_index = params.aligner == 'cellrangerarc' ?
                             getGenomeAttribute('cellrangerarc') :
                             getGenomeAttribute('cellranger')

diff --git a/modules.json b/modules.json
@@ -85,6 +85,16 @@
                         "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
                         "installed_by": ["modules"]
                     },
+                    "simpleaf/index": {
+                        "branch": "master",
+                        "git_sha": "094299f4e5460f014fcc637e069adc33e420cf8e",
+                        "installed_by": ["modules"]
+                    },
+                    "simpleaf/quant": {
+                        "branch": "master",
+                        "git_sha": "094299f4e5460f014fcc637e069adc33e420cf8e",
+                        "installed_by": ["modules"]
+                    },
                     "star/genomegenerate": {
                         "branch": "master",
                         "git_sha": "46eca555142d6e597729fcb682adcc791796f514",

diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf
@@ -1,23 +1,25 @@
 process ALEVINQC {
 
     //
-    // This module executes alevinfry QC reporting tool on alevin results
+    // This module executes alevinfry QC reporting tool on alevin-fry results
     //
 
     tag "$meta.id"
     label 'process_low'
 
-    //The alevinqc 1.14.0 container is broken, missing some libraries - thus reverting this to previous 1.12.1 version
-    conda "bioconda::bioconductor-alevinqc=1.12.1"
+    conda "bioconda::bioconductor-alevinqc=1.18.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' :
-        'biocontainers/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' }"
+        'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.18.0--r43hf17093f_0' :
+        'biocontainers/bioconductor-alevinqc:1.18.0--r43hf17093f_0' }"
 
+    // all metas are the same
     input:
-    tuple val(meta), path(alevin_results)
+    tuple val(meta), path(quant_dir, stageAs: "quant_dir")
+    tuple val(meta1), path(permit_dir, stageAs: "permit_dir")
+    tuple val(meta2), path(map_dir)
 
     output:
-    tuple val(meta), path("alevin_report_${meta.id}.html"), emit: report
+    tuple val(meta), path("simpleaf_qc_report_${meta.id}.html"), emit: report
     path  "versions.yml", emit: versions
 
     when:
@@ -29,11 +31,11 @@ process ALEVINQC {
     #!/usr/bin/env Rscript
     require(alevinQC)
     alevinFryQCReport(
-        mapDir = "${alevin_results}/af_map",
-        quantDir = "${alevin_results}/af_quant",
-        permitDir= "${alevin_results}/af_quant",
+        mapDir = "${map_dir}",
+        permitDir= "${permit_dir}",
+        quantDir = "${quant_dir}",
         sampleId = "${prefix}",
-        outputFile = "alevin_report_${meta.id}.html",
+        outputFile = "simpleaf_qc_report_${meta.id}.html",
         outputFormat = "html_document",
         outputDir = "./",
         forceOverwrite = TRUE

diff --git a/...les/local/templates/mtx_to_h5ad_alevin.py → ...s/local/templates/mtx_to_h5ad_simpleaf.py b/...les/local/templates/mtx_to_h5ad_alevin.py → ...s/local/templates/mtx_to_h5ad_simpleaf.py
@@ -11,19 +11,7 @@
 import anndata
 from anndata import AnnData
 import platform
-
-def _mtx_to_adata(
-    input: str,
-    sample: str,
-):
-
-    adata = sc.read_mtx(f"{input}/quants_mat.mtx")
-    adata.obs_names = pd.read_csv(f"{input}/quants_mat_rows.txt", header=None, sep="\\t")[0].values
-    adata.var_names = pd.read_csv(f"{input}/quants_mat_cols.txt", header=None, sep="\\t")[0].values
-    adata.obs["sample"] = sample
-
-    return adata
-
+import json
 
 def format_yaml_like(data: dict, indent: int = 0) -> str:
     """Formats a dictionary to a YAML-like string.
@@ -63,15 +51,26 @@ def input_to_adata(
     print(f"Reading in {input_data}")
 
     # open main data
-    adata = _mtx_to_adata(input_data, sample)
+    simpleaf_h5ad_path = f"{input_data}/alevin/quants.h5ad"
+
+    # the simpleaf quant module exports an h5ad file.
+    adata = sc.read_h5ad(simpleaf_h5ad_path)
+    adata.obs["sample"] = sample
 
     # standard format
     # index are gene IDs and symbols are a column
-    # TODO: how to get gene_symbols for alevin?
-    adata.var['gene_versions'] = adata.var.index
+    if "gene_symbol" in adata.var.columns:
+        adata.var['gene_ids'] = adata.var['gene_symbol']
+    else:
+        adata.var['gene_ids'] = adata.var['gene_id']
+
+    adata.var['gene_versions'] = adata.var['gene_ids']
     adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values
     adata.var_names_make_unique()
 
+    # sort adata column- and row- wise to avoid positional differences
+    adata = adata[adata.obs_names.sort_values(), adata.var_names.sort_values()]
+
     # write results
     adata.write_h5ad(f"{output}")
     print(f"Wrote h5ad file to {output}")
@@ -85,7 +84,7 @@ def input_to_adata(
 
 # input_type comes from NF module
 input_to_adata(
-    input_data="${meta.id}_alevin_results/af_quant/alevin/",
+    input_data="${inputs}",
     output="${meta.id}_${meta.input_type}_matrix.h5ad",
     sample="${meta.id}"
 )

diff --git a/modules/nf-core/simpleaf/index/environment.yml b/modules/nf-core/simpleaf/index/environment.yml