Merge pull request #31 from Mye-InfoBank/solo-batchwise

Improve doublet removal
Mye-InfoBank · Jan 29, 2024 · 027d9d4 · 027d9d4
2 parents 577f678 + 7d0484c
commit 027d9d4
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 25 deletions.
diff --git a/bin/celltypist_majority.py b/bin/celltypist_majority.py
@@ -17,6 +17,8 @@
 clustering = pd.read_pickle(args.input_clustering)
 df_celltypist = pd.read_pickle(args.input_celltypist)
 
+df_celltypist = df_celltypist.reindex(clustering.index)
+
 predictions = df_celltypist["celltypist_prediction"]
 
 majority_key = args.clustering_key + "_celltypist_majority"

diff --git a/conf/base.config b/conf/base.config
@@ -38,7 +38,7 @@ process {
         time   = { check_max( 40.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high_memory {
-        memory = { check_max( 300.GB * task.attempt, 'memory' ) }
+        memory = { check_max( 150.GB * task.attempt, 'memory' ) }
     }
     withLabel:error_ignore {
         errorStrategy = 'ignore'

diff --git a/main.nf b/main.nf
@@ -7,14 +7,14 @@ nextflow.enable.dsl = 2
 
 // Modules
 include { CELLTYPIST } from "./modules/celltypist.nf"
-include { SOLO } from "./modules/solo.nf"
 include { CELL_CYCLE } from "./modules/cell_cycle.nf"
 include { MERGE } from "./modules/merge.nf"
 
 // Workflows
 include { PREPROCESSING } from "./workflows/preprocessing.nf"
 include { COUNTS } from "./workflows/counts.nf"
 include { INTEGRATION } from "./workflows/integration.nf"
+include { DOUBLETS } from "./workflows/doublets.nf"
 include { CLUSTERING } from "./workflows/clustering.nf"
 
 if (params.samplesheet) { ch_samplesheet = file(params.samplesheet) } else { exit 1, 'Samplesheet not specified!' }
@@ -24,6 +24,7 @@ workflow {
 
     ch_preprocessed = PREPROCESSING.out.simple
     ch_hvgs = PREPROCESSING.out.hvgs
+    ch_batches = PREPROCESSING.out.batches
 
     COUNTS(ch_preprocessed, params.normalization_method)
 
@@ -43,31 +44,33 @@ workflow {
         Channel.value(params.benchmark_hvgs)
     )
 
-    SOLO(
+    DOUBLETS(
         ch_hvgs,
-        INTEGRATION.out.scanvi_model
+        INTEGRATION.out.scanvi_model,
+        INTEGRATION.out.integrated,
+        ch_preprocessed,
+        ch_batches.collect()
     )
 
     CLUSTERING(
-        INTEGRATION.out.integrated,
+        DOUBLETS.out.integrations,
         Channel.from(params.leiden_resolutions),
         CELLTYPIST.out,
         Channel.value(params.entropy_initial_smoothness)
     )
 
     ch_obs = CLUSTERING.out.obs.mix(
-        SOLO.out, CELL_CYCLE.out, CELLTYPIST.out
+        CELL_CYCLE.out, CELLTYPIST.out, DOUBLETS.out.solo
     )
 
     ch_obsm = CLUSTERING.out.obsm.mix(
-        INTEGRATION.out.obsm
+        DOUBLETS.out.obsm
     )
 
-    MERGE(
-        ch_preprocessed,
+    MERGE (
+        DOUBLETS.out.raw,
         COUNTS.out,
         ch_obsm.map{ meta, obsm -> obsm}.collect(),
         ch_obs.map{ meta, obs -> obs}.collect()
-        )
-
+    )
 }
diff --git a/modules/dedoublet_adata.nf b/modules/dedoublet_adata.nf
@@ -0,0 +1,30 @@
+process DEDOUBLET_ADATA {
+    tag "${meta.id}"
+    container "bigdatainbiomedicine/sc-rpy:1.0"
+
+    label "process_medium"
+
+    input:
+        tuple val(meta), path(adata)
+        tuple val(meta2), path(solo)
+
+    output:
+        tuple val(meta), path("${meta.id}.dedup.h5ad")
+
+    script:
+    """
+    #!/opt/conda/bin/python
+
+    import pandas as pd
+    import anndata as ad
+
+    adata = ad.read_h5ad("${adata}")
+    solo = pd.read_pickle("${solo}")
+
+    # Keep only cells with "singlet" in the "doublet_label" column
+    adata = adata[solo["doublet_label"] == "singlet", :]
+
+    # Save the AnnData object
+    adata.write_h5ad("${meta.id}.dedup.h5ad")
+    """
+}
diff --git a/modules/merge.nf b/modules/merge.nf
@@ -28,6 +28,8 @@ process MERGE {
   adata = ad.read_h5ad("${original_adata}")
 
   counts_adata = ad.read_h5ad("$counts")
+  counts_adata = counts_adata[adata.obs_names, :]
+
   for layer in counts_adata.layers.keys():
     adata.layers[layer] = csc_matrix(counts_adata.layers[layer]).astype(np.float32)
   adata.X = csc_matrix(counts_adata.X).astype(np.float32)
@@ -44,8 +46,9 @@ process MERGE {
 
   for obs_path in obs_paths:
     df = pd.read_pickle(obs_path)
+    df = df.reindex(adata.obs_names)
     adata.obs = pd.concat([adata.obs, df], axis=1)
-  
+
   for col in adata.obs.columns:
     if adata.obs[col].dtype == np.float64:
       adata.obs[col] = adata.obs[col].astype(np.float32)

diff --git a/modules/solo.nf b/modules/solo.nf
@@ -1,16 +1,14 @@
-#!/usr/bin/env nextflow
-nextflow.enable.dsl = 2
-
-
 process SOLO {
     tag "${meta.id}"
     container "bigdatainbiomedicine/sc-scib:1.0"
 
     label "process_medium"
+    label "process_high_memory"
 
     input:
         tuple val(meta), path(adata)
         tuple val(meta2), path(scvi_model)
+        val batches
 
     output:
         tuple val(meta), path("${meta.id}.solo.pkl")
@@ -21,6 +19,7 @@ process SOLO {
 
     import scvi
     import scanpy as sc
+    import pandas as pd
     from threadpoolctl import threadpool_limits
     threadpool_limits(${task.cpus})
 
@@ -30,11 +29,23 @@ process SOLO {
 
     scvi.model.SCANVI.setup_anndata(adata_hvg, batch_key="batch", labels_key="cell_type", unlabeled_category="Unknown")
     scvi_model = scvi.model.SCANVI.load("${scvi_model}", adata=adata_hvg)
-    solo = scvi.external.SOLO.from_scvi_model(scvi_model)
-    solo.train()
-    res = solo.predict()
-    res["doublet_label"] = solo.predict(False)
 
-    res.to_pickle("${meta.id}.solo.pkl")
+    results = []
+
+    batches = "${batches.join(" ")}".split(" ")
+    for batch in batches:
+        solo = scvi.external.SOLO.from_scvi_model(scvi_model, restrict_to_batch=batch)
+        solo.train()
+        batch_res = solo.predict()
+        batch_res["doublet_label"] = solo.predict(False)
+
+        results.append(batch_res)
+    
+    solo_res = pd.concat(results)
+
+    # Reorder the cells to match the original adata
+    solo_res = solo_res.reindex(adata.obs_names)
+
+    solo_res.to_pickle("${meta.id}.solo.pkl")
     """
 }
diff --git a/workflows/doublets.nf b/workflows/doublets.nf
@@ -0,0 +1,39 @@
+include { SOLO } from "../modules/solo.nf"
+include { DEDOUBLET_ADATA as DEDOUBLET_INTEGRATIONS } from "../modules/dedoublet_adata.nf"
+include { DEDOUBLET_ADATA as DEDOUBLET_RAW } from "../modules/dedoublet_adata.nf"
+include { EXTRACT_EMBEDDING } from "../modules/extract_embedding.nf"
+
+
+workflow DOUBLETS {
+    take:
+        ch_hvgs
+        ch_scanvi_model
+        ch_integrations
+        ch_raw
+        ch_batches
+
+    main:
+        SOLO(
+            ch_hvgs,
+            ch_scanvi_model,
+            ch_batches
+        )
+
+        DEDOUBLET_INTEGRATIONS(
+            ch_integrations,
+            SOLO.out
+        )
+
+        EXTRACT_EMBEDDING(DEDOUBLET_INTEGRATIONS.out)
+
+        DEDOUBLET_RAW(
+            ch_raw,
+            SOLO.out
+        )
+
+    emit:
+        solo = SOLO.out
+        integrations = DEDOUBLET_INTEGRATIONS.out
+        raw = DEDOUBLET_RAW.out
+        obsm = EXTRACT_EMBEDDING.out
+}
diff --git a/workflows/integration.nf b/workflows/integration.nf
@@ -2,7 +2,6 @@ include { INTEGRATE } from "../modules/integrate.nf"
 include { INTEGRATE as INTEGRATE_GPU } from "../modules/integrate.nf"
 include { INTEGRATE as INTEGRATE_SCVI } from "../modules/integrate.nf"
 include { INTEGRATE_SCANVI } from "../modules/integrate_scanvi.nf"
-include { EXTRACT_EMBEDDING } from "../modules/extract_embedding.nf"
 include { BENCHMARKING } from "./benchmarking.nf"
 
 
@@ -71,8 +70,6 @@ workflow INTEGRATION {
         ch_integrated_types = ch_integrated
             .map{ meta, adata -> [meta, adata, integration_types[meta.integration]] }
 
-        EXTRACT_EMBEDDING(ch_integrated)
-
         BENCHMARKING(
             ch_preprocessed,
             ch_integrated_types,
@@ -82,5 +79,4 @@ workflow INTEGRATION {
     emit:
         integrated = ch_integrated
         scanvi_model = INTEGRATE_SCANVI.out.model
-        obsm = EXTRACT_EMBEDDING.out
 }
diff --git a/www/SIMBA.png b/www/SIMBA.png