Skip to content

Commit

Permalink
Merge pull request #31 from Mye-InfoBank/solo-batchwise
Browse files Browse the repository at this point in the history
Improve doublet removal
  • Loading branch information
nictru authored Jan 29, 2024
2 parents 577f678 + 7d0484c commit 027d9d4
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 25 deletions.
2 changes: 2 additions & 0 deletions bin/celltypist_majority.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
clustering = pd.read_pickle(args.input_clustering)
df_celltypist = pd.read_pickle(args.input_celltypist)

df_celltypist = df_celltypist.reindex(clustering.index)

predictions = df_celltypist["celltypist_prediction"]

majority_key = args.clustering_key + "_celltypist_majority"
Expand Down
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ process {
time = { check_max( 40.h * task.attempt, 'time' ) }
}
withLabel:process_high_memory {
memory = { check_max( 300.GB * task.attempt, 'memory' ) }
memory = { check_max( 150.GB * task.attempt, 'memory' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'
Expand Down
23 changes: 13 additions & 10 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ nextflow.enable.dsl = 2

// Modules
include { CELLTYPIST } from "./modules/celltypist.nf"
include { SOLO } from "./modules/solo.nf"
include { CELL_CYCLE } from "./modules/cell_cycle.nf"
include { MERGE } from "./modules/merge.nf"

// Workflows
include { PREPROCESSING } from "./workflows/preprocessing.nf"
include { COUNTS } from "./workflows/counts.nf"
include { INTEGRATION } from "./workflows/integration.nf"
include { DOUBLETS } from "./workflows/doublets.nf"
include { CLUSTERING } from "./workflows/clustering.nf"

if (params.samplesheet) { ch_samplesheet = file(params.samplesheet) } else { exit 1, 'Samplesheet not specified!' }
Expand All @@ -24,6 +24,7 @@ workflow {

ch_preprocessed = PREPROCESSING.out.simple
ch_hvgs = PREPROCESSING.out.hvgs
ch_batches = PREPROCESSING.out.batches

COUNTS(ch_preprocessed, params.normalization_method)

Expand All @@ -43,31 +44,33 @@ workflow {
Channel.value(params.benchmark_hvgs)
)

SOLO(
DOUBLETS(
ch_hvgs,
INTEGRATION.out.scanvi_model
INTEGRATION.out.scanvi_model,
INTEGRATION.out.integrated,
ch_preprocessed,
ch_batches.collect()
)

CLUSTERING(
INTEGRATION.out.integrated,
DOUBLETS.out.integrations,
Channel.from(params.leiden_resolutions),
CELLTYPIST.out,
Channel.value(params.entropy_initial_smoothness)
)

ch_obs = CLUSTERING.out.obs.mix(
SOLO.out, CELL_CYCLE.out, CELLTYPIST.out
CELL_CYCLE.out, CELLTYPIST.out, DOUBLETS.out.solo
)

ch_obsm = CLUSTERING.out.obsm.mix(
INTEGRATION.out.obsm
DOUBLETS.out.obsm
)

MERGE(
ch_preprocessed,
MERGE (
DOUBLETS.out.raw,
COUNTS.out,
ch_obsm.map{ meta, obsm -> obsm}.collect(),
ch_obs.map{ meta, obs -> obs}.collect()
)

)
}
30 changes: 30 additions & 0 deletions modules/dedoublet_adata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
process DEDOUBLET_ADATA {
tag "${meta.id}"
container "bigdatainbiomedicine/sc-rpy:1.0"

label "process_medium"

input:
tuple val(meta), path(adata)
tuple val(meta2), path(solo)

output:
tuple val(meta), path("${meta.id}.dedup.h5ad")

script:
"""
#!/opt/conda/bin/python
import pandas as pd
import anndata as ad
adata = ad.read_h5ad("${adata}")
solo = pd.read_pickle("${solo}")
# Keep only cells with "singlet" in the "doublet_label" column
adata = adata[solo["doublet_label"] == "singlet", :]
# Save the AnnData object
adata.write_h5ad("${meta.id}.dedup.h5ad")
"""
}
5 changes: 4 additions & 1 deletion modules/merge.nf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ process MERGE {
adata = ad.read_h5ad("${original_adata}")
counts_adata = ad.read_h5ad("$counts")
counts_adata = counts_adata[adata.obs_names, :]
for layer in counts_adata.layers.keys():
adata.layers[layer] = csc_matrix(counts_adata.layers[layer]).astype(np.float32)
adata.X = csc_matrix(counts_adata.X).astype(np.float32)
Expand All @@ -44,8 +46,9 @@ process MERGE {
for obs_path in obs_paths:
df = pd.read_pickle(obs_path)
df = df.reindex(adata.obs_names)
adata.obs = pd.concat([adata.obs, df], axis=1)
for col in adata.obs.columns:
if adata.obs[col].dtype == np.float64:
adata.obs[col] = adata.obs[col].astype(np.float32)
Expand Down
29 changes: 20 additions & 9 deletions modules/solo.nf
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2


process SOLO {
tag "${meta.id}"
container "bigdatainbiomedicine/sc-scib:1.0"

label "process_medium"
label "process_high_memory"

input:
tuple val(meta), path(adata)
tuple val(meta2), path(scvi_model)
val batches

output:
tuple val(meta), path("${meta.id}.solo.pkl")
Expand All @@ -21,6 +19,7 @@ process SOLO {
import scvi
import scanpy as sc
import pandas as pd
from threadpoolctl import threadpool_limits
threadpool_limits(${task.cpus})
Expand All @@ -30,11 +29,23 @@ process SOLO {
scvi.model.SCANVI.setup_anndata(adata_hvg, batch_key="batch", labels_key="cell_type", unlabeled_category="Unknown")
scvi_model = scvi.model.SCANVI.load("${scvi_model}", adata=adata_hvg)
solo = scvi.external.SOLO.from_scvi_model(scvi_model)
solo.train()
res = solo.predict()
res["doublet_label"] = solo.predict(False)
res.to_pickle("${meta.id}.solo.pkl")
results = []
batches = "${batches.join(" ")}".split(" ")
for batch in batches:
solo = scvi.external.SOLO.from_scvi_model(scvi_model, restrict_to_batch=batch)
solo.train()
batch_res = solo.predict()
batch_res["doublet_label"] = solo.predict(False)
results.append(batch_res)
solo_res = pd.concat(results)
# Reorder the cells to match the original adata
solo_res = solo_res.reindex(adata.obs_names)
solo_res.to_pickle("${meta.id}.solo.pkl")
"""
}
39 changes: 39 additions & 0 deletions workflows/doublets.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
include { SOLO } from "../modules/solo.nf"
include { DEDOUBLET_ADATA as DEDOUBLET_INTEGRATIONS } from "../modules/dedoublet_adata.nf"
include { DEDOUBLET_ADATA as DEDOUBLET_RAW } from "../modules/dedoublet_adata.nf"
include { EXTRACT_EMBEDDING } from "../modules/extract_embedding.nf"


workflow DOUBLETS {
take:
ch_hvgs
ch_scanvi_model
ch_integrations
ch_raw
ch_batches

main:
SOLO(
ch_hvgs,
ch_scanvi_model,
ch_batches
)

DEDOUBLET_INTEGRATIONS(
ch_integrations,
SOLO.out
)

EXTRACT_EMBEDDING(DEDOUBLET_INTEGRATIONS.out)

DEDOUBLET_RAW(
ch_raw,
SOLO.out
)

emit:
solo = SOLO.out
integrations = DEDOUBLET_INTEGRATIONS.out
raw = DEDOUBLET_RAW.out
obsm = EXTRACT_EMBEDDING.out
}
4 changes: 0 additions & 4 deletions workflows/integration.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ include { INTEGRATE } from "../modules/integrate.nf"
include { INTEGRATE as INTEGRATE_GPU } from "../modules/integrate.nf"
include { INTEGRATE as INTEGRATE_SCVI } from "../modules/integrate.nf"
include { INTEGRATE_SCANVI } from "../modules/integrate_scanvi.nf"
include { EXTRACT_EMBEDDING } from "../modules/extract_embedding.nf"
include { BENCHMARKING } from "./benchmarking.nf"


Expand Down Expand Up @@ -71,8 +70,6 @@ workflow INTEGRATION {
ch_integrated_types = ch_integrated
.map{ meta, adata -> [meta, adata, integration_types[meta.integration]] }

EXTRACT_EMBEDDING(ch_integrated)

BENCHMARKING(
ch_preprocessed,
ch_integrated_types,
Expand All @@ -82,5 +79,4 @@ workflow INTEGRATION {
emit:
integrated = ch_integrated
scanvi_model = INTEGRATE_SCANVI.out.model
obsm = EXTRACT_EMBEDDING.out
}
Binary file modified www/SIMBA.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 027d9d4

Please sign in to comment.