Remove st_ prefix from names/variables/etc.

nf-core · Mar 19, 2024 · 8238a32 · 8238a32
1 parent 9e907df
commit 8238a32
Show file tree

Hide file tree

Showing 22 changed files with 300 additions and 299 deletions.
diff --git a/bin/st_clustering.qmd → bin/clustering.qmd b/bin/st_clustering.qmd → bin/clustering.qmd
@@ -11,12 +11,12 @@ jupyter: python3
 ```{python}
 #| tags: [parameters]
 #| echo: false
-input_sdata = "st_sdata_filtered.zarr"  # Input: SpatialData file
+input_sdata = "sdata_filtered.zarr"  # Input: SpatialData file
 cluster_resolution = 1  # Resolution for Leiden clustering
 n_hvgs = 2000  # Number of HVGs to use for analyses
 artifact_dir = "artifacts" # Output directory
-output_adata = "st_adata_processed.h5ad"  # Output: AnnData file
-output_sdata = "st_sdata_processed.zarr"  # Output: SpatialData file
+output_adata = "adata_processed.h5ad"  # Output: AnnData file
+output_sdata = "sdata_processed.zarr"  # Output: SpatialData file
 ```
 
 The data has already been filtered in the _quality controls_ reports and is
@@ -62,11 +62,11 @@ def to_legacy_anndata(sdata: spatialdata.SpatialData) -> AnnData:
 ```
 
 ```{python}
-st_sdata = spatialdata.read_zarr(input_sdata, ["images", "table", "shapes"])
-st_adata = to_legacy_anndata(st_sdata)
+sdata = spatialdata.read_zarr(input_sdata, ["images", "table", "shapes"])
+adata = to_legacy_anndata(sdata)
 
 print("Content of the AnnData object:")
-print(st_adata)
+print(adata)
 ```
 
 # Normalization
@@ -76,8 +76,8 @@ use the built-in `normalize_total` method from [Scanpy](https://scanpy.readthedo
 followed by a log-transformation.
 
 ```{python}
-sc.pp.normalize_total(st_adata, inplace=True)
-sc.pp.log1p(st_adata)
+sc.pp.normalize_total(adata, inplace=True)
+sc.pp.log1p(adata)
 ```
 
 # Feature selection
@@ -90,13 +90,13 @@ regards to yielding a good separation of clusters.
 ```{python}
 # layout-nrow: 1
 # Find top HVGs and print results
-sc.pp.highly_variable_genes(st_adata, flavor="seurat", n_top_genes=n_hvgs)
-var_genes_all = st_adata.var.highly_variable
+sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=n_hvgs)
+var_genes_all = adata.var.highly_variable
 print("Extracted highly variable genes: %d"%sum(var_genes_all))
 
 # Plot the HVGs
 plt.rcParams["figure.figsize"] = (4.5, 4.5)
-sc.pl.highly_variable_genes(st_adata)
+sc.pl.highly_variable_genes(adata)
 ```
 
 # Clustering
@@ -108,10 +108,10 @@ Manifold Approximation and Projection) is used for visualization. The Leiden
 algorithm is employed for clustering with a given resolution.
 
 ```{python}
-sc.pp.pca(st_adata)
-sc.pp.neighbors(st_adata)
-sc.tl.umap(st_adata)
-sc.tl.leiden(st_adata, key_added="clusters", resolution=cluster_resolution)
+sc.pp.pca(adata)
+sc.pp.neighbors(adata)
+sc.tl.umap(adata)
+sc.tl.leiden(adata, key_added="clusters", resolution=cluster_resolution)
 Markdown(f"Resolution for Leiden clustering: `{cluster_resolution}`")
 ```
 
@@ -122,7 +122,7 @@ We then generate UMAP plots to visualize the distribution of clusters:
 ```{python}
 #| warning: false
 plt.rcParams["figure.figsize"] = (7, 7)
-sc.pl.umap(st_adata, color="clusters")
+sc.pl.umap(adata, color="clusters")
 ```
 
 ## Counts and genes
@@ -133,7 +133,7 @@ the UMAP:
 ```{python}
 # Make plots of UMAP of ST spots clusters
 plt.rcParams["figure.figsize"] = (3.5, 3.5)
-sc.pl.umap(st_adata, color=["total_counts", "n_genes_by_counts"])
+sc.pl.umap(adata, color=["total_counts", "n_genes_by_counts"])
 ```
 
 ## Individual clusters
@@ -142,8 +142,8 @@ An additional visualisation is to show where the various spots are in each
 individual cluster while ignoring all other cluster:
 
 ```{python}
-sc.tl.embedding_density(st_adata, basis="umap", groupby="clusters")
-sc.pl.embedding_density(st_adata, groupby="clusters", ncols=2)
+sc.tl.embedding_density(adata, basis="umap", groupby="clusters")
+sc.pl.embedding_density(adata, groupby="clusters", ncols=2)
 ```
 
 # Spatial visualisation
@@ -154,8 +154,8 @@ spatial coordinates by overlaying the spots on the tissue image itself.
 ```{python}
 #| layout-nrow: 2
 plt.rcParams["figure.figsize"] = (8, 8)
-sc.pl.spatial(st_adata, img_key="hires", color="total_counts", size=1.25)
-sc.pl.spatial(st_adata, img_key="hires", color="n_genes_by_counts", size=1.25)
+sc.pl.spatial(adata, img_key="hires", color="total_counts", size=1.25)
+sc.pl.spatial(adata, img_key="hires", color="n_genes_by_counts", size=1.25)
 ```
 
 To gain insights into tissue organization and potential inter-cellular
@@ -167,13 +167,13 @@ organization of cells.
 ```{python}
 # TODO: Can the colour bar on this figure be fit to the figure?
 plt.rcParams["figure.figsize"] = (7, 7)
-sc.pl.spatial(st_adata, img_key="hires", color="clusters", size=1.25)
+sc.pl.spatial(adata, img_key="hires", color="clusters", size=1.25)
 ```
 
 ```{python}
 #| echo: false
-del st_sdata.table
-st_sdata.table = st_adata
-st_adata.write(os.path.join(artifact_dir, output_adata))
-st_sdata.write(os.path.join(artifact_dir, output_sdata))
+del sdata.table
+sdata.table = adata
+adata.write(os.path.join(artifact_dir, output_adata))
+sdata.write(os.path.join(artifact_dir, output_sdata))
 ```
diff --git a/bin/st_quality_controls.qmd → bin/quality_controls.qmd b/bin/st_quality_controls.qmd → bin/quality_controls.qmd
@@ -26,16 +26,16 @@ analysis tools and facilitates seamless integration into existing workflows.
 ```{python}
 #| tags: [parameters]
 #| echo: false
-input_sdata = "st_sdata_raw.zarr"  # Input: SpatialData file
+input_sdata = "sdata_raw.zarr"  # Input: SpatialData file
 min_counts = 500  # Min counts per spot
 min_genes = 250  # Min genes per spot
 min_spots = 1  # Min spots per gene
 mito_threshold = 20  # Mitochondrial content threshold (%)
 ribo_threshold = 0  # Ribosomal content threshold (%)
 hb_threshold = 100  # content threshold (%)
 artifact_dir = "artifacts"
-output_adata = "st_adata_filtered.h5ad"  # Output: AnnData file
-output_sdata = "st_sdata_filtered.zarr"  # Output: SpatialData file
+output_adata = "adata_filtered.h5ad"  # Output: AnnData file
+output_sdata = "sdata_filtered.zarr"  # Output: SpatialData file
 ```
 
 ```{python}
@@ -78,18 +78,18 @@ def to_legacy_anndata(sdata: spatialdata.SpatialData) -> AnnData:
 
 ```{python}
 # Read the data
-st_sdata = spatialdata.read_zarr(input_sdata, ["images", "table", "shapes"])
-st_adata = to_legacy_anndata(st_sdata)
+sdata = spatialdata.read_zarr(input_sdata, ["images", "table", "shapes"])
+adata = to_legacy_anndata(sdata)
 
 # Convert X matrix from csr to csc dense matrix for output compatibility:
-st_adata.X = scipy.sparse.csc_matrix(st_adata.X)
+adata.X = scipy.sparse.csc_matrix(adata.X)
 
 # Store the raw data so that it can be used for analyses from scratch if desired
-st_adata.layers['raw'] = st_adata.X.copy()
+adata.layers['raw'] = adata.X.copy()
 
 # Print the anndata object for inspection
 print("Content of the AnnData object:")
-print(st_adata)
+print(adata)
 ```
 
 # Quality controls
@@ -102,14 +102,14 @@ percentage of counts from mitochondrial, ribosomal and haemoglobin genes
 
 ```{python}
 # Calculate mitochondrial, ribosomal and haemoglobin percentages
-st_adata.var['mt'] = st_adata.var_names.str.startswith('MT-')
-st_adata.var['ribo'] = st_adata.var_names.str.contains(("^RP[LS]"))
-st_adata.var['hb'] = st_adata.var_names.str.contains(("^HB[AB]"))
-sc.pp.calculate_qc_metrics(st_adata, qc_vars=["mt", "ribo", "hb"],
+adata.var['mt'] = adata.var_names.str.startswith('MT-')
+adata.var['ribo'] = adata.var_names.str.contains(("^RP[LS]"))
+adata.var['hb'] = adata.var_names.str.contains(("^HB[AB]"))
+sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"],
     inplace=True, log1p=False)
 
 # Save a copy of data as a restore-point if filtering results in 0 spots left
-st_adata_before_filtering = st_adata.copy()
+adata_before_filtering = adata.copy()
 ```
 
 ## Violin plots
@@ -120,9 +120,9 @@ mitochondrial, ribosomal and haemoglobin genes:
 
 ```{python}
 #| layout-nrow: 2
-sc.pl.violin(st_adata, ['n_genes_by_counts', 'total_counts'],
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
     multi_panel=True, jitter=0.4, rotation= 45)
-sc.pl.violin(st_adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
+sc.pl.violin(adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
     multi_panel=True, jitter=0.4, rotation= 45)
 ```
 
@@ -133,8 +133,8 @@ spatial patterns may be discerned:
 
 ```{python}
 #| layout-nrow: 2
-sc.pl.spatial(st_adata, color = ["total_counts", "n_genes_by_counts"], size=1.25)
-sc.pl.spatial(st_adata, color = ["pct_counts_mt", "pct_counts_ribo", "pct_counts_hb"], size=1.25)
+sc.pl.spatial(adata, color = ["total_counts", "n_genes_by_counts"], size=1.25)
+sc.pl.spatial(adata, color = ["pct_counts_mt", "pct_counts_ribo", "pct_counts_hb"], size=1.25)
 ```
 
 ## Scatter plots
@@ -145,8 +145,8 @@ counts versus the number of genes:
 
 ```{python}
 #| layout-ncol: 2
-sc.pl.scatter(st_adata, x='pct_counts_ribo', y='pct_counts_mt')
-sc.pl.scatter(st_adata, x='total_counts', y='n_genes_by_counts')
+sc.pl.scatter(adata, x='pct_counts_ribo', y='pct_counts_mt')
+sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
 ```
 
 ## Top expressed genes
@@ -155,7 +155,7 @@ It can also be informative to see which genes are the most expressed in the
 dataset; the following figure shows the top 20 most expressed genes.
 
 ```{python}
-sc.pl.highest_expr_genes(st_adata, n_top=20)
+sc.pl.highest_expr_genes(adata, n_top=20)
 ```
 
 # Filtering
@@ -167,16 +167,16 @@ are uninformative and are thus removed.
 
 ```{python}
 # Create a string observation "obs/in_tissue_str" with "In tissue" and "Outside tissue":
-st_adata.obs["in_tissue_str"] = ["In tissue" if x == 1 else "Outside tissue" for x in st_adata.obs["in_tissue"]]
+adata.obs["in_tissue_str"] = ["In tissue" if x == 1 else "Outside tissue" for x in adata.obs["in_tissue"]]
 
 # Plot spots inside tissue
-sc.pl.spatial(st_adata, color=["in_tissue_str"], title="Spots in tissue", size=1.25)
-del st_adata.obs["in_tissue_str"]
+sc.pl.spatial(adata, color=["in_tissue_str"], title="Spots in tissue", size=1.25)
+del adata.obs["in_tissue_str"]
 
 # Remove spots outside tissue and print results
-n_spots = st_adata.shape[0]
-st_adata = st_adata[st_adata.obs["in_tissue"] == 1]
-n_spots_in_tissue = st_adata.shape[0]
+n_spots = adata.shape[0]
+adata = adata[adata.obs["in_tissue"] == 1]
+n_spots_in_tissue = adata.shape[0]
 Markdown(f"""A total of `{n_spots_in_tissue}` spots are situated inside the
 tissue, out of `{n_spots}` spots in total.""")
 ```
@@ -190,18 +190,18 @@ your knowledge of the specific tissue at hand.
 ```{python}
 #| warning: false
 # Filter spots based on counts
-n_spots = st_adata.shape[0]
-n_genes = st_adata.shape[1]
-sc.pp.filter_cells(st_adata, min_counts=min_counts)
-n_spots_filtered_min_counts = st_adata.shape[0]
+n_spots = adata.shape[0]
+n_genes = adata.shape[1]
+sc.pp.filter_cells(adata, min_counts=min_counts)
+n_spots_filtered_min_counts = adata.shape[0]
 
 # Filter spots based on genes
-sc.pp.filter_cells(st_adata, min_genes=min_genes)
-n_spots_filtered_min_genes = st_adata.shape[0]
+sc.pp.filter_cells(adata, min_genes=min_genes)
+n_spots_filtered_min_genes = adata.shape[0]
 
 # Filter genes based on spots
-sc.pp.filter_genes(st_adata, min_cells=min_spots)
-n_genes_filtered_min_spots = st_adata.shape[1]
+sc.pp.filter_genes(adata, min_cells=min_spots)
+n_genes_filtered_min_spots = adata.shape[1]
 
 # Print results
 Markdown(f"""
@@ -220,16 +220,16 @@ ribosomal nor haemoglobin content is filtered by default.
 
 ```{python}
 # Filter spots
-st_adata = st_adata[st_adata.obs["pct_counts_mt"] <= mito_threshold]
-n_spots_filtered_mito = st_adata.shape[0]
-st_adata = st_adata[st_adata.obs["pct_counts_ribo"] >= ribo_threshold]
-n_spots_filtered_ribo = st_adata.shape[0]
-st_adata = st_adata[st_adata.obs["pct_counts_hb"] <= hb_threshold]
-n_spots_filtered_hb = st_adata.shape[0]
+adata = adata[adata.obs["pct_counts_mt"] <= mito_threshold]
+n_spots_filtered_mito = adata.shape[0]
+adata = adata[adata.obs["pct_counts_ribo"] >= ribo_threshold]
+n_spots_filtered_ribo = adata.shape[0]
+adata = adata[adata.obs["pct_counts_hb"] <= hb_threshold]
+n_spots_filtered_hb = adata.shape[0]
 
 # Print results
 Markdown(f"""
-- Removed `{st_adata.shape[0] - n_spots_filtered_mito}` spots with more than `{mito_threshold}%` mitochondrial content.
+- Removed `{adata.shape[0] - n_spots_filtered_mito}` spots with more than `{mito_threshold}%` mitochondrial content.
 - Removed `{n_spots_filtered_mito - n_spots_filtered_ribo}` spots with less than `{ribo_threshold}%` ribosomal content.
 - Removed `{n_spots_filtered_ribo - n_spots_filtered_hb}` spots with more than `{hb_threshold}%` haemoglobin content.
 """)
@@ -238,8 +238,8 @@ Markdown(f"""
 ```{python}
 #| echo: false
 # Restore non-filtered data if filtering results in 0 spots left
-if (st_adata.shape[0] == 0 or st_adata.shape[1] == 0):
-    st_adata = st_adata_before_filtering
+if (adata.shape[0] == 0 or adata.shape[1] == 0):
+    adata = adata_before_filtering
     display(
         Markdown(dedent(
             """
@@ -269,21 +269,21 @@ if (st_adata.shape[0] == 0 or st_adata.shape[1] == 0):
 Markdown(f"""
 The final results of all the filtering is as follows:
 
-- A total of `{st_adata.shape[0]}` spots out of `{n_spots}` remain after filtering.
-- A total of `{st_adata.shape[1]}` genes out of `{n_genes}` remain after filtering.
+- A total of `{adata.shape[0]}` spots out of `{n_spots}` remain after filtering.
+- A total of `{adata.shape[1]}` genes out of `{n_genes}` remain after filtering.
 """)
 ```
 
 ```{python}
 #| layout-nrow: 2
-sc.pl.violin(st_adata, ['n_genes_by_counts', 'total_counts'],
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
     multi_panel=True, jitter=0.4, rotation= 45)
-sc.pl.violin(st_adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
+sc.pl.violin(adata, ['pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'],
     multi_panel=True, jitter=0.4, rotation= 45)
 ```
 
 ```{python}
-del st_sdata.table
-st_sdata.table = st_adata
-st_sdata.write(os.path.join(artifact_dir, output_sdata))
+del sdata.table
+sdata.table = adata
+sdata.write(os.path.join(artifact_dir, output_sdata))
 ```
diff --git a/bin/read_st_data.py → bin/read_data.py b/bin/read_st_data.py → bin/read_data.py
@@ -2,6 +2,7 @@
 
 # Load packages
 import argparse
+
 import spatialdata_io
 
 if __name__ == "__main__":
@@ -28,9 +29,9 @@
     args = parser.parse_args()
 
     # Read Visium data
-    st_spatialdata = spatialdata_io.visium(
+    spatialdata = spatialdata_io.visium(
         args.SRCountDir, counts_file="raw_feature_bc_matrix.h5", dataset_id="visium"
     )
 
     # Write raw spatialdata to file
-    st_spatialdata.write(args.output_sdata, overwrite=True)
+    spatialdata.write(args.output_sdata, overwrite=True)