add docs

ToryDeng · Dec 22, 2023 · 0c8c000 · 0c8c000
1 parent 6cc3f1b
commit 0c8c000
Show file tree

Hide file tree

Showing 59 changed files with 3,601 additions and 231 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,163 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
 
+# C extensions
+*.so
 
-/scGeneClust/.idea/
-/data/.ipynb_checkpoints/
-/test.py
-/LEGEND/.idea/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+# dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+
+data/*
diff --git a/LEGEND/__init__.py b/LEGEND/__init__.py
@@ -4,4 +4,4 @@
 # @File : __init__.py
 # @Software: PyCharm
 from ._model import GeneClust, integrate
-from ._utils import load_PBMC3k, load_simulated_data, load_mouse_brain
+from ._utils import load_PBMC3k, load_simulated_data, load_mouse_brain, load_mouse_cortex
diff --git a/LEGEND/_model.py b/LEGEND/_model.py
@@ -4,7 +4,7 @@
 # @File : _model.py
 # @Software: PyCharm
 import os
-from typing import Literal, Optional, Union, Tuple
+from typing import Literal, Optional, Tuple, Union
 
 import anndata as ad
 import numpy as np
@@ -13,28 +13,29 @@
 
 import LEGEND.pp as pp
 import LEGEND.tl as tl
+
 from ._utils import set_logger
-from ._validation import check_args, check_all_genes_selected
+from ._validation import check_all_genes_selected, check_args
 
 
 def GeneClust(
-        adata: ad.AnnData,
-        image: np.ndarray = None,
-        n_var_clusters: int = None,
-        n_obs_clusters: int = None,
-        n_components: int = 10,
-        relevant_gene_pct: int = 20,
-        post_hoc_filtering: bool = True,
-        version: Literal['fast', 'ps'] = 'fast',
-        modality: Literal['sc', 'st'] = 'sc',
-        shape: Literal['hexagon', 'square'] = 'hexagon',
-        alpha: float = 0.3,
-        return_info: bool = False,
-        subset: bool = False,
-        max_workers: int = os.cpu_count() - 1,
-        log_path: Optional[Union[os.PathLike, str]] = None,
-        verbosity: Literal[0, 1, 2] = 1,
-        random_state: int = 0
+    adata: ad.AnnData,
+    image: np.ndarray = None,
+    n_var_clusters: int = None,
+    n_obs_clusters: int = None,
+    n_components: int = 10,
+    relevant_gene_pct: int = 20,
+    post_hoc_filtering: bool = True,
+    version: Literal["fast", "ps"] = "fast",
+    modality: Literal["sc", "st"] = "sc",
+    shape: Literal["hexagon", "square"] = "hexagon",
+    alpha: float = 0.3,
+    return_info: bool = False,
+    subset: bool = False,
+    max_workers: int = os.cpu_count() - 1,
+    log_path: Optional[Union[os.PathLike, str]] = None,
+    verbosity: Literal[0, 1, 2] = 1,
+    random_state: int = 0,
 ) -> Optional[Union[Tuple[ad.AnnData, np.ndarray], np.ndarray]]:
     """
     This function is the common interface for *GeneClust-fast* and *GeneClust-ps*.
@@ -51,7 +52,7 @@ def GeneClust(
     n_var_clusters : int
         The number of clusters in gene clustering. Only valid in GeneClust-fast.
     n_obs_clusters : int
-        The number of clusters in cell clustering used to find high-confidence cells. Only valid in GeneClust-ps.
+        The number of clusters in cell/spots clustering used to find high-confidence cells/spots. Only valid in GeneClust-ps.
     n_components : int, default=10
         The number of principal components used along with the first component. Only valid in GeneClust-ps.
     relevant_gene_pct: int, default=20
@@ -102,6 +103,7 @@ def GeneClust(
         Genes relevance values are in `copied_adata.var['relevance']`. Irrelevant genes are filtered.
         Gene redundancy values are in `copied_adata.varp['redundancy']`.
         MST of relevant genes is in `copied_adata.uns['MST']`.
+        Gene outlier scores are in `copied_adata.var['outlier_score']`.
         Representative genes are indicated by `copied_adata.var['representative']`.
     selected_genes : ndarray
         Names of selected genes.
@@ -120,8 +122,21 @@ def GeneClust(
 
     # check arguments
     do_norm = check_args(
-        adata, image, version, n_var_clusters, n_obs_clusters, n_components, relevant_gene_pct, post_hoc_filtering,
-        modality, shape, alpha, return_info, subset, max_workers, random_state
+        adata,
+        image,
+        version,
+        n_var_clusters,
+        n_obs_clusters,
+        n_components,
+        relevant_gene_pct,
+        post_hoc_filtering,
+        modality,
+        shape,
+        alpha,
+        return_info,
+        subset,
+        max_workers,
+        random_state,
     )
 
     # feature selection starts
@@ -139,16 +154,30 @@ def GeneClust(
     pp.reduce_dim(copied_adata, version, random_state)
     # gene clustering
     tl.cluster_genes(
-        copied_adata, image, version, modality, shape, alpha, n_var_clusters, n_obs_clusters, n_components,
-        relevant_gene_pct, max_workers, random_state
+        copied_adata,
+        image,
+        version,
+        modality,
+        shape,
+        alpha,
+        n_var_clusters,
+        n_obs_clusters,
+        n_components,
+        relevant_gene_pct,
+        max_workers,
+        random_state,
     )
     # select features from gene clusters
-    selected_genes = tl.select_from_clusters(copied_adata, version, modality, 20, post_hoc_filtering, random_state)
+    selected_genes = tl.select_from_clusters(
+        copied_adata, version, modality, 20, post_hoc_filtering, random_state
+    )
     check_all_genes_selected(copied_adata, selected_genes)
 
     if subset:
         adata._inplace_subset_var(selected_genes)
-        logger.opt(colors=True).info(f"<magenta>GeneClust-{version}</magenta> finished.")
+        logger.opt(colors=True).info(
+            f"<magenta>GeneClust-{version}</magenta> finished."
+        )
         return None
 
     logger.opt(colors=True).info(f"<magenta>GeneClust-{version}</magenta> finished.")
@@ -159,16 +188,16 @@ def GeneClust(
 
 
 def integrate(
-        adata_rna: ad.AnnData,
-        adata_st: ad.AnnData,
-        rna_weight: float = 0.5,
-        rel_pct: int = 20,
-        post_hoc_filtering: bool = True,
-        return_info: bool = False,
-        max_workers: int = os.cpu_count() - 1,
-        log_path: Optional[Union[os.PathLike, str]] = None,
-        verbosity: Literal[0, 1, 2] = 1,
-        random_state: int = 0
+    adata_rna: ad.AnnData,
+    adata_st: ad.AnnData,
+    rna_weight: float = 0.5,
+    rel_pct: int = 20,
+    post_hoc_filtering: bool = True,
+    return_info: bool = False,
+    max_workers: int = os.cpu_count() - 1,
+    log_path: Optional[Union[os.PathLike, str]] = None,
+    verbosity: Literal[0, 1, 2] = 1,
+    random_state: int = 0,
 ):
     """
     Integrate information from multimodal data to identify co-expressed genes.
@@ -220,23 +249,39 @@ def integrate(
     pseudo_adata = ad.AnnData(np.zeros((1, common_genes.shape[0])), dtype=float)
     pseudo_adata.var_names = common_genes
 
-    comb_redundancy = rna_weight * adata_rna.varp['redundancy'] + (1 - rna_weight) * adata_st.varp['redundancy']
-    comb_relevance = rna_weight * adata_rna.var['relevance'] + (1 - rna_weight) * adata_st.var['relevance']
+    comb_redundancy = (
+        rna_weight * adata_rna.varp["redundancy"]
+        + (1 - rna_weight) * adata_st.varp["redundancy"]
+    )
+    comb_relevance = (
+        rna_weight * adata_rna.var["relevance"]
+        + (1 - rna_weight) * adata_st.var["relevance"]
+    )
     comb_MST = tl.information.build_MST(-comb_redundancy)
-    adata_st.uns['MST'], adata_rna.uns['MST'] = comb_MST, comb_MST
-    logger.opt(colors=True).info(f"Start to compute complementarity on <magenta>SRT</magenta> data...")
-    st_complm = tl.information.compute_gene_complementarity(adata_st, max_workers, random_state)
-    logger.opt(colors=True).info(f"Start to compute complementarity on <magenta>scRNA-seq</magenta> data...")
-    rna_complm = tl.information.compute_gene_complementarity(adata_rna, max_workers, random_state)
-    comb_MST.es['complm'] = rna_weight * st_complm + (1 - rna_weight) * rna_complm
-
-    pseudo_adata.uns['MST'] = comb_MST
-    pseudo_adata.var['relevance'] = comb_relevance
-    pseudo_adata.var['relevance_rna'] = adata_rna.var['relevance']
-    pseudo_adata.var['relevance_st'] = adata_st.var['relevance']
+    adata_st.uns["MST"], adata_rna.uns["MST"] = comb_MST, comb_MST
+    logger.opt(colors=True).info(
+        f"Start to compute complementarity on <magenta>SRT</magenta> data..."
+    )
+    st_complm = tl.information.compute_gene_complementarity(
+        adata_st, max_workers, random_state
+    )
+    logger.opt(colors=True).info(
+        f"Start to compute complementarity on <magenta>scRNA-seq</magenta> data..."
+    )
+    rna_complm = tl.information.compute_gene_complementarity(
+        adata_rna, max_workers, random_state
+    )
+    comb_MST.es["complm"] = rna_weight * st_complm + (1 - rna_weight) * rna_complm
+
+    pseudo_adata.uns["MST"] = comb_MST
+    pseudo_adata.var["relevance"] = comb_relevance
+    pseudo_adata.var["relevance_rna"] = adata_rna.var["relevance"]
+    pseudo_adata.var["relevance_st"] = adata_st.var["relevance"]
 
     tl.cluster.generate_gene_clusters(pseudo_adata)
-    selected_genes = tl.select_from_clusters(pseudo_adata, 'ps', 'st', rel_pct, post_hoc_filtering, random_state)
+    selected_genes = tl.select_from_clusters(
+        pseudo_adata, "ps", "st", rel_pct, post_hoc_filtering, random_state
+    )
     check_all_genes_selected(pseudo_adata, selected_genes)
 
     if return_info: