From 30db1f8a455a7bd17d2c85193e8d3393118fb4bc Mon Sep 17 00:00:00 2001
From: sichao <sichao_y@outlook.com>
Date: Thu, 20 Apr 2023 16:21:48 -0400
Subject: [PATCH 1/5] combine neighbors and k_nearest_neighbors

---
 dynamo/tools/connectivity.py | 53 +++++++++---------------------------
 dynamo/tools/utils.py        | 51 +++++++++++++++++++++++++++++-----
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py
index f0b939d9a..ca38ed43e 100755
--- a/dynamo/tools/connectivity.py
+++ b/dynamo/tools/connectivity.py
@@ -21,7 +21,7 @@
 from ..configuration import DynamoAdataKeyManager
 from ..docrep import DocstringProcessor
 from ..dynamo_logger import LoggerManager, main_info, main_warning
-from .utils import fetch_X_data, log1p_
+from .utils import fetch_X_data, k_nearest_neighbors, log1p_
 
 docstrings = DocstringProcessor()
 
@@ -587,45 +587,18 @@ def neighbors(
             logger.info("fetching X data from layer:%s, basis:%s" % (str(layer), str(basis)))
             genes, X_data = fetch_X_data(adata, genes, layer, basis)
 
-    if method is None:
-        logger.info("method arg is None, choosing methods automatically...")
-        if X_data.shape[0] > 200000 and X_data.shape[1] > 2:
-
-            from pynndescent import NNDescent
-
-            method = "pynn"
-        elif X_data.shape[1] > 10:
-            method = "ball_tree"
-        else:
-            method = "kd_tree"
-        logger.info("method %s selected" % (method), indent_level=2)
-
-    # may distinguish between umap and pynndescent -- treat them equal for now
-    if method.lower() in ["pynn", "umap"]:
-        index = NNDescent(
-            X_data,
-            metric=metric,
-            n_neighbors=n_neighbors,
-            n_jobs=cores,
-            random_state=seed,
-            **kwargs,
-        )
-        knn, distances = index.query(X_data, k=n_neighbors)
-    elif method in ["ball_tree", "kd_tree"]:
-        from sklearn.neighbors import NearestNeighbors
-
-        # print("***debug X_data:", X_data)
-        nbrs = NearestNeighbors(
-            n_neighbors=n_neighbors,
-            metric=metric,
-            metric_params=metric_kwads,
-            algorithm=method,
-            n_jobs=cores,
-            **kwargs,
-        ).fit(X_data)
-        distances, knn = nbrs.kneighbors(X_data)
-    else:
-        raise ImportError(f"nearest neighbor search method {method} is not supported")
+    knn, distances = k_nearest_neighbors(
+        X_data,
+        k=n_neighbors - 1,
+        method=method,
+        metric=metric,
+        metric_kwads=metric_kwads,
+        exclude_self=False,
+        pynn_rand_state=seed,
+        n_jobs=cores,
+        logger=logger,
+        **kwargs,
+    )
 
     conn_key, dist_key, neighbor_key = _gen_neighbor_keys(result_prefix)
     logger.info_insert_adata(conn_key, adata_attr="obsp")
diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py
index c03d5161b..45ba439ab 100755
--- a/dynamo/tools/utils.py
+++ b/dynamo/tools/utils.py
@@ -25,6 +25,7 @@
 from tqdm import tqdm
 
 from ..dynamo_logger import (
+    Logger,
     LoggerManager,
     main_critical,
     main_debug,
@@ -193,6 +194,9 @@ def nearest_neighbors(coord: np.ndarray, coords: Union[np.ndarray, sp.csr_matrix
 def k_nearest_neighbors(
     X: np.ndarray,
     k: int,
+    method: Optional[str] = None,
+    metric: Union[str, Callable] = "euclidean",
+    metric_kwads: Dict[str, Any] = None,
     exclude_self: bool = True,
     knn_dim: int = 10,
     pynn_num: int = int(2e5),
@@ -200,12 +204,21 @@ def k_nearest_neighbors(
     pynn_rand_state: int = 19491001,
     n_jobs: int = -1,
     return_nbrs: bool = False,
+    logger: Logger = None,
+    **kwargs,
 ) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]:
     """Compute k nearest neighbors for a given space.
 
     Args:
         X: the space to find nearest neighbors on.
         k: the number of neighbors to be found, excluding the point itself.
+        method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of
+            the input data.
+        metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to
+            the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If
+            metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
+            `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean".
+        metric_kwads: additional keyword arguments for the metric function. Defaults to None.
         exclude_self: whether to exclude the point itself from the result. Defaults to True.
         knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is
             smaller than this value, `kd_tree` algorithm would be used. Defaults to 10.
@@ -216,6 +229,8 @@ def k_nearest_neighbors(
         pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001.
         n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1.
         return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False.
+        logger: the Logger object to display the log information.
+        kwargs: additional arguments that will be passed to each nearest neighbor search algorithm.
 
     Returns:
         A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each
@@ -223,22 +238,44 @@ def k_nearest_neighbors(
         object and it would be returned only if `return_nbrs` is True.
     """
 
-    n, d = np.atleast_2d(X).shape
-    if n > int(pynn_num) and d > pynn_dim:
-        from pynndescent import NNDescent
+    if method is None:
+        if logger is None:
+            logger = LoggerManager.gen_logger("neighbors")
+        logger.info("method arg is None, choosing methods automatically...")
+        if X.shape[0] > pynn_num and X.shape[1] > pynn_dim:
+            method = "pynn"
+        else:
+            if X.shape[1] > knn_dim:
+                method = "ball_tree"
+            else:
+                method = "kd_tree"
+        logger.info("method %s selected" % (method), indent_level=2)
 
+    if method.lower() in ["pynn", "umap"]:
+        from pynndescent import NNDescent
         nbrs = NNDescent(
             X,
-            metric="euclidean",
+            metric=metric,
             n_neighbors=k + 1,
             n_jobs=n_jobs,
             random_state=pynn_rand_state,
+            **kwargs,
         )
         nbrs_idx, dists = nbrs.query(X, k=k + 1)
-    else:
-        alg = "ball_tree" if d > knn_dim else "kd_tree"
-        nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=n_jobs).fit(X)
+    elif method in ["ball_tree", "kd_tree"]:
+        from sklearn.neighbors import NearestNeighbors
+        # print("***debug X_data:", X_data)
+        nbrs = NearestNeighbors(
+            n_neighbors=k + 1,
+            metric=metric,
+            metric_params=metric_kwads,
+            algorithm=method,
+            n_jobs=n_jobs,
+            **kwargs,
+        ).fit(X)
         dists, nbrs_idx = nbrs.kneighbors(X)
+    else:
+        raise ImportError(f"nearest neighbor search method {method} is not supported")
 
     nbrs_idx = np.array(nbrs_idx)
     if exclude_self:

From baea41861b1b6203e7bf7fa7d499d2946ef05066 Mon Sep 17 00:00:00 2001
From: sichao <sichao_y@outlook.com>
Date: Fri, 21 Apr 2023 11:36:24 -0400
Subject: [PATCH 2/5] add hnswlib option

---
 dynamo/tools/connectivity.py   | 141 ++++++++++++++++++++++++++++++++-
 dynamo/tools/graph_calculus.py |   2 +-
 dynamo/tools/utils.py          |  95 ----------------------
 3 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py
index ca38ed43e..d62a00b2b 100755
--- a/dynamo/tools/connectivity.py
+++ b/dynamo/tools/connectivity.py
@@ -15,13 +15,14 @@
 from pynndescent.distances import true_angular
 from scipy.sparse import coo_matrix, csr_matrix, issparse
 from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
 from sklearn.utils import sparsefuncs
 from umap import UMAP
 
 from ..configuration import DynamoAdataKeyManager
 from ..docrep import DocstringProcessor
-from ..dynamo_logger import LoggerManager, main_info, main_warning
-from .utils import fetch_X_data, k_nearest_neighbors, log1p_
+from ..dynamo_logger import Logger, LoggerManager, main_info, main_warning
+from .utils import fetch_X_data, log1p_
 
 docstrings = DocstringProcessor()
 
@@ -514,6 +515,142 @@ def _gen_neighbor_keys(result_prefix: str = "") -> Tuple[str, str, str]:
     return conn_key, dist_key, neighbor_key
 
 
+def _correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Corrects the indices and corresponding distances obtained from a hnswlib by manually adding self neighbors.
+
+    Args:
+        knn_hn: array containing the k-NN indices obtained from the hnswlib.
+        distances_hn: array containing the distances corresponding to the k-NN indices obtained from the HNSW index.
+
+    Returns:
+        A tuple containing the corrected indices and distances.
+    """
+    mask = knn_hn[:, 0] == np.arange(knn_hn.shape[0])
+    target_indices = np.where(mask)[0]
+    def roll(arr, value=0):
+        arr = np.roll(arr, 1, axis=0)
+        arr[0] = value
+        return arr
+
+    knn_corrected = [knn_hn[i] if i in target_indices else roll(knn_hn[i], i) for i in range(knn_hn.shape[0])]
+    distances_corrected = [
+        distances_hn[i] if i in target_indices else roll(distances_hn[i]) for i in range(distances_hn.shape[0])
+    ]
+    return np.vstack(knn_corrected), np.vstack(distances_corrected)
+
+
+def k_nearest_neighbors(
+    X: np.ndarray,
+    k: int,
+    method: Optional[str] = None,
+    metric: Union[str, Callable] = "euclidean",
+    metric_kwads: Dict[str, Any] = None,
+    exclude_self: bool = True,
+    knn_dim: int = 10,
+    pynn_num: int = 5000,
+    pynn_dim: int = 2,
+    hnswlib_num: int = int(2e5),
+    pynn_rand_state: int = 19491001,
+    n_jobs: int = -1,
+    return_nbrs: bool = False,
+    logger: Logger = None,
+    **kwargs,
+) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]:
+    """Compute k nearest neighbors for a given space.
+
+    Args:
+        X: the space to find nearest neighbors on.
+        k: the number of neighbors to be found, excluding the point itself.
+        method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of
+            the input data.
+        metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to
+            the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If
+            metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
+            `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean".
+        metric_kwads: additional keyword arguments for the metric function. Defaults to None.
+        exclude_self: whether to exclude the point itself from the result. Defaults to True.
+        knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is
+            smaller than this value, `kd_tree` algorithm would be used. Defaults to 10.
+        pynn_num: the lowest threshold of features to use NNDescent package. If number of features less than/equal to
+            this value, `sklearn` package would be used. Defaults to 5000.
+        pynn_dim: the lowest threshold of dimensions to use NNDescent package. If number of features less than/equal to
+            this value, `sklearn` package would be used. Defaults to 2.
+        hnswlib_num: the lowest threshold of features to use `hnswlib` nearest neighbors. Defaults to int(2e5).
+        pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001.
+        n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1.
+        return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False.
+        logger: the Logger object to display the log information.
+        kwargs: additional arguments that will be passed to each nearest neighbor search algorithm.
+
+    Returns:
+        A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each
+        point and dists contains the distances between neighbors and the point. nbrs is the fitted nearest neighbor
+        object, and it would be returned only if `return_nbrs` is True.
+    """
+
+    if method is None:
+        if logger is None:
+            logger = LoggerManager.gen_logger("neighbors")
+        logger.info("method arg is None, choosing methods automatically...")
+        if X.shape[0] > hnswlib_num:
+            method = "hnswlib"
+        elif X.shape[0] > pynn_num and X.shape[1] > pynn_dim:
+            method = "pynn"
+        else:
+            if X.shape[1] > knn_dim:
+                method = "ball_tree"
+            else:
+                method = "kd_tree"
+        logger.info("method %s selected" % (method), indent_level=2)
+
+    if method.lower() in ["pynn", "umap"]:
+        from pynndescent import NNDescent
+        nbrs = NNDescent(
+            X,
+            metric=metric,
+            n_neighbors=k + 1,
+            n_jobs=n_jobs,
+            random_state=pynn_rand_state,
+            **kwargs,
+        )
+        nbrs_idx, dists = nbrs.query(X, k=k + 1)
+    elif method in ["ball_tree", "kd_tree"]:
+        from sklearn.neighbors import NearestNeighbors
+        # print("***debug X_data:", X_data)
+        nbrs = NearestNeighbors(
+            n_neighbors=k + 1,
+            metric=metric,
+            metric_params=metric_kwads,
+            algorithm=method,
+            n_jobs=n_jobs,
+            **kwargs,
+        ).fit(X)
+        dists, nbrs_idx = nbrs.kneighbors(X)
+    elif method == "hnswlib":
+        import hnswlib
+        space = "l2" if metric == "euclidean" else metric
+        if space not in ["l2", "cosine", "ip"]:
+            raise ImportError(f"hnswlib nearest neighbors with space {space} is not supported")
+        nbrs = hnswlib.Index(space=space, dim=X.shape[1])
+        nbrs.init_index(max_elements=X.shape[0], random_seed=pynn_rand_state, **kwargs)
+        nbrs.set_num_threads(n_jobs)
+        nbrs.add_items(X)
+        nbrs_idx, dists = nbrs.knn_query(X, k=k + 1)
+        if space == "l2":
+            dists = np.sqrt(dists)
+        nbrs_idx, dists = _correct_hnsw_neighbors(nbrs_idx, dists)
+    else:
+        raise ImportError(f"nearest neighbor search method {method} is not supported")
+
+    nbrs_idx = np.array(nbrs_idx)
+    if exclude_self:
+        nbrs_idx = nbrs_idx[:, 1:]
+        dists = dists[:, 1:]
+    if return_nbrs:
+        return nbrs_idx, dists, nbrs
+    return nbrs_idx, dists
+
+
 def neighbors(
     adata: AnnData,
     X_data: np.ndarray = None,
diff --git a/dynamo/tools/graph_calculus.py b/dynamo/tools/graph_calculus.py
index bf4f52182..fa2dfe9ec 100644
--- a/dynamo/tools/graph_calculus.py
+++ b/dynamo/tools/graph_calculus.py
@@ -11,13 +11,13 @@
 from scipy.optimize import lsq_linear, minimize
 from sklearn.neighbors import NearestNeighbors
 
+from .connectivity import k_nearest_neighbors
 from ..dynamo_logger import main_info, main_warning
 from ..tools.utils import projection_with_transition_matrix
 from .utils import (
     elem_prod,
     flatten,
     index_condensed_matrix,
-    k_nearest_neighbors,
     nbrs_to_dists,
     symmetrize_symmetric_matrix,
 )
diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py
index 45ba439ab..5bdef59a6 100755
--- a/dynamo/tools/utils.py
+++ b/dynamo/tools/utils.py
@@ -191,101 +191,6 @@ def nearest_neighbors(coord: np.ndarray, coords: Union[np.ndarray, sp.csr_matrix
     return neighs
 
 
-def k_nearest_neighbors(
-    X: np.ndarray,
-    k: int,
-    method: Optional[str] = None,
-    metric: Union[str, Callable] = "euclidean",
-    metric_kwads: Dict[str, Any] = None,
-    exclude_self: bool = True,
-    knn_dim: int = 10,
-    pynn_num: int = int(2e5),
-    pynn_dim: int = 2,
-    pynn_rand_state: int = 19491001,
-    n_jobs: int = -1,
-    return_nbrs: bool = False,
-    logger: Logger = None,
-    **kwargs,
-) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]:
-    """Compute k nearest neighbors for a given space.
-
-    Args:
-        X: the space to find nearest neighbors on.
-        k: the number of neighbors to be found, excluding the point itself.
-        method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of
-            the input data.
-        metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to
-            the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If
-            metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
-            `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean".
-        metric_kwads: additional keyword arguments for the metric function. Defaults to None.
-        exclude_self: whether to exclude the point itself from the result. Defaults to True.
-        knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is
-            smaller than this value, `kd_tree` algorithm would be used. Defaults to 10.
-        pynn_num: the lowest threshold of features to use NNDescent package. If number of features less than/equal to
-            this value, `sklearn` package would be used. Defaults to int(2e5).
-        pynn_dim: the lowest threshold of dimensions to use NNDescent package. If number of features less than/equal to
-            this value, `sklearn` package would be used. Defaults to 2.
-        pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001.
-        n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1.
-        return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False.
-        logger: the Logger object to display the log information.
-        kwargs: additional arguments that will be passed to each nearest neighbor search algorithm.
-
-    Returns:
-        A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each
-        point and dists contains the distances between neighbors and the point. nbrs is the fitted nearest neighbor
-        object and it would be returned only if `return_nbrs` is True.
-    """
-
-    if method is None:
-        if logger is None:
-            logger = LoggerManager.gen_logger("neighbors")
-        logger.info("method arg is None, choosing methods automatically...")
-        if X.shape[0] > pynn_num and X.shape[1] > pynn_dim:
-            method = "pynn"
-        else:
-            if X.shape[1] > knn_dim:
-                method = "ball_tree"
-            else:
-                method = "kd_tree"
-        logger.info("method %s selected" % (method), indent_level=2)
-
-    if method.lower() in ["pynn", "umap"]:
-        from pynndescent import NNDescent
-        nbrs = NNDescent(
-            X,
-            metric=metric,
-            n_neighbors=k + 1,
-            n_jobs=n_jobs,
-            random_state=pynn_rand_state,
-            **kwargs,
-        )
-        nbrs_idx, dists = nbrs.query(X, k=k + 1)
-    elif method in ["ball_tree", "kd_tree"]:
-        from sklearn.neighbors import NearestNeighbors
-        # print("***debug X_data:", X_data)
-        nbrs = NearestNeighbors(
-            n_neighbors=k + 1,
-            metric=metric,
-            metric_params=metric_kwads,
-            algorithm=method,
-            n_jobs=n_jobs,
-            **kwargs,
-        ).fit(X)
-        dists, nbrs_idx = nbrs.kneighbors(X)
-    else:
-        raise ImportError(f"nearest neighbor search method {method} is not supported")
-
-    nbrs_idx = np.array(nbrs_idx)
-    if exclude_self:
-        nbrs_idx = nbrs_idx[:, 1:]
-        dists = dists[:, 1:]
-    if return_nbrs:
-        return nbrs_idx, dists, nbrs
-    return nbrs_idx, dists
-
-
 def nbrs_to_dists(X: np.ndarray, nbrs_idx: np.ndarray) -> List[np.ndarray]:
     """Calculate the distances between neighbors of a given space.
 

From 0003aa96297cd32c1ec80aa8f68b696889d1ec80 Mon Sep 17 00:00:00 2001
From: sichao <sichao_y@outlook.com>
Date: Fri, 21 Apr 2023 19:43:48 -0400
Subject: [PATCH 3/5] update k nearest neighbors reference

---
 dynamo/prediction/fate.py           | 80 +++++++++++++------------
 dynamo/tools/Markov.py              | 91 +++++++++--------------------
 dynamo/tools/connectivity.py        | 19 +++---
 dynamo/tools/graph_calculus.py      |  2 +-
 dynamo/tools/growth.py              | 34 +++++------
 dynamo/tools/metric_velocity.py     | 23 +++-----
 dynamo/tools/sampling.py            | 23 +++-----
 dynamo/vectorfield/scVectorField.py | 43 +++++---------
 dynamo/vectorfield/topography.py    | 23 +++-----
 9 files changed, 136 insertions(+), 202 deletions(-)

diff --git a/dynamo/prediction/fate.py b/dynamo/prediction/fate.py
index 5f96bfb13..131944895 100755
--- a/dynamo/prediction/fate.py
+++ b/dynamo/prediction/fate.py
@@ -15,6 +15,7 @@
     main_info_insert_adata,
     main_warning,
 )
+from ..tools.connectivity import correct_hnsw_neighbors, k_nearest_neighbors
 from ..tools.utils import fetch_states, getTseq
 from ..vectorfield import vector_field_function
 from ..vectorfield.utils import vecfld_from_adata, vector_transformation
@@ -402,24 +403,17 @@ def fate_bias(
 
     X = adata.obsm[basis_key] if basis_key != "X" else adata.X
 
-    if X.shape[0] > 5000 and X.shape[1] > 2:
-        alg = "NNDescent"
-        from pynndescent import NNDescent
-
-        nbrs = NNDescent(
-            X,
-            metric=metric,
-            metric_kwds=metric_kwds,
-            n_neighbors=30,
-            n_jobs=cores,
-            random_state=seed,
-            **kwargs,
-        )
-        knn, distances = nbrs.query(X, k=30)
-    else:
-        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-        nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg, n_jobs=cores).fit(X)
-        distances, knn = nbrs.kneighbors(X)
+    knn, distances, nbrs, alg = k_nearest_neighbors(
+        X,
+        k=29,
+        metric=metric,
+        metric_kwads=metric_kwds,
+        exclude_self=False,
+        pynn_rand_state=seed,
+        return_nbrs=True,
+        n_jobs=cores,
+        **kwargs,
+    )
 
     median_dist = np.median(distances[:, 1])
 
@@ -451,8 +445,13 @@ def fate_bias(
             main_info("using all steps data")
             indices = np.arange(0, n_steps)
 
-        if alg == "NNDescent":
+        if alg == "pynn":
             knn, distances = nbrs.query(prediction[:, indices].T, k=30)
+        elif alg == "hnswlib":
+            knn, distances = nbrs.knn_query(prediction[:, indices].T, k=30)
+            if metric == "euclidean":
+                distances = np.sqrt(distances)
+            knn, distances = correct_hnsw_neighbors(knn, distances)
         else:
             distances, knn = nbrs.kneighbors(prediction[:, indices].T)
 
@@ -466,6 +465,9 @@ def fate_bias(
                 # cells with indices are all close to some random progenitor cells.
                 if hasattr(nbrs, "query"):
                     knn, _ = nbrs.query(X[knn.flatten(), :], k=30)
+                elif hasattr(nbrs, "knn_query"):
+                    knn, distances_hn = nbrs.knn_query(X[knn.flatten(), :], k=30)
+                    knn, _ = correct_hnsw_neighbors(knn, distances_hn)
                 else:
                     _, knn = nbrs.kneighbors(X[knn.flatten(), :])
 
@@ -492,6 +494,11 @@ def fate_bias(
 
                 if hasattr(nbrs, "query"):
                     knn, distances = nbrs.query(prediction[:, indices - 1].T, k=30)
+                elif hasattr(nbrs, "knn_query"):
+                    knn, distances = nbrs.knn_query(prediction[:, indices - 1].T, k=30)
+                    if metric == "euclidean":
+                        distances = np.sqrt(distances)
+                    knn, distances = correct_hnsw_neighbors(knn, distances)
                 else:
                     distances, knn = nbrs.kneighbors(prediction[:, indices - 1].T)
 
@@ -590,22 +597,18 @@ def andecestor(
     X = adata.obsm[basis_key].copy()
 
     main_info("build a kNN graph structure so we can query the nearest cells of the predicted states.")
-    if X.shape[0] > 5000 and X.shape[1] > 2:
-        alg = "NNDescent"
-        from pynndescent import NNDescent
-
-        nbrs = NNDescent(
-            X,
-            metric=metric,
-            metric_kwds=metric_kwds,
-            n_neighbors=n_neighbors,
-            n_jobs=cores,
-            random_state=seed,
-            **kwargs,
-        )
-    else:
-        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-        nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm=alg, n_jobs=cores).fit(X)
+    _, _, nbrs, alg = k_nearest_neighbors(
+        X,
+        k=n_neighbors - 1,
+        metric=metric,
+        metric_kwads=metric_kwds,
+        exclude_self=False,
+        pynn_rand_state=seed,
+        n_jobs=cores,
+        return_nbrs=True,
+        logger=logger,
+        **kwargs,
+    )
 
     if init_states is None:
         init_states = adata[init_cells, :].obsm[basis_key]
@@ -635,8 +638,13 @@ def andecestor(
         last_indices = [0, -1] if direction == "both" else [-1]
         queries = pred[j].T[last_indices] if last_point_only else pred[j].T
 
-        if alg == "NNDescent":
+        if alg == "pynn":
             knn, distances = nbrs.query(queries, k=n_neighbors)
+        elif alg == "hnswlib":
+            knn, distances = nbrs.knn_query(queries, k=n_neighbors)
+            if metric == "euclidean":
+                distances = np.sqrt(distances)
+            knn, distances = correct_hnsw_neighbors(knn, distances)
         else:
             distances, knn = nbrs.kneighbors(queries)
 
diff --git a/dynamo/tools/Markov.py b/dynamo/tools/Markov.py
index 20689fc39..8effac6f2 100755
--- a/dynamo/tools/Markov.py
+++ b/dynamo/tools/Markov.py
@@ -8,6 +8,7 @@
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
+from .connectivity import k_nearest_neighbors
 from ..dynamo_logger import LoggerManager
 from ..simulation.utils import directMethod
 from .utils import append_iterative_neighbor_indices, flatten
@@ -164,25 +165,16 @@ def makeTransitionMatrix(Qnn, I_vec, tol=0.0):  # Qnn, I, tol=0.0
     return M
 
 
-@jit(nopython=True)
+
 def compute_tau(X, V, k=100, nbr_idx=None):
     if nbr_idx is None:
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=k,
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            _, dist = nbrs.query(X, k=k)
-        else:
-            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
-            dists, _ = nbrs.kneighbors(X)
-
+        _, dists = k_nearest_neighbors(
+            X,
+            k=k - 1,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+            n_jobs=-1,
+        )
     else:
         dists = np.zeros(nbr_idx.shape)
         for i in range(nbr_idx.shape[0]):
@@ -225,22 +217,13 @@ def prepare_velocity_grid_data(
     if n_neighbors is None:
         n_neighbors = np.max([10, int(n_obs / 50)])
 
-    if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2:
-        from pynndescent import NNDescent
-
-        nn = NNDescent(
-            X_emb,
-            metric="euclidean",
-            n_neighbors=n_neighbors,
-            n_jobs=-1,
-            random_state=19491001,
-        )
-        neighs, dists = nn.query(X_grid, k=n_neighbors)
-    else:
-        alg = "ball_tree" if X_emb.shape[1] > 10 else "kd_tree"
-        nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1, algorithm=alg)
-        nn.fit(X_emb)
-        dists, neighs = nn.kneighbors(X_grid)
+    neighs, dists = k_nearest_neighbors(
+        X_emb,
+        query_X=X_grid,
+        k=n_neighbors - 1,
+        exclude_self=False,
+        pynn_rand_state=19491001,
+    )
 
     weight = norm.pdf(x=dists, scale=scale)
     p_mass = weight.sum(1)
@@ -390,21 +373,12 @@ def graphize_velocity(V, X, nbrs_idx=None, k=30, normalize_v=False, E_func=None)
 
     nbrs = None
     if nbrs_idx is None:
-        if n > 200000 and d > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=k + 1,
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            nbrs_idx, _ = nbrs.query(X, k=k + 1)
-        else:
-            alg = "ball_tree" if d > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
-            _, nbrs_idx = nbrs.kneighbors(X)
+        nbrs_idx, _ = k_nearest_neighbors(
+            X,
+            k=k,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+        )
 
     if type(E_func) is str:
         if E_func == "sqrt":
@@ -591,21 +565,12 @@ def fit(
     ):
         # compute connectivity
         if neighbor_idx is None:
-            if X.shape[0] > 200000 and X.shape[1] > 2:
-                from pynndescent import NNDescent
-
-                nbrs = NNDescent(
-                    X,
-                    metric="euclidean",
-                    n_neighbors=k,
-                    n_jobs=-1,
-                    random_state=19491001,
-                )
-                neighbor_idx, _ = nbrs.query(X, k=k)
-            else:
-                alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-                nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
-                _, neighbor_idx = nbrs.kneighbors(X)
+            neighbor_idx, _ = k_nearest_neighbors(
+                X,
+                k=k-1,
+                exclude_self=False,
+                pynn_rand_state=19491001,
+            )
 
         if n_recurse_neighbors is not None:
             self.Idx = append_iterative_neighbor_indices(neighbor_idx, n_recurse_neighbors)
diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py
index d62a00b2b..a59a69fef 100755
--- a/dynamo/tools/connectivity.py
+++ b/dynamo/tools/connectivity.py
@@ -515,7 +515,7 @@ def _gen_neighbor_keys(result_prefix: str = "") -> Tuple[str, str, str]:
     return conn_key, dist_key, neighbor_key
 
 
-def _correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     """Corrects the indices and corresponding distances obtained from a hnswlib by manually adding self neighbors.
 
     Args:
@@ -542,6 +542,7 @@ def roll(arr, value=0):
 def k_nearest_neighbors(
     X: np.ndarray,
     k: int,
+    query_X: Optional[np.ndarray] = None,
     method: Optional[str] = None,
     metric: Union[str, Callable] = "euclidean",
     metric_kwads: Dict[str, Any] = None,
@@ -578,7 +579,8 @@ def k_nearest_neighbors(
         hnswlib_num: the lowest threshold of features to use `hnswlib` nearest neighbors. Defaults to int(2e5).
         pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001.
         n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1.
-        return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False.
+        return_nbrs: whether to return the fitted nearest neighbor object. If True, will return nearest neighbor object
+            and the method. Defaults to False.
         logger: the Logger object to display the log information.
         kwargs: additional arguments that will be passed to each nearest neighbor search algorithm.
 
@@ -603,6 +605,9 @@ def k_nearest_neighbors(
                 method = "kd_tree"
         logger.info("method %s selected" % (method), indent_level=2)
 
+    if query_X is None:
+        query_X = X
+
     if method.lower() in ["pynn", "umap"]:
         from pynndescent import NNDescent
         nbrs = NNDescent(
@@ -613,7 +618,7 @@ def k_nearest_neighbors(
             random_state=pynn_rand_state,
             **kwargs,
         )
-        nbrs_idx, dists = nbrs.query(X, k=k + 1)
+        nbrs_idx, dists = nbrs.query(query_X, k=k + 1)
     elif method in ["ball_tree", "kd_tree"]:
         from sklearn.neighbors import NearestNeighbors
         # print("***debug X_data:", X_data)
@@ -625,7 +630,7 @@ def k_nearest_neighbors(
             n_jobs=n_jobs,
             **kwargs,
         ).fit(X)
-        dists, nbrs_idx = nbrs.kneighbors(X)
+        dists, nbrs_idx = nbrs.kneighbors(query_X)
     elif method == "hnswlib":
         import hnswlib
         space = "l2" if metric == "euclidean" else metric
@@ -635,10 +640,10 @@ def k_nearest_neighbors(
         nbrs.init_index(max_elements=X.shape[0], random_seed=pynn_rand_state, **kwargs)
         nbrs.set_num_threads(n_jobs)
         nbrs.add_items(X)
-        nbrs_idx, dists = nbrs.knn_query(X, k=k + 1)
+        nbrs_idx, dists = nbrs.knn_query(query_X, k=k + 1)
         if space == "l2":
             dists = np.sqrt(dists)
-        nbrs_idx, dists = _correct_hnsw_neighbors(nbrs_idx, dists)
+        nbrs_idx, dists = correct_hnsw_neighbors(nbrs_idx, dists)
     else:
         raise ImportError(f"nearest neighbor search method {method} is not supported")
 
@@ -647,7 +652,7 @@ def k_nearest_neighbors(
         nbrs_idx = nbrs_idx[:, 1:]
         dists = dists[:, 1:]
     if return_nbrs:
-        return nbrs_idx, dists, nbrs
+        return nbrs_idx, dists, nbrs, method
     return nbrs_idx, dists
 
 
diff --git a/dynamo/tools/graph_calculus.py b/dynamo/tools/graph_calculus.py
index fa2dfe9ec..bab5cecf6 100644
--- a/dynamo/tools/graph_calculus.py
+++ b/dynamo/tools/graph_calculus.py
@@ -78,7 +78,7 @@ def graphize_velocity(
 
     if nbrs_idx is None or return_nbrs:
         main_info("calculating neighbor indices...", indent_level=2)
-        nbrs_idx, dists, nbrs = k_nearest_neighbors(X, k, exclude_self=True, return_nbrs=True)
+        nbrs_idx, dists, nbrs, _ = k_nearest_neighbors(X, k, exclude_self=True, return_nbrs=True)
 
     if dists is None:
         dists = nbrs_to_dists(X, nbrs_idx)
diff --git a/dynamo/tools/growth.py b/dynamo/tools/growth.py
index 4f3de9ebf..efeff8ddd 100644
--- a/dynamo/tools/growth.py
+++ b/dynamo/tools/growth.py
@@ -13,6 +13,7 @@
 from scipy.sparse import issparse
 from sklearn.neighbors import NearestNeighbors
 
+from .connectivity import k_nearest_neighbors
 
 def score_cells(
     adata: AnnData,
@@ -74,11 +75,11 @@ def score_cells(
     elif basis is not None and "X_" + basis not in adata.obsm.keys():
         raise ValueError(f"Your adata doesn't have the {basis} you inputted in .obsm attribute of your adata.")
 
-    if genes is None and "use_for_pca" not in adata.obs.keys():
-        raise ValueError(f"Your adata doesn't have 'use_for_pca' column in .obs.")
+    if genes is None and "use_for_pca" not in adata.var.keys():
+        raise ValueError(f"Your adata doesn't have 'use_for_pca' column in .var.")
 
     if genes is None:
-        genes = adata.var_names[adata.use_for_pca]
+        genes = adata.var_names[adata.var.use_for_pca]
     else:
         genes = (
             list(adata.var_names.intersection(genes))
@@ -93,23 +94,16 @@ def score_cells(
 
     X_basis = adata.obsm["X_pca"] if basis is None else adata.obsm["X_" + basis]
 
-    if X_basis.shape[0] > 5000 and X_basis.shape[1] > 2:
-        from pynndescent import NNDescent
-
-        nbrs = NNDescent(
-            X_basis,
-            metric=metric,
-            metric_kwds=metric_kwds,
-            n_neighbors=30,
-            n_jobs=cores,
-            random_state=seed,
-            **kwargs,
-        )
-        knn, distances = nbrs.query(X_basis, k=n_neighbors)
-    else:
-        alg = "ball_tree" if X_basis.shape[1] > 10 else "kd_tree"
-        nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm=alg, n_jobs=cores).fit(X_basis)
-        distances, knn = nbrs.kneighbors(X_basis)
+    knn, distances = k_nearest_neighbors(
+        X_basis,
+        k=n_neighbors - 1,
+        metric=metric,
+        metric_kwads=metric_kwds,
+        exclude_self=False,
+        pynn_rand_state=seed,
+        n_jobs=cores,
+        **kwargs,
+    )
 
     X_data = adata[:, genes].X if layer in [None, "X"] else adata[:, genes].layers[layer]
 
diff --git a/dynamo/tools/metric_velocity.py b/dynamo/tools/metric_velocity.py
index 0d01018b0..14b968621 100755
--- a/dynamo/tools/metric_velocity.py
+++ b/dynamo/tools/metric_velocity.py
@@ -16,6 +16,7 @@
 from .connectivity import (
     adj_to_knn,
     check_and_recompute_neighbors,
+    k_nearest_neighbors,
     mnn_from_list,
     umap_conn_indices_dist_embedding,
 )
@@ -89,22 +90,12 @@ def cell_wise_confidence(
         )
     else:
         n_neigh = 30
-
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=n_neigh + 1,
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1)
-        else:
-            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=n_neigh + 1, algorithm=alg, n_jobs=-1).fit(X)
-            dist, nbrs_idx = nbrs.kneighbors(X)
+        nbrs_idx, dist = k_nearest_neighbors(
+            X,
+            k=n_neigh,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+        )
 
         row = np.repeat(nbrs_idx[:, 0], n_neigh)
         col = nbrs_idx[:, 1:].flatten()
diff --git a/dynamo/tools/sampling.py b/dynamo/tools/sampling.py
index 5ba8f4be7..c0a0619bd 100644
--- a/dynamo/tools/sampling.py
+++ b/dynamo/tools/sampling.py
@@ -9,6 +9,7 @@
 from scipy.cluster.vq import kmeans2
 from sklearn.neighbors import NearestNeighbors
 
+from .connectivity import k_nearest_neighbors
 from ..dynamo_logger import LoggerManager
 from .utils import nearest_neighbors, timeit
 
@@ -135,21 +136,13 @@ def trn(X: np.ndarray, n: int, return_index: bool = True, seed: int = 19491001,
     if not return_index:
         return trnet.W
     else:
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=1,
-                n_jobs=-1,
-                random_state=seed,
-            )
-            idx, _ = nbrs.query(trnet.W, k=1)
-        else:
-            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X)
-            _, idx = nbrs.kneighbors(trnet.W)
+        idx, _ = k_nearest_neighbors(
+            X,
+            query_X=trnet.W,
+            k=0,
+            exclude_self=False,
+            pynn_rand_state=seed,
+        )
 
         return idx[:, 0]
 
diff --git a/dynamo/vectorfield/scVectorField.py b/dynamo/vectorfield/scVectorField.py
index 8b828888f..ad33712e5 100755
--- a/dynamo/vectorfield/scVectorField.py
+++ b/dynamo/vectorfield/scVectorField.py
@@ -14,6 +14,7 @@
 
 from ..dynamo_logger import LoggerManager, main_info, main_warning
 from ..simulation.ODE import jacobian_bifur2genes, ode_bifur2genes
+from ..tools.connectivity import k_nearest_neighbors
 from ..tools.sampling import lhsclassic, sample, sample_by_velocity
 from ..tools.utils import (
     index_condensed_matrix,
@@ -103,21 +104,13 @@ def bandwidth_selector(X):
     This function computes an empirical bandwidth for a Gaussian kernel.
     """
     n, m = X.shape
-    if n > 200000 and m > 2:
-        from pynndescent import NNDescent
 
-        nbrs = NNDescent(
-            X,
-            metric="euclidean",
-            n_neighbors=max(2, int(0.2 * n)),
-            n_jobs=-1,
-            random_state=19491001,
-        )
-        _, distances = nbrs.query(X, k=max(2, int(0.2 * n)))
-    else:
-        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-        nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)), algorithm=alg, n_jobs=-1).fit(X)
-        distances, _ = nbrs.kneighbors(X)
+    _, distances = k_nearest_neighbors(
+        X,
+        k=max(2, int(0.2 * n)) - 1,
+        exclude_self=False,
+        pynn_rand_state=19491001,
+    )
 
     d = np.mean(distances[:, 1:]) / 1.5
     return np.sqrt(2) * d
@@ -241,21 +234,13 @@ def graphize_vecfld(
 
     nbrs = None
     if nbrs_idx is None:
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=k + 1,
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            nbrs_idx, dist = nbrs.query(X, k=k + 1)
-        else:
-            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
-            dist, nbrs_idx = nbrs.kneighbors(X)
+        nbrs_idx, dist, nbrs, _ = k_nearest_neighbors(
+            X,
+            k=k,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+            return_nbrs=True,
+        )
 
     if dist is None and not distance_free:
         D = pdist(X)
diff --git a/dynamo/vectorfield/topography.py b/dynamo/vectorfield/topography.py
index 30c63de03..0a57b281d 100755
--- a/dynamo/vectorfield/topography.py
+++ b/dynamo/vectorfield/topography.py
@@ -13,6 +13,7 @@
 from sklearn.neighbors import NearestNeighbors
 
 from ..dynamo_logger import LoggerManager, main_info, main_warning
+from ..tools.connectivity import k_nearest_neighbors
 from ..tools.utils import gaussian_1d, inverse_norm, nearest_neighbors, update_dict
 from ..utils import copy_adata
 from .FixedPoints import FixedPoints
@@ -294,21 +295,13 @@ def get_Xss_confidence(self, k=50):
         Xref = np.median(X, 0)
         Xss = np.vstack((Xss, Xref))
 
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=min(k, X.shape[0] - 1),
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1))
-        else:
-            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X)
-            dist, _ = nbrs.kneighbors(Xss)
+        _, dist = k_nearest_neighbors(
+            X,
+            query_X=Xss,
+            k=min(k, X.shape[0] - 1) - 1,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+        )
 
         dist_m = dist.mean(1)
         # confidence = 1 - dist_m / dist_m.max()

From 9f6f77ba368bdc114e73d6b92702d2ffcc8a68e2 Mon Sep 17 00:00:00 2001
From: sichao <sichao_y@outlook.com>
Date: Mon, 24 Apr 2023 15:52:37 -0400
Subject: [PATCH 4/5] update reference in clustering

---
 dynamo/vectorfield/clustering.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/dynamo/vectorfield/clustering.py b/dynamo/vectorfield/clustering.py
index d4ef2c038..a7e09dc7b 100644
--- a/dynamo/vectorfield/clustering.py
+++ b/dynamo/vectorfield/clustering.py
@@ -10,6 +10,7 @@
 from ..dynamo_logger import main_info
 from ..preprocessing.utils import pca
 from ..tools.clustering import hdbscan, infomap, leiden, louvain
+from ..tools.connectivity import k_nearest_neighbors
 from ..tools.Markov import (
     grid_velocity_filter,
     prepare_velocity_grid_data,
@@ -159,20 +160,15 @@ def cluster_field(
         adata.obs[key] = adata.obs.obs[key].astype("category")
 
     elif method in ["louvain", "leiden", "infomap"]:
-        if X.shape[0] > 200000 and X.shape[1] > 2:
-            from pynndescent import NNDescent
 
-            nbrs = NNDescent(
-                X,
-                metric="euclidean",
-                n_neighbors=31,
-                n_jobs=cores,
-                random_state=19491001,
-            )
-            nbrs_idx, dist = nbrs.query(X, k=31)
-        else:
-            nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X)
-            dist, nbrs_idx = nbrs.kneighbors(X)
+        nbrs_idx, dist = k_nearest_neighbors(
+            X,
+            k=30,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+            n_jobs=cores,
+            logger=logger,
+        )
 
         row = np.repeat(nbrs_idx[:, 0], 30)
         col = nbrs_idx[:, 1:].flatten()

From 1a08bd1e3584131319ded4979662d16dd77a4ebb Mon Sep 17 00:00:00 2001
From: sichao <sichao_y@outlook.com>
Date: Wed, 26 Apr 2023 15:03:31 -0400
Subject: [PATCH 5/5] update reference in stochastic_process

---
 dynamo/vectorfield/stochastic_process.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/dynamo/vectorfield/stochastic_process.py b/dynamo/vectorfield/stochastic_process.py
index d89a627b0..4b1fbcbf1 100644
--- a/dynamo/vectorfield/stochastic_process.py
+++ b/dynamo/vectorfield/stochastic_process.py
@@ -2,7 +2,7 @@
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
-from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors
+from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors, k_nearest_neighbors
 from ..tools.utils import log1p_
 from .utils import vecfld_from_adata, vector_field_function
 
@@ -152,21 +152,12 @@ def diffusionMatrix(
     neighbor_result_prefix = "" if layer is None else layer
     conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix)
     if neighbor_key not in adata.uns_keys() or (X_data is not None and V_data is not None):
-        if X_data.shape[0] > 200000 and X_data.shape[1] > 2:
-            from pynndescent import NNDescent
-
-            nbrs = NNDescent(
-                X_data,
-                metric="euclidean",
-                n_neighbors=n,
-                n_jobs=-1,
-                random_state=19491001,
-            )
-            Idx, _ = nbrs.query(X_data, k=n)
-        else:
-            alg = "ball_tree" if X_data.shape[1] > 10 else "kd_tree"
-            nbrs = NearestNeighbors(n_neighbors=n, algorithm=alg, n_jobs=-1).fit(X_data)
-            _, Idx = nbrs.kneighbors(X_data)
+        Idx, _ = k_nearest_neighbors(
+            X_data,
+            k=n - 1,
+            exclude_self=False,
+            pynn_rand_state=19491001,
+        )
     else:
         check_and_recompute_neighbors(adata, result_prefix=layer)
         conn_key = "connectivities" if layer is None else layer + "_connectivities"