From 30db1f8a455a7bd17d2c85193e8d3393118fb4bc Mon Sep 17 00:00:00 2001 From: sichao Date: Thu, 20 Apr 2023 16:21:48 -0400 Subject: [PATCH 1/5] combine neighbors and k_nearest_neighbors --- dynamo/tools/connectivity.py | 53 +++++++++--------------------------- dynamo/tools/utils.py | 51 +++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py index f0b939d9a..ca38ed43e 100755 --- a/dynamo/tools/connectivity.py +++ b/dynamo/tools/connectivity.py @@ -21,7 +21,7 @@ from ..configuration import DynamoAdataKeyManager from ..docrep import DocstringProcessor from ..dynamo_logger import LoggerManager, main_info, main_warning -from .utils import fetch_X_data, log1p_ +from .utils import fetch_X_data, k_nearest_neighbors, log1p_ docstrings = DocstringProcessor() @@ -587,45 +587,18 @@ def neighbors( logger.info("fetching X data from layer:%s, basis:%s" % (str(layer), str(basis))) genes, X_data = fetch_X_data(adata, genes, layer, basis) - if method is None: - logger.info("method arg is None, choosing methods automatically...") - if X_data.shape[0] > 200000 and X_data.shape[1] > 2: - - from pynndescent import NNDescent - - method = "pynn" - elif X_data.shape[1] > 10: - method = "ball_tree" - else: - method = "kd_tree" - logger.info("method %s selected" % (method), indent_level=2) - - # may distinguish between umap and pynndescent -- treat them equal for now - if method.lower() in ["pynn", "umap"]: - index = NNDescent( - X_data, - metric=metric, - n_neighbors=n_neighbors, - n_jobs=cores, - random_state=seed, - **kwargs, - ) - knn, distances = index.query(X_data, k=n_neighbors) - elif method in ["ball_tree", "kd_tree"]: - from sklearn.neighbors import NearestNeighbors - - # print("***debug X_data:", X_data) - nbrs = NearestNeighbors( - n_neighbors=n_neighbors, - metric=metric, - metric_params=metric_kwads, - algorithm=method, - n_jobs=cores, - **kwargs, - ).fit(X_data) - distances, knn = nbrs.kneighbors(X_data) - else: - raise ImportError(f"nearest neighbor search method {method} is not supported") + knn, distances = k_nearest_neighbors( + X_data, + k=n_neighbors - 1, + method=method, + metric=metric, + metric_kwads=metric_kwads, + exclude_self=False, + pynn_rand_state=seed, + n_jobs=cores, + logger=logger, + **kwargs, + ) conn_key, dist_key, neighbor_key = _gen_neighbor_keys(result_prefix) logger.info_insert_adata(conn_key, adata_attr="obsp") diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index c03d5161b..45ba439ab 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -25,6 +25,7 @@ from tqdm import tqdm from ..dynamo_logger import ( + Logger, LoggerManager, main_critical, main_debug, @@ -193,6 +194,9 @@ def nearest_neighbors(coord: np.ndarray, coords: Union[np.ndarray, sp.csr_matrix def k_nearest_neighbors( X: np.ndarray, k: int, + method: Optional[str] = None, + metric: Union[str, Callable] = "euclidean", + metric_kwads: Dict[str, Any] = None, exclude_self: bool = True, knn_dim: int = 10, pynn_num: int = int(2e5), @@ -200,12 +204,21 @@ def k_nearest_neighbors( pynn_rand_state: int = 19491001, n_jobs: int = -1, return_nbrs: bool = False, + logger: Logger = None, + **kwargs, ) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]: """Compute k nearest neighbors for a given space. Args: X: the space to find nearest neighbors on. k: the number of neighbors to be found, excluding the point itself. + method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of + the input data. + metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to + the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If + metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a + `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean". + metric_kwads: additional keyword arguments for the metric function. Defaults to None. exclude_self: whether to exclude the point itself from the result. Defaults to True. knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is smaller than this value, `kd_tree` algorithm would be used. Defaults to 10. @@ -216,6 +229,8 @@ def k_nearest_neighbors( pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001. n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1. return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False. + logger: the Logger object to display the log information. + kwargs: additional arguments that will be passed to each nearest neighbor search algorithm. Returns: A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each @@ -223,22 +238,44 @@ def k_nearest_neighbors( object and it would be returned only if `return_nbrs` is True. """ - n, d = np.atleast_2d(X).shape - if n > int(pynn_num) and d > pynn_dim: - from pynndescent import NNDescent + if method is None: + if logger is None: + logger = LoggerManager.gen_logger("neighbors") + logger.info("method arg is None, choosing methods automatically...") + if X.shape[0] > pynn_num and X.shape[1] > pynn_dim: + method = "pynn" + else: + if X.shape[1] > knn_dim: + method = "ball_tree" + else: + method = "kd_tree" + logger.info("method %s selected" % (method), indent_level=2) + if method.lower() in ["pynn", "umap"]: + from pynndescent import NNDescent nbrs = NNDescent( X, - metric="euclidean", + metric=metric, n_neighbors=k + 1, n_jobs=n_jobs, random_state=pynn_rand_state, + **kwargs, ) nbrs_idx, dists = nbrs.query(X, k=k + 1) - else: - alg = "ball_tree" if d > knn_dim else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=n_jobs).fit(X) + elif method in ["ball_tree", "kd_tree"]: + from sklearn.neighbors import NearestNeighbors + # print("***debug X_data:", X_data) + nbrs = NearestNeighbors( + n_neighbors=k + 1, + metric=metric, + metric_params=metric_kwads, + algorithm=method, + n_jobs=n_jobs, + **kwargs, + ).fit(X) dists, nbrs_idx = nbrs.kneighbors(X) + else: + raise ImportError(f"nearest neighbor search method {method} is not supported") nbrs_idx = np.array(nbrs_idx) if exclude_self: From baea41861b1b6203e7bf7fa7d499d2946ef05066 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 21 Apr 2023 11:36:24 -0400 Subject: [PATCH 2/5] add hnswlib option --- dynamo/tools/connectivity.py | 141 ++++++++++++++++++++++++++++++++- dynamo/tools/graph_calculus.py | 2 +- dynamo/tools/utils.py | 95 ---------------------- 3 files changed, 140 insertions(+), 98 deletions(-) diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py index ca38ed43e..d62a00b2b 100755 --- a/dynamo/tools/connectivity.py +++ b/dynamo/tools/connectivity.py @@ -15,13 +15,14 @@ from pynndescent.distances import true_angular from scipy.sparse import coo_matrix, csr_matrix, issparse from sklearn.decomposition import TruncatedSVD +from sklearn.neighbors import NearestNeighbors from sklearn.utils import sparsefuncs from umap import UMAP from ..configuration import DynamoAdataKeyManager from ..docrep import DocstringProcessor -from ..dynamo_logger import LoggerManager, main_info, main_warning -from .utils import fetch_X_data, k_nearest_neighbors, log1p_ +from ..dynamo_logger import Logger, LoggerManager, main_info, main_warning +from .utils import fetch_X_data, log1p_ docstrings = DocstringProcessor() @@ -514,6 +515,142 @@ def _gen_neighbor_keys(result_prefix: str = "") -> Tuple[str, str, str]: return conn_key, dist_key, neighbor_key +def _correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """Corrects the indices and corresponding distances obtained from a hnswlib by manually adding self neighbors. + + Args: + knn_hn: array containing the k-NN indices obtained from the hnswlib. + distances_hn: array containing the distances corresponding to the k-NN indices obtained from the HNSW index. + + Returns: + A tuple containing the corrected indices and distances. + """ + mask = knn_hn[:, 0] == np.arange(knn_hn.shape[0]) + target_indices = np.where(mask)[0] + def roll(arr, value=0): + arr = np.roll(arr, 1, axis=0) + arr[0] = value + return arr + + knn_corrected = [knn_hn[i] if i in target_indices else roll(knn_hn[i], i) for i in range(knn_hn.shape[0])] + distances_corrected = [ + distances_hn[i] if i in target_indices else roll(distances_hn[i]) for i in range(distances_hn.shape[0]) + ] + return np.vstack(knn_corrected), np.vstack(distances_corrected) + + +def k_nearest_neighbors( + X: np.ndarray, + k: int, + method: Optional[str] = None, + metric: Union[str, Callable] = "euclidean", + metric_kwads: Dict[str, Any] = None, + exclude_self: bool = True, + knn_dim: int = 10, + pynn_num: int = 5000, + pynn_dim: int = 2, + hnswlib_num: int = int(2e5), + pynn_rand_state: int = 19491001, + n_jobs: int = -1, + return_nbrs: bool = False, + logger: Logger = None, + **kwargs, +) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]: + """Compute k nearest neighbors for a given space. + + Args: + X: the space to find nearest neighbors on. + k: the number of neighbors to be found, excluding the point itself. + method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of + the input data. + metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to + the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If + metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a + `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean". + metric_kwads: additional keyword arguments for the metric function. Defaults to None. + exclude_self: whether to exclude the point itself from the result. Defaults to True. + knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is + smaller than this value, `kd_tree` algorithm would be used. Defaults to 10. + pynn_num: the lowest threshold of features to use NNDescent package. If number of features less than/equal to + this value, `sklearn` package would be used. Defaults to 5000. + pynn_dim: the lowest threshold of dimensions to use NNDescent package. If number of features less than/equal to + this value, `sklearn` package would be used. Defaults to 2. + hnswlib_num: the lowest threshold of features to use `hnswlib` nearest neighbors. Defaults to int(2e5). + pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001. + n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1. + return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False. + logger: the Logger object to display the log information. + kwargs: additional arguments that will be passed to each nearest neighbor search algorithm. + + Returns: + A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each + point and dists contains the distances between neighbors and the point. nbrs is the fitted nearest neighbor + object, and it would be returned only if `return_nbrs` is True. + """ + + if method is None: + if logger is None: + logger = LoggerManager.gen_logger("neighbors") + logger.info("method arg is None, choosing methods automatically...") + if X.shape[0] > hnswlib_num: + method = "hnswlib" + elif X.shape[0] > pynn_num and X.shape[1] > pynn_dim: + method = "pynn" + else: + if X.shape[1] > knn_dim: + method = "ball_tree" + else: + method = "kd_tree" + logger.info("method %s selected" % (method), indent_level=2) + + if method.lower() in ["pynn", "umap"]: + from pynndescent import NNDescent + nbrs = NNDescent( + X, + metric=metric, + n_neighbors=k + 1, + n_jobs=n_jobs, + random_state=pynn_rand_state, + **kwargs, + ) + nbrs_idx, dists = nbrs.query(X, k=k + 1) + elif method in ["ball_tree", "kd_tree"]: + from sklearn.neighbors import NearestNeighbors + # print("***debug X_data:", X_data) + nbrs = NearestNeighbors( + n_neighbors=k + 1, + metric=metric, + metric_params=metric_kwads, + algorithm=method, + n_jobs=n_jobs, + **kwargs, + ).fit(X) + dists, nbrs_idx = nbrs.kneighbors(X) + elif method == "hnswlib": + import hnswlib + space = "l2" if metric == "euclidean" else metric + if space not in ["l2", "cosine", "ip"]: + raise ImportError(f"hnswlib nearest neighbors with space {space} is not supported") + nbrs = hnswlib.Index(space=space, dim=X.shape[1]) + nbrs.init_index(max_elements=X.shape[0], random_seed=pynn_rand_state, **kwargs) + nbrs.set_num_threads(n_jobs) + nbrs.add_items(X) + nbrs_idx, dists = nbrs.knn_query(X, k=k + 1) + if space == "l2": + dists = np.sqrt(dists) + nbrs_idx, dists = _correct_hnsw_neighbors(nbrs_idx, dists) + else: + raise ImportError(f"nearest neighbor search method {method} is not supported") + + nbrs_idx = np.array(nbrs_idx) + if exclude_self: + nbrs_idx = nbrs_idx[:, 1:] + dists = dists[:, 1:] + if return_nbrs: + return nbrs_idx, dists, nbrs + return nbrs_idx, dists + + def neighbors( adata: AnnData, X_data: np.ndarray = None, diff --git a/dynamo/tools/graph_calculus.py b/dynamo/tools/graph_calculus.py index bf4f52182..fa2dfe9ec 100644 --- a/dynamo/tools/graph_calculus.py +++ b/dynamo/tools/graph_calculus.py @@ -11,13 +11,13 @@ from scipy.optimize import lsq_linear, minimize from sklearn.neighbors import NearestNeighbors +from .connectivity import k_nearest_neighbors from ..dynamo_logger import main_info, main_warning from ..tools.utils import projection_with_transition_matrix from .utils import ( elem_prod, flatten, index_condensed_matrix, - k_nearest_neighbors, nbrs_to_dists, symmetrize_symmetric_matrix, ) diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index 45ba439ab..5bdef59a6 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -191,101 +191,6 @@ def nearest_neighbors(coord: np.ndarray, coords: Union[np.ndarray, sp.csr_matrix return neighs -def k_nearest_neighbors( - X: np.ndarray, - k: int, - method: Optional[str] = None, - metric: Union[str, Callable] = "euclidean", - metric_kwads: Dict[str, Any] = None, - exclude_self: bool = True, - knn_dim: int = 10, - pynn_num: int = int(2e5), - pynn_dim: int = 2, - pynn_rand_state: int = 19491001, - n_jobs: int = -1, - return_nbrs: bool = False, - logger: Logger = None, - **kwargs, -) -> Union[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, NearestNeighbors]]: - """Compute k nearest neighbors for a given space. - - Args: - X: the space to find nearest neighbors on. - k: the number of neighbors to be found, excluding the point itself. - method: the method used for nearest neighbor search. If it is None, will choose algorithm based on the size of - the input data. - metric: the distance metric to use for the tree. The default metric is euclidean, and with p=2 is equivalent to - the standard Euclidean metric. See the documentation of `DistanceMetric` for a list of available metrics. If - metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a - `sparse graph`, in which case only "nonzero" elements may be considered neighbors. Defaults to "euclidean". - metric_kwads: additional keyword arguments for the metric function. Defaults to None. - exclude_self: whether to exclude the point itself from the result. Defaults to True. - knn_dim: the lowest threshold of dimensions of data to use `ball_tree` algorithm. If dimensions of the data is - smaller than this value, `kd_tree` algorithm would be used. Defaults to 10. - pynn_num: the lowest threshold of features to use NNDescent package. If number of features less than/equal to - this value, `sklearn` package would be used. Defaults to int(2e5). - pynn_dim: the lowest threshold of dimensions to use NNDescent package. If number of features less than/equal to - this value, `sklearn` package would be used. Defaults to 2. - pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001. - n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1. - return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False. - logger: the Logger object to display the log information. - kwargs: additional arguments that will be passed to each nearest neighbor search algorithm. - - Returns: - A tuple (nbrs_idx, dists, [nbrs]), where nbrs_idx contains the indices of nearest neighbors found for each - point and dists contains the distances between neighbors and the point. nbrs is the fitted nearest neighbor - object and it would be returned only if `return_nbrs` is True. - """ - - if method is None: - if logger is None: - logger = LoggerManager.gen_logger("neighbors") - logger.info("method arg is None, choosing methods automatically...") - if X.shape[0] > pynn_num and X.shape[1] > pynn_dim: - method = "pynn" - else: - if X.shape[1] > knn_dim: - method = "ball_tree" - else: - method = "kd_tree" - logger.info("method %s selected" % (method), indent_level=2) - - if method.lower() in ["pynn", "umap"]: - from pynndescent import NNDescent - nbrs = NNDescent( - X, - metric=metric, - n_neighbors=k + 1, - n_jobs=n_jobs, - random_state=pynn_rand_state, - **kwargs, - ) - nbrs_idx, dists = nbrs.query(X, k=k + 1) - elif method in ["ball_tree", "kd_tree"]: - from sklearn.neighbors import NearestNeighbors - # print("***debug X_data:", X_data) - nbrs = NearestNeighbors( - n_neighbors=k + 1, - metric=metric, - metric_params=metric_kwads, - algorithm=method, - n_jobs=n_jobs, - **kwargs, - ).fit(X) - dists, nbrs_idx = nbrs.kneighbors(X) - else: - raise ImportError(f"nearest neighbor search method {method} is not supported") - - nbrs_idx = np.array(nbrs_idx) - if exclude_self: - nbrs_idx = nbrs_idx[:, 1:] - dists = dists[:, 1:] - if return_nbrs: - return nbrs_idx, dists, nbrs - return nbrs_idx, dists - - def nbrs_to_dists(X: np.ndarray, nbrs_idx: np.ndarray) -> List[np.ndarray]: """Calculate the distances between neighbors of a given space. From 0003aa96297cd32c1ec80aa8f68b696889d1ec80 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 21 Apr 2023 19:43:48 -0400 Subject: [PATCH 3/5] update k nearest neighbors reference --- dynamo/prediction/fate.py | 80 +++++++++++++------------ dynamo/tools/Markov.py | 91 +++++++++-------------------- dynamo/tools/connectivity.py | 19 +++--- dynamo/tools/graph_calculus.py | 2 +- dynamo/tools/growth.py | 34 +++++------ dynamo/tools/metric_velocity.py | 23 +++----- dynamo/tools/sampling.py | 23 +++----- dynamo/vectorfield/scVectorField.py | 43 +++++--------- dynamo/vectorfield/topography.py | 23 +++----- 9 files changed, 136 insertions(+), 202 deletions(-) diff --git a/dynamo/prediction/fate.py b/dynamo/prediction/fate.py index 5f96bfb13..131944895 100755 --- a/dynamo/prediction/fate.py +++ b/dynamo/prediction/fate.py @@ -15,6 +15,7 @@ main_info_insert_adata, main_warning, ) +from ..tools.connectivity import correct_hnsw_neighbors, k_nearest_neighbors from ..tools.utils import fetch_states, getTseq from ..vectorfield import vector_field_function from ..vectorfield.utils import vecfld_from_adata, vector_transformation @@ -402,24 +403,17 @@ def fate_bias( X = adata.obsm[basis_key] if basis_key != "X" else adata.X - if X.shape[0] > 5000 and X.shape[1] > 2: - alg = "NNDescent" - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric=metric, - metric_kwds=metric_kwds, - n_neighbors=30, - n_jobs=cores, - random_state=seed, - **kwargs, - ) - knn, distances = nbrs.query(X, k=30) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg, n_jobs=cores).fit(X) - distances, knn = nbrs.kneighbors(X) + knn, distances, nbrs, alg = k_nearest_neighbors( + X, + k=29, + metric=metric, + metric_kwads=metric_kwds, + exclude_self=False, + pynn_rand_state=seed, + return_nbrs=True, + n_jobs=cores, + **kwargs, + ) median_dist = np.median(distances[:, 1]) @@ -451,8 +445,13 @@ def fate_bias( main_info("using all steps data") indices = np.arange(0, n_steps) - if alg == "NNDescent": + if alg == "pynn": knn, distances = nbrs.query(prediction[:, indices].T, k=30) + elif alg == "hnswlib": + knn, distances = nbrs.knn_query(prediction[:, indices].T, k=30) + if metric == "euclidean": + distances = np.sqrt(distances) + knn, distances = correct_hnsw_neighbors(knn, distances) else: distances, knn = nbrs.kneighbors(prediction[:, indices].T) @@ -466,6 +465,9 @@ def fate_bias( # cells with indices are all close to some random progenitor cells. if hasattr(nbrs, "query"): knn, _ = nbrs.query(X[knn.flatten(), :], k=30) + elif hasattr(nbrs, "knn_query"): + knn, distances_hn = nbrs.knn_query(X[knn.flatten(), :], k=30) + knn, _ = correct_hnsw_neighbors(knn, distances_hn) else: _, knn = nbrs.kneighbors(X[knn.flatten(), :]) @@ -492,6 +494,11 @@ def fate_bias( if hasattr(nbrs, "query"): knn, distances = nbrs.query(prediction[:, indices - 1].T, k=30) + elif hasattr(nbrs, "knn_query"): + knn, distances = nbrs.knn_query(prediction[:, indices - 1].T, k=30) + if metric == "euclidean": + distances = np.sqrt(distances) + knn, distances = correct_hnsw_neighbors(knn, distances) else: distances, knn = nbrs.kneighbors(prediction[:, indices - 1].T) @@ -590,22 +597,18 @@ def andecestor( X = adata.obsm[basis_key].copy() main_info("build a kNN graph structure so we can query the nearest cells of the predicted states.") - if X.shape[0] > 5000 and X.shape[1] > 2: - alg = "NNDescent" - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric=metric, - metric_kwds=metric_kwds, - n_neighbors=n_neighbors, - n_jobs=cores, - random_state=seed, - **kwargs, - ) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm=alg, n_jobs=cores).fit(X) + _, _, nbrs, alg = k_nearest_neighbors( + X, + k=n_neighbors - 1, + metric=metric, + metric_kwads=metric_kwds, + exclude_self=False, + pynn_rand_state=seed, + n_jobs=cores, + return_nbrs=True, + logger=logger, + **kwargs, + ) if init_states is None: init_states = adata[init_cells, :].obsm[basis_key] @@ -635,8 +638,13 @@ def andecestor( last_indices = [0, -1] if direction == "both" else [-1] queries = pred[j].T[last_indices] if last_point_only else pred[j].T - if alg == "NNDescent": + if alg == "pynn": knn, distances = nbrs.query(queries, k=n_neighbors) + elif alg == "hnswlib": + knn, distances = nbrs.knn_query(queries, k=n_neighbors) + if metric == "euclidean": + distances = np.sqrt(distances) + knn, distances = correct_hnsw_neighbors(knn, distances) else: distances, knn = nbrs.kneighbors(queries) diff --git a/dynamo/tools/Markov.py b/dynamo/tools/Markov.py index 20689fc39..8effac6f2 100755 --- a/dynamo/tools/Markov.py +++ b/dynamo/tools/Markov.py @@ -8,6 +8,7 @@ from sklearn.neighbors import NearestNeighbors from tqdm import tqdm +from .connectivity import k_nearest_neighbors from ..dynamo_logger import LoggerManager from ..simulation.utils import directMethod from .utils import append_iterative_neighbor_indices, flatten @@ -164,25 +165,16 @@ def makeTransitionMatrix(Qnn, I_vec, tol=0.0): # Qnn, I, tol=0.0 return M -@jit(nopython=True) + def compute_tau(X, V, k=100, nbr_idx=None): if nbr_idx is None: - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=k, - n_jobs=-1, - random_state=19491001, - ) - _, dist = nbrs.query(X, k=k) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) - dists, _ = nbrs.kneighbors(X) - + _, dists = k_nearest_neighbors( + X, + k=k - 1, + exclude_self=False, + pynn_rand_state=19491001, + n_jobs=-1, + ) else: dists = np.zeros(nbr_idx.shape) for i in range(nbr_idx.shape[0]): @@ -225,22 +217,13 @@ def prepare_velocity_grid_data( if n_neighbors is None: n_neighbors = np.max([10, int(n_obs / 50)]) - if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2: - from pynndescent import NNDescent - - nn = NNDescent( - X_emb, - metric="euclidean", - n_neighbors=n_neighbors, - n_jobs=-1, - random_state=19491001, - ) - neighs, dists = nn.query(X_grid, k=n_neighbors) - else: - alg = "ball_tree" if X_emb.shape[1] > 10 else "kd_tree" - nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1, algorithm=alg) - nn.fit(X_emb) - dists, neighs = nn.kneighbors(X_grid) + neighs, dists = k_nearest_neighbors( + X_emb, + query_X=X_grid, + k=n_neighbors - 1, + exclude_self=False, + pynn_rand_state=19491001, + ) weight = norm.pdf(x=dists, scale=scale) p_mass = weight.sum(1) @@ -390,21 +373,12 @@ def graphize_velocity(V, X, nbrs_idx=None, k=30, normalize_v=False, E_func=None) nbrs = None if nbrs_idx is None: - if n > 200000 and d > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=k + 1, - n_jobs=-1, - random_state=19491001, - ) - nbrs_idx, _ = nbrs.query(X, k=k + 1) - else: - alg = "ball_tree" if d > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) - _, nbrs_idx = nbrs.kneighbors(X) + nbrs_idx, _ = k_nearest_neighbors( + X, + k=k, + exclude_self=False, + pynn_rand_state=19491001, + ) if type(E_func) is str: if E_func == "sqrt": @@ -591,21 +565,12 @@ def fit( ): # compute connectivity if neighbor_idx is None: - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=k, - n_jobs=-1, - random_state=19491001, - ) - neighbor_idx, _ = nbrs.query(X, k=k) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) - _, neighbor_idx = nbrs.kneighbors(X) + neighbor_idx, _ = k_nearest_neighbors( + X, + k=k-1, + exclude_self=False, + pynn_rand_state=19491001, + ) if n_recurse_neighbors is not None: self.Idx = append_iterative_neighbor_indices(neighbor_idx, n_recurse_neighbors) diff --git a/dynamo/tools/connectivity.py b/dynamo/tools/connectivity.py index d62a00b2b..a59a69fef 100755 --- a/dynamo/tools/connectivity.py +++ b/dynamo/tools/connectivity.py @@ -515,7 +515,7 @@ def _gen_neighbor_keys(result_prefix: str = "") -> Tuple[str, str, str]: return conn_key, dist_key, neighbor_key -def _correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: +def correct_hnsw_neighbors(knn_hn: np.ndarray, distances_hn: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Corrects the indices and corresponding distances obtained from a hnswlib by manually adding self neighbors. Args: @@ -542,6 +542,7 @@ def roll(arr, value=0): def k_nearest_neighbors( X: np.ndarray, k: int, + query_X: Optional[np.ndarray] = None, method: Optional[str] = None, metric: Union[str, Callable] = "euclidean", metric_kwads: Dict[str, Any] = None, @@ -578,7 +579,8 @@ def k_nearest_neighbors( hnswlib_num: the lowest threshold of features to use `hnswlib` nearest neighbors. Defaults to int(2e5). pynn_rand_state: the random seed for NNDescent calculation. Defaults to 19491001. n_jobs: number of parallel jobs for NNDescent. -1 means all cores would be used. Defaults to -1. - return_nbrs: whether to return the fitted nearest neighbor object. Defaults to False. + return_nbrs: whether to return the fitted nearest neighbor object. If True, will return nearest neighbor object + and the method. Defaults to False. logger: the Logger object to display the log information. kwargs: additional arguments that will be passed to each nearest neighbor search algorithm. @@ -603,6 +605,9 @@ def k_nearest_neighbors( method = "kd_tree" logger.info("method %s selected" % (method), indent_level=2) + if query_X is None: + query_X = X + if method.lower() in ["pynn", "umap"]: from pynndescent import NNDescent nbrs = NNDescent( @@ -613,7 +618,7 @@ def k_nearest_neighbors( random_state=pynn_rand_state, **kwargs, ) - nbrs_idx, dists = nbrs.query(X, k=k + 1) + nbrs_idx, dists = nbrs.query(query_X, k=k + 1) elif method in ["ball_tree", "kd_tree"]: from sklearn.neighbors import NearestNeighbors # print("***debug X_data:", X_data) @@ -625,7 +630,7 @@ def k_nearest_neighbors( n_jobs=n_jobs, **kwargs, ).fit(X) - dists, nbrs_idx = nbrs.kneighbors(X) + dists, nbrs_idx = nbrs.kneighbors(query_X) elif method == "hnswlib": import hnswlib space = "l2" if metric == "euclidean" else metric @@ -635,10 +640,10 @@ def k_nearest_neighbors( nbrs.init_index(max_elements=X.shape[0], random_seed=pynn_rand_state, **kwargs) nbrs.set_num_threads(n_jobs) nbrs.add_items(X) - nbrs_idx, dists = nbrs.knn_query(X, k=k + 1) + nbrs_idx, dists = nbrs.knn_query(query_X, k=k + 1) if space == "l2": dists = np.sqrt(dists) - nbrs_idx, dists = _correct_hnsw_neighbors(nbrs_idx, dists) + nbrs_idx, dists = correct_hnsw_neighbors(nbrs_idx, dists) else: raise ImportError(f"nearest neighbor search method {method} is not supported") @@ -647,7 +652,7 @@ def k_nearest_neighbors( nbrs_idx = nbrs_idx[:, 1:] dists = dists[:, 1:] if return_nbrs: - return nbrs_idx, dists, nbrs + return nbrs_idx, dists, nbrs, method return nbrs_idx, dists diff --git a/dynamo/tools/graph_calculus.py b/dynamo/tools/graph_calculus.py index fa2dfe9ec..bab5cecf6 100644 --- a/dynamo/tools/graph_calculus.py +++ b/dynamo/tools/graph_calculus.py @@ -78,7 +78,7 @@ def graphize_velocity( if nbrs_idx is None or return_nbrs: main_info("calculating neighbor indices...", indent_level=2) - nbrs_idx, dists, nbrs = k_nearest_neighbors(X, k, exclude_self=True, return_nbrs=True) + nbrs_idx, dists, nbrs, _ = k_nearest_neighbors(X, k, exclude_self=True, return_nbrs=True) if dists is None: dists = nbrs_to_dists(X, nbrs_idx) diff --git a/dynamo/tools/growth.py b/dynamo/tools/growth.py index 4f3de9ebf..efeff8ddd 100644 --- a/dynamo/tools/growth.py +++ b/dynamo/tools/growth.py @@ -13,6 +13,7 @@ from scipy.sparse import issparse from sklearn.neighbors import NearestNeighbors +from .connectivity import k_nearest_neighbors def score_cells( adata: AnnData, @@ -74,11 +75,11 @@ def score_cells( elif basis is not None and "X_" + basis not in adata.obsm.keys(): raise ValueError(f"Your adata doesn't have the {basis} you inputted in .obsm attribute of your adata.") - if genes is None and "use_for_pca" not in adata.obs.keys(): - raise ValueError(f"Your adata doesn't have 'use_for_pca' column in .obs.") + if genes is None and "use_for_pca" not in adata.var.keys(): + raise ValueError(f"Your adata doesn't have 'use_for_pca' column in .var.") if genes is None: - genes = adata.var_names[adata.use_for_pca] + genes = adata.var_names[adata.var.use_for_pca] else: genes = ( list(adata.var_names.intersection(genes)) @@ -93,23 +94,16 @@ def score_cells( X_basis = adata.obsm["X_pca"] if basis is None else adata.obsm["X_" + basis] - if X_basis.shape[0] > 5000 and X_basis.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X_basis, - metric=metric, - metric_kwds=metric_kwds, - n_neighbors=30, - n_jobs=cores, - random_state=seed, - **kwargs, - ) - knn, distances = nbrs.query(X_basis, k=n_neighbors) - else: - alg = "ball_tree" if X_basis.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm=alg, n_jobs=cores).fit(X_basis) - distances, knn = nbrs.kneighbors(X_basis) + knn, distances = k_nearest_neighbors( + X_basis, + k=n_neighbors - 1, + metric=metric, + metric_kwads=metric_kwds, + exclude_self=False, + pynn_rand_state=seed, + n_jobs=cores, + **kwargs, + ) X_data = adata[:, genes].X if layer in [None, "X"] else adata[:, genes].layers[layer] diff --git a/dynamo/tools/metric_velocity.py b/dynamo/tools/metric_velocity.py index 0d01018b0..14b968621 100755 --- a/dynamo/tools/metric_velocity.py +++ b/dynamo/tools/metric_velocity.py @@ -16,6 +16,7 @@ from .connectivity import ( adj_to_knn, check_and_recompute_neighbors, + k_nearest_neighbors, mnn_from_list, umap_conn_indices_dist_embedding, ) @@ -89,22 +90,12 @@ def cell_wise_confidence( ) else: n_neigh = 30 - - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=n_neigh + 1, - n_jobs=-1, - random_state=19491001, - ) - nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=n_neigh + 1, algorithm=alg, n_jobs=-1).fit(X) - dist, nbrs_idx = nbrs.kneighbors(X) + nbrs_idx, dist = k_nearest_neighbors( + X, + k=n_neigh, + exclude_self=False, + pynn_rand_state=19491001, + ) row = np.repeat(nbrs_idx[:, 0], n_neigh) col = nbrs_idx[:, 1:].flatten() diff --git a/dynamo/tools/sampling.py b/dynamo/tools/sampling.py index 5ba8f4be7..c0a0619bd 100644 --- a/dynamo/tools/sampling.py +++ b/dynamo/tools/sampling.py @@ -9,6 +9,7 @@ from scipy.cluster.vq import kmeans2 from sklearn.neighbors import NearestNeighbors +from .connectivity import k_nearest_neighbors from ..dynamo_logger import LoggerManager from .utils import nearest_neighbors, timeit @@ -135,21 +136,13 @@ def trn(X: np.ndarray, n: int, return_index: bool = True, seed: int = 19491001, if not return_index: return trnet.W else: - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=1, - n_jobs=-1, - random_state=seed, - ) - idx, _ = nbrs.query(trnet.W, k=1) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X) - _, idx = nbrs.kneighbors(trnet.W) + idx, _ = k_nearest_neighbors( + X, + query_X=trnet.W, + k=0, + exclude_self=False, + pynn_rand_state=seed, + ) return idx[:, 0] diff --git a/dynamo/vectorfield/scVectorField.py b/dynamo/vectorfield/scVectorField.py index 8b828888f..ad33712e5 100755 --- a/dynamo/vectorfield/scVectorField.py +++ b/dynamo/vectorfield/scVectorField.py @@ -14,6 +14,7 @@ from ..dynamo_logger import LoggerManager, main_info, main_warning from ..simulation.ODE import jacobian_bifur2genes, ode_bifur2genes +from ..tools.connectivity import k_nearest_neighbors from ..tools.sampling import lhsclassic, sample, sample_by_velocity from ..tools.utils import ( index_condensed_matrix, @@ -103,21 +104,13 @@ def bandwidth_selector(X): This function computes an empirical bandwidth for a Gaussian kernel. """ n, m = X.shape - if n > 200000 and m > 2: - from pynndescent import NNDescent - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=max(2, int(0.2 * n)), - n_jobs=-1, - random_state=19491001, - ) - _, distances = nbrs.query(X, k=max(2, int(0.2 * n))) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)), algorithm=alg, n_jobs=-1).fit(X) - distances, _ = nbrs.kneighbors(X) + _, distances = k_nearest_neighbors( + X, + k=max(2, int(0.2 * n)) - 1, + exclude_self=False, + pynn_rand_state=19491001, + ) d = np.mean(distances[:, 1:]) / 1.5 return np.sqrt(2) * d @@ -241,21 +234,13 @@ def graphize_vecfld( nbrs = None if nbrs_idx is None: - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=k + 1, - n_jobs=-1, - random_state=19491001, - ) - nbrs_idx, dist = nbrs.query(X, k=k + 1) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) - dist, nbrs_idx = nbrs.kneighbors(X) + nbrs_idx, dist, nbrs, _ = k_nearest_neighbors( + X, + k=k, + exclude_self=False, + pynn_rand_state=19491001, + return_nbrs=True, + ) if dist is None and not distance_free: D = pdist(X) diff --git a/dynamo/vectorfield/topography.py b/dynamo/vectorfield/topography.py index 30c63de03..0a57b281d 100755 --- a/dynamo/vectorfield/topography.py +++ b/dynamo/vectorfield/topography.py @@ -13,6 +13,7 @@ from sklearn.neighbors import NearestNeighbors from ..dynamo_logger import LoggerManager, main_info, main_warning +from ..tools.connectivity import k_nearest_neighbors from ..tools.utils import gaussian_1d, inverse_norm, nearest_neighbors, update_dict from ..utils import copy_adata from .FixedPoints import FixedPoints @@ -294,21 +295,13 @@ def get_Xss_confidence(self, k=50): Xref = np.median(X, 0) Xss = np.vstack((Xss, Xref)) - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=min(k, X.shape[0] - 1), - n_jobs=-1, - random_state=19491001, - ) - _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1)) - else: - alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) - dist, _ = nbrs.kneighbors(Xss) + _, dist = k_nearest_neighbors( + X, + query_X=Xss, + k=min(k, X.shape[0] - 1) - 1, + exclude_self=False, + pynn_rand_state=19491001, + ) dist_m = dist.mean(1) # confidence = 1 - dist_m / dist_m.max() From 9f6f77ba368bdc114e73d6b92702d2ffcc8a68e2 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 24 Apr 2023 15:52:37 -0400 Subject: [PATCH 4/5] update reference in clustering --- dynamo/vectorfield/clustering.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/dynamo/vectorfield/clustering.py b/dynamo/vectorfield/clustering.py index d4ef2c038..a7e09dc7b 100644 --- a/dynamo/vectorfield/clustering.py +++ b/dynamo/vectorfield/clustering.py @@ -10,6 +10,7 @@ from ..dynamo_logger import main_info from ..preprocessing.utils import pca from ..tools.clustering import hdbscan, infomap, leiden, louvain +from ..tools.connectivity import k_nearest_neighbors from ..tools.Markov import ( grid_velocity_filter, prepare_velocity_grid_data, @@ -159,20 +160,15 @@ def cluster_field( adata.obs[key] = adata.obs.obs[key].astype("category") elif method in ["louvain", "leiden", "infomap"]: - if X.shape[0] > 200000 and X.shape[1] > 2: - from pynndescent import NNDescent - nbrs = NNDescent( - X, - metric="euclidean", - n_neighbors=31, - n_jobs=cores, - random_state=19491001, - ) - nbrs_idx, dist = nbrs.query(X, k=31) - else: - nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X) - dist, nbrs_idx = nbrs.kneighbors(X) + nbrs_idx, dist = k_nearest_neighbors( + X, + k=30, + exclude_self=False, + pynn_rand_state=19491001, + n_jobs=cores, + logger=logger, + ) row = np.repeat(nbrs_idx[:, 0], 30) col = nbrs_idx[:, 1:].flatten() From 1a08bd1e3584131319ded4979662d16dd77a4ebb Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 26 Apr 2023 15:03:31 -0400 Subject: [PATCH 5/5] update reference in stochastic_process --- dynamo/vectorfield/stochastic_process.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/dynamo/vectorfield/stochastic_process.py b/dynamo/vectorfield/stochastic_process.py index d89a627b0..4b1fbcbf1 100644 --- a/dynamo/vectorfield/stochastic_process.py +++ b/dynamo/vectorfield/stochastic_process.py @@ -2,7 +2,7 @@ from sklearn.neighbors import NearestNeighbors from tqdm import tqdm -from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors +from ..tools.connectivity import _gen_neighbor_keys, check_and_recompute_neighbors, k_nearest_neighbors from ..tools.utils import log1p_ from .utils import vecfld_from_adata, vector_field_function @@ -152,21 +152,12 @@ def diffusionMatrix( neighbor_result_prefix = "" if layer is None else layer conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix) if neighbor_key not in adata.uns_keys() or (X_data is not None and V_data is not None): - if X_data.shape[0] > 200000 and X_data.shape[1] > 2: - from pynndescent import NNDescent - - nbrs = NNDescent( - X_data, - metric="euclidean", - n_neighbors=n, - n_jobs=-1, - random_state=19491001, - ) - Idx, _ = nbrs.query(X_data, k=n) - else: - alg = "ball_tree" if X_data.shape[1] > 10 else "kd_tree" - nbrs = NearestNeighbors(n_neighbors=n, algorithm=alg, n_jobs=-1).fit(X_data) - _, Idx = nbrs.kneighbors(X_data) + Idx, _ = k_nearest_neighbors( + X_data, + k=n - 1, + exclude_self=False, + pynn_rand_state=19491001, + ) else: check_and_recompute_neighbors(adata, result_prefix=layer) conn_key = "connectivities" if layer is None else layer + "_connectivities"