diff --git a/scarf/datastore.py b/scarf/datastore.py index 7807ae9..572b4bd 100644 --- a/scarf/datastore.py +++ b/scarf/datastore.py @@ -1617,9 +1617,7 @@ def run_umap( umap_dims: int = 2, spread: float = 2.0, min_dist: float = 1, - fit_n_epochs: int = 200, - tx_n_epochs: int = 100, - set_op_mix_ratio: float = 1.0, + n_epochs: int = 300, repulsion_strength: float = 1.0, initial_alpha: float = 1.0, negative_sample_rate: float = 5, @@ -1649,15 +1647,9 @@ def run_umap( manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. (Default value: 1) - fit_n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the - low dimensional embedding. Larger values result in more accurate embeddings. - (Default value: 200) - tx_n_epochs: NUmber of epochs during transform (Default value: 100) - set_op_mix_ratio: Same as set_op_mix_ratio in UMAP package. Interpolate between (fuzzy) union and - intersection as the set operation used to combine local fuzzy simplicial sets to obtain - a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. - The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a - pure fuzzy union, while 0.0 will use a pure fuzzy intersection. + n_epochs: Same as n_epochs in UMAP package. The number of epochs to be used in optimizing the + low dimensional embedding. Larger values may result in more accurate embeddings. + (Default value: 300) repulsion_strength: Same as repulsion_strength in UMAP package. Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. (Default value: 1.0) @@ -1699,15 +1691,13 @@ def run_umap( verbose = False if get_log_level() <= 20: verbose = True - t = fit_transform( + t, a, b = fit_transform( graph=graph.tocoo(), ini_embed=ini_embed, spread=spread, min_dist=min_dist, - tx_n_epochs=tx_n_epochs, - fit_n_epochs=fit_n_epochs, + n_epochs=n_epochs, random_seed=random_seed, - set_op_mix_ratio=set_op_mix_ratio, repulsion_strength=repulsion_strength, initial_alpha=initial_alpha, negative_sample_rate=negative_sample_rate, @@ -2657,15 +2647,15 @@ def run_unified_umap( target_weight: float = 0.1, spread: float = 2.0, min_dist: float = 1, - fit_n_epochs: int = 200, - tx_n_epochs: int = 100, - set_op_mix_ratio: float = 1.0, + n_epochs: int = 200, repulsion_strength: float = 1.0, initial_alpha: float = 1.0, negative_sample_rate: float = 5, random_seed: int = 4444, ini_embed_with: str = "kmeans", label: str = "unified_UMAP", + parallel: bool = False, + nthreads: int = None, ) -> None: """ Calculates the UMAP embedding for graph obtained using ``load_unified_graph``. @@ -2689,15 +2679,9 @@ def run_unified_umap( manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. (Default value: 1) - fit_n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the - low dimensional embedding. Larger values result in more accurate embeddings. - (Default value: 200) - tx_n_epochs: NUmber of epochs during transform (Default value: 100) - set_op_mix_ratio: Same as set_op_mix_ratio in UMAP package. Interpolate between (fuzzy) union and - intersection as the set operation used to combine local fuzzy simplicial sets to obtain - a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. - The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a - pure fuzzy union, while 0.0 will use a pure fuzzy intersection. + n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the + low dimensional embedding. Larger values result in more accurate embeddings. + (Default value: 200) repulsion_strength: Same as repulsion_strength in UMAP package. Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. (Default value: 1.0) @@ -2710,11 +2694,16 @@ def run_unified_umap( random_seed: (Default value: 4444) ini_embed_with: either 'kmeans' or a column from cell metadata to be used as initial embedding coordinates label: base label for UMAP dimensions in the cell metadata column (Default value: 'UMAP') + parallel: Whether to run UMAP in parallel mode. Setting value to True will use `nthreads` threads. + The results are not reproducible in parallel mode. (Default value: False) + nthreads: If parallel=True then this number of threads will be used to run UMAP. By default the `nthreads` + attribute of the class is used. (Default value: None) Returns: None """ from .umap import fit_transform + from .utils import get_log_level if from_assay is None: from_assay = self._defaultAssay @@ -2731,18 +2720,24 @@ def run_unified_umap( ini_embed = self._get_uni_ini_embed( from_assay, cell_key, feat_key, graph, ini_embed_with, n_cells[0] ) - t = fit_transform( + if nthreads is None: + nthreads = self.nthreads + verbose = False + if get_log_level() <= 20: + verbose = True + t, a, b = fit_transform( graph=graph.tocoo(), ini_embed=ini_embed, spread=spread, min_dist=min_dist, - tx_n_epochs=tx_n_epochs, - fit_n_epochs=fit_n_epochs, + n_epochs=n_epochs, random_seed=random_seed, - set_op_mix_ratio=set_op_mix_ratio, repulsion_strength=repulsion_strength, initial_alpha=initial_alpha, negative_sample_rate=negative_sample_rate, + parallel=parallel, + nthreads=nthreads, + verbose=verbose, ) self._save_embedding(from_assay, cell_key, label, t, n_cells, target_names) return None diff --git a/scarf/umap.py b/scarf/umap.py index e91cdff..0965ccf 100644 --- a/scarf/umap.py +++ b/scarf/umap.py @@ -4,11 +4,11 @@ # License: BSD 3 clause import locale +from .utils import logger locale.setlocale(locale.LC_NUMERIC, "C") - -__all__ = ["fit", "transform", "fit_transform"] +__all__ = ["fit_transform"] def simplicial_set_embedding( @@ -34,11 +34,10 @@ def simplicial_set_embedding( import warnings from .utils import tqdm_params - g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0 - g.eliminate_zeros() + # g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0 + # g.eliminate_zeros() epochs_per_sample = make_epochs_per_sample(g.data, n_epochs) - head = g.row - tail = g.col + logger.trace("calculated epochs_per_sample") rng_state = ( check_random_state(random_seed) .randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3) @@ -59,8 +58,8 @@ def simplicial_set_embedding( embedding = optimize_layout_euclidean( embedding, embedding, - head, - tail, + g.row, + g.col, n_epochs, g.shape[1], epochs_per_sample, @@ -77,20 +76,20 @@ def simplicial_set_embedding( return embedding -def fuzzy_simplicial_set(g, set_op_mix_ratio): - tg = g.transpose() - prod = g.multiply(tg) - res = set_op_mix_ratio * (g + tg - prod) + (1.0 - set_op_mix_ratio) * prod - res.eliminate_zeros() - return res.tocoo() +# def fuzzy_simplicial_set(g, set_op_mix_ratio): +# tg = g.transpose() +# prod = g.multiply(tg) +# res = set_op_mix_ratio * (g + tg - prod) + (1.0 - set_op_mix_ratio) * prod +# res.eliminate_zeros() +# return res.tocoo() -def fit( +def fit_transform( graph, - embedding, + ini_embed, spread, min_dist, - set_op_mix_ratio, + # set_op_mix_ratio, n_epochs, random_seed, repulsion_strength, @@ -103,41 +102,11 @@ def fit( from umap.umap_ import find_ab_params a, b = find_ab_params(spread, min_dist) - sym_graph = fuzzy_simplicial_set(graph, set_op_mix_ratio) + logger.trace("Found ab params") + # sym_graph = fuzzy_simplicial_set(graph, set_op_mix_ratio) embedding = simplicial_set_embedding( - sym_graph, - embedding, - n_epochs, - a, - b, - random_seed, - repulsion_strength, - initial_alpha, - negative_sample_rate, - parallel, - nthreads, - verbose, - ) - return embedding, a, b - - -def transform( - graph, - embedding, - a, - b, - n_epochs, - random_seed, - repulsion_strength, - initial_alpha, - negative_sample_rate, - parallel, - nthreads, - verbose, -): - return simplicial_set_embedding( graph, - embedding, + ini_embed, n_epochs, a, b, @@ -149,51 +118,4 @@ def transform( nthreads, verbose, ) - - -def fit_transform( - graph, - ini_embed, - spread: float, - min_dist: float, - tx_n_epochs: int, - fit_n_epochs: int, - random_seed: int, - set_op_mix_ratio: float = 1.0, - repulsion_strength: float = 1.0, - initial_alpha: float = 1.0, - negative_sample_rate: float = 5, - parallel: bool = False, - nthreads: int = 1, - verbose: bool = True, -): - e, a, b = fit( - graph, - ini_embed, - spread=spread, - min_dist=min_dist, - set_op_mix_ratio=set_op_mix_ratio, - n_epochs=fit_n_epochs, - random_seed=random_seed, - repulsion_strength=repulsion_strength, - initial_alpha=initial_alpha, - negative_sample_rate=negative_sample_rate, - parallel=parallel, - nthreads=nthreads, - verbose=verbose, - ) - t = transform( - graph, - e, - a, - b, - n_epochs=tx_n_epochs, - random_seed=random_seed, - repulsion_strength=repulsion_strength, - initial_alpha=initial_alpha, - negative_sample_rate=negative_sample_rate, - parallel=parallel, - nthreads=nthreads, - verbose=verbose, - ) - return t + return embedding, a, b