Skip to content

Commit

Permalink
major changes to umap usage. not using fuzzy_simplical_set anymore. n…
Browse files Browse the repository at this point in the history
…ot performing a separate transform step. removed set_op_mix_ratio parameter. fit_n_epochs parameter changed to n_epochs
  • Loading branch information
parashardhapola committed Jul 30, 2021
1 parent b454bdf commit a8fb23f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 130 deletions.
59 changes: 27 additions & 32 deletions scarf/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,9 +1617,7 @@ def run_umap(
umap_dims: int = 2,
spread: float = 2.0,
min_dist: float = 1,
fit_n_epochs: int = 200,
tx_n_epochs: int = 100,
set_op_mix_ratio: float = 1.0,
n_epochs: int = 300,
repulsion_strength: float = 1.0,
initial_alpha: float = 1.0,
negative_sample_rate: float = 5,
Expand Down Expand Up @@ -1649,15 +1647,9 @@ def run_umap(
manifold are drawn closer together, while larger values will result on a more even dispersal of
points. The value should be set relative to the ``spread`` value, which determines the scale at
which embedded points will be spread out. (Default value: 1)
fit_n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the
low dimensional embedding. Larger values result in more accurate embeddings.
(Default value: 200)
tx_n_epochs: NUmber of epochs during transform (Default value: 100)
set_op_mix_ratio: Same as set_op_mix_ratio in UMAP package. Interpolate between (fuzzy) union and
intersection as the set operation used to combine local fuzzy simplicial sets to obtain
a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm.
The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a
pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
n_epochs: Same as n_epochs in UMAP package. The number of epochs to be used in optimizing the
low dimensional embedding. Larger values may result in more accurate embeddings.
(Default value: 300)
repulsion_strength: Same as repulsion_strength in UMAP package. Weighting applied to negative samples in
low dimensional embedding optimization. Values higher than one will result in greater
weight being given to negative samples. (Default value: 1.0)
Expand Down Expand Up @@ -1699,15 +1691,13 @@ def run_umap(
verbose = False
if get_log_level() <= 20:
verbose = True
t = fit_transform(
t, a, b = fit_transform(
graph=graph.tocoo(),
ini_embed=ini_embed,
spread=spread,
min_dist=min_dist,
tx_n_epochs=tx_n_epochs,
fit_n_epochs=fit_n_epochs,
n_epochs=n_epochs,
random_seed=random_seed,
set_op_mix_ratio=set_op_mix_ratio,
repulsion_strength=repulsion_strength,
initial_alpha=initial_alpha,
negative_sample_rate=negative_sample_rate,
Expand Down Expand Up @@ -2657,15 +2647,15 @@ def run_unified_umap(
target_weight: float = 0.1,
spread: float = 2.0,
min_dist: float = 1,
fit_n_epochs: int = 200,
tx_n_epochs: int = 100,
set_op_mix_ratio: float = 1.0,
n_epochs: int = 200,
repulsion_strength: float = 1.0,
initial_alpha: float = 1.0,
negative_sample_rate: float = 5,
random_seed: int = 4444,
ini_embed_with: str = "kmeans",
label: str = "unified_UMAP",
parallel: bool = False,
nthreads: int = None,
) -> None:
"""
Calculates the UMAP embedding for graph obtained using ``load_unified_graph``.
Expand All @@ -2689,15 +2679,9 @@ def run_unified_umap(
manifold are drawn closer together, while larger values will result on a more even dispersal of
points. The value should be set relative to the ``spread`` value, which determines the scale at
which embedded points will be spread out. (Default value: 1)
fit_n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the
low dimensional embedding. Larger values result in more accurate embeddings.
(Default value: 200)
tx_n_epochs: NUmber of epochs during transform (Default value: 100)
set_op_mix_ratio: Same as set_op_mix_ratio in UMAP package. Interpolate between (fuzzy) union and
intersection as the set operation used to combine local fuzzy simplicial sets to obtain
a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm.
The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a
pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
n_epochs: Same as n_epochs in UMAP package. The number of training epochs to be used in optimizing the
low dimensional embedding. Larger values result in more accurate embeddings.
(Default value: 200)
repulsion_strength: Same as repulsion_strength in UMAP package. Weighting applied to negative samples in
low dimensional embedding optimization. Values higher than one will result in greater
weight being given to negative samples. (Default value: 1.0)
Expand All @@ -2710,11 +2694,16 @@ def run_unified_umap(
random_seed: (Default value: 4444)
ini_embed_with: either 'kmeans' or a column from cell metadata to be used as initial embedding coordinates
label: base label for UMAP dimensions in the cell metadata column (Default value: 'UMAP')
parallel: Whether to run UMAP in parallel mode. Setting value to True will use `nthreads` threads.
The results are not reproducible in parallel mode. (Default value: False)
nthreads: If parallel=True then this number of threads will be used to run UMAP. By default the `nthreads`
attribute of the class is used. (Default value: None)
Returns:
None
"""
from .umap import fit_transform
from .utils import get_log_level

if from_assay is None:
from_assay = self._defaultAssay
Expand All @@ -2731,18 +2720,24 @@ def run_unified_umap(
ini_embed = self._get_uni_ini_embed(
from_assay, cell_key, feat_key, graph, ini_embed_with, n_cells[0]
)
t = fit_transform(
if nthreads is None:
nthreads = self.nthreads
verbose = False
if get_log_level() <= 20:
verbose = True
t, a, b = fit_transform(
graph=graph.tocoo(),
ini_embed=ini_embed,
spread=spread,
min_dist=min_dist,
tx_n_epochs=tx_n_epochs,
fit_n_epochs=fit_n_epochs,
n_epochs=n_epochs,
random_seed=random_seed,
set_op_mix_ratio=set_op_mix_ratio,
repulsion_strength=repulsion_strength,
initial_alpha=initial_alpha,
negative_sample_rate=negative_sample_rate,
parallel=parallel,
nthreads=nthreads,
verbose=verbose,
)
self._save_embedding(from_assay, cell_key, label, t, n_cells, target_names)
return None
Expand Down
118 changes: 20 additions & 98 deletions scarf/umap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
# License: BSD 3 clause

import locale
from .utils import logger

locale.setlocale(locale.LC_NUMERIC, "C")


__all__ = ["fit", "transform", "fit_transform"]
__all__ = ["fit_transform"]


def simplicial_set_embedding(
Expand All @@ -34,11 +34,10 @@ def simplicial_set_embedding(
import warnings
from .utils import tqdm_params

g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0
g.eliminate_zeros()
# g.data[g.data < (g.data.max() / float(n_epochs))] = 0.0
# g.eliminate_zeros()
epochs_per_sample = make_epochs_per_sample(g.data, n_epochs)
head = g.row
tail = g.col
logger.trace("calculated epochs_per_sample")
rng_state = (
check_random_state(random_seed)
.randint(np.iinfo(np.int32).min + 1, np.iinfo(np.int32).max - 1, 3)
Expand All @@ -59,8 +58,8 @@ def simplicial_set_embedding(
embedding = optimize_layout_euclidean(
embedding,
embedding,
head,
tail,
g.row,
g.col,
n_epochs,
g.shape[1],
epochs_per_sample,
Expand All @@ -77,20 +76,20 @@ def simplicial_set_embedding(
return embedding


def fuzzy_simplicial_set(g, set_op_mix_ratio):
tg = g.transpose()
prod = g.multiply(tg)
res = set_op_mix_ratio * (g + tg - prod) + (1.0 - set_op_mix_ratio) * prod
res.eliminate_zeros()
return res.tocoo()
# def fuzzy_simplicial_set(g, set_op_mix_ratio):
# tg = g.transpose()
# prod = g.multiply(tg)
# res = set_op_mix_ratio * (g + tg - prod) + (1.0 - set_op_mix_ratio) * prod
# res.eliminate_zeros()
# return res.tocoo()


def fit(
def fit_transform(
graph,
embedding,
ini_embed,
spread,
min_dist,
set_op_mix_ratio,
# set_op_mix_ratio,
n_epochs,
random_seed,
repulsion_strength,
Expand All @@ -103,41 +102,11 @@ def fit(
from umap.umap_ import find_ab_params

a, b = find_ab_params(spread, min_dist)
sym_graph = fuzzy_simplicial_set(graph, set_op_mix_ratio)
logger.trace("Found ab params")
# sym_graph = fuzzy_simplicial_set(graph, set_op_mix_ratio)
embedding = simplicial_set_embedding(
sym_graph,
embedding,
n_epochs,
a,
b,
random_seed,
repulsion_strength,
initial_alpha,
negative_sample_rate,
parallel,
nthreads,
verbose,
)
return embedding, a, b


def transform(
graph,
embedding,
a,
b,
n_epochs,
random_seed,
repulsion_strength,
initial_alpha,
negative_sample_rate,
parallel,
nthreads,
verbose,
):
return simplicial_set_embedding(
graph,
embedding,
ini_embed,
n_epochs,
a,
b,
Expand All @@ -149,51 +118,4 @@ def transform(
nthreads,
verbose,
)


def fit_transform(
graph,
ini_embed,
spread: float,
min_dist: float,
tx_n_epochs: int,
fit_n_epochs: int,
random_seed: int,
set_op_mix_ratio: float = 1.0,
repulsion_strength: float = 1.0,
initial_alpha: float = 1.0,
negative_sample_rate: float = 5,
parallel: bool = False,
nthreads: int = 1,
verbose: bool = True,
):
e, a, b = fit(
graph,
ini_embed,
spread=spread,
min_dist=min_dist,
set_op_mix_ratio=set_op_mix_ratio,
n_epochs=fit_n_epochs,
random_seed=random_seed,
repulsion_strength=repulsion_strength,
initial_alpha=initial_alpha,
negative_sample_rate=negative_sample_rate,
parallel=parallel,
nthreads=nthreads,
verbose=verbose,
)
t = transform(
graph,
e,
a,
b,
n_epochs=tx_n_epochs,
random_seed=random_seed,
repulsion_strength=repulsion_strength,
initial_alpha=initial_alpha,
negative_sample_rate=negative_sample_rate,
parallel=parallel,
nthreads=nthreads,
verbose=verbose,
)
return t
return embedding, a, b

0 comments on commit a8fb23f

Please sign in to comment.