From e6a27139dd74b9f548b6cb921f474bb0a134cc92 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:51:37 +0200 Subject: [PATCH 01/54] fixed conflicts --- tensorflow_probability/python/experimental/vi/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py index 0cb4971fcc..e18c8d3455 100644 --- a/tensorflow_probability/python/experimental/vi/__init__.py +++ b/tensorflow_probability/python/experimental/vi/__init__.py @@ -29,6 +29,7 @@ 'build_affine_surrogate_posterior', 'build_affine_surrogate_posterior_from_base_distribution', 'build_asvi_surrogate_posterior', + 'builf_cf_surrogate_posterior' 'build_factored_surrogate_posterior', 'build_split_flow_surrogate_posterior', 'build_trainable_location_scale_distribution', From c501d2bb6a120fb1d41068bc0a94d7233ee02f4c Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:52:05 +0200 Subject: [PATCH 02/54] Revert "Revert "initial tests, updated init and build"" This reverts commit 5bb28b08 --- .../python/experimental/vi/BUILD | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD index e57f884ca5..863e0aeef2 100644 --- a/tensorflow_probability/python/experimental/vi/BUILD +++ b/tensorflow_probability/python/experimental/vi/BUILD @@ -31,6 +31,7 @@ py_library( srcs_version = "PY3", deps = [ ":automatic_structured_vi", + ":cascading_flows", ":surrogate_posteriors", "//tensorflow_probability/python/experimental/vi/util", "//tensorflow_probability/python/internal:all_util", @@ -67,6 +68,36 @@ py_library( ], ) +py_library( + name = "cascading_flows", + srcs = ["cascading_flows.py.py"], + srcs_version = "PY3", + deps = [ + # tensorflow dep, + "//tensorflow_probability/python/bijectors:build_highway_flow_layer", + "//tensorflow_probability/python/bijectors:chain", + "//tensorflow_probability/python/bijectors:reshape", + "//tensorflow_probability/python/bijectors:scale", + "//tensorflow_probability/python/bijectors:shift", + "//tensorflow_probability/python/bijectors:split", + "//tensorflow_probability/python/distributions:batch_broadcast", + "//tensorflow_probability/python/distributions:beta", + "//tensorflow_probability/python/distributions:blockwise", + "//tensorflow_probability/python/distributions:chi2", + "//tensorflow_probability/python/distributions:exponential", + "//tensorflow_probability/python/distributions:gamma", + "//tensorflow_probability/python/distributions:half_normal", + "//tensorflow_probability/python/distributions:joint_distribution_auto_batched", + "//tensorflow_probability/python/distributions:joint_distribution_coroutine", + "//tensorflow_probability/python/distributions:normal", + "//tensorflow_probability/python/distributions:sample", + "//tensorflow_probability/python/distributions:transformed_distribution", + "//tensorflow_probability/python/distributions:truncated_normal", + "//tensorflow_probability/python/distributions:uniform", + "//tensorflow_probability/python/internal:samplers", + ], +) + py_library( name = "surrogate_posteriors", srcs = ["surrogate_posteriors.py"], @@ -111,6 +142,22 @@ py_test( ], ) +py_test( + name = "cascading_flows_test", + size = "large", + srcs = ["cascading_flows_test.py"], + python_version = "PY3", + shard_count = 4, + srcs_version = "PY3", + deps = [ + # absl/testing:parameterized dep, + # numpy dep, + # tensorflow dep, + "//tensorflow_probability", + "//tensorflow_probability/python/internal:test_util", + ], +) + py_test( name = "surrogate_posteriors_test", size = "large", From b6be9d96c1bcc278e5f3278dbec76ef480090e9a Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:54:47 +0200 Subject: [PATCH 03/54] reverted commit --- .../python/experimental/vi/cascading_flows.py | 483 ++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows.py diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py new file mode 100644 index 0000000000..d8c9393d8e --- /dev/null +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -0,0 +1,483 @@ +# Copyright 2021 The TensorFlow Probability Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Utilities for constructing structured surrogate posteriors.""" + +from __future__ import absolute_import +from __future__ import division +# [internal] enable type annotations +from __future__ import print_function + +import copy +import functools +import inspect + +import tensorflow.compat.v2 as tf + +from tensorflow_probability.python.experimental.bijectors import \ + build_highway_flow_layer +from tensorflow_probability.python.bijectors import chain +from tensorflow_probability.python.bijectors import reshape +from tensorflow_probability.python.bijectors import scale as scale_lib +from tensorflow_probability.python.bijectors import shift +from tensorflow_probability.python.bijectors import split + +from tensorflow_probability.python.distributions import batch_broadcast +from tensorflow_probability.python.distributions import beta +from tensorflow_probability.python.distributions import blockwise +from tensorflow_probability.python.distributions import chi2 +from tensorflow_probability.python.distributions import exponential +from tensorflow_probability.python.distributions import gamma +from tensorflow_probability.python.distributions import half_normal +from tensorflow_probability.python.distributions import \ + joint_distribution_auto_batched +from tensorflow_probability.python.distributions import \ + joint_distribution_coroutine +from tensorflow_probability.python.distributions import normal +from tensorflow_probability.python.distributions import sample +from tensorflow_probability.python.distributions import transformed_distribution +from tensorflow_probability.python.distributions import truncated_normal +from tensorflow_probability.python.distributions import uniform +from tensorflow_probability.python.internal import samplers + +__all__ = [ + 'register_asvi_substitution_rule', + 'build_cf_surrogate_posterior' +] + +Root = joint_distribution_coroutine.JointDistributionCoroutine.Root + +_NON_STATISTICAL_PARAMS = [ + 'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum', + 'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support', + 'num_probit_terms_approx' +] +_NON_TRAINABLE_PARAMS = ['low', 'high'] + +# Registry of transformations that are applied to distributions in the prior +# before defining the surrogate family. + + +# Todo: inherited from asvi code, do we need this? +ASVI_SURROGATE_SUBSTITUTIONS = {} + + +# Todo: inherited from asvi code, do we need this? +def _as_substituted_distribution(distribution): + """Applies all substitution rules that match a distribution.""" + for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items(): + if condition(distribution): + distribution = substitution_fn(distribution) + return distribution + + +# Todo: inherited from asvi code, do we need this? +def register_asvi_substitution_rule(condition, substitution_fn): + """Registers a rule for substituting distributions in ASVI surrogates. + + Args: + condition: Python `callable` that takes a Distribution instance and + returns a Python `bool` indicating whether or not to substitute it. + May also be a class type such as `tfd.Normal`, in which case the + condition is interpreted as + `lambda distribution: isinstance(distribution, class)`. + substitution_fn: Python `callable` that takes a Distribution + instance and returns a new Distribution instance used to define + the ASVI surrogate posterior. Note that this substitution does not modify + the original model. + + #### Example + + To use a Normal surrogate for all location-scale family distributions, we + could register the substitution: + + ```python + tfp.experimental.vi.register_asvi_surrogate_substitution( + condition=lambda distribution: ( + hasattr(distribution, 'loc') and hasattr(distribution, 'scale')) + substitution_fn=lambda distribution: ( + # Invoking the event space bijector applies any relevant constraints, + # e.g., that HalfCauchy samples must be `>= loc`. + distribution.experimental_default_event_space_bijector()( + tfd.Normal(loc=distribution.loc, scale=distribution.scale))) + ``` + + This rule will fire when ASVI encounters a location-scale distribution, + and instructs ASVI to build a surrogate 'as if' the model had just used a + (possibly constrained) Normal in its place. Note that we could have used a + more precise condition, e.g., to limit the substitution to distributions with + a specific `name`, if we had reason to think that a Normal distribution would + be a good surrogate for some model variables but not others. + + """ + global ASVI_SURROGATE_SUBSTITUTIONS + if inspect.isclass(condition): + condition = lambda distribution, cls=condition: isinstance( + # pylint: disable=g-long-lambda + distribution, cls) + ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn + + +# Default substitutions attempt to express distributions using the most +# flexible available parameterization. +# pylint: disable=g-long-lambda +register_asvi_substitution_rule( + half_normal.HalfNormal, + lambda dist: truncated_normal.TruncatedNormal( + loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) +register_asvi_substitution_rule( + uniform.Uniform, + lambda dist: shift.Shift(dist.low)( + scale_lib.Scale(dist.high - dist.low)( + beta.Beta(concentration0=tf.ones_like(dist.mean()), + concentration1=1.)))) +register_asvi_substitution_rule( + exponential.Exponential, + lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) +register_asvi_substitution_rule( + chi2.Chi2, + lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) + + +# pylint: enable=g-long-lambda + +# a single JointDistribution. +def build_cf_surrogate_posterior( + prior, + num_auxiliary_variables=0, + initial_prior_weight=0.5, + seed=None, + name=None): + # todo: change docstrings + """Builds a structured surrogate posterior inspired by conjugate updating. + + ASVI, or Automatic Structured Variational Inference, was proposed by + Ambrogioni et al. (2020) [1] as a method of automatically constructing a + surrogate posterior with the same structure as the prior. It does this by + reparameterizing the variational family of the surrogate posterior by + structuring each parameter according to the equation + ```none + prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter + ``` + In this equation, `prior_parameter` is a vector of prior parameters and + `mean_field_parameter` is a vector of trainable parameters with the same + domain as `prior_parameter`. `prior_weight` is a vector of learnable + parameters where `0. <= prior_weight <= 1.`. When `prior_weight = + 0`, the surrogate posterior will be a mean-field surrogate, and when + `prior_weight = 1.`, the surrogate posterior will be the prior. This convex + combination equation, inspired by conjugacy in exponential families, thus + allows the surrogate posterior to balance between the structure of the prior + and the structure of a mean-field approximation. + + Args: + prior: tfd.JointDistribution instance of the prior. + mean_field: Optional Python boolean. If `True`, creates a degenerate + surrogate distribution in which all variables are independent, + ignoring the prior dependence structure. Default value: `False`. + initial_prior_weight: Optional float value (either static or tensor value) + on the interval [0, 1]. A larger value creates an initial surrogate + distribution with more dependence on the prior structure. Default value: + `0.5`. + seed: Python `int` seed for random initialization. + name: Optional string. Default value: `build_cf_surrogate_posterior`. + + Returns: + surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance + whose samples have shape and structure matching that of `prior`. + + Raises: + TypeError: The `prior` argument cannot be a nested `JointDistribution`. + + ### Examples + + Consider a Brownian motion model expressed as a JointDistribution: + + ```python + prior_loc = 0. + innovation_noise = .1 + + def model_fn(): + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for i in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) + ``` + + Let's use variational inference to approximate the posterior. We'll build a + surrogate posterior distribution by feeding in the prior distribution. + + ```python + surrogate_posterior = + tfp.experimental.vi.build_cf_surrogate_posterior(prior) + ``` + + This creates a trainable joint distribution, defined by variables in + `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` + to fit this distribution by minimizing a divergence to the true posterior. + + ```python + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob_fn, + surrogate_posterior=surrogate_posterior, + num_steps=100, + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # After optimization, samples from the surrogate will approximate + # samples from the true posterior. + samples = surrogate_posterior.sample(100) + posterior_mean = [tf.reduce_mean(x) for x in samples] + posterior_std = [tf.math.reduce_std(x) for x in samples] + ``` + + #### References + [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured + variational inference. _arXiv preprint arXiv:2002.00643_, 2020 + https://arxiv.org/abs/2002.00643 + + """ + with tf.name_scope(name or 'build_cf_surrogate_posterior'): + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist=prior, + base_distribution_surrogate_fn=functools.partial( + _cf_convex_update_for_base_distribution, + initial_prior_weight=initial_prior_weight, + num_auxiliary_variables=num_auxiliary_variables), + seed=seed) + surrogate_posterior.also_track = variables + return surrogate_posterior + + +def _cf_surrogate_for_distribution(dist, + base_distribution_surrogate_fn, + sample_shape=None, + variables=None, + seed=None): + # todo: change docstrings + """Recursively creates ASVI surrogates, and creates new variables if needed. + + Args: + dist: a `tfd.Distribution` instance. + base_distribution_surrogate_fn: Callable to build a surrogate posterior + for a 'base' (non-meta and non-joint) distribution, with signature + `surrogate_posterior, variables = base_distribution_fn( + dist, sample_shape=None, variables=None, seed=None)`. + sample_shape: Optional `Tensor` shape of samples drawn from `dist` by + `tfd.Sample` wrappers. If not `None`, the surrogate's event will include + independent sample dimensions, i.e., it will have event shape + `concat([sample_shape, dist.event_shape], axis=0)`. + Default value: `None`. + variables: Optional nested structure of `tf.Variable`s returned from a + previous call to `_cf_surrogate_for_distribution`. If `None`, + new variables will be created; otherwise, constructs a surrogate posterior + backed by the passed-in variables. + Default value: `None`. + seed: Python `int` seed for random initialization. + Returns: + surrogate_posterior: Instance of `tfd.Distribution` representing a trainable + surrogate posterior distribution, with the same structure and `name` as + `dist`. + variables: Nested structure of `tf.Variable` trainable parameters for the + surrogate posterior. If `dist` is a base distribution, this is + a `dict` of `ASVIParameters` instances. If `dist` is a joint + distribution, this is a `dist.dtype` structure of such `dict`s. + """ + + # Apply any substitutions, while attempting to preserve the original name. + dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) + + if hasattr(dist, '_model_coroutine'): + surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=variables, + seed=seed) + else: + surrogate_posterior, variables = base_distribution_surrogate_fn( + dist=dist, sample_shape=sample_shape, variables=variables, seed=seed) + return surrogate_posterior, variables + + +def _cf_surrogate_for_joint_distribution( + dist, base_distribution_surrogate_fn, variables=None, seed=None): + """Builds a structured joint surrogate posterior for a joint model.""" + + # Probabilistic program for CF surrogate posterior. + flat_variables = dist._model_flatten( + variables) if variables else None # pylint: disable=protected-access + prior_coroutine = dist._model_coroutine # pylint: disable=protected-access + + def posterior_generator(seed=seed): + prior_gen = prior_coroutine() + dist = next(prior_gen) + i = 0 + try: + while True: + was_root = isinstance(dist, Root) + if was_root: + dist = dist.distribution + + seed, init_seed = samplers.split_seed(seed) + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=flat_variables[i] if flat_variables else None, + seed=init_seed) + + if was_root: + surrogate_posterior = Root(surrogate_posterior) + # If variables were not given---i.e., we're creating new + # variables---then yield the new variables along with the surrogate + # posterior. This assumes an execution context such as + # `_extract_variables_from_coroutine_model` below that will capture and + # save the variables. + value_out = yield (surrogate_posterior if flat_variables + else (surrogate_posterior, variables)) + if type(value_out) == list: + if len(dist.event_shape) == 0: + dist = prior_gen.send(tf.squeeze(value_out[0], -1)) + else: + dist = prior_gen.send(value_out[0]) + + else: + dist = prior_gen.send(value_out) + i += 1 + except StopIteration: + pass + + if variables is None: + # Run the generator to create variables, then call ourselves again + # to construct the surrogate JD from these variables. Note that we can't + # just create a JDC from the current `posterior_generator`, because it will + # try to build new variables on every invocation; the recursive call will + # define a new `posterior_generator` that knows about the variables we're + # about to create. + return _cf_surrogate_for_joint_distribution( + dist=dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=dist._model_unflatten( # pylint: disable=protected-access + _extract_variables_from_coroutine_model( + posterior_generator, seed=seed))) + + # Temporary workaround for bijector caching issues with autobatched JDs. + surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched + if not hasattr(dist, 'use_vectorized_map'): + surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine + surrogate_posterior = surrogate_type(posterior_generator, + name=_get_name(dist)) + + # Ensure that the surrogate posterior structure matches that of the prior. + # todo: check me, do we need this? in case needs to be modified + # if we use auxiliary variables, then the structure won't match the one of the + # prior + '''try: + tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) + except TypeError: + tokenize = lambda jd: jd._model_unflatten( + # pylint: disable=protected-access, g-long-lambda + range(len(jd._model_flatten(jd.dtype))) + # pylint: disable=protected-access + ) + surrogate_posterior = restructure.Restructure( + output_structure=tokenize(dist), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist))''' + return surrogate_posterior, variables + + +# todo: sample_shape and seed are not used.. maybe they should? +def _cf_convex_update_for_base_distribution(dist, + initial_prior_weight, + num_auxiliary_variables=0, + sample_shape=None, + variables=None, + seed=None): + """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + + if variables is None: + actual_event_shape = dist.event_shape_tensor() + int_event_shape = int(actual_event_shape) if \ + actual_event_shape.shape.as_list()[0] > 0 else 1 + layers = 3 + bijectors = [reshape.Reshape([-1], + event_shape_in=actual_event_shape + + num_auxiliary_variables)] + + for _ in range(0, layers - 1): + bijectors.append( + build_highway_flow_layer( + tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + activation_fn=True, gate_first_n=int_event_shape)) + bijectors.append( + build_highway_flow_layer( + tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + activation_fn=False, gate_first_n=int_event_shape)) + bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + + variables = chain.Chain(bijectors=list(reversed(bijectors))) + + if num_auxiliary_variables > 0: + cascading_flows = split.Split( + [-1, num_auxiliary_variables])( + transformed_distribution.TransformedDistribution( + distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast( + sample.Sample(normal.Normal(0., .1), num_auxiliary_variables), + to_shape=dist.batch_shape)]), + bijector=variables)) + + else: + cascading_flows = transformed_distribution.TransformedDistribution( + distribution=dist, + bijector=variables) + + return cascading_flows, variables + + +def _extract_variables_from_coroutine_model(model_fn, seed=None): + """Extracts variables from a generator that yields (dist, variables) pairs.""" + gen = model_fn() + try: + dist, dist_variables = next(gen) + flat_variables = [dist_variables] + while True: + seed, local_seed = samplers.split_seed(seed, n=2) + sampled_value = (dist.distribution.sample(seed=local_seed) + if isinstance(dist, Root) + else dist.sample(seed=local_seed)) + dist, dist_variables = gen.send( + sampled_value) # tf.concat(sampled_value, axis=0) + flat_variables.append(dist_variables) + except StopIteration: + pass + return flat_variables + + +def _set_name(dist, name): + """Copies a distribution-like object, replacing its name.""" + if hasattr(dist, 'copy'): + return dist.copy(name=name) + # Some distribution-like entities such as JointDistributionPinned don't + # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set + # the name directly. + dist = copy.copy(dist) + dist._name = name # pylint: disable=protected-access + return dist + + +def _get_name(dist): + """Attempts to get a distribution's short name, excluding the name scope.""" + return getattr(dist, 'parameters', {}).get('name', dist.name) From dbf371b97027fcdd7b85f96bfc0e4d00dccc1bd9 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:55:54 +0200 Subject: [PATCH 04/54] Revert "removed cascading_flows from pr" This reverts commit 1620ebd2 --- .../experimental/bijectors/highway_flow.py | 1 + .../experimental/vi/cascading_flows_test.py | 354 ++++++++++++++++++ 2 files changed, 355 insertions(+) create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows_test.py diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow.py b/tensorflow_probability/python/experimental/bijectors/highway_flow.py index bdfed9b2e8..6f26abe72f 100644 --- a/tensorflow_probability/python/experimental/bijectors/highway_flow.py +++ b/tensorflow_probability/python/experimental/bijectors/highway_flow.py @@ -26,6 +26,7 @@ from tensorflow_probability.python.internal import tensor_util + def build_highway_flow_layer(width, residual_fraction_initial_value=0.5, activation_fn=False, diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py new file mode 100644 index 0000000000..9c4393be24 --- /dev/null +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -0,0 +1,354 @@ +# Copyright 2021 The TensorFlow Probability Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Tests for structured surrogate posteriors.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import tensorflow.compat.v1 as tf1 +import tensorflow.compat.v2 as tf +import tensorflow_probability as tfp +from tensorflow_probability.python.experimental.vi import cascading_flows +from tensorflow_probability.python.internal import prefer_static as ps +from tensorflow_probability.python.internal import test_util + + +tfb = tfp.bijectors +tfd = tfp.distributions + + +@test_util.test_all_tf_execution_regimes +class _TrainableCFSurrogate(object): + + def _expected_num_trainable_variables(self, prior_dist): + """Infers the expected number of trainable variables for a non-nested JD.""" + prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access + expected_num_trainable_variables = 0 + for original_dist in prior_dists: + try: + original_dist = original_dist.distribution + except AttributeError: + pass + dist = cascading_flows._as_substituted_distribution(original_dist) + dist_params = dist.parameters + for param, value in dist_params.items(): + if (param not in cascading_flows._NON_STATISTICAL_PARAMS + and value is not None and param not in ('low', 'high')): + # One variable each for prior_weight, mean_field_parameter. + expected_num_trainable_variables += 2 + return expected_num_trainable_variables + + def test_dims_and_gradients(self): + + prior_dist = self.make_prior_dist() + + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist) + + # Test that the correct number of trainable variables are being tracked + self.assertLen(surrogate_posterior.trainable_variables, + self._expected_num_trainable_variables(prior_dist)) + + # Test that the sample shape is correct + three_posterior_samples = surrogate_posterior.sample( + 3, seed=test_util.test_seed(sampler_type='stateless')) + three_prior_samples = prior_dist.sample( + 3, seed=test_util.test_seed(sampler_type='stateless')) + self.assertAllEqualNested( + [s.shape for s in tf.nest.flatten(three_prior_samples)], + [s.shape for s in tf.nest.flatten(three_posterior_samples)]) + + # Test that gradients are available wrt the variational parameters. + posterior_sample = surrogate_posterior.sample( + seed=test_util.test_seed(sampler_type='stateless')) + with tf.GradientTape() as tape: + posterior_logprob = surrogate_posterior.log_prob(posterior_sample) + grad = tape.gradient(posterior_logprob, + surrogate_posterior.trainable_variables) + self.assertTrue(all(g is not None for g in grad)) + + def test_initialization_is_deterministic_following_seed(self): + prior_dist = self.make_prior_dist() + + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist, + seed=test_util.test_seed(sampler_type='stateless')) + self.evaluate( + [v.initializer for v in surrogate_posterior.trainable_variables]) + posterior_sample = surrogate_posterior.sample( + seed=test_util.test_seed(sampler_type='stateless')) + + surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist, + seed=test_util.test_seed(sampler_type='stateless')) + self.evaluate( + [v.initializer for v in surrogate_posterior2.trainable_variables]) + posterior_sample2 = surrogate_posterior2.sample( + seed=test_util.test_seed(sampler_type='stateless')) + + self.assertAllEqualNested(posterior_sample, posterior_sample2) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + innovation_noise = 0.1 + prior_loc = 0. + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for _ in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + def make_likelihood_model(self, x, observation_noise): + + def _likelihood_model(): + for i in range(5): + yield tfd.Normal(loc=x[i], scale=observation_noise) + + return tfd.JointDistributionCoroutineAutoBatched(_likelihood_model) + + def get_observations(self, prior_dist): + observation_noise = 0.15 + ground_truth = prior_dist.sample() + likelihood = self.make_likelihood_model( + x=ground_truth, observation_noise=observation_noise) + return likelihood.sample(1) + + def get_target_log_prob(self, observations, prior_dist): + + def target_log_prob(*x): + observation_noise = 0.15 + likelihood_dist = self.make_likelihood_model( + x=x, observation_noise=observation_noise) + return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x) + + return target_log_prob + + def test_fitting_surrogate_posterior(self): + + prior_dist = self.make_prior_dist() + observations = self.get_observations(prior_dist) + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist) + target_log_prob = self.get_target_log_prob(observations, prior_dist) + + # Test vi fit surrogate posterior works + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob, + surrogate_posterior, + num_steps=5, # Don't optimize to completion. + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # Compute posterior statistics. + with tf.control_dependencies([losses]): + posterior_samples = surrogate_posterior.sample(100) + posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples) + posterior_stddev = tf.nest.map_structure(tf.math.reduce_std, + posterior_samples) + + self.evaluate(tf1.global_variables_initializer()) + _ = self.evaluate(losses) + _ = self.evaluate(posterior_mean) + _ = self.evaluate(posterior_stddev) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestEightSchools(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12], + dtype=tf.float32) + num_schools = ps.shape(treatment_effects)[-1] + + return tfd.JointDistributionNamed({ + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda + tfd.Independent( + tfd.Normal( + loc=avg_effect[..., None] * tf.ones(num_schools), + scale=tf.exp(log_stddev[..., None]) * tf.ones( + num_schools), + name='school_effects'), + reinterpreted_batch_ndims=1)) + }) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + return tfd.JointDistributionNamed({ + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda + tfd.Sample( + tfd.Normal( + loc=avg_effect[..., None], + scale=tf.exp(log_stddev[..., None]), + name='school_effects'), + sample_shape=[8])) + }) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + innovation_noise = 1. + yield tfd.HalfNormal( + scale=innovation_noise, validate_args=True, allow_nan_stats=False) + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestDiscreteLatent( + test_util.TestCase, _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + a = yield tfd.Bernoulli(logits=0.5, name='a') + yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1., + scale=1., name='b') + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestNesting(test_util.TestCase, + _TrainableCFSurrogate): + + def _expected_num_trainable_variables(self, _): + # Nested distributions have total of 10 params after Exponential->Gamma + # substitution, multiplied by 2 variables per param. + return 20 + + def make_prior_dist(self): + + def nested_model(): + a = yield tfd.Sample( + tfd.Sample( + tfd.Normal(0., 1.), + sample_shape=4), + sample_shape=[2], + name='a') + b = yield tfb.Sigmoid()( + tfb.Square()( + tfd.Exponential(rate=tf.exp(a))), + name='b') + # pylint: disable=g-long-lambda + yield tfd.JointDistributionSequential( + [tfd.Laplace(loc=a, scale=b), + lambda c1: tfd.Independent( + tfd.Beta(concentration1=1., + concentration0=tf.nn.softplus(c1)), + reinterpreted_batch_ndims=1), + lambda c1, c2: tfd.JointDistributionNamed({ + 'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)}) + ], name='c') + # pylint: enable=g-long-lambda + + return tfd.JointDistributionCoroutineAutoBatched(nested_model) + + +@test_util.test_all_tf_execution_regimes +class TestCFDistributionSubstitution(test_util.TestCase): + + def test_default_substitutes_trainable_families(self): + + @tfd.JointDistributionCoroutineAutoBatched + def model(): + yield tfd.Sample( + tfd.Uniform(low=-2., high=7.), + sample_shape=[2], + name='a') + yield tfd.HalfNormal(1., name='b') + yield tfd.Exponential(rate=[1., 2.], name='c') + yield tfd.Chi2(df=3., name='d') + + surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( + model) + self.assertAllEqualNested(model.event_shape, surrogate.event_shape) + + surrogate_dists, _ = surrogate.sample_distributions() + self.assertIsInstance(surrogate_dists.a, tfd.Independent) + self.assertIsInstance(surrogate_dists.a.distribution, + tfd.TransformedDistribution) + self.assertIsInstance(surrogate_dists.a.distribution.distribution, + tfd.Beta) + self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal) + self.assertIsInstance(surrogate_dists.c, tfd.Gamma) + self.assertIsInstance(surrogate_dists.d, tfd.Gamma) + + def test_can_specify_custom_substitution(self): + + @tfd.JointDistributionCoroutineAutoBatched + def centered_horseshoe(ndims=100): + global_scale = yield tfd.HalfCauchy( + loc=0., scale=1., name='global_scale') + local_scale = yield tfd.HalfCauchy( + loc=0., scale=tf.ones([ndims]), name='local_scale') + yield tfd.Normal( + loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights') + + tfp.experimental.vi.register_asvi_substitution_rule( + condition=tfd.HalfCauchy, + substitution_fn=( + lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale)))) + surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( + centered_horseshoe) + self.assertAllEqualNested(centered_horseshoe.event_shape, + surrogate.event_shape) + + # If the surrogate was built with names or structure differing from the + # model, so that it had to be `tfb.Restructure`'d, then this + # sample_distributions call will fail because the surrogate isn't an + # instance of tfd.JointDistribution. + surrogate_dists, _ = surrogate.sample_distributions() + self.assertIsInstance(surrogate_dists.global_scale.distribution, + tfd.Normal) + self.assertIsInstance(surrogate_dists.local_scale.distribution, + tfd.Normal) + self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + +# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions. +# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with +# missing observations. + +if __name__ == '__main__': + tf.test.main() From c6118b13c2b8b04358bab2e9e865830a540cff06 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:56:49 +0200 Subject: [PATCH 05/54] reverted to latest version --- .../python/experimental/vi/cascading_flows.py | 101 ++++++++++++++---- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index d8c9393d8e..61dcce7236 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -25,21 +25,20 @@ import tensorflow.compat.v2 as tf -from tensorflow_probability.python.experimental.bijectors import \ - build_highway_flow_layer from tensorflow_probability.python.bijectors import chain from tensorflow_probability.python.bijectors import reshape from tensorflow_probability.python.bijectors import scale as scale_lib from tensorflow_probability.python.bijectors import shift from tensorflow_probability.python.bijectors import split - from tensorflow_probability.python.distributions import batch_broadcast from tensorflow_probability.python.distributions import beta from tensorflow_probability.python.distributions import blockwise from tensorflow_probability.python.distributions import chi2 +from tensorflow_probability.python.distributions import deterministic from tensorflow_probability.python.distributions import exponential from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal +from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import \ joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ @@ -49,10 +48,12 @@ from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import truncated_normal from tensorflow_probability.python.distributions import uniform +from tensorflow_probability.python.experimental.bijectors import \ + build_highway_flow_layer from tensorflow_probability.python.internal import samplers __all__ = [ - 'register_asvi_substitution_rule', + 'register_cf_substitution_rule', 'build_cf_surrogate_posterior' ] @@ -83,7 +84,7 @@ def _as_substituted_distribution(distribution): # Todo: inherited from asvi code, do we need this? -def register_asvi_substitution_rule(condition, substitution_fn): +def register_cf_substitution_rule(condition, substitution_fn): """Registers a rule for substituting distributions in ASVI surrogates. Args: @@ -132,20 +133,20 @@ def register_asvi_substitution_rule(condition, substitution_fn): # Default substitutions attempt to express distributions using the most # flexible available parameterization. # pylint: disable=g-long-lambda -register_asvi_substitution_rule( +register_cf_substitution_rule( half_normal.HalfNormal, lambda dist: truncated_normal.TruncatedNormal( loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) -register_asvi_substitution_rule( +register_cf_substitution_rule( uniform.Uniform, lambda dist: shift.Shift(dist.low)( scale_lib.Scale(dist.high - dist.low)( beta.Beta(concentration0=tf.ones_like(dist.mean()), concentration1=1.)))) -register_asvi_substitution_rule( +register_cf_substitution_rule( exponential.Exponential, lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) -register_asvi_substitution_rule( +register_cf_substitution_rule( chi2.Chi2, lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) @@ -255,6 +256,7 @@ def model_fn(): _cf_convex_update_for_base_distribution, initial_prior_weight=initial_prior_weight, num_auxiliary_variables=num_auxiliary_variables), + num_auxiliary_variables=num_auxiliary_variables, seed=seed) surrogate_posterior.also_track = variables return surrogate_posterior @@ -264,6 +266,8 @@ def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn, sample_shape=None, variables=None, + num_auxiliary_variables=0, + global_auxiliary_variables=None, seed=None): # todo: change docstrings """Recursively creates ASVI surrogates, and creates new variables if needed. @@ -303,15 +307,19 @@ def _cf_surrogate_for_distribution(dist, dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=variables, + num_auxiliary_variables=num_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables=None, + num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -322,7 +330,46 @@ def _cf_surrogate_for_joint_distribution( def posterior_generator(seed=seed): prior_gen = prior_coroutine() dist = next(prior_gen) - i = 0 + + if num_auxiliary_variables > 0: + i = 1 + + if flat_variables: + variables = flat_variables[0] + + else: + layers = 3 + bijectors = [] + + for _ in range(0, layers - 1): + bijectors.append( + build_highway_flow_layer(num_auxiliary_variables, + residual_fraction_initial_value=0.5, + activation_fn=True, gate_first_n=0, + seed=seed)) + bijectors.append( + build_highway_flow_layer(num_auxiliary_variables, + residual_fraction_initial_value=0.5, + activation_fn=False, gate_first_n=0, + seed=seed)) + + variables = chain.Chain(bijectors=list(reversed(bijectors))) + + eps = transformed_distribution.TransformedDistribution( + distribution=sample.Sample(normal.Normal(0., 0.1), + num_auxiliary_variables), + bijector=variables) + + eps = Root(eps) + + value_out = yield (eps if flat_variables + else (eps, variables)) + + global_auxiliary_variables = value_out + + else: + i = 0 + try: while True: was_root = isinstance(dist, Root) @@ -334,9 +381,10 @@ def posterior_generator(seed=seed): dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=flat_variables[i] if flat_variables else None, + global_auxiliary_variables=global_auxiliary_variables, seed=init_seed) - if was_root: + if was_root and num_auxiliary_variables == 0: surrogate_posterior = Root(surrogate_posterior) # If variables were not given---i.e., we're creating new # variables---then yield the new variables along with the surrogate @@ -367,6 +415,8 @@ def posterior_generator(seed=seed): return _cf_surrogate_for_joint_distribution( dist=dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, variables=dist._model_unflatten( # pylint: disable=protected-access _extract_variables_from_coroutine_model( posterior_generator, seed=seed))) @@ -401,6 +451,7 @@ def posterior_generator(seed=seed): def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables=0, + global_auxiliary_variables=None, sample_shape=None, variables=None, seed=None): @@ -412,31 +463,39 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape.shape.as_list()[0] > 0 else 1 layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] for _ in range(0, layers - 1): bijectors.append( build_highway_flow_layer( tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=True, gate_first_n=int_event_shape)) + activation_fn=True, gate_first_n=int_event_shape, seed=seed)) bijectors.append( build_highway_flow_layer( tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=False, gate_first_n=int_event_shape)) - bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + activation_fn=False, gate_first_n=int_event_shape, seed=seed)) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: + batch_shape = global_auxiliary_variables.shape[0] if len( + global_auxiliary_variables.shape) > 1 else [] + cascading_flows = split.Split( [-1, num_auxiliary_variables])( transformed_distribution.TransformedDistribution( - distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast( - sample.Sample(normal.Normal(0., .1), num_auxiliary_variables), - to_shape=dist.batch_shape)]), + distribution=blockwise.Blockwise([ + batch_broadcast.BatchBroadcast(dist, + to_shape=batch_shape), + independent.Independent( + deterministic.Deterministic( + global_auxiliary_variables), + reinterpreted_batch_ndims=1)]), bijector=variables)) else: From bcf95e154f02d45c704aee82d0e1fccf05f2f03c Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 11:22:31 +0200 Subject: [PATCH 06/54] fixed surrogate posterior type --- .../python/experimental/vi/cascading_flows.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 61dcce7236..95c7cf5faf 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -39,8 +39,7 @@ from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent -from tensorflow_probability.python.distributions import \ - joint_distribution_auto_batched +from tensorflow_probability.python.distributions import joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ joint_distribution_coroutine from tensorflow_probability.python.distributions import normal @@ -422,11 +421,10 @@ def posterior_generator(seed=seed): posterior_generator, seed=seed))) # Temporary workaround for bijector caching issues with autobatched JDs. - surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched - if not hasattr(dist, 'use_vectorized_map'): - surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine - surrogate_posterior = surrogate_type(posterior_generator, - name=_get_name(dist)) + surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified From 4d4b291c8e759a81ce171641ad1912cbb641b5f0 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 10:47:34 +0200 Subject: [PATCH 07/54] small fixes --- .../python/experimental/vi/cascading_flows.py | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 95c7cf5faf..a9735f3739 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -156,7 +156,7 @@ def register_cf_substitution_rule(condition, substitution_fn): def build_cf_surrogate_posterior( prior, num_auxiliary_variables=0, - initial_prior_weight=0.5, + initial_prior_weight=0.98, seed=None, name=None): # todo: change docstrings @@ -311,14 +311,12 @@ def _cf_surrogate_for_distribution(dist, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, - global_auxiliary_variables=global_auxiliary_variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, - num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -343,19 +341,17 @@ def posterior_generator(seed=seed): for _ in range(0, layers - 1): bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.5, - activation_fn=True, gate_first_n=0, - seed=seed)) + residual_fraction_initial_value=0.98, + activation_fn=True, gate_first_n=0, seed=seed)) bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.5, - activation_fn=False, gate_first_n=0, - seed=seed)) + residual_fraction_initial_value=0.98, + activation_fn=False, gate_first_n=0, seed=seed)) variables = chain.Chain(bijectors=list(reversed(bijectors))) eps = transformed_distribution.TransformedDistribution( - distribution=sample.Sample(normal.Normal(0., 0.1), + distribution=sample.Sample(normal.Normal(0., 1.), num_auxiliary_variables), bijector=variables) @@ -380,7 +376,7 @@ def posterior_generator(seed=seed): dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables=global_auxiliary_variables, + global_auxiliary_variables = global_auxiliary_variables, seed=init_seed) if was_root and num_auxiliary_variables == 0: @@ -422,9 +418,9 @@ def posterior_generator(seed=seed): # Temporary workaround for bijector caching issues with autobatched JDs. surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified @@ -461,8 +457,8 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape.shape.as_list()[0] > 0 else 1 layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] for _ in range(0, layers - 1): bijectors.append( @@ -475,8 +471,7 @@ def _cf_convex_update_for_base_distribution(dist, tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, activation_fn=False, gate_first_n=int_event_shape, seed=seed)) - bijectors.append( - reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -489,7 +484,7 @@ def _cf_convex_update_for_base_distribution(dist, transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), + to_shape=batch_shape), independent.Independent( deterministic.Deterministic( global_auxiliary_variables), From 6cd887105e07df82b8d4bef8fafbb06f3e245680 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:09:26 +0200 Subject: [PATCH 08/54] fixed global variables if no auxiliary variabled --- tensorflow_probability/python/experimental/vi/cascading_flows.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index a9735f3739..ef9f6f78da 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -363,6 +363,7 @@ def posterior_generator(seed=seed): global_auxiliary_variables = value_out else: + global_auxiliary_variables = None i = 0 try: From 4690d0a75e9dc7e5e78914d9503f2ea1e0bc9d8a Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:51:56 +0200 Subject: [PATCH 09/54] added number of layers parameter --- .../python/experimental/vi/cascading_flows.py | 81 ++++++++++++------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index ef9f6f78da..9b430bf6a6 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -39,16 +39,17 @@ from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent -from tensorflow_probability.python.distributions import joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ - joint_distribution_coroutine + joint_distribution_auto_batched +from tensorflow_probability.python.distributions import \ + joint_distribution_coroutine from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import truncated_normal from tensorflow_probability.python.distributions import uniform from tensorflow_probability.python.experimental.bijectors import \ - build_highway_flow_layer + build_highway_flow_layer from tensorflow_probability.python.internal import samplers __all__ = [ @@ -157,6 +158,7 @@ def build_cf_surrogate_posterior( prior, num_auxiliary_variables=0, initial_prior_weight=0.98, + num_layers=3, seed=None, name=None): # todo: change docstrings @@ -254,8 +256,10 @@ def model_fn(): base_distribution_surrogate_fn=functools.partial( _cf_convex_update_for_base_distribution, initial_prior_weight=initial_prior_weight, - num_auxiliary_variables=num_auxiliary_variables), + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers), num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, seed=seed) surrogate_posterior.also_track = variables return surrogate_posterior @@ -263,9 +267,10 @@ def model_fn(): def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn, + num_auxiliary_variables, + num_layers, sample_shape=None, variables=None, - num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): # todo: change docstrings @@ -307,16 +312,22 @@ def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=variables, num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, global_auxiliary_variables=global_auxiliary_variables, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, + num_layers=num_layers, + seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables, + num_auxiliary_variables, num_layers, global_auxiliary_variables, + seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -335,18 +346,17 @@ def posterior_generator(seed=seed): variables = flat_variables[0] else: - layers = 3 bijectors = [] - for _ in range(0, layers - 1): + for _ in range(0, num_layers - 1): bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.98, - activation_fn=True, gate_first_n=0, seed=seed)) + activation_fn=True, + gate_first_n=0, seed=seed)) bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.98, - activation_fn=False, gate_first_n=0, seed=seed)) + activation_fn=False, + gate_first_n=0, seed=seed)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -376,8 +386,10 @@ def posterior_generator(seed=seed): surrogate_posterior, variables = _cf_surrogate_for_distribution( dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables = global_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, seed=init_seed) if was_root and num_auxiliary_variables == 0: @@ -412,16 +424,18 @@ def posterior_generator(seed=seed): dist=dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, global_auxiliary_variables=global_auxiliary_variables, - variables=dist._model_unflatten( # pylint: disable=protected-access + variables=dist._model_unflatten( + # pylint: disable=protected-access _extract_variables_from_coroutine_model( posterior_generator, seed=seed))) # Temporary workaround for bijector caching issues with autobatched JDs. surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified @@ -445,10 +459,11 @@ def posterior_generator(seed=seed): # todo: sample_shape and seed are not used.. maybe they should? def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, - num_auxiliary_variables=0, - global_auxiliary_variables=None, + num_auxiliary_variables, + num_layers, + global_auxiliary_variables, + variables, sample_shape=None, - variables=None, seed=None): """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" @@ -456,23 +471,27 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape = dist.event_shape_tensor() int_event_shape = int(actual_event_shape) if \ actual_event_shape.shape.as_list()[0] > 0 else 1 - layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] - for _ in range(0, layers - 1): + for _ in range(0, num_layers - 1): bijectors.append( build_highway_flow_layer( - tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=True, gate_first_n=int_event_shape, seed=seed)) + activation_fn=True, gate_first_n=int_event_shape, + seed=seed)) bijectors.append( build_highway_flow_layer( - tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=False, gate_first_n=int_event_shape, seed=seed)) - bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + activation_fn=False, gate_first_n=int_event_shape, + seed=seed)) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -485,7 +504,7 @@ def _cf_convex_update_for_base_distribution(dist, transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), + to_shape=batch_shape), independent.Independent( deterministic.Deterministic( global_auxiliary_variables), From 3b182aa6a30bfc7ae90715995dab33c22ec52525 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:59:21 +0200 Subject: [PATCH 10/54] readded highway flow --- .../python/experimental/bijectors/BUILD | 29 +++++++++++++++++++ .../python/experimental/bijectors/__init__.py | 4 +++ 2 files changed, 33 insertions(+) diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD index 3e42d43fb8..3fa54d43c1 100644 --- a/tensorflow_probability/python/experimental/bijectors/BUILD +++ b/tensorflow_probability/python/experimental/bijectors/BUILD @@ -117,6 +117,20 @@ multi_substrate_py_library( ], ) +multi_substrate_py_library( + name = "highway_flow", + srcs = ["highway_flow.py"], + srcs_version = "PY3", + deps = [ + ":scalar_function_with_inferred_inverse", + # numpy dep, + # tensorflow dep, + "//tensorflow_probability/python/bijectors", + "//tensorflow_probability/python/util", + "//tensorflow_probability/python/internal:samplers", + ], +) + multi_substrate_py_test( name = "sharded_test", size = "medium", @@ -133,3 +147,18 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:test_util", ], ) + +multi_substrate_py_test( + name = "highway_flow_test", + size = "medium", + srcs = ["highway_flow_test.py"], + jax_size = "medium", + python_version = "PY3", + srcs_version = "PY3", + deps = [ + # numpy dep + # tensorflow dep, + "//tensorflow_probability", + "//tensorflow_probability/python/internal:test_util", + ], +) \ No newline at end of file diff --git a/tensorflow_probability/python/experimental/bijectors/__init__.py b/tensorflow_probability/python/experimental/bijectors/__init__.py index a261af93f5..e7b4fb00da 100644 --- a/tensorflow_probability/python/experimental/bijectors/__init__.py +++ b/tensorflow_probability/python/experimental/bijectors/__init__.py @@ -18,9 +18,13 @@ from tensorflow_probability.python.bijectors.ldj_ratio import inverse_log_det_jacobian_ratio from tensorflow_probability.python.experimental.bijectors.distribution_bijectors import make_distribution_bijector from tensorflow_probability.python.experimental.bijectors.scalar_function_with_inferred_inverse import ScalarFunctionWithInferredInverse +from tensorflow_probability.python.experimental.bijectors.highway_flow import build_highway_flow_layer +from tensorflow_probability.python.experimental.bijectors.highway_flow import HighwayFlow from tensorflow_probability.python.experimental.bijectors.sharded import Sharded __all__ = [ + 'build_highway_flow_layer', + 'HighwayFlow', 'forward_log_det_jacobian_ratio', 'inverse_log_det_jacobian_ratio', 'make_distribution_bijector', From 3e11546deec4f55936d4dc2555b7c43c3ebad778 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:05:36 +0200 Subject: [PATCH 11/54] fixed init --- tensorflow_probability/python/experimental/vi/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py index e18c8d3455..cc5530300a 100644 --- a/tensorflow_probability/python/experimental/vi/__init__.py +++ b/tensorflow_probability/python/experimental/vi/__init__.py @@ -17,6 +17,7 @@ from tensorflow_probability.python.experimental.vi import util from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule +from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior @@ -29,7 +30,7 @@ 'build_affine_surrogate_posterior', 'build_affine_surrogate_posterior_from_base_distribution', 'build_asvi_surrogate_posterior', - 'builf_cf_surrogate_posterior' + 'build_cf_surrogate_posterior', 'build_factored_surrogate_posterior', 'build_split_flow_surrogate_posterior', 'build_trainable_location_scale_distribution', From 2ff7130e48c91989dd34ce2997c71696cfebf651 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:07:24 +0200 Subject: [PATCH 12/54] removed highway flow from this branch --- .../bijectors/highway_flow_test.py | 142 ------------------ 1 file changed, 142 deletions(-) delete mode 100644 tensorflow_probability/python/experimental/bijectors/highway_flow_test.py diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py b/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py deleted file mode 100644 index 24e6b7fb4e..0000000000 --- a/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2021 The TensorFlow Probability Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""Tests for HighwayFlow.""" -import tensorflow.compat.v2 as tf - -import tensorflow_probability as tfp -from tensorflow_probability.python.internal import samplers -from tensorflow_probability.python.internal import test_util - -tfb = tfp.bijectors -tfd = tfp.distributions - -@test_util.test_all_tf_execution_regimes -class HighwayFlowTests(test_util.TestCase): - - def testBijector(self): - width = 1 - for dim in range(2): - if dim == 0: - # Test generic case with scalar input - x = samplers.uniform((width,), minval=-1., - maxval=1., - seed=test_util.test_seed(sampler_type='stateless')) - elif dim == 1: - # Test with 2D tensor + batch - x = samplers.uniform((5, width, width), - minval=-1., - maxval=1., - seed=test_util.test_seed(sampler_type='stateless')) - - bijector = tfp.experimental.bijectors.build_highway_flow_layer( - width, activation_fn=True) - self.evaluate( - [v.initializer for v in bijector.trainable_variables]) - self.assertStartsWith(bijector.name, 'highway_flow') - self.assertAllClose(x, bijector.inverse( - tf.identity(bijector.forward(x)))) - self.assertAllClose( - bijector.forward_log_det_jacobian(x, event_ndims=dim + 1), - -bijector.inverse_log_det_jacobian( - tf.identity(bijector.forward(x)), event_ndims=dim + 1)) - - def testBijectorWithoutActivation(self): - width = 4 - x = samplers.uniform((2, width, width), - minval=-1., - maxval=1., - seed=test_util.test_seed(sampler_type='stateless')) - - bijector = tfp.experimental.bijectors.build_highway_flow_layer( - width, activation_fn=False) - self.evaluate( - [v.initializer for v in bijector.trainable_variables]) - self.assertStartsWith(bijector.name, 'highway_flow') - self.assertAllClose(x, bijector.inverse( - tf.identity(bijector.forward(x)))) - self.assertAllClose( - bijector.forward_log_det_jacobian(x, event_ndims=2), - -bijector.inverse_log_det_jacobian( - tf.identity(bijector.forward(x)), event_ndims=2)) - - def testGating(self): - width = 4 - x = samplers.uniform((2, width, width), - minval=-1., - maxval=1., - seed=test_util.test_seed(sampler_type='stateless')) - - # Test with gating half of the inputs - bijector = tfp.experimental.bijectors.build_highway_flow_layer( - width, activation_fn=True, gate_first_n=2) - self.evaluate( - [v.initializer for v in bijector.trainable_variables]) - self.assertStartsWith(bijector.name, 'highway_flow') - self.assertAllClose(x, bijector.inverse( - tf.identity(bijector.forward(x)))) - self.assertAllClose( - bijector.forward_log_det_jacobian(x, event_ndims=2), - -bijector.inverse_log_det_jacobian( - tf.identity(bijector.forward(x)), event_ndims=2)) - - # Test with gating no inputs - bijector = tfp.experimental.bijectors.build_highway_flow_layer( - width, activation_fn=True, gate_first_n=0) - self.evaluate( - [v.initializer for v in bijector.trainable_variables]) - self.assertStartsWith(bijector.name, 'highway_flow') - self.assertAllClose(x, bijector.inverse( - tf.identity(bijector.forward(x)))) - self.assertAllClose( - bijector.forward_log_det_jacobian(x, event_ndims=2), - -bijector.inverse_log_det_jacobian( - tf.identity(bijector.forward(x)), event_ndims=2)) - - def testResidualFractionGradientsWithCenteredDifference(self): - width = 4 - batch_size = 3 - residual_fraction = tf.constant(0.5) - bijector = tfp.experimental.bijectors.HighwayFlow( - residual_fraction=residual_fraction, - activation_fn=tf.nn.softplus, - bias=tf.zeros(width), - upper_diagonal_weights_matrix=tf.eye(width), - lower_diagonal_weights_matrix=tf.eye(width), - gate_first_n=width - ) - target = tfd.MultivariateNormalDiag(loc=tf.zeros(width), - scale_diag=tf.ones(width)) - x = tf.ones((batch_size, width)) - with tf.GradientTape() as g: - g.watch(bijector.residual_fraction) - y = tf.reduce_mean(target.log_prob(bijector.forward(x))) - tf_grad = g.gradient(y, bijector.residual_fraction) - - h = 1e-3 - - # pylint: disable=protected-access - bijector._residual_fraction = residual_fraction + h - y1 = tf.reduce_mean(target.log_prob(bijector.forward(tf.identity(x)))) - bijector._residual_fraction = residual_fraction - h - y2 = tf.reduce_mean(target.log_prob(bijector.forward(tf.identity(x)))) - # pylint: enable=protected-access - - manual_grad = (y1 - y2) / (2 * h) - - self.assertAllClose(tf_grad, manual_grad, rtol=1e-4) - - -if __name__ == '__main__': - tf.test.main() From 4cc20e3b438c25428b8c8b4c5df9c3d54b8a82ff Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:07:59 +0200 Subject: [PATCH 13/54] removed highway flow from this branch --- .../experimental/bijectors/highway_flow.py | 396 ------------------ 1 file changed, 396 deletions(-) delete mode 100644 tensorflow_probability/python/experimental/bijectors/highway_flow.py diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow.py b/tensorflow_probability/python/experimental/bijectors/highway_flow.py deleted file mode 100644 index 6f26abe72f..0000000000 --- a/tensorflow_probability/python/experimental/bijectors/highway_flow.py +++ /dev/null @@ -1,396 +0,0 @@ -# Copyright 2021 The TensorFlow Probability Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -"""Highway Flow bijector.""" - -import tensorflow.compat.v2 as tf - -from tensorflow_probability.python import bijectors as tfb -from tensorflow_probability.python import util -from tensorflow_probability.python.internal import cache_util -from tensorflow_probability.python.internal import dtype_util -from tensorflow_probability.python.internal import prefer_static as ps -from tensorflow_probability.python.internal import samplers -from tensorflow_probability.python.internal import tensor_util - - - -def build_highway_flow_layer(width, - residual_fraction_initial_value=0.5, - activation_fn=False, - gate_first_n=None, - seed=None): - """Builds HighwayFlow making sure that all the requirements are satisfied. - - Args: - width: Input dimension of the bijector. - residual_fraction_initial_value: Initial value for gating parameter, must be - between 0 and 1. - activation_fn: Whether or not use SoftPlus activation function. - gate_first_n: Decides which part of the input should be gated (useful for - example when using auxiliary variables). - seed: Seed for random initialization of the weights. - - Returns: - The initialized bijector with the following elements: - `residual_fraction` is bounded between 0 and 1. - `upper_diagonal_weights_matrix` is a randomly initialized (lower) diagonal - matrix with positive diagonal of size `width x width`. - `lower_diagonal_weights_matrix` is a randomly initialized lower diagonal - matrix with ones on the diagonal of size `width x width`; - `bias` is a randomly initialized vector of size `width` - """ - - # TODO: add control that residual_fraction_initial_value is between 0 and 1 - residual_fraction_initial_value = tf.convert_to_tensor( - residual_fraction_initial_value, - dtype_hint=tf.float32, - name='residual_fraction_initial_value') - dtype = residual_fraction_initial_value.dtype - - bias_seed, upper_seed, lower_seed = samplers.split_seed( - seed, n=3) - lower_bijector = tfb.Chain( - [tfb.TransformDiagonal(diag_bijector=tfb.Shift(1.)), - tfb.Pad(paddings=[(1, 0), (0, 1)]), - tfb.FillTriangular()]) - unconstrained_lower_initial_values = samplers.normal( - shape=lower_bijector.inverse_event_shape([width, width]), - mean=0., - stddev=.01, - seed=lower_seed) - upper_bijector = tfb.FillScaleTriL(diag_bijector=tfb.Softplus(), - diag_shift=None) - unconstrained_upper_initial_values = samplers.normal( - shape=upper_bijector.inverse_event_shape([width, width]), - mean=0., - stddev=.01, - seed=upper_seed) - - return HighwayFlow( - residual_fraction=util.TransformedVariable( - initial_value=residual_fraction_initial_value, - bijector=tfb.Sigmoid(), - dtype=dtype), - activation_fn=activation_fn, - bias=tf.Variable( - samplers.normal((width,), mean=0., stddev=0.01, seed=bias_seed), - dtype=dtype), - upper_diagonal_weights_matrix=util.TransformedVariable( - initial_value=upper_bijector.forward(unconstrained_upper_initial_values), - bijector=upper_bijector, - dtype=dtype), - lower_diagonal_weights_matrix=util.TransformedVariable( - initial_value=lower_bijector.forward(unconstrained_lower_initial_values), - bijector=lower_bijector, - dtype=dtype), - gate_first_n=gate_first_n - ) - - -class HighwayFlow(tfb.Bijector): - """Implements an Highway Flow bijector [1]. - - HighwayFlow interpolates the input `X` with the transformations at each step - of the bjiector. The Highway Flow can be used as building block for a - Cascading flow [1] or as a generic normalizing flow. - - The transformation consists of a convex update between the input `X` and a - linear transformation of `X` followed by activation with the form `g(A @ - X + b)`, where `g(.)` is a differentiable non-decreasing activation - function, and `A` and `b` are trainable weights. - - The convex update is regulated by a trainable residual fraction `l` - constrained between 0 and 1, and can be - formalized as: - `Y = l * X + (1 - l) * g(A @ X + b)`. - - To make this transformation invertible, the bijector is split in three - convex updates: - - `Y1 = l * X + (1 - l) * L @ X`, with `L` lower diagonal matrix with ones - on the diagonal; - - `Y2 = l * Y1 + (1 - l) * (U @ Y1 + b)`, with `U` upper diagonal matrix - with positive diagonal; - - `Y = l * Y2 + (1 - l) * g(Y2)` - - The function `build_highway_flow_layer` helps initializing the bijector - with the variables respecting the various constraints. - - For more details on Highway Flow and Cascading Flows see [1]. - - #### Usage example - ```python - tfd = tfp.distributions - tfb = tfp.bijectors - - dim = 4 # last input dimension - - bijector = build_highway_flow_layer(dim, activation_fn=True) - y = bijector.forward(x) # forward mapping - x = bijector.inverse(y) # inverse mapping - base = tfd.MultivariateNormalDiag(loc=tf.zeros(dim)) # Base distribution - transformed_distribution = tfd.TransformedDistribution(base, bijector) - ``` - - #### References - - [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. - "Automatic variational inference with cascading flows." arXiv preprint - arXiv:2102.04801 (2021). - """ - - # HighWay Flow simultaneously computes `forward` and `fldj` - # (and `inverse`/`ildj`), so we override the bijector cache to update the - # LDJ entries of attrs on forward/inverse inverse calls (instead of - # updating them only when the LDJ methods themselves are called). - - _cache = cache_util.BijectorCacheWithGreedyAttrs( - forward_name='_augmented_forward', - inverse_name='_augmented_inverse') - - def __init__(self, residual_fraction, activation_fn, bias, - upper_diagonal_weights_matrix, - lower_diagonal_weights_matrix, - gate_first_n=None, - validate_args=False, - name=None): - """Initializes the HighwayFlow. - Args: - residual_fraction: Scalar `Tensor` used for the convex update, must be - between 0 and 1. - activation_fn: Boolean to decide whether to use SoftPlus (True) activation - or no activation (False). - bias: Bias vector. - upper_diagonal_weights_matrix: Lower diagional matrix of size - (width, width) with positive diagonal (is transposed to Upper diagonal - within the bijector). - lower_diagonal_weights_matrix: Lower diagonal matrix with ones on the main - diagional. - gate_first_n: Integer that decides what part of the input is gated. - Default: `None`. When None, the whole input is gated. - """ - parameters = dict(locals()) - name = name or 'highway_flow' - dtype = dtype_util.common_dtype( - [residual_fraction, bias, upper_diagonal_weights_matrix, - lower_diagonal_weights_matrix], dtype_hint=tf.float32) - with tf.name_scope(name) as name: - self._width = ps.shape(bias)[-1] - self._bias = tensor_util.convert_nonref_to_tensor(bias, dtype=dtype, - name='bias') - self._residual_fraction = tensor_util.convert_nonref_to_tensor( - residual_fraction, dtype=dtype, name='residual_fraction') - # The upper matrix is still lower triangular, transpose is done in - # _inverse and _forwars metowds. - self._upper_diagonal_weights_matrix = tensor_util.convert_nonref_to_tensor( - upper_diagonal_weights_matrix, dtype=dtype, - name='upper_diagonal_weights_matrix') - self._lower_diagonal_weights_matrix = tensor_util.convert_nonref_to_tensor( - lower_diagonal_weights_matrix, dtype=dtype, - name='lower_diagonal_weights_matrix') - self._activation_fn = activation_fn - self._gate_first_n = gate_first_n if gate_first_n else self.width - - self._num_ungated = self.width - self.gate_first_n - - super(HighwayFlow, self).__init__( - validate_args=validate_args, - forward_min_event_ndims=1, - parameters=parameters, - dtype=dtype, - name=name) - - @property - def bias(self): - return self._bias - - @property - def width(self): - return self._width - - @property - def residual_fraction(self): - return self._residual_fraction - - @property - def upper_diagonal_weights_matrix(self): - return self._upper_diagonal_weights_matrix - - @property - def lower_diagonal_weights_matrix(self): - return self._lower_diagonal_weights_matrix - - @property - def activation_fn(self): - return self._activation_fn - - @property - def gate_first_n(self): - return self._gate_first_n - - @property - def num_ungated(self): - return self._num_ungated - - def _derivative_of_softplus(self, x): - return tf.concat([(self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.zeros(self.num_ungated, dtype=self.dtype)], - axis=0) + ( - tf.concat([(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0)) * tf.math.sigmoid(x) - - def _convex_update(self, weights_matrix): - return tf.concat( - [self.residual_fraction * tf.eye(num_rows=self.gate_first_n, - num_columns=self.width, - dtype=self.dtype), - tf.zeros([self.num_ungated, self.width], dtype=self.dtype)], - axis=0) + tf.concat([(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * weights_matrix - - def _inverse_of_softplus(self, y, n=20): - """Inverse of the activation layer with softplus using Newton iteration.""" - x = tf.ones_like(y, dtype=self.dtype) - for _ in range(n): - x = x - (tf.concat([(self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.zeros(self.num_ungated, dtype=self.dtype)], - axis=0) * x + tf.concat( - [(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * tf.math.softplus( - x) - y) / ( - self._derivative_of_softplus(x)) - return x - - def _augmented_forward(self, x): - """Computes forward and forward_log_det_jacobian transformations. - - Args: - x: Input of the bijector. - - Returns: - x after forward flow and a dict containing forward and inverse log - determinant of the jacobian. - """ - - # Log determinant term from the upper matrix. Note that the log determinant - # of the lower matrix is zero. - - fldj = tf.zeros(ps.shape(x)[:-1], dtype=self.dtype) + tf.reduce_sum( - tf.math.log(tf.concat([(self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.zeros(self.num_ungated, dtype=self.dtype)], - axis=0) + ( - tf.concat([(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0)) * tf.linalg.diag_part( - self.upper_diagonal_weights_matrix))) - x = x[tf.newaxis, ...] - x = tf.linalg.matvec( - self._convex_update(self.lower_diagonal_weights_matrix), x) - x = tf.linalg.matvec( - self._convex_update(self.upper_diagonal_weights_matrix), - x, transpose_a=True) - x += (tf.concat([(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * self.bias)[tf.newaxis, ...] - - if self.activation_fn: - fldj += tf.reduce_sum(tf.math.log(self._derivative_of_softplus(x[0])), - axis=-1) - x = tf.concat([(self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.zeros(self.num_ungated, dtype=self.dtype)], - axis=0) * x + tf.concat( - [(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * tf.nn.softplus(x) - - return tf.squeeze(x, 0), {'ildj': -fldj, 'fldj': fldj} - - def _augmented_inverse(self, y): - """Computes inverse and inverse_log_det_jacobian transformations. - - Args: - y: input of the (inverse) bijectorr. - - Returns: - y after inverse flow and a dict containing inverse and forward log - determinant of the jacobian. - """ - - ildj = tf.zeros(ps.shape(y)[:-1], dtype=self.dtype) - tf.reduce_sum( - tf.math.log(tf.concat([(self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.zeros(self.num_ungated, dtype=self.dtype)], - axis=0) + tf.concat( - [(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * tf.linalg.diag_part( - self.upper_diagonal_weights_matrix))) - - if self.activation_fn: - y = self._inverse_of_softplus(y) - ildj -= tf.reduce_sum(tf.math.log(self._derivative_of_softplus(y)), - axis=-1) - - y = y[..., tf.newaxis] - - y = y - (tf.concat([(1. - self.residual_fraction) * tf.ones( - self.gate_first_n, dtype=self.dtype), - tf.ones(self.num_ungated, dtype=self.dtype)], - axis=0) * self.bias)[..., tf.newaxis] - y = tf.linalg.triangular_solve( - self._convex_update(self.upper_diagonal_weights_matrix), y, - lower=True, adjoint=True) - y = tf.linalg.triangular_solve( - self._convex_update(self.lower_diagonal_weights_matrix), y) - - return tf.squeeze(y, axis=-1), {'ildj': ildj, 'fldj': -ildj} - - def _forward(self, x): - y, _ = self._augmented_forward(x) - return y - - def _inverse(self, y): - x, _ = self._augmented_inverse(y) - return x - - def _forward_log_det_jacobian(self, x): - cached = self._cache.forward_attributes(x) - # If LDJ isn't in the cache, call forward once. - if 'fldj' not in cached: - _, attrs = self._augmented_forward(x) - cached.update(attrs) - return cached['fldj'] - - def _inverse_log_det_jacobian(self, y): - cached = self._cache.inverse_attributes(y) - # If LDJ isn't in the cache, call inverse once. - if 'ildj' not in cached: - _, attrs = self._augmented_inverse(y) - cached.update(attrs) - return cached['ildj'] From 0b386c671506a6f8f67c12cd5a8492db1d8d3fdd Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:08:45 +0200 Subject: [PATCH 14/54] working on tests --- .../experimental/vi/cascading_flows_test.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 9c4393be24..598d3fd66e 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -30,7 +30,7 @@ tfb = tfp.bijectors tfd = tfp.distributions - + # test_util.test_seed(sampler_type='stateless')) @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): @@ -65,7 +65,7 @@ def test_dims_and_gradients(self): self._expected_num_trainable_variables(prior_dist)) # Test that the sample shape is correct - three_posterior_samples = surrogate_posterior.sample( + '''three_posterior_samples = surrogate_posterior.sample( 3, seed=test_util.test_seed(sampler_type='stateless')) three_prior_samples = prior_dist.sample( 3, seed=test_util.test_seed(sampler_type='stateless')) @@ -74,15 +74,15 @@ def test_dims_and_gradients(self): [s.shape for s in tf.nest.flatten(three_posterior_samples)]) # Test that gradients are available wrt the variational parameters. - posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + posterior_sample = surrogate_posterior.sample( + seed=1) with tf.GradientTape() as tape: posterior_logprob = surrogate_posterior.log_prob(posterior_sample) grad = tape.gradient(posterior_logprob, surrogate_posterior.trainable_variables) - self.assertTrue(all(g is not None for g in grad)) + self.assertTrue(all(g is not None for g in grad))''' - def test_initialization_is_deterministic_following_seed(self): + '''def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( @@ -101,7 +101,7 @@ def test_initialization_is_deterministic_following_seed(self): posterior_sample2 = surrogate_posterior2.sample( seed=test_util.test_seed(sampler_type='stateless')) - self.assertAllEqualNested(posterior_sample, posterior_sample2) + self.assertAllEqualNested(posterior_sample, posterior_sample2)''' @test_util.test_all_tf_execution_regimes @@ -144,7 +144,7 @@ def target_log_prob(*x): return target_log_prob - def test_fitting_surrogate_posterior(self): + '''def test_fitting_surrogate_posterior(self): prior_dist = self.make_prior_dist() observations = self.get_observations(prior_dist) @@ -170,7 +170,7 @@ def test_fitting_surrogate_posterior(self): self.evaluate(tf1.global_variables_initializer()) _ = self.evaluate(losses) _ = self.evaluate(posterior_mean) - _ = self.evaluate(posterior_stddev) + _ = self.evaluate(posterior_stddev)''' @test_util.test_all_tf_execution_regimes From d8f47802d41a65304c103b6c8b0e2bb6fbdbbd48 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 11:39:55 +0200 Subject: [PATCH 15/54] more testing --- .../experimental/vi/cascading_flows_test.py | 57 +++++++------------ 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 598d3fd66e..b52e1e5f77 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -35,73 +35,59 @@ @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): - def _expected_num_trainable_variables(self, prior_dist): + def _expected_num_trainable_variables(self, prior_dist, num_layers): """Infers the expected number of trainable variables for a non-nested JD.""" prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access expected_num_trainable_variables = 0 + + # For each distribution in the prior, we will have one highway flow with + # `num_layers` blocks, and each block has 4 trainable variables: + # `residual_fraction`, `lower_diagonal_weights_matrix`, + # `upper_diagonal_weights_matrix` and `bias`. for original_dist in prior_dists: - try: - original_dist = original_dist.distribution - except AttributeError: - pass - dist = cascading_flows._as_substituted_distribution(original_dist) - dist_params = dist.parameters - for param, value in dist_params.items(): - if (param not in cascading_flows._NON_STATISTICAL_PARAMS - and value is not None and param not in ('low', 'high')): - # One variable each for prior_weight, mean_field_parameter. - expected_num_trainable_variables += 2 + expected_num_trainable_variables += (4 * num_layers) return expected_num_trainable_variables def test_dims_and_gradients(self): prior_dist = self.make_prior_dist() - + num_layers = 3 surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist) + prior=prior_dist, num_layers=num_layers) # Test that the correct number of trainable variables are being tracked self.assertLen(surrogate_posterior.trainable_variables, - self._expected_num_trainable_variables(prior_dist)) + self._expected_num_trainable_variables(prior_dist, num_layers)) # Test that the sample shape is correct - '''three_posterior_samples = surrogate_posterior.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + three_posterior_samples = surrogate_posterior.sample( + 3, seed=1) three_prior_samples = prior_dist.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + 3, seed=1) self.assertAllEqualNested( [s.shape for s in tf.nest.flatten(three_prior_samples)], [s.shape for s in tf.nest.flatten(three_posterior_samples)]) - # Test that gradients are available wrt the variational parameters. - posterior_sample = surrogate_posterior.sample( - seed=1) - with tf.GradientTape() as tape: - posterior_logprob = surrogate_posterior.log_prob(posterior_sample) - grad = tape.gradient(posterior_logprob, - surrogate_posterior.trainable_variables) - self.assertTrue(all(g is not None for g in grad))''' - - '''def test_initialization_is_deterministic_following_seed(self): + def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) self.evaluate( [v.initializer for v in surrogate_posterior.trainable_variables]) posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) self.evaluate( [v.initializer for v in surrogate_posterior2.trainable_variables]) posterior_sample2 = surrogate_posterior2.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) - self.assertAllEqualNested(posterior_sample, posterior_sample2)''' + self.assertAllEqualNested(posterior_sample, posterior_sample2) @test_util.test_all_tf_execution_regimes @@ -172,7 +158,6 @@ def target_log_prob(*x): _ = self.evaluate(posterior_mean) _ = self.evaluate(posterior_stddev)''' - @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestEightSchools(test_util.TestCase, _TrainableCFSurrogate): @@ -235,7 +220,7 @@ def _prior_model_fn(): return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) -@test_util.test_all_tf_execution_regimes +'''@test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestDiscreteLatent( test_util.TestCase, _TrainableCFSurrogate): @@ -344,7 +329,7 @@ def centered_horseshoe(ndims=100): tfd.Normal) self.assertIsInstance(surrogate_dists.local_scale.distribution, tfd.Normal) - self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + self.assertIsInstance(surrogate_dists.weights, tfd.Normal)''' # TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions. # TODO(kateslin): Add an ASVI surrogate posterior test with for a model with From af9a5bae484ee95491f42415f88ff955c25564be Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:51:37 +0200 Subject: [PATCH 16/54] fixed conflicts --- tensorflow_probability/python/experimental/vi/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py index 0cb4971fcc..e18c8d3455 100644 --- a/tensorflow_probability/python/experimental/vi/__init__.py +++ b/tensorflow_probability/python/experimental/vi/__init__.py @@ -29,6 +29,7 @@ 'build_affine_surrogate_posterior', 'build_affine_surrogate_posterior_from_base_distribution', 'build_asvi_surrogate_posterior', + 'builf_cf_surrogate_posterior' 'build_factored_surrogate_posterior', 'build_split_flow_surrogate_posterior', 'build_trainable_location_scale_distribution', From 5cf8ce96854f3c27b529a023f282d525ccb0acfc Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:52:05 +0200 Subject: [PATCH 17/54] Revert "Revert "initial tests, updated init and build"" This reverts commit 5bb28b08 --- .../python/experimental/vi/BUILD | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD index e57f884ca5..863e0aeef2 100644 --- a/tensorflow_probability/python/experimental/vi/BUILD +++ b/tensorflow_probability/python/experimental/vi/BUILD @@ -31,6 +31,7 @@ py_library( srcs_version = "PY3", deps = [ ":automatic_structured_vi", + ":cascading_flows", ":surrogate_posteriors", "//tensorflow_probability/python/experimental/vi/util", "//tensorflow_probability/python/internal:all_util", @@ -67,6 +68,36 @@ py_library( ], ) +py_library( + name = "cascading_flows", + srcs = ["cascading_flows.py.py"], + srcs_version = "PY3", + deps = [ + # tensorflow dep, + "//tensorflow_probability/python/bijectors:build_highway_flow_layer", + "//tensorflow_probability/python/bijectors:chain", + "//tensorflow_probability/python/bijectors:reshape", + "//tensorflow_probability/python/bijectors:scale", + "//tensorflow_probability/python/bijectors:shift", + "//tensorflow_probability/python/bijectors:split", + "//tensorflow_probability/python/distributions:batch_broadcast", + "//tensorflow_probability/python/distributions:beta", + "//tensorflow_probability/python/distributions:blockwise", + "//tensorflow_probability/python/distributions:chi2", + "//tensorflow_probability/python/distributions:exponential", + "//tensorflow_probability/python/distributions:gamma", + "//tensorflow_probability/python/distributions:half_normal", + "//tensorflow_probability/python/distributions:joint_distribution_auto_batched", + "//tensorflow_probability/python/distributions:joint_distribution_coroutine", + "//tensorflow_probability/python/distributions:normal", + "//tensorflow_probability/python/distributions:sample", + "//tensorflow_probability/python/distributions:transformed_distribution", + "//tensorflow_probability/python/distributions:truncated_normal", + "//tensorflow_probability/python/distributions:uniform", + "//tensorflow_probability/python/internal:samplers", + ], +) + py_library( name = "surrogate_posteriors", srcs = ["surrogate_posteriors.py"], @@ -111,6 +142,22 @@ py_test( ], ) +py_test( + name = "cascading_flows_test", + size = "large", + srcs = ["cascading_flows_test.py"], + python_version = "PY3", + shard_count = 4, + srcs_version = "PY3", + deps = [ + # absl/testing:parameterized dep, + # numpy dep, + # tensorflow dep, + "//tensorflow_probability", + "//tensorflow_probability/python/internal:test_util", + ], +) + py_test( name = "surrogate_posteriors_test", size = "large", From 755bca92cd518eb9c91c9dd177c8a02fb1ed3381 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:54:47 +0200 Subject: [PATCH 18/54] reverted commit --- .../python/experimental/vi/cascading_flows.py | 483 ++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows.py diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py new file mode 100644 index 0000000000..d8c9393d8e --- /dev/null +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -0,0 +1,483 @@ +# Copyright 2021 The TensorFlow Probability Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Utilities for constructing structured surrogate posteriors.""" + +from __future__ import absolute_import +from __future__ import division +# [internal] enable type annotations +from __future__ import print_function + +import copy +import functools +import inspect + +import tensorflow.compat.v2 as tf + +from tensorflow_probability.python.experimental.bijectors import \ + build_highway_flow_layer +from tensorflow_probability.python.bijectors import chain +from tensorflow_probability.python.bijectors import reshape +from tensorflow_probability.python.bijectors import scale as scale_lib +from tensorflow_probability.python.bijectors import shift +from tensorflow_probability.python.bijectors import split + +from tensorflow_probability.python.distributions import batch_broadcast +from tensorflow_probability.python.distributions import beta +from tensorflow_probability.python.distributions import blockwise +from tensorflow_probability.python.distributions import chi2 +from tensorflow_probability.python.distributions import exponential +from tensorflow_probability.python.distributions import gamma +from tensorflow_probability.python.distributions import half_normal +from tensorflow_probability.python.distributions import \ + joint_distribution_auto_batched +from tensorflow_probability.python.distributions import \ + joint_distribution_coroutine +from tensorflow_probability.python.distributions import normal +from tensorflow_probability.python.distributions import sample +from tensorflow_probability.python.distributions import transformed_distribution +from tensorflow_probability.python.distributions import truncated_normal +from tensorflow_probability.python.distributions import uniform +from tensorflow_probability.python.internal import samplers + +__all__ = [ + 'register_asvi_substitution_rule', + 'build_cf_surrogate_posterior' +] + +Root = joint_distribution_coroutine.JointDistributionCoroutine.Root + +_NON_STATISTICAL_PARAMS = [ + 'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum', + 'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support', + 'num_probit_terms_approx' +] +_NON_TRAINABLE_PARAMS = ['low', 'high'] + +# Registry of transformations that are applied to distributions in the prior +# before defining the surrogate family. + + +# Todo: inherited from asvi code, do we need this? +ASVI_SURROGATE_SUBSTITUTIONS = {} + + +# Todo: inherited from asvi code, do we need this? +def _as_substituted_distribution(distribution): + """Applies all substitution rules that match a distribution.""" + for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items(): + if condition(distribution): + distribution = substitution_fn(distribution) + return distribution + + +# Todo: inherited from asvi code, do we need this? +def register_asvi_substitution_rule(condition, substitution_fn): + """Registers a rule for substituting distributions in ASVI surrogates. + + Args: + condition: Python `callable` that takes a Distribution instance and + returns a Python `bool` indicating whether or not to substitute it. + May also be a class type such as `tfd.Normal`, in which case the + condition is interpreted as + `lambda distribution: isinstance(distribution, class)`. + substitution_fn: Python `callable` that takes a Distribution + instance and returns a new Distribution instance used to define + the ASVI surrogate posterior. Note that this substitution does not modify + the original model. + + #### Example + + To use a Normal surrogate for all location-scale family distributions, we + could register the substitution: + + ```python + tfp.experimental.vi.register_asvi_surrogate_substitution( + condition=lambda distribution: ( + hasattr(distribution, 'loc') and hasattr(distribution, 'scale')) + substitution_fn=lambda distribution: ( + # Invoking the event space bijector applies any relevant constraints, + # e.g., that HalfCauchy samples must be `>= loc`. + distribution.experimental_default_event_space_bijector()( + tfd.Normal(loc=distribution.loc, scale=distribution.scale))) + ``` + + This rule will fire when ASVI encounters a location-scale distribution, + and instructs ASVI to build a surrogate 'as if' the model had just used a + (possibly constrained) Normal in its place. Note that we could have used a + more precise condition, e.g., to limit the substitution to distributions with + a specific `name`, if we had reason to think that a Normal distribution would + be a good surrogate for some model variables but not others. + + """ + global ASVI_SURROGATE_SUBSTITUTIONS + if inspect.isclass(condition): + condition = lambda distribution, cls=condition: isinstance( + # pylint: disable=g-long-lambda + distribution, cls) + ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn + + +# Default substitutions attempt to express distributions using the most +# flexible available parameterization. +# pylint: disable=g-long-lambda +register_asvi_substitution_rule( + half_normal.HalfNormal, + lambda dist: truncated_normal.TruncatedNormal( + loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) +register_asvi_substitution_rule( + uniform.Uniform, + lambda dist: shift.Shift(dist.low)( + scale_lib.Scale(dist.high - dist.low)( + beta.Beta(concentration0=tf.ones_like(dist.mean()), + concentration1=1.)))) +register_asvi_substitution_rule( + exponential.Exponential, + lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) +register_asvi_substitution_rule( + chi2.Chi2, + lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) + + +# pylint: enable=g-long-lambda + +# a single JointDistribution. +def build_cf_surrogate_posterior( + prior, + num_auxiliary_variables=0, + initial_prior_weight=0.5, + seed=None, + name=None): + # todo: change docstrings + """Builds a structured surrogate posterior inspired by conjugate updating. + + ASVI, or Automatic Structured Variational Inference, was proposed by + Ambrogioni et al. (2020) [1] as a method of automatically constructing a + surrogate posterior with the same structure as the prior. It does this by + reparameterizing the variational family of the surrogate posterior by + structuring each parameter according to the equation + ```none + prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter + ``` + In this equation, `prior_parameter` is a vector of prior parameters and + `mean_field_parameter` is a vector of trainable parameters with the same + domain as `prior_parameter`. `prior_weight` is a vector of learnable + parameters where `0. <= prior_weight <= 1.`. When `prior_weight = + 0`, the surrogate posterior will be a mean-field surrogate, and when + `prior_weight = 1.`, the surrogate posterior will be the prior. This convex + combination equation, inspired by conjugacy in exponential families, thus + allows the surrogate posterior to balance between the structure of the prior + and the structure of a mean-field approximation. + + Args: + prior: tfd.JointDistribution instance of the prior. + mean_field: Optional Python boolean. If `True`, creates a degenerate + surrogate distribution in which all variables are independent, + ignoring the prior dependence structure. Default value: `False`. + initial_prior_weight: Optional float value (either static or tensor value) + on the interval [0, 1]. A larger value creates an initial surrogate + distribution with more dependence on the prior structure. Default value: + `0.5`. + seed: Python `int` seed for random initialization. + name: Optional string. Default value: `build_cf_surrogate_posterior`. + + Returns: + surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance + whose samples have shape and structure matching that of `prior`. + + Raises: + TypeError: The `prior` argument cannot be a nested `JointDistribution`. + + ### Examples + + Consider a Brownian motion model expressed as a JointDistribution: + + ```python + prior_loc = 0. + innovation_noise = .1 + + def model_fn(): + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for i in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) + ``` + + Let's use variational inference to approximate the posterior. We'll build a + surrogate posterior distribution by feeding in the prior distribution. + + ```python + surrogate_posterior = + tfp.experimental.vi.build_cf_surrogate_posterior(prior) + ``` + + This creates a trainable joint distribution, defined by variables in + `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` + to fit this distribution by minimizing a divergence to the true posterior. + + ```python + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob_fn, + surrogate_posterior=surrogate_posterior, + num_steps=100, + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # After optimization, samples from the surrogate will approximate + # samples from the true posterior. + samples = surrogate_posterior.sample(100) + posterior_mean = [tf.reduce_mean(x) for x in samples] + posterior_std = [tf.math.reduce_std(x) for x in samples] + ``` + + #### References + [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured + variational inference. _arXiv preprint arXiv:2002.00643_, 2020 + https://arxiv.org/abs/2002.00643 + + """ + with tf.name_scope(name or 'build_cf_surrogate_posterior'): + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist=prior, + base_distribution_surrogate_fn=functools.partial( + _cf_convex_update_for_base_distribution, + initial_prior_weight=initial_prior_weight, + num_auxiliary_variables=num_auxiliary_variables), + seed=seed) + surrogate_posterior.also_track = variables + return surrogate_posterior + + +def _cf_surrogate_for_distribution(dist, + base_distribution_surrogate_fn, + sample_shape=None, + variables=None, + seed=None): + # todo: change docstrings + """Recursively creates ASVI surrogates, and creates new variables if needed. + + Args: + dist: a `tfd.Distribution` instance. + base_distribution_surrogate_fn: Callable to build a surrogate posterior + for a 'base' (non-meta and non-joint) distribution, with signature + `surrogate_posterior, variables = base_distribution_fn( + dist, sample_shape=None, variables=None, seed=None)`. + sample_shape: Optional `Tensor` shape of samples drawn from `dist` by + `tfd.Sample` wrappers. If not `None`, the surrogate's event will include + independent sample dimensions, i.e., it will have event shape + `concat([sample_shape, dist.event_shape], axis=0)`. + Default value: `None`. + variables: Optional nested structure of `tf.Variable`s returned from a + previous call to `_cf_surrogate_for_distribution`. If `None`, + new variables will be created; otherwise, constructs a surrogate posterior + backed by the passed-in variables. + Default value: `None`. + seed: Python `int` seed for random initialization. + Returns: + surrogate_posterior: Instance of `tfd.Distribution` representing a trainable + surrogate posterior distribution, with the same structure and `name` as + `dist`. + variables: Nested structure of `tf.Variable` trainable parameters for the + surrogate posterior. If `dist` is a base distribution, this is + a `dict` of `ASVIParameters` instances. If `dist` is a joint + distribution, this is a `dist.dtype` structure of such `dict`s. + """ + + # Apply any substitutions, while attempting to preserve the original name. + dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) + + if hasattr(dist, '_model_coroutine'): + surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=variables, + seed=seed) + else: + surrogate_posterior, variables = base_distribution_surrogate_fn( + dist=dist, sample_shape=sample_shape, variables=variables, seed=seed) + return surrogate_posterior, variables + + +def _cf_surrogate_for_joint_distribution( + dist, base_distribution_surrogate_fn, variables=None, seed=None): + """Builds a structured joint surrogate posterior for a joint model.""" + + # Probabilistic program for CF surrogate posterior. + flat_variables = dist._model_flatten( + variables) if variables else None # pylint: disable=protected-access + prior_coroutine = dist._model_coroutine # pylint: disable=protected-access + + def posterior_generator(seed=seed): + prior_gen = prior_coroutine() + dist = next(prior_gen) + i = 0 + try: + while True: + was_root = isinstance(dist, Root) + if was_root: + dist = dist.distribution + + seed, init_seed = samplers.split_seed(seed) + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=flat_variables[i] if flat_variables else None, + seed=init_seed) + + if was_root: + surrogate_posterior = Root(surrogate_posterior) + # If variables were not given---i.e., we're creating new + # variables---then yield the new variables along with the surrogate + # posterior. This assumes an execution context such as + # `_extract_variables_from_coroutine_model` below that will capture and + # save the variables. + value_out = yield (surrogate_posterior if flat_variables + else (surrogate_posterior, variables)) + if type(value_out) == list: + if len(dist.event_shape) == 0: + dist = prior_gen.send(tf.squeeze(value_out[0], -1)) + else: + dist = prior_gen.send(value_out[0]) + + else: + dist = prior_gen.send(value_out) + i += 1 + except StopIteration: + pass + + if variables is None: + # Run the generator to create variables, then call ourselves again + # to construct the surrogate JD from these variables. Note that we can't + # just create a JDC from the current `posterior_generator`, because it will + # try to build new variables on every invocation; the recursive call will + # define a new `posterior_generator` that knows about the variables we're + # about to create. + return _cf_surrogate_for_joint_distribution( + dist=dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=dist._model_unflatten( # pylint: disable=protected-access + _extract_variables_from_coroutine_model( + posterior_generator, seed=seed))) + + # Temporary workaround for bijector caching issues with autobatched JDs. + surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched + if not hasattr(dist, 'use_vectorized_map'): + surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine + surrogate_posterior = surrogate_type(posterior_generator, + name=_get_name(dist)) + + # Ensure that the surrogate posterior structure matches that of the prior. + # todo: check me, do we need this? in case needs to be modified + # if we use auxiliary variables, then the structure won't match the one of the + # prior + '''try: + tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) + except TypeError: + tokenize = lambda jd: jd._model_unflatten( + # pylint: disable=protected-access, g-long-lambda + range(len(jd._model_flatten(jd.dtype))) + # pylint: disable=protected-access + ) + surrogate_posterior = restructure.Restructure( + output_structure=tokenize(dist), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist))''' + return surrogate_posterior, variables + + +# todo: sample_shape and seed are not used.. maybe they should? +def _cf_convex_update_for_base_distribution(dist, + initial_prior_weight, + num_auxiliary_variables=0, + sample_shape=None, + variables=None, + seed=None): + """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + + if variables is None: + actual_event_shape = dist.event_shape_tensor() + int_event_shape = int(actual_event_shape) if \ + actual_event_shape.shape.as_list()[0] > 0 else 1 + layers = 3 + bijectors = [reshape.Reshape([-1], + event_shape_in=actual_event_shape + + num_auxiliary_variables)] + + for _ in range(0, layers - 1): + bijectors.append( + build_highway_flow_layer( + tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + activation_fn=True, gate_first_n=int_event_shape)) + bijectors.append( + build_highway_flow_layer( + tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + activation_fn=False, gate_first_n=int_event_shape)) + bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + + variables = chain.Chain(bijectors=list(reversed(bijectors))) + + if num_auxiliary_variables > 0: + cascading_flows = split.Split( + [-1, num_auxiliary_variables])( + transformed_distribution.TransformedDistribution( + distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast( + sample.Sample(normal.Normal(0., .1), num_auxiliary_variables), + to_shape=dist.batch_shape)]), + bijector=variables)) + + else: + cascading_flows = transformed_distribution.TransformedDistribution( + distribution=dist, + bijector=variables) + + return cascading_flows, variables + + +def _extract_variables_from_coroutine_model(model_fn, seed=None): + """Extracts variables from a generator that yields (dist, variables) pairs.""" + gen = model_fn() + try: + dist, dist_variables = next(gen) + flat_variables = [dist_variables] + while True: + seed, local_seed = samplers.split_seed(seed, n=2) + sampled_value = (dist.distribution.sample(seed=local_seed) + if isinstance(dist, Root) + else dist.sample(seed=local_seed)) + dist, dist_variables = gen.send( + sampled_value) # tf.concat(sampled_value, axis=0) + flat_variables.append(dist_variables) + except StopIteration: + pass + return flat_variables + + +def _set_name(dist, name): + """Copies a distribution-like object, replacing its name.""" + if hasattr(dist, 'copy'): + return dist.copy(name=name) + # Some distribution-like entities such as JointDistributionPinned don't + # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set + # the name directly. + dist = copy.copy(dist) + dist._name = name # pylint: disable=protected-access + return dist + + +def _get_name(dist): + """Attempts to get a distribution's short name, excluding the name scope.""" + return getattr(dist, 'parameters', {}).get('name', dist.name) From bbc38a44d1ae1976f03a305e257dd18dd53939e6 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:55:54 +0200 Subject: [PATCH 19/54] Revert "removed cascading_flows from pr" This reverts commit 1620ebd2 --- .../experimental/vi/cascading_flows_test.py | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows_test.py diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py new file mode 100644 index 0000000000..9c4393be24 --- /dev/null +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -0,0 +1,354 @@ +# Copyright 2021 The TensorFlow Probability Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Tests for structured surrogate posteriors.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import tensorflow.compat.v1 as tf1 +import tensorflow.compat.v2 as tf +import tensorflow_probability as tfp +from tensorflow_probability.python.experimental.vi import cascading_flows +from tensorflow_probability.python.internal import prefer_static as ps +from tensorflow_probability.python.internal import test_util + + +tfb = tfp.bijectors +tfd = tfp.distributions + + +@test_util.test_all_tf_execution_regimes +class _TrainableCFSurrogate(object): + + def _expected_num_trainable_variables(self, prior_dist): + """Infers the expected number of trainable variables for a non-nested JD.""" + prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access + expected_num_trainable_variables = 0 + for original_dist in prior_dists: + try: + original_dist = original_dist.distribution + except AttributeError: + pass + dist = cascading_flows._as_substituted_distribution(original_dist) + dist_params = dist.parameters + for param, value in dist_params.items(): + if (param not in cascading_flows._NON_STATISTICAL_PARAMS + and value is not None and param not in ('low', 'high')): + # One variable each for prior_weight, mean_field_parameter. + expected_num_trainable_variables += 2 + return expected_num_trainable_variables + + def test_dims_and_gradients(self): + + prior_dist = self.make_prior_dist() + + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist) + + # Test that the correct number of trainable variables are being tracked + self.assertLen(surrogate_posterior.trainable_variables, + self._expected_num_trainable_variables(prior_dist)) + + # Test that the sample shape is correct + three_posterior_samples = surrogate_posterior.sample( + 3, seed=test_util.test_seed(sampler_type='stateless')) + three_prior_samples = prior_dist.sample( + 3, seed=test_util.test_seed(sampler_type='stateless')) + self.assertAllEqualNested( + [s.shape for s in tf.nest.flatten(three_prior_samples)], + [s.shape for s in tf.nest.flatten(three_posterior_samples)]) + + # Test that gradients are available wrt the variational parameters. + posterior_sample = surrogate_posterior.sample( + seed=test_util.test_seed(sampler_type='stateless')) + with tf.GradientTape() as tape: + posterior_logprob = surrogate_posterior.log_prob(posterior_sample) + grad = tape.gradient(posterior_logprob, + surrogate_posterior.trainable_variables) + self.assertTrue(all(g is not None for g in grad)) + + def test_initialization_is_deterministic_following_seed(self): + prior_dist = self.make_prior_dist() + + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist, + seed=test_util.test_seed(sampler_type='stateless')) + self.evaluate( + [v.initializer for v in surrogate_posterior.trainable_variables]) + posterior_sample = surrogate_posterior.sample( + seed=test_util.test_seed(sampler_type='stateless')) + + surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist, + seed=test_util.test_seed(sampler_type='stateless')) + self.evaluate( + [v.initializer for v in surrogate_posterior2.trainable_variables]) + posterior_sample2 = surrogate_posterior2.sample( + seed=test_util.test_seed(sampler_type='stateless')) + + self.assertAllEqualNested(posterior_sample, posterior_sample2) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + innovation_noise = 0.1 + prior_loc = 0. + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for _ in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + def make_likelihood_model(self, x, observation_noise): + + def _likelihood_model(): + for i in range(5): + yield tfd.Normal(loc=x[i], scale=observation_noise) + + return tfd.JointDistributionCoroutineAutoBatched(_likelihood_model) + + def get_observations(self, prior_dist): + observation_noise = 0.15 + ground_truth = prior_dist.sample() + likelihood = self.make_likelihood_model( + x=ground_truth, observation_noise=observation_noise) + return likelihood.sample(1) + + def get_target_log_prob(self, observations, prior_dist): + + def target_log_prob(*x): + observation_noise = 0.15 + likelihood_dist = self.make_likelihood_model( + x=x, observation_noise=observation_noise) + return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x) + + return target_log_prob + + def test_fitting_surrogate_posterior(self): + + prior_dist = self.make_prior_dist() + observations = self.get_observations(prior_dist) + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + prior=prior_dist) + target_log_prob = self.get_target_log_prob(observations, prior_dist) + + # Test vi fit surrogate posterior works + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob, + surrogate_posterior, + num_steps=5, # Don't optimize to completion. + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # Compute posterior statistics. + with tf.control_dependencies([losses]): + posterior_samples = surrogate_posterior.sample(100) + posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples) + posterior_stddev = tf.nest.map_structure(tf.math.reduce_std, + posterior_samples) + + self.evaluate(tf1.global_variables_initializer()) + _ = self.evaluate(losses) + _ = self.evaluate(posterior_mean) + _ = self.evaluate(posterior_stddev) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestEightSchools(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12], + dtype=tf.float32) + num_schools = ps.shape(treatment_effects)[-1] + + return tfd.JointDistributionNamed({ + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda + tfd.Independent( + tfd.Normal( + loc=avg_effect[..., None] * tf.ones(num_schools), + scale=tf.exp(log_stddev[..., None]) * tf.ones( + num_schools), + name='school_effects'), + reinterpreted_batch_ndims=1)) + }) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + return tfd.JointDistributionNamed({ + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda + tfd.Sample( + tfd.Normal( + loc=avg_effect[..., None], + scale=tf.exp(log_stddev[..., None]), + name='school_effects'), + sample_shape=[8])) + }) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase, + _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + innovation_noise = 1. + yield tfd.HalfNormal( + scale=innovation_noise, validate_args=True, allow_nan_stats=False) + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestDiscreteLatent( + test_util.TestCase, _TrainableCFSurrogate): + + def make_prior_dist(self): + + def _prior_model_fn(): + a = yield tfd.Bernoulli(logits=0.5, name='a') + yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1., + scale=1., name='b') + + return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) + + +@test_util.test_all_tf_execution_regimes +class CFSurrogatePosteriorTestNesting(test_util.TestCase, + _TrainableCFSurrogate): + + def _expected_num_trainable_variables(self, _): + # Nested distributions have total of 10 params after Exponential->Gamma + # substitution, multiplied by 2 variables per param. + return 20 + + def make_prior_dist(self): + + def nested_model(): + a = yield tfd.Sample( + tfd.Sample( + tfd.Normal(0., 1.), + sample_shape=4), + sample_shape=[2], + name='a') + b = yield tfb.Sigmoid()( + tfb.Square()( + tfd.Exponential(rate=tf.exp(a))), + name='b') + # pylint: disable=g-long-lambda + yield tfd.JointDistributionSequential( + [tfd.Laplace(loc=a, scale=b), + lambda c1: tfd.Independent( + tfd.Beta(concentration1=1., + concentration0=tf.nn.softplus(c1)), + reinterpreted_batch_ndims=1), + lambda c1, c2: tfd.JointDistributionNamed({ + 'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)}) + ], name='c') + # pylint: enable=g-long-lambda + + return tfd.JointDistributionCoroutineAutoBatched(nested_model) + + +@test_util.test_all_tf_execution_regimes +class TestCFDistributionSubstitution(test_util.TestCase): + + def test_default_substitutes_trainable_families(self): + + @tfd.JointDistributionCoroutineAutoBatched + def model(): + yield tfd.Sample( + tfd.Uniform(low=-2., high=7.), + sample_shape=[2], + name='a') + yield tfd.HalfNormal(1., name='b') + yield tfd.Exponential(rate=[1., 2.], name='c') + yield tfd.Chi2(df=3., name='d') + + surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( + model) + self.assertAllEqualNested(model.event_shape, surrogate.event_shape) + + surrogate_dists, _ = surrogate.sample_distributions() + self.assertIsInstance(surrogate_dists.a, tfd.Independent) + self.assertIsInstance(surrogate_dists.a.distribution, + tfd.TransformedDistribution) + self.assertIsInstance(surrogate_dists.a.distribution.distribution, + tfd.Beta) + self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal) + self.assertIsInstance(surrogate_dists.c, tfd.Gamma) + self.assertIsInstance(surrogate_dists.d, tfd.Gamma) + + def test_can_specify_custom_substitution(self): + + @tfd.JointDistributionCoroutineAutoBatched + def centered_horseshoe(ndims=100): + global_scale = yield tfd.HalfCauchy( + loc=0., scale=1., name='global_scale') + local_scale = yield tfd.HalfCauchy( + loc=0., scale=tf.ones([ndims]), name='local_scale') + yield tfd.Normal( + loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights') + + tfp.experimental.vi.register_asvi_substitution_rule( + condition=tfd.HalfCauchy, + substitution_fn=( + lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale)))) + surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( + centered_horseshoe) + self.assertAllEqualNested(centered_horseshoe.event_shape, + surrogate.event_shape) + + # If the surrogate was built with names or structure differing from the + # model, so that it had to be `tfb.Restructure`'d, then this + # sample_distributions call will fail because the surrogate isn't an + # instance of tfd.JointDistribution. + surrogate_dists, _ = surrogate.sample_distributions() + self.assertIsInstance(surrogate_dists.global_scale.distribution, + tfd.Normal) + self.assertIsInstance(surrogate_dists.local_scale.distribution, + tfd.Normal) + self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + +# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions. +# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with +# missing observations. + +if __name__ == '__main__': + tf.test.main() From a89d60a746ef75c4e0acb7c872c0c5b290703e9d Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 10:56:49 +0200 Subject: [PATCH 20/54] reverted to latest version --- .../python/experimental/vi/cascading_flows.py | 101 ++++++++++++++---- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index d8c9393d8e..61dcce7236 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -25,21 +25,20 @@ import tensorflow.compat.v2 as tf -from tensorflow_probability.python.experimental.bijectors import \ - build_highway_flow_layer from tensorflow_probability.python.bijectors import chain from tensorflow_probability.python.bijectors import reshape from tensorflow_probability.python.bijectors import scale as scale_lib from tensorflow_probability.python.bijectors import shift from tensorflow_probability.python.bijectors import split - from tensorflow_probability.python.distributions import batch_broadcast from tensorflow_probability.python.distributions import beta from tensorflow_probability.python.distributions import blockwise from tensorflow_probability.python.distributions import chi2 +from tensorflow_probability.python.distributions import deterministic from tensorflow_probability.python.distributions import exponential from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal +from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import \ joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ @@ -49,10 +48,12 @@ from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import truncated_normal from tensorflow_probability.python.distributions import uniform +from tensorflow_probability.python.experimental.bijectors import \ + build_highway_flow_layer from tensorflow_probability.python.internal import samplers __all__ = [ - 'register_asvi_substitution_rule', + 'register_cf_substitution_rule', 'build_cf_surrogate_posterior' ] @@ -83,7 +84,7 @@ def _as_substituted_distribution(distribution): # Todo: inherited from asvi code, do we need this? -def register_asvi_substitution_rule(condition, substitution_fn): +def register_cf_substitution_rule(condition, substitution_fn): """Registers a rule for substituting distributions in ASVI surrogates. Args: @@ -132,20 +133,20 @@ def register_asvi_substitution_rule(condition, substitution_fn): # Default substitutions attempt to express distributions using the most # flexible available parameterization. # pylint: disable=g-long-lambda -register_asvi_substitution_rule( +register_cf_substitution_rule( half_normal.HalfNormal, lambda dist: truncated_normal.TruncatedNormal( loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) -register_asvi_substitution_rule( +register_cf_substitution_rule( uniform.Uniform, lambda dist: shift.Shift(dist.low)( scale_lib.Scale(dist.high - dist.low)( beta.Beta(concentration0=tf.ones_like(dist.mean()), concentration1=1.)))) -register_asvi_substitution_rule( +register_cf_substitution_rule( exponential.Exponential, lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) -register_asvi_substitution_rule( +register_cf_substitution_rule( chi2.Chi2, lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) @@ -255,6 +256,7 @@ def model_fn(): _cf_convex_update_for_base_distribution, initial_prior_weight=initial_prior_weight, num_auxiliary_variables=num_auxiliary_variables), + num_auxiliary_variables=num_auxiliary_variables, seed=seed) surrogate_posterior.also_track = variables return surrogate_posterior @@ -264,6 +266,8 @@ def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn, sample_shape=None, variables=None, + num_auxiliary_variables=0, + global_auxiliary_variables=None, seed=None): # todo: change docstrings """Recursively creates ASVI surrogates, and creates new variables if needed. @@ -303,15 +307,19 @@ def _cf_surrogate_for_distribution(dist, dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=variables, + num_auxiliary_variables=num_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables=None, + num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -322,7 +330,46 @@ def _cf_surrogate_for_joint_distribution( def posterior_generator(seed=seed): prior_gen = prior_coroutine() dist = next(prior_gen) - i = 0 + + if num_auxiliary_variables > 0: + i = 1 + + if flat_variables: + variables = flat_variables[0] + + else: + layers = 3 + bijectors = [] + + for _ in range(0, layers - 1): + bijectors.append( + build_highway_flow_layer(num_auxiliary_variables, + residual_fraction_initial_value=0.5, + activation_fn=True, gate_first_n=0, + seed=seed)) + bijectors.append( + build_highway_flow_layer(num_auxiliary_variables, + residual_fraction_initial_value=0.5, + activation_fn=False, gate_first_n=0, + seed=seed)) + + variables = chain.Chain(bijectors=list(reversed(bijectors))) + + eps = transformed_distribution.TransformedDistribution( + distribution=sample.Sample(normal.Normal(0., 0.1), + num_auxiliary_variables), + bijector=variables) + + eps = Root(eps) + + value_out = yield (eps if flat_variables + else (eps, variables)) + + global_auxiliary_variables = value_out + + else: + i = 0 + try: while True: was_root = isinstance(dist, Root) @@ -334,9 +381,10 @@ def posterior_generator(seed=seed): dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=flat_variables[i] if flat_variables else None, + global_auxiliary_variables=global_auxiliary_variables, seed=init_seed) - if was_root: + if was_root and num_auxiliary_variables == 0: surrogate_posterior = Root(surrogate_posterior) # If variables were not given---i.e., we're creating new # variables---then yield the new variables along with the surrogate @@ -367,6 +415,8 @@ def posterior_generator(seed=seed): return _cf_surrogate_for_joint_distribution( dist=dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, variables=dist._model_unflatten( # pylint: disable=protected-access _extract_variables_from_coroutine_model( posterior_generator, seed=seed))) @@ -401,6 +451,7 @@ def posterior_generator(seed=seed): def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables=0, + global_auxiliary_variables=None, sample_shape=None, variables=None, seed=None): @@ -412,31 +463,39 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape.shape.as_list()[0] > 0 else 1 layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] for _ in range(0, layers - 1): bijectors.append( build_highway_flow_layer( tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=True, gate_first_n=int_event_shape)) + activation_fn=True, gate_first_n=int_event_shape, seed=seed)) bijectors.append( build_highway_flow_layer( tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=False, gate_first_n=int_event_shape)) - bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + activation_fn=False, gate_first_n=int_event_shape, seed=seed)) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: + batch_shape = global_auxiliary_variables.shape[0] if len( + global_auxiliary_variables.shape) > 1 else [] + cascading_flows = split.Split( [-1, num_auxiliary_variables])( transformed_distribution.TransformedDistribution( - distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast( - sample.Sample(normal.Normal(0., .1), num_auxiliary_variables), - to_shape=dist.batch_shape)]), + distribution=blockwise.Blockwise([ + batch_broadcast.BatchBroadcast(dist, + to_shape=batch_shape), + independent.Independent( + deterministic.Deterministic( + global_auxiliary_variables), + reinterpreted_batch_ndims=1)]), bijector=variables)) else: From ea80d7bac076dfd3b971bcbfb0a31653d03b7ac0 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 14 May 2021 11:22:31 +0200 Subject: [PATCH 21/54] fixed surrogate posterior type --- .../python/experimental/vi/cascading_flows.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 61dcce7236..95c7cf5faf 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -39,8 +39,7 @@ from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent -from tensorflow_probability.python.distributions import \ - joint_distribution_auto_batched +from tensorflow_probability.python.distributions import joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ joint_distribution_coroutine from tensorflow_probability.python.distributions import normal @@ -422,11 +421,10 @@ def posterior_generator(seed=seed): posterior_generator, seed=seed))) # Temporary workaround for bijector caching issues with autobatched JDs. - surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched - if not hasattr(dist, 'use_vectorized_map'): - surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine - surrogate_posterior = surrogate_type(posterior_generator, - name=_get_name(dist)) + surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified From cf11c70ac38c55f49cec1132e6e38d7faa941d23 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 10:47:34 +0200 Subject: [PATCH 22/54] small fixes --- .../python/experimental/vi/cascading_flows.py | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 95c7cf5faf..a9735f3739 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -156,7 +156,7 @@ def register_cf_substitution_rule(condition, substitution_fn): def build_cf_surrogate_posterior( prior, num_auxiliary_variables=0, - initial_prior_weight=0.5, + initial_prior_weight=0.98, seed=None, name=None): # todo: change docstrings @@ -311,14 +311,12 @@ def _cf_surrogate_for_distribution(dist, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, - global_auxiliary_variables=global_auxiliary_variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, - num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -343,19 +341,17 @@ def posterior_generator(seed=seed): for _ in range(0, layers - 1): bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.5, - activation_fn=True, gate_first_n=0, - seed=seed)) + residual_fraction_initial_value=0.98, + activation_fn=True, gate_first_n=0, seed=seed)) bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.5, - activation_fn=False, gate_first_n=0, - seed=seed)) + residual_fraction_initial_value=0.98, + activation_fn=False, gate_first_n=0, seed=seed)) variables = chain.Chain(bijectors=list(reversed(bijectors))) eps = transformed_distribution.TransformedDistribution( - distribution=sample.Sample(normal.Normal(0., 0.1), + distribution=sample.Sample(normal.Normal(0., 1.), num_auxiliary_variables), bijector=variables) @@ -380,7 +376,7 @@ def posterior_generator(seed=seed): dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables=global_auxiliary_variables, + global_auxiliary_variables = global_auxiliary_variables, seed=init_seed) if was_root and num_auxiliary_variables == 0: @@ -422,9 +418,9 @@ def posterior_generator(seed=seed): # Temporary workaround for bijector caching issues with autobatched JDs. surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified @@ -461,8 +457,8 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape.shape.as_list()[0] > 0 else 1 layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] for _ in range(0, layers - 1): bijectors.append( @@ -475,8 +471,7 @@ def _cf_convex_update_for_base_distribution(dist, tf.reduce_prod(actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, activation_fn=False, gate_first_n=int_event_shape, seed=seed)) - bijectors.append( - reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -489,7 +484,7 @@ def _cf_convex_update_for_base_distribution(dist, transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), + to_shape=batch_shape), independent.Independent( deterministic.Deterministic( global_auxiliary_variables), From d9e28288bb82fce607a57a0b796e1e627abecd9a Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:09:26 +0200 Subject: [PATCH 23/54] fixed global variables if no auxiliary variabled --- tensorflow_probability/python/experimental/vi/cascading_flows.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index a9735f3739..ef9f6f78da 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -363,6 +363,7 @@ def posterior_generator(seed=seed): global_auxiliary_variables = value_out else: + global_auxiliary_variables = None i = 0 try: From 80e8ee7be1de52270cccff87a99cbecc1e677841 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:51:56 +0200 Subject: [PATCH 24/54] added number of layers parameter --- .../python/experimental/vi/cascading_flows.py | 81 ++++++++++++------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index ef9f6f78da..9b430bf6a6 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -39,16 +39,17 @@ from tensorflow_probability.python.distributions import gamma from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent -from tensorflow_probability.python.distributions import joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ - joint_distribution_coroutine + joint_distribution_auto_batched +from tensorflow_probability.python.distributions import \ + joint_distribution_coroutine from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import truncated_normal from tensorflow_probability.python.distributions import uniform from tensorflow_probability.python.experimental.bijectors import \ - build_highway_flow_layer + build_highway_flow_layer from tensorflow_probability.python.internal import samplers __all__ = [ @@ -157,6 +158,7 @@ def build_cf_surrogate_posterior( prior, num_auxiliary_variables=0, initial_prior_weight=0.98, + num_layers=3, seed=None, name=None): # todo: change docstrings @@ -254,8 +256,10 @@ def model_fn(): base_distribution_surrogate_fn=functools.partial( _cf_convex_update_for_base_distribution, initial_prior_weight=initial_prior_weight, - num_auxiliary_variables=num_auxiliary_variables), + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers), num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, seed=seed) surrogate_posterior.also_track = variables return surrogate_posterior @@ -263,9 +267,10 @@ def model_fn(): def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn, + num_auxiliary_variables, + num_layers, sample_shape=None, variables=None, - num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): # todo: change docstrings @@ -307,16 +312,22 @@ def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=variables, num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, global_auxiliary_variables=global_auxiliary_variables, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed) + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, + num_layers=num_layers, + seed=seed) return surrogate_posterior, variables def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None): + dist, base_distribution_surrogate_fn, variables, + num_auxiliary_variables, num_layers, global_auxiliary_variables, + seed=None): """Builds a structured joint surrogate posterior for a joint model.""" # Probabilistic program for CF surrogate posterior. @@ -335,18 +346,17 @@ def posterior_generator(seed=seed): variables = flat_variables[0] else: - layers = 3 bijectors = [] - for _ in range(0, layers - 1): + for _ in range(0, num_layers - 1): bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.98, - activation_fn=True, gate_first_n=0, seed=seed)) + activation_fn=True, + gate_first_n=0, seed=seed)) bijectors.append( build_highway_flow_layer(num_auxiliary_variables, - residual_fraction_initial_value=0.98, - activation_fn=False, gate_first_n=0, seed=seed)) + activation_fn=False, + gate_first_n=0, seed=seed)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -376,8 +386,10 @@ def posterior_generator(seed=seed): surrogate_posterior, variables = _cf_surrogate_for_distribution( dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables = global_auxiliary_variables, + global_auxiliary_variables=global_auxiliary_variables, seed=init_seed) if was_root and num_auxiliary_variables == 0: @@ -412,16 +424,18 @@ def posterior_generator(seed=seed): dist=dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, global_auxiliary_variables=global_auxiliary_variables, - variables=dist._model_unflatten( # pylint: disable=protected-access + variables=dist._model_unflatten( + # pylint: disable=protected-access _extract_variables_from_coroutine_model( posterior_generator, seed=seed))) # Temporary workaround for bijector caching issues with autobatched JDs. surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified @@ -445,10 +459,11 @@ def posterior_generator(seed=seed): # todo: sample_shape and seed are not used.. maybe they should? def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, - num_auxiliary_variables=0, - global_auxiliary_variables=None, + num_auxiliary_variables, + num_layers, + global_auxiliary_variables, + variables, sample_shape=None, - variables=None, seed=None): """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" @@ -456,23 +471,27 @@ def _cf_convex_update_for_base_distribution(dist, actual_event_shape = dist.event_shape_tensor() int_event_shape = int(actual_event_shape) if \ actual_event_shape.shape.as_list()[0] > 0 else 1 - layers = 3 bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + event_shape_in=actual_event_shape + + num_auxiliary_variables)] - for _ in range(0, layers - 1): + for _ in range(0, num_layers - 1): bijectors.append( build_highway_flow_layer( - tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=True, gate_first_n=int_event_shape, seed=seed)) + activation_fn=True, gate_first_n=int_event_shape, + seed=seed)) bijectors.append( build_highway_flow_layer( - tf.reduce_prod(actual_event_shape + num_auxiliary_variables), + tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - activation_fn=False, gate_first_n=int_event_shape, seed=seed)) - bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + activation_fn=False, gate_first_n=int_event_shape, + seed=seed)) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) variables = chain.Chain(bijectors=list(reversed(bijectors))) @@ -485,7 +504,7 @@ def _cf_convex_update_for_base_distribution(dist, transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), + to_shape=batch_shape), independent.Independent( deterministic.Deterministic( global_auxiliary_variables), From 13602296347ccea0763cea8af8b729fc8c100226 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 18 May 2021 11:59:21 +0200 Subject: [PATCH 25/54] readded highway flow --- .../python/experimental/bijectors/BUILD | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD index 9f7afce0b8..6befb7bf81 100644 --- a/tensorflow_probability/python/experimental/bijectors/BUILD +++ b/tensorflow_probability/python/experimental/bijectors/BUILD @@ -148,6 +148,20 @@ multi_substrate_py_library( ], ) +multi_substrate_py_library( + name = "highway_flow", + srcs = ["highway_flow.py"], + srcs_version = "PY3", + deps = [ + ":scalar_function_with_inferred_inverse", + # numpy dep, + # tensorflow dep, + "//tensorflow_probability/python/bijectors", + "//tensorflow_probability/python/util", + "//tensorflow_probability/python/internal:samplers", + ], +) + multi_substrate_py_test( name = "sharded_test", size = "medium", @@ -164,3 +178,18 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:test_util", ], ) + +multi_substrate_py_test( + name = "highway_flow_test", + size = "medium", + srcs = ["highway_flow_test.py"], + jax_size = "medium", + python_version = "PY3", + srcs_version = "PY3", + deps = [ + # numpy dep + # tensorflow dep, + "//tensorflow_probability", + "//tensorflow_probability/python/internal:test_util", + ], +) \ No newline at end of file From e1a22184a9cb454fc472ffc47d46c82241f4f5e0 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:05:36 +0200 Subject: [PATCH 26/54] fixed init --- tensorflow_probability/python/experimental/vi/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py index e18c8d3455..cc5530300a 100644 --- a/tensorflow_probability/python/experimental/vi/__init__.py +++ b/tensorflow_probability/python/experimental/vi/__init__.py @@ -17,6 +17,7 @@ from tensorflow_probability.python.experimental.vi import util from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule +from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior @@ -29,7 +30,7 @@ 'build_affine_surrogate_posterior', 'build_affine_surrogate_posterior_from_base_distribution', 'build_asvi_surrogate_posterior', - 'builf_cf_surrogate_posterior' + 'build_cf_surrogate_posterior', 'build_factored_surrogate_posterior', 'build_split_flow_surrogate_posterior', 'build_trainable_location_scale_distribution', From 4f667ee1f89daa467ef3d7e25cb3f536d6990061 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 10:08:45 +0200 Subject: [PATCH 27/54] working on tests --- .../experimental/vi/cascading_flows_test.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 9c4393be24..598d3fd66e 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -30,7 +30,7 @@ tfb = tfp.bijectors tfd = tfp.distributions - + # test_util.test_seed(sampler_type='stateless')) @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): @@ -65,7 +65,7 @@ def test_dims_and_gradients(self): self._expected_num_trainable_variables(prior_dist)) # Test that the sample shape is correct - three_posterior_samples = surrogate_posterior.sample( + '''three_posterior_samples = surrogate_posterior.sample( 3, seed=test_util.test_seed(sampler_type='stateless')) three_prior_samples = prior_dist.sample( 3, seed=test_util.test_seed(sampler_type='stateless')) @@ -74,15 +74,15 @@ def test_dims_and_gradients(self): [s.shape for s in tf.nest.flatten(three_posterior_samples)]) # Test that gradients are available wrt the variational parameters. - posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + posterior_sample = surrogate_posterior.sample( + seed=1) with tf.GradientTape() as tape: posterior_logprob = surrogate_posterior.log_prob(posterior_sample) grad = tape.gradient(posterior_logprob, surrogate_posterior.trainable_variables) - self.assertTrue(all(g is not None for g in grad)) + self.assertTrue(all(g is not None for g in grad))''' - def test_initialization_is_deterministic_following_seed(self): + '''def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( @@ -101,7 +101,7 @@ def test_initialization_is_deterministic_following_seed(self): posterior_sample2 = surrogate_posterior2.sample( seed=test_util.test_seed(sampler_type='stateless')) - self.assertAllEqualNested(posterior_sample, posterior_sample2) + self.assertAllEqualNested(posterior_sample, posterior_sample2)''' @test_util.test_all_tf_execution_regimes @@ -144,7 +144,7 @@ def target_log_prob(*x): return target_log_prob - def test_fitting_surrogate_posterior(self): + '''def test_fitting_surrogate_posterior(self): prior_dist = self.make_prior_dist() observations = self.get_observations(prior_dist) @@ -170,7 +170,7 @@ def test_fitting_surrogate_posterior(self): self.evaluate(tf1.global_variables_initializer()) _ = self.evaluate(losses) _ = self.evaluate(posterior_mean) - _ = self.evaluate(posterior_stddev) + _ = self.evaluate(posterior_stddev)''' @test_util.test_all_tf_execution_regimes From 75d8b53ebb000c50f262c7dce905c1989cc51ea5 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 20 May 2021 11:39:55 +0200 Subject: [PATCH 28/54] more testing --- .../experimental/vi/cascading_flows_test.py | 57 +++++++------------ 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 598d3fd66e..b52e1e5f77 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -35,73 +35,59 @@ @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): - def _expected_num_trainable_variables(self, prior_dist): + def _expected_num_trainable_variables(self, prior_dist, num_layers): """Infers the expected number of trainable variables for a non-nested JD.""" prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access expected_num_trainable_variables = 0 + + # For each distribution in the prior, we will have one highway flow with + # `num_layers` blocks, and each block has 4 trainable variables: + # `residual_fraction`, `lower_diagonal_weights_matrix`, + # `upper_diagonal_weights_matrix` and `bias`. for original_dist in prior_dists: - try: - original_dist = original_dist.distribution - except AttributeError: - pass - dist = cascading_flows._as_substituted_distribution(original_dist) - dist_params = dist.parameters - for param, value in dist_params.items(): - if (param not in cascading_flows._NON_STATISTICAL_PARAMS - and value is not None and param not in ('low', 'high')): - # One variable each for prior_weight, mean_field_parameter. - expected_num_trainable_variables += 2 + expected_num_trainable_variables += (4 * num_layers) return expected_num_trainable_variables def test_dims_and_gradients(self): prior_dist = self.make_prior_dist() - + num_layers = 3 surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist) + prior=prior_dist, num_layers=num_layers) # Test that the correct number of trainable variables are being tracked self.assertLen(surrogate_posterior.trainable_variables, - self._expected_num_trainable_variables(prior_dist)) + self._expected_num_trainable_variables(prior_dist, num_layers)) # Test that the sample shape is correct - '''three_posterior_samples = surrogate_posterior.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + three_posterior_samples = surrogate_posterior.sample( + 3, seed=1) three_prior_samples = prior_dist.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + 3, seed=1) self.assertAllEqualNested( [s.shape for s in tf.nest.flatten(three_prior_samples)], [s.shape for s in tf.nest.flatten(three_posterior_samples)]) - # Test that gradients are available wrt the variational parameters. - posterior_sample = surrogate_posterior.sample( - seed=1) - with tf.GradientTape() as tape: - posterior_logprob = surrogate_posterior.log_prob(posterior_sample) - grad = tape.gradient(posterior_logprob, - surrogate_posterior.trainable_variables) - self.assertTrue(all(g is not None for g in grad))''' - - '''def test_initialization_is_deterministic_following_seed(self): + def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) self.evaluate( [v.initializer for v in surrogate_posterior.trainable_variables]) posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) self.evaluate( [v.initializer for v in surrogate_posterior2.trainable_variables]) posterior_sample2 = surrogate_posterior2.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=1) - self.assertAllEqualNested(posterior_sample, posterior_sample2)''' + self.assertAllEqualNested(posterior_sample, posterior_sample2) @test_util.test_all_tf_execution_regimes @@ -172,7 +158,6 @@ def target_log_prob(*x): _ = self.evaluate(posterior_mean) _ = self.evaluate(posterior_stddev)''' - @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestEightSchools(test_util.TestCase, _TrainableCFSurrogate): @@ -235,7 +220,7 @@ def _prior_model_fn(): return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) -@test_util.test_all_tf_execution_regimes +'''@test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestDiscreteLatent( test_util.TestCase, _TrainableCFSurrogate): @@ -344,7 +329,7 @@ def centered_horseshoe(ndims=100): tfd.Normal) self.assertIsInstance(surrogate_dists.local_scale.distribution, tfd.Normal) - self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + self.assertIsInstance(surrogate_dists.weights, tfd.Normal)''' # TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions. # TODO(kateslin): Add an ASVI surrogate posterior test with for a model with From 65f5adbafb7057d4712bb8adc1f0dc49c87f0431 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 27 May 2021 13:34:20 +0200 Subject: [PATCH 29/54] small refsctoring and changed docstriings --- .../python/experimental/vi/cascading_flows.py | 890 +++++++++--------- 1 file changed, 471 insertions(+), 419 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 9b430bf6a6..8c9f48222c 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -40,29 +40,33 @@ from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import \ - joint_distribution_auto_batched + joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ - joint_distribution_coroutine + joint_distribution_coroutine from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution from tensorflow_probability.python.distributions import truncated_normal from tensorflow_probability.python.distributions import uniform from tensorflow_probability.python.experimental.bijectors import \ - build_highway_flow_layer + build_trainable_highway_flow from tensorflow_probability.python.internal import samplers __all__ = [ - 'register_cf_substitution_rule', - 'build_cf_surrogate_posterior' + 'register_cf_substitution_rule', + 'build_cf_surrogate_posterior' ] Root = joint_distribution_coroutine.JointDistributionCoroutine.Root +# TODO: the part until the function build_cf_surrogate_posterior is identical to +# the one in automatic_structured_vi. Should we remove it from this file and +# import them directly from automatic_structured_vi? + _NON_STATISTICAL_PARAMS = [ - 'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum', - 'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support', - 'num_probit_terms_approx' + 'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum', + 'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support', + 'num_probit_terms_approx' ] _NON_TRAINABLE_PARAMS = ['low', 'high'] @@ -70,393 +74,448 @@ # before defining the surrogate family. -# Todo: inherited from asvi code, do we need this? ASVI_SURROGATE_SUBSTITUTIONS = {} -# Todo: inherited from asvi code, do we need this? def _as_substituted_distribution(distribution): - """Applies all substitution rules that match a distribution.""" - for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items(): - if condition(distribution): - distribution = substitution_fn(distribution) - return distribution + """Applies all substitution rules that match a distribution.""" + for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items(): + if condition(distribution): + distribution = substitution_fn(distribution) + return distribution -# Todo: inherited from asvi code, do we need this? def register_cf_substitution_rule(condition, substitution_fn): - """Registers a rule for substituting distributions in ASVI surrogates. - - Args: - condition: Python `callable` that takes a Distribution instance and - returns a Python `bool` indicating whether or not to substitute it. - May also be a class type such as `tfd.Normal`, in which case the - condition is interpreted as - `lambda distribution: isinstance(distribution, class)`. - substitution_fn: Python `callable` that takes a Distribution - instance and returns a new Distribution instance used to define - the ASVI surrogate posterior. Note that this substitution does not modify - the original model. - - #### Example - - To use a Normal surrogate for all location-scale family distributions, we - could register the substitution: - - ```python - tfp.experimental.vi.register_asvi_surrogate_substitution( - condition=lambda distribution: ( - hasattr(distribution, 'loc') and hasattr(distribution, 'scale')) - substitution_fn=lambda distribution: ( - # Invoking the event space bijector applies any relevant constraints, - # e.g., that HalfCauchy samples must be `>= loc`. - distribution.experimental_default_event_space_bijector()( - tfd.Normal(loc=distribution.loc, scale=distribution.scale))) - ``` - - This rule will fire when ASVI encounters a location-scale distribution, - and instructs ASVI to build a surrogate 'as if' the model had just used a - (possibly constrained) Normal in its place. Note that we could have used a - more precise condition, e.g., to limit the substitution to distributions with - a specific `name`, if we had reason to think that a Normal distribution would - be a good surrogate for some model variables but not others. - - """ - global ASVI_SURROGATE_SUBSTITUTIONS - if inspect.isclass(condition): - condition = lambda distribution, cls=condition: isinstance( - # pylint: disable=g-long-lambda - distribution, cls) - ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn + """Registers a rule for substituting distributions in ASVI surrogates. + + Args: + condition: Python `callable` that takes a Distribution instance and + returns a Python `bool` indicating whether or not to substitute it. + May also be a class type such as `tfd.Normal`, in which case the + condition is interpreted as + `lambda distribution: isinstance(distribution, class)`. + substitution_fn: Python `callable` that takes a Distribution + instance and returns a new Distribution instance used to define + the ASVI surrogate posterior. Note that this substitution does not modify + the original model. + + #### Example + + To use a Normal surrogate for all location-scale family distributions, we + could register the substitution: + + ```python + tfp.experimental.vi.register_asvi_surrogate_substitution( + condition=lambda distribution: ( + hasattr(distribution, 'loc') and hasattr(distribution, 'scale')) + substitution_fn=lambda distribution: ( + # Invoking the event space bijector applies any relevant constraints, + # e.g., that HalfCauchy samples must be `>= loc`. + distribution.experimental_default_event_space_bijector()( + tfd.Normal(loc=distribution.loc, scale=distribution.scale))) + ``` + + This rule will fire when ASVI encounters a location-scale distribution, + and instructs ASVI to build a surrogate 'as if' the model had just used a + (possibly constrained) Normal in its place. Note that we could have used a + more precise condition, e.g., to limit the substitution to distributions with + a specific `name`, if we had reason to think that a Normal distribution would + be a good surrogate for some model variables but not others. + + """ + global ASVI_SURROGATE_SUBSTITUTIONS + if inspect.isclass(condition): + condition = lambda distribution, cls=condition: isinstance( + # pylint: disable=g-long-lambda + distribution, cls) + ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn # Default substitutions attempt to express distributions using the most # flexible available parameterization. # pylint: disable=g-long-lambda register_cf_substitution_rule( - half_normal.HalfNormal, - lambda dist: truncated_normal.TruncatedNormal( - loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) + half_normal.HalfNormal, + lambda dist: truncated_normal.TruncatedNormal( + loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) register_cf_substitution_rule( - uniform.Uniform, - lambda dist: shift.Shift(dist.low)( - scale_lib.Scale(dist.high - dist.low)( - beta.Beta(concentration0=tf.ones_like(dist.mean()), - concentration1=1.)))) + uniform.Uniform, + lambda dist: shift.Shift(dist.low)( + scale_lib.Scale(dist.high - dist.low)( + beta.Beta(concentration0=tf.ones_like(dist.mean()), + concentration1=1.)))) register_cf_substitution_rule( - exponential.Exponential, - lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) + exponential.Exponential, + lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) register_cf_substitution_rule( - chi2.Chi2, - lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) + chi2.Chi2, + lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) # pylint: enable=g-long-lambda # a single JointDistribution. def build_cf_surrogate_posterior( - prior, - num_auxiliary_variables=0, - initial_prior_weight=0.98, - num_layers=3, - seed=None, - name=None): - # todo: change docstrings - """Builds a structured surrogate posterior inspired by conjugate updating. - - ASVI, or Automatic Structured Variational Inference, was proposed by - Ambrogioni et al. (2020) [1] as a method of automatically constructing a - surrogate posterior with the same structure as the prior. It does this by - reparameterizing the variational family of the surrogate posterior by - structuring each parameter according to the equation - ```none - prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter - ``` - In this equation, `prior_parameter` is a vector of prior parameters and - `mean_field_parameter` is a vector of trainable parameters with the same - domain as `prior_parameter`. `prior_weight` is a vector of learnable - parameters where `0. <= prior_weight <= 1.`. When `prior_weight = - 0`, the surrogate posterior will be a mean-field surrogate, and when - `prior_weight = 1.`, the surrogate posterior will be the prior. This convex - combination equation, inspired by conjugacy in exponential families, thus - allows the surrogate posterior to balance between the structure of the prior - and the structure of a mean-field approximation. - - Args: - prior: tfd.JointDistribution instance of the prior. - mean_field: Optional Python boolean. If `True`, creates a degenerate - surrogate distribution in which all variables are independent, - ignoring the prior dependence structure. Default value: `False`. - initial_prior_weight: Optional float value (either static or tensor value) - on the interval [0, 1]. A larger value creates an initial surrogate - distribution with more dependence on the prior structure. Default value: - `0.5`. - seed: Python `int` seed for random initialization. - name: Optional string. Default value: `build_cf_surrogate_posterior`. - - Returns: - surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance - whose samples have shape and structure matching that of `prior`. - - Raises: - TypeError: The `prior` argument cannot be a nested `JointDistribution`. - - ### Examples - - Consider a Brownian motion model expressed as a JointDistribution: - - ```python - prior_loc = 0. - innovation_noise = .1 - - def model_fn(): - new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) - for i in range(4): - new = yield tfd.Normal(loc=new, scale=innovation_noise) - - prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) - ``` - - Let's use variational inference to approximate the posterior. We'll build a - surrogate posterior distribution by feeding in the prior distribution. - - ```python - surrogate_posterior = - tfp.experimental.vi.build_cf_surrogate_posterior(prior) - ``` - - This creates a trainable joint distribution, defined by variables in - `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` - to fit this distribution by minimizing a divergence to the true posterior. - - ```python - losses = tfp.vi.fit_surrogate_posterior( - target_log_prob_fn, - surrogate_posterior=surrogate_posterior, - num_steps=100, - optimizer=tf.optimizers.Adam(0.1), - sample_size=10) - - # After optimization, samples from the surrogate will approximate - # samples from the true posterior. - samples = surrogate_posterior.sample(100) - posterior_mean = [tf.reduce_mean(x) for x in samples] - posterior_std = [tf.math.reduce_std(x) for x in samples] - ``` - - #### References - [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured - variational inference. _arXiv preprint arXiv:2002.00643_, 2020 - https://arxiv.org/abs/2002.00643 - - """ - with tf.name_scope(name or 'build_cf_surrogate_posterior'): - surrogate_posterior, variables = _cf_surrogate_for_distribution( - dist=prior, - base_distribution_surrogate_fn=functools.partial( - _cf_convex_update_for_base_distribution, - initial_prior_weight=initial_prior_weight, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers), - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - seed=seed) - surrogate_posterior.also_track = variables - return surrogate_posterior + prior, + num_auxiliary_variables=0, + initial_prior_weight=0.98, + num_layers=3, + seed=None, + name=None): + """Builds a structured surrogate posterior with cascading flows. + + Cascading Flows (CF) [1] is a method that automatically construct a + variational approximation given an input probabilistic program. CF combines + ASVI [2] with the flexibility of normalizing flows, by transforming the + conditional distributions of the prior program with HighwayFlow architectures, + to steer the prior towards the observed data. More details on the HighwayFlow + architecture can be found in [1] and in the tfp bijector `HighwayFlow`. + It is possible to add auxiliary variables to the prior program to further + increase the flexibility of cascading flows, useful especially in the + cases where the input program has low dimensionality. The auxiliary variables + are sampled from a global linear flow, to account for statistical dependencies + among variables, and then transformed with local HighwayFlows together with + samples form the prior. Note that when using auxiliary variables it is + necessary to modify the variational lower bound [3]. + + Args: + prior: tfd.JointDistribution instance of the prior. + num_auxiliary_variables: The number of auxiliary variables to use for each + variable in the input program. Default value: `0`. + initial_prior_weight: Optional float value (either static or tensor value) + on the interval [0, 1]. A larger value creates an initial surrogate + distribution with more dependence on the prior structure. Default value: + `0.98`. + num_layers: Number of layers to use in each Highway Flow architecture. All + the layers will have `softplus` activation function, apart from the last one + which will have linear activation. Default value: `3`. + seed: Python `int` seed for random initialization. + name: Optional string. Default value: `build_cf_surrogate_posterior`. + + Returns: + surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance + whose samples have shape and structure matching that of `prior`. + + Raises: + TypeError: The `prior` argument cannot be a nested `JointDistribution`. + + ### Examples + + Consider a Brownian motion model expressed as a JointDistribution: + + ```python + prior_loc = 0. + innovation_noise = .1 + + def model_fn(): + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for i in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) + ``` + + Let's use variational inference to approximate the posterior. We'll build a + surrogate posterior distribution by feeding in the prior distribution. + + ```python + surrogate_posterior = + tfp.experimental.vi.build_cf_surrogate_posterior(prior) + ``` + + This creates a trainable joint distribution, defined by variables in + `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` + to fit this distribution by minimizing a divergence to the true posterior. + + ```python + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob_fn, + surrogate_posterior=surrogate_posterior, + num_steps=100, + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # After optimization, samples from the surrogate will approximate + # samples from the true posterior. + samples = surrogate_posterior.sample(100) + posterior_mean = [tf.reduce_mean(x) for x in samples] + posterior_std = [tf.math.reduce_std(x) for x in samples] + ``` + + When using auxiliary variables, we need some modifications for loss and + samples, as samples will return also the global variables and transformed + auxiliary variables + + ```python + num_aux_vars=10 + target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape( + tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1), + scale=tf.reshape(tfp.util.TransformedVariable( + [tf.random.uniform((1,), minval=0.01, maxval=1.) + for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1) + + def target_log_prob_aux_vars(z_and_eps): + z = [x[0] for x in z_and_eps[1:]] + eps = [x[1] for x in z_and_eps[1:]] + lp_z = target_log_prob_fn(z) + lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape) + return lp_z + lp_eps + + target_log_prob = lambda *values: target_log_prob_aux_vars(values) + cf_surrogate_posterior = build_cf_surrogate_posterior(prior, + num_auxiliary_variables=num_aux_vars) + trainable_variables = list(cf_surrogate_posterior.trainable_variables) + trainable_variables.extend(list(target_dist.trainable_variables)) + cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob, + cf_surrogate_posterior, + optimizer=tf.optimizers.Adam(0.01), + num_steps=8000, + sample_size=50, + trainable_variables=trainable_variables) + + cf_posterior_samples = cf_surrogate_posterior.sample(num_samples) + cf_posterior_samples = tf.convert_to_tensor( + [s[0] for s in cf_posterior_samples[1:]]) + ``` + + #### References + [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic + variational inference with cascading flows." arXiv preprint arXiv:2102.04801 + (2021). + + [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference." + International Conference on Artificial Intelligence and Statistics. PMLR, + 2021. + + [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational + models." International Conference on Machine Learning. PMLR, 2016. + + """ + with tf.name_scope(name or 'build_cf_surrogate_posterior'): + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist=prior, + base_distribution_surrogate_fn=functools.partial( + _cf_convex_update_for_base_distribution, + initial_prior_weight=initial_prior_weight, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers), + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + seed=seed) + surrogate_posterior.also_track = variables + return surrogate_posterior def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn, num_auxiliary_variables, num_layers, + global_auxiliary_variables=None, sample_shape=None, variables=None, - global_auxiliary_variables=None, seed=None): - # todo: change docstrings - """Recursively creates ASVI surrogates, and creates new variables if needed. - - Args: - dist: a `tfd.Distribution` instance. - base_distribution_surrogate_fn: Callable to build a surrogate posterior - for a 'base' (non-meta and non-joint) distribution, with signature - `surrogate_posterior, variables = base_distribution_fn( - dist, sample_shape=None, variables=None, seed=None)`. - sample_shape: Optional `Tensor` shape of samples drawn from `dist` by - `tfd.Sample` wrappers. If not `None`, the surrogate's event will include - independent sample dimensions, i.e., it will have event shape - `concat([sample_shape, dist.event_shape], axis=0)`. - Default value: `None`. - variables: Optional nested structure of `tf.Variable`s returned from a - previous call to `_cf_surrogate_for_distribution`. If `None`, - new variables will be created; otherwise, constructs a surrogate posterior - backed by the passed-in variables. - Default value: `None`. - seed: Python `int` seed for random initialization. - Returns: - surrogate_posterior: Instance of `tfd.Distribution` representing a trainable - surrogate posterior distribution, with the same structure and `name` as - `dist`. - variables: Nested structure of `tf.Variable` trainable parameters for the - surrogate posterior. If `dist` is a base distribution, this is - a `dict` of `ASVIParameters` instances. If `dist` is a joint - distribution, this is a `dist.dtype` structure of such `dict`s. - """ - - # Apply any substitutions, while attempting to preserve the original name. - dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) - - if hasattr(dist, '_model_coroutine'): - surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( - dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - variables=variables, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - global_auxiliary_variables=global_auxiliary_variables, - seed=seed) - else: - surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, - global_auxiliary_variables=global_auxiliary_variables, - num_layers=num_layers, - seed=seed) - return surrogate_posterior, variables + """Recursively creates CF surrogates, and creates new variables if needed. + + Args: + dist: a `tfd.Distribution` instance. + base_distribution_surrogate_fn: Callable to build a surrogate posterior + for a 'base' (non-meta and non-joint) distribution, with signature + `surrogate_posterior, variables = base_distribution_fn( + dist, sample_shape=None, variables=None, seed=None)`. + num_auxiliary_variables: The number of auxiliary variables to use for each + variable in the input program. + num_layers: Number of layers to use in each Highway Flow architecture. + global_auxiliary_variables: The sampled global auxiliary variables + (available only if using auxiliary variables). Default value: None. + sample_shape: Optional `Tensor` shape of samples drawn from `dist` by + `tfd.Sample` wrappers. If not `None`, the surrogate's event will include + independent sample dimensions, i.e., it will have event shape + `concat([sample_shape, dist.event_shape], axis=0)`. + Default value: `None`. + variables: Optional nested structure of `tf.Variable`s returned from a + previous call to `_cf_surrogate_for_distribution`. If `None`, + new variables will be created; otherwise, constructs a surrogate posterior + backed by the passed-in variables. + Default value: `None`. + seed: Python `int` seed for random initialization. + Returns: + surrogate_posterior: Instance of `tfd.Distribution` representing a trainable + surrogate posterior distribution, with the same structure and `name` as + `dist`, and with addition of global and local auxiliary variables if + `num_auxiliary_variables > 0`. + variables: Nested structure of `tf.Variable` trainable parameters for the + surrogate posterior. If `dist` is a base distribution, this is + a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape` + bijectors. If `dist` is a joint distribution, this is a `dist.dtype` + structure of such `tfb.Chain`s. + """ + + # Apply any substitutions, while attempting to preserve the original name. + dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) + + if hasattr(dist, '_model_coroutine'): + surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=variables, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + global_auxiliary_variables=global_auxiliary_variables, + seed=seed) + else: + surrogate_posterior, variables = base_distribution_surrogate_fn( + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, + num_layers=num_layers, + seed=seed) + return surrogate_posterior, variables -def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables, - num_auxiliary_variables, num_layers, global_auxiliary_variables, - seed=None): - """Builds a structured joint surrogate posterior for a joint model.""" +def _build_highway_flow_block(num_layers, width, + residual_fraction_initial_value, gate_first_n, + seed): + bijectors = [] - # Probabilistic program for CF surrogate posterior. - flat_variables = dist._model_flatten( - variables) if variables else None # pylint: disable=protected-access - prior_coroutine = dist._model_coroutine # pylint: disable=protected-access + for _ in range(0, num_layers - 1): + bijectors.append( + build_trainable_highway_flow(width, + activation_fn=tf.nn.softplus, + gate_first_n=gate_first_n, seed=seed)) + bijectors.append( + build_trainable_highway_flow(width, + activation_fn=None, + gate_first_n=gate_first_n, seed=seed)) - def posterior_generator(seed=seed): - prior_gen = prior_coroutine() - dist = next(prior_gen) + return bijectors - if num_auxiliary_variables > 0: - i = 1 - if flat_variables: - variables = flat_variables[0] +def _cf_surrogate_for_joint_distribution( + dist, base_distribution_surrogate_fn, variables, + num_auxiliary_variables, num_layers, global_auxiliary_variables, + seed=None): + """Builds a structured joint surrogate posterior for a joint model.""" - else: - bijectors = [] + # Probabilistic program for CF surrogate posterior. + flat_variables = dist._model_flatten( + variables) if variables else None # pylint: disable=protected-access + prior_coroutine = dist._model_coroutine # pylint: disable=protected-access - for _ in range(0, num_layers - 1): - bijectors.append( - build_highway_flow_layer(num_auxiliary_variables, - activation_fn=True, - gate_first_n=0, seed=seed)) - bijectors.append( - build_highway_flow_layer(num_auxiliary_variables, - activation_fn=False, - gate_first_n=0, seed=seed)) + def posterior_generator(seed=seed): + prior_gen = prior_coroutine() + dist = next(prior_gen) - variables = chain.Chain(bijectors=list(reversed(bijectors))) + if num_auxiliary_variables > 0: + i = 1 - eps = transformed_distribution.TransformedDistribution( - distribution=sample.Sample(normal.Normal(0., 1.), - num_auxiliary_variables), - bijector=variables) + if flat_variables: + variables = flat_variables[0] - eps = Root(eps) + else: - value_out = yield (eps if flat_variables - else (eps, variables)) + bijectors = _build_highway_flow_block( + num_layers, + width=num_auxiliary_variables, + residual_fraction_initial_value=None, + gate_first_n=0, seed=seed) + variables = chain.Chain(bijectors=list(reversed(bijectors))) - global_auxiliary_variables = value_out + eps = transformed_distribution.TransformedDistribution( + distribution=sample.Sample(normal.Normal(0., 1.), + num_auxiliary_variables), + bijector=variables) - else: - global_auxiliary_variables = None - i = 0 + eps = Root(eps) - try: - while True: - was_root = isinstance(dist, Root) - if was_root: - dist = dist.distribution + value_out = yield (eps if flat_variables + else (eps, variables)) - seed, init_seed = samplers.split_seed(seed) - surrogate_posterior, variables = _cf_surrogate_for_distribution( - dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables=global_auxiliary_variables, - seed=init_seed) - - if was_root and num_auxiliary_variables == 0: - surrogate_posterior = Root(surrogate_posterior) - # If variables were not given---i.e., we're creating new - # variables---then yield the new variables along with the surrogate - # posterior. This assumes an execution context such as - # `_extract_variables_from_coroutine_model` below that will capture and - # save the variables. - value_out = yield (surrogate_posterior if flat_variables - else (surrogate_posterior, variables)) - if type(value_out) == list: - if len(dist.event_shape) == 0: - dist = prior_gen.send(tf.squeeze(value_out[0], -1)) - else: - dist = prior_gen.send(value_out[0]) + global_auxiliary_variables = value_out else: - dist = prior_gen.send(value_out) - i += 1 - except StopIteration: - pass - - if variables is None: - # Run the generator to create variables, then call ourselves again - # to construct the surrogate JD from these variables. Note that we can't - # just create a JDC from the current `posterior_generator`, because it will - # try to build new variables on every invocation; the recursive call will - # define a new `posterior_generator` that knows about the variables we're - # about to create. - return _cf_surrogate_for_joint_distribution( - dist=dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - global_auxiliary_variables=global_auxiliary_variables, - variables=dist._model_unflatten( + global_auxiliary_variables = None + i = 0 + + try: + while True: + was_root = isinstance(dist, Root) + if was_root: + dist = dist.distribution + + seed, init_seed = samplers.split_seed(seed) + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + variables=flat_variables[i] if flat_variables else None, + global_auxiliary_variables=global_auxiliary_variables, + seed=init_seed) + + if was_root and num_auxiliary_variables == 0: + surrogate_posterior = Root(surrogate_posterior) + # If variables were not given---i.e., we're creating new + # variables---then yield the new variables along with the surrogate + # posterior. This assumes an execution context such as + # `_extract_variables_from_coroutine_model` below that will capture and + # save the variables. + value_out = yield (surrogate_posterior if flat_variables + else (surrogate_posterior, variables)) + if type(value_out) == list: + if len(dist.event_shape) == 0: + dist = prior_gen.send(tf.squeeze(value_out[0], -1)) + else: + dist = prior_gen.send(value_out[0]) + + else: + dist = prior_gen.send(value_out) + i += 1 + except StopIteration: + pass + + if variables is None: + # Run the generator to create variables, then call ourselves again + # to construct the surrogate JD from these variables. Note that we can't + # just create a JDC from the current `posterior_generator`, because it will + # try to build new variables on every invocation; the recursive call will + # define a new `posterior_generator` that knows about the variables we're + # about to create. + return _cf_surrogate_for_joint_distribution( + dist=dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + global_auxiliary_variables=global_auxiliary_variables, + variables=dist._model_unflatten( + # pylint: disable=protected-access + _extract_variables_from_coroutine_model( + posterior_generator, seed=seed))) + + # Temporary workaround for bijector caching issues with autobatched JDs. + surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) + + # Ensure that the surrogate posterior structure matches that of the prior. + # todo: check me, do we need this? in case needs to be modified + # if we use auxiliary variables, then the structure won't match the one of the + # prior + '''try: + tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) + except TypeError: + tokenize = lambda jd: jd._model_unflatten( + # pylint: disable=protected-access, g-long-lambda + range(len(jd._model_flatten(jd.dtype))) # pylint: disable=protected-access - _extract_variables_from_coroutine_model( - posterior_generator, seed=seed))) - - # Temporary workaround for bijector caching issues with autobatched JDs. - surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) - - # Ensure that the surrogate posterior structure matches that of the prior. - # todo: check me, do we need this? in case needs to be modified - # if we use auxiliary variables, then the structure won't match the one of the - # prior - '''try: - tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) - except TypeError: - tokenize = lambda jd: jd._model_unflatten( - # pylint: disable=protected-access, g-long-lambda - range(len(jd._model_flatten(jd.dtype))) - # pylint: disable=protected-access - ) - surrogate_posterior = restructure.Restructure( - output_structure=tokenize(dist), - input_structure=tokenize(surrogate_posterior))( - surrogate_posterior, name=_get_name(dist))''' - return surrogate_posterior, variables - - -# todo: sample_shape and seed are not used.. maybe they should? + ) + surrogate_posterior = restructure.Restructure( + output_structure=tokenize(dist), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist))''' + return surrogate_posterior, variables + + +# todo: sample_shape is not used.. can remove? def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables, @@ -465,91 +524,84 @@ def _cf_convex_update_for_base_distribution(dist, variables, sample_shape=None, seed=None): - """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + + if variables is None: + actual_event_shape = dist.event_shape_tensor() + int_event_shape = int(actual_event_shape) if \ + actual_event_shape.shape.as_list()[0] > 0 else 1 + bijectors = [reshape.Reshape([-1], + event_shape_in=actual_event_shape + + num_auxiliary_variables)] + + bijectors.extend( + _build_highway_flow_block( + num_layers, + width=tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + gate_first_n=int_event_shape, seed=seed)) - if variables is None: - actual_event_shape = dist.event_shape_tensor() - int_event_shape = int(actual_event_shape) if \ - actual_event_shape.shape.as_list()[0] > 0 else 1 - bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] - - for _ in range(0, num_layers - 1): - bijectors.append( - build_highway_flow_layer( - tf.reduce_prod( - actual_event_shape + num_auxiliary_variables), - residual_fraction_initial_value=initial_prior_weight, - activation_fn=True, gate_first_n=int_event_shape, - seed=seed)) - bijectors.append( - build_highway_flow_layer( - tf.reduce_prod( - actual_event_shape + num_auxiliary_variables), - residual_fraction_initial_value=initial_prior_weight, - activation_fn=False, gate_first_n=int_event_shape, - seed=seed)) - bijectors.append( - reshape.Reshape(actual_event_shape + num_auxiliary_variables)) - - variables = chain.Chain(bijectors=list(reversed(bijectors))) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) - if num_auxiliary_variables > 0: - batch_shape = global_auxiliary_variables.shape[0] if len( - global_auxiliary_variables.shape) > 1 else [] + variables = chain.Chain(bijectors=list(reversed(bijectors))) - cascading_flows = split.Split( - [-1, num_auxiliary_variables])( - transformed_distribution.TransformedDistribution( - distribution=blockwise.Blockwise([ - batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), - independent.Independent( - deterministic.Deterministic( - global_auxiliary_variables), - reinterpreted_batch_ndims=1)]), - bijector=variables)) + if num_auxiliary_variables > 0: + batch_shape = global_auxiliary_variables.shape[0] if len( + global_auxiliary_variables.shape) > 1 else [] + + cascading_flows = split.Split( + [-1, num_auxiliary_variables])( + transformed_distribution.TransformedDistribution( + distribution=blockwise.Blockwise([ + batch_broadcast.BatchBroadcast(dist, + to_shape=batch_shape), + independent.Independent( + deterministic.Deterministic( + global_auxiliary_variables), + reinterpreted_batch_ndims=1)]), + bijector=variables)) - else: - cascading_flows = transformed_distribution.TransformedDistribution( - distribution=dist, - bijector=variables) + else: + cascading_flows = transformed_distribution.TransformedDistribution( + distribution=dist, + bijector=variables) - return cascading_flows, variables + return cascading_flows, variables def _extract_variables_from_coroutine_model(model_fn, seed=None): - """Extracts variables from a generator that yields (dist, variables) pairs.""" - gen = model_fn() - try: - dist, dist_variables = next(gen) - flat_variables = [dist_variables] - while True: - seed, local_seed = samplers.split_seed(seed, n=2) - sampled_value = (dist.distribution.sample(seed=local_seed) - if isinstance(dist, Root) - else dist.sample(seed=local_seed)) - dist, dist_variables = gen.send( - sampled_value) # tf.concat(sampled_value, axis=0) - flat_variables.append(dist_variables) - except StopIteration: - pass - return flat_variables + """Extracts variables from a generator that yields (dist, variables) pairs.""" + gen = model_fn() + try: + dist, dist_variables = next(gen) + flat_variables = [dist_variables] + while True: + seed, local_seed = samplers.split_seed(seed, n=2) + sampled_value = (dist.distribution.sample(seed=local_seed) + if isinstance(dist, Root) + else dist.sample(seed=local_seed)) + dist, dist_variables = gen.send( + sampled_value) # tf.concat(sampled_value, axis=0) + flat_variables.append(dist_variables) + except StopIteration: + pass + return flat_variables def _set_name(dist, name): - """Copies a distribution-like object, replacing its name.""" - if hasattr(dist, 'copy'): - return dist.copy(name=name) - # Some distribution-like entities such as JointDistributionPinned don't - # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set - # the name directly. - dist = copy.copy(dist) - dist._name = name # pylint: disable=protected-access - return dist + """Copies a distribution-like object, replacing its name.""" + if hasattr(dist, 'copy'): + return dist.copy(name=name) + # Some distribution-like entities such as JointDistributionPinned don't + # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set + # the name directly. + dist = copy.copy(dist) + dist._name = name # pylint: disable=protected-access + return dist def _get_name(dist): - """Attempts to get a distribution's short name, excluding the name scope.""" - return getattr(dist, 'parameters', {}).get('name', dist.name) + """Attempts to get a distribution's short name, excluding the name scope.""" + return getattr(dist, 'parameters', {}).get('name', dist.name) From a2b025c8be79fbf69d6aff37f87403523fd13da3 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 27 May 2021 13:35:03 +0200 Subject: [PATCH 30/54] added dependency to build_trainable_highway_flow --- tensorflow_probability/python/experimental/vi/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD index 863e0aeef2..6e08525bb3 100644 --- a/tensorflow_probability/python/experimental/vi/BUILD +++ b/tensorflow_probability/python/experimental/vi/BUILD @@ -94,6 +94,7 @@ py_library( "//tensorflow_probability/python/distributions:transformed_distribution", "//tensorflow_probability/python/distributions:truncated_normal", "//tensorflow_probability/python/distributions:uniform", + "//tensorflow_probability/python/experimental/bijectors:build_trainable_highway_flow", "//tensorflow_probability/python/internal:samplers", ], ) From 7bd84572e3f87d9b3c594b605c28ac6352227ffe Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 28 May 2021 10:37:54 +0200 Subject: [PATCH 31/54] some refactoring --- .../python/experimental/vi/cascading_flows.py | 4 +- .../experimental/vi/cascading_flows_test.py | 216 +++++++++--------- 2 files changed, 113 insertions(+), 107 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 8c9f48222c..fad0a70517 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -379,10 +379,12 @@ def _build_highway_flow_block(num_layers, width, for _ in range(0, num_layers - 1): bijectors.append( build_trainable_highway_flow(width, + residual_fraction_initial_value=residual_fraction_initial_value, activation_fn=tf.nn.softplus, gate_first_n=gate_first_n, seed=seed)) bijectors.append( build_trainable_highway_flow(width, + residual_fraction_initial_value=residual_fraction_initial_value, activation_fn=None, gate_first_n=gate_first_n, seed=seed)) @@ -415,7 +417,7 @@ def posterior_generator(seed=seed): bijectors = _build_highway_flow_block( num_layers, width=num_auxiliary_variables, - residual_fraction_initial_value=None, + residual_fraction_initial_value=0, # not used gate_first_n=0, seed=seed) variables = chain.Chain(bijectors=list(reversed(bijectors))) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index b52e1e5f77..2d9d35c808 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -18,19 +18,20 @@ from __future__ import division from __future__ import print_function -# Dependency imports - import tensorflow.compat.v1 as tf1 import tensorflow.compat.v2 as tf + import tensorflow_probability as tfp -from tensorflow_probability.python.experimental.vi import cascading_flows from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util +# Dependency imports tfb = tfp.bijectors tfd = tfp.distributions - # test_util.test_seed(sampler_type='stateless')) + + +# test_util.test_seed(sampler_type='stateless')) @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): @@ -49,50 +50,59 @@ def _expected_num_trainable_variables(self, prior_dist, num_layers): return expected_num_trainable_variables def test_dims_and_gradients(self): - prior_dist = self.make_prior_dist() num_layers = 3 surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist, num_layers=num_layers) + prior=prior_dist, num_layers=num_layers) # Test that the correct number of trainable variables are being tracked self.assertLen(surrogate_posterior.trainable_variables, - self._expected_num_trainable_variables(prior_dist, num_layers)) + self._expected_num_trainable_variables(prior_dist, + num_layers)) # Test that the sample shape is correct three_posterior_samples = surrogate_posterior.sample( - 3, seed=1) + 3, seed=(0, 0)) three_prior_samples = prior_dist.sample( - 3, seed=1) + 3, seed=(0, 0)) self.assertAllEqualNested( - [s.shape for s in tf.nest.flatten(three_prior_samples)], - [s.shape for s in tf.nest.flatten(three_posterior_samples)]) + [s.shape for s in tf.nest.flatten(three_prior_samples)], + [s.shape for s in tf.nest.flatten(three_posterior_samples)]) + + # Test that gradients are available wrt the variational parameters. + with tf.GradientTape() as tape: + posterior_sample = surrogate_posterior.sample( + seed=(0, 0)) + posterior_logprob = surrogate_posterior.log_prob(posterior_sample) + grad = tape.gradient(posterior_logprob, + surrogate_posterior.trainable_variables) + self.assertTrue(all(g is not None for g in grad)) def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist, - seed=1) + prior=prior_dist, + seed=(0, 0)) self.evaluate( - [v.initializer for v in surrogate_posterior.trainable_variables]) + [v.initializer for v in surrogate_posterior.trainable_variables]) posterior_sample = surrogate_posterior.sample( - seed=1) + seed=(0, 0)) surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist, - seed=1) + prior=prior_dist, + seed=(0, 0)) self.evaluate( - [v.initializer for v in surrogate_posterior2.trainable_variables]) + [v.initializer for v in surrogate_posterior2.trainable_variables]) posterior_sample2 = surrogate_posterior2.sample( - seed=1) + seed=(0, 0)) self.assertAllEqualNested(posterior_sample, posterior_sample2) @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, - _TrainableCFSurrogate): + _TrainableCFSurrogate): def make_prior_dist(self): @@ -117,7 +127,7 @@ def get_observations(self, prior_dist): observation_noise = 0.15 ground_truth = prior_dist.sample() likelihood = self.make_likelihood_model( - x=ground_truth, observation_noise=observation_noise) + x=ground_truth, observation_noise=observation_noise) return likelihood.sample(1) def get_target_log_prob(self, observations, prior_dist): @@ -125,42 +135,45 @@ def get_target_log_prob(self, observations, prior_dist): def target_log_prob(*x): observation_noise = 0.15 likelihood_dist = self.make_likelihood_model( - x=x, observation_noise=observation_noise) - return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x) + x=x, observation_noise=observation_noise) + return likelihood_dist.log_prob(observations) + prior_dist.log_prob( + x) return target_log_prob - '''def test_fitting_surrogate_posterior(self): + def test_fitting_surrogate_posterior(self): prior_dist = self.make_prior_dist() observations = self.get_observations(prior_dist) surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( - prior=prior_dist) + prior=prior_dist) target_log_prob = self.get_target_log_prob(observations, prior_dist) # Test vi fit surrogate posterior works losses = tfp.vi.fit_surrogate_posterior( - target_log_prob, - surrogate_posterior, - num_steps=5, # Don't optimize to completion. - optimizer=tf.optimizers.Adam(0.1), - sample_size=10) + target_log_prob, + surrogate_posterior, + num_steps=5, # Don't optimize to completion. + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) # Compute posterior statistics. with tf.control_dependencies([losses]): posterior_samples = surrogate_posterior.sample(100) - posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples) + posterior_mean = tf.nest.map_structure(tf.reduce_mean, + posterior_samples) posterior_stddev = tf.nest.map_structure(tf.math.reduce_std, posterior_samples) self.evaluate(tf1.global_variables_initializer()) _ = self.evaluate(losses) _ = self.evaluate(posterior_mean) - _ = self.evaluate(posterior_stddev)''' + _ = self.evaluate(posterior_stddev) + @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestEightSchools(test_util.TestCase, - _TrainableCFSurrogate): + _TrainableCFSurrogate): def make_prior_dist(self): treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12], @@ -168,64 +181,64 @@ def make_prior_dist(self): num_schools = ps.shape(treatment_effects)[-1] return tfd.JointDistributionNamed({ - 'avg_effect': - tfd.Normal(loc=0., scale=10., name='avg_effect'), - 'log_stddev': - tfd.Normal(loc=5., scale=1., name='log_stddev'), - 'school_effects': - lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda - tfd.Independent( - tfd.Normal( - loc=avg_effect[..., None] * tf.ones(num_schools), - scale=tf.exp(log_stddev[..., None]) * tf.ones( - num_schools), - name='school_effects'), - reinterpreted_batch_ndims=1)) + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( + # pylint: disable=g-long-lambda + tfd.Independent( + tfd.Normal( + loc=avg_effect[..., None] * tf.ones(num_schools), + scale=tf.exp(log_stddev[..., None]) * tf.ones( + num_schools), + name='school_effects'), + reinterpreted_batch_ndims=1)) }) @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase, - _TrainableCFSurrogate): + _TrainableCFSurrogate): def make_prior_dist(self): - return tfd.JointDistributionNamed({ - 'avg_effect': - tfd.Normal(loc=0., scale=10., name='avg_effect'), - 'log_stddev': - tfd.Normal(loc=5., scale=1., name='log_stddev'), - 'school_effects': - lambda log_stddev, avg_effect: ( # pylint: disable=g-long-lambda - tfd.Sample( - tfd.Normal( - loc=avg_effect[..., None], - scale=tf.exp(log_stddev[..., None]), - name='school_effects'), - sample_shape=[8])) + 'avg_effect': + tfd.Normal(loc=0., scale=10., name='avg_effect'), + 'log_stddev': + tfd.Normal(loc=5., scale=1., name='log_stddev'), + 'school_effects': + lambda log_stddev, avg_effect: ( + # pylint: disable=g-long-lambda + tfd.Sample( + tfd.Normal( + loc=avg_effect[..., None], + scale=tf.exp(log_stddev[..., None]), + name='school_effects'), + sample_shape=[8])) }) @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase, - _TrainableCFSurrogate): + _TrainableCFSurrogate): def make_prior_dist(self): - def _prior_model_fn(): innovation_noise = 1. yield tfd.HalfNormal( - scale=innovation_noise, validate_args=True, allow_nan_stats=False) + scale=innovation_noise, validate_args=True, + allow_nan_stats=False) return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) -'''@test_util.test_all_tf_execution_regimes +@test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestDiscreteLatent( - test_util.TestCase, _TrainableCFSurrogate): + test_util.TestCase, _TrainableCFSurrogate): def make_prior_dist(self): - def _prior_model_fn(): a = yield tfd.Bernoulli(logits=0.5, name='a') yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1., @@ -236,36 +249,30 @@ def _prior_model_fn(): @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestNesting(test_util.TestCase, - _TrainableCFSurrogate): - - def _expected_num_trainable_variables(self, _): - # Nested distributions have total of 10 params after Exponential->Gamma - # substitution, multiplied by 2 variables per param. - return 20 + _TrainableCFSurrogate): def make_prior_dist(self): - def nested_model(): a = yield tfd.Sample( - tfd.Sample( - tfd.Normal(0., 1.), - sample_shape=4), - sample_shape=[2], - name='a') + tfd.Sample( + tfd.Normal(0., 1.), + sample_shape=4), + sample_shape=[2], + name='a') b = yield tfb.Sigmoid()( - tfb.Square()( - tfd.Exponential(rate=tf.exp(a))), - name='b') + tfb.Square()( + tfd.Exponential(rate=tf.exp(a))), + name='b') # pylint: disable=g-long-lambda yield tfd.JointDistributionSequential( - [tfd.Laplace(loc=a, scale=b), - lambda c1: tfd.Independent( - tfd.Beta(concentration1=1., - concentration0=tf.nn.softplus(c1)), - reinterpreted_batch_ndims=1), - lambda c1, c2: tfd.JointDistributionNamed({ - 'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)}) - ], name='c') + [tfd.Laplace(loc=a, scale=b), + lambda c1: tfd.Independent( + tfd.Beta(concentration1=1., + concentration0=tf.nn.softplus(c1)), + reinterpreted_batch_ndims=1), + lambda c1, c2: tfd.JointDistributionNamed({ + 'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)}) + ], name='c') # pylint: enable=g-long-lambda return tfd.JointDistributionCoroutineAutoBatched(nested_model) @@ -275,19 +282,18 @@ def nested_model(): class TestCFDistributionSubstitution(test_util.TestCase): def test_default_substitutes_trainable_families(self): - @tfd.JointDistributionCoroutineAutoBatched def model(): yield tfd.Sample( - tfd.Uniform(low=-2., high=7.), - sample_shape=[2], - name='a') + tfd.Uniform(low=-2., high=7.), + sample_shape=[2], + name='a') yield tfd.HalfNormal(1., name='b') yield tfd.Exponential(rate=[1., 2.], name='c') yield tfd.Chi2(df=3., name='d') surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( - model) + model) self.assertAllEqualNested(model.event_shape, surrogate.event_shape) surrogate_dists, _ = surrogate.sample_distributions() @@ -301,22 +307,23 @@ def model(): self.assertIsInstance(surrogate_dists.d, tfd.Gamma) def test_can_specify_custom_substitution(self): - @tfd.JointDistributionCoroutineAutoBatched def centered_horseshoe(ndims=100): global_scale = yield tfd.HalfCauchy( - loc=0., scale=1., name='global_scale') + loc=0., scale=1., name='global_scale') local_scale = yield tfd.HalfCauchy( - loc=0., scale=tf.ones([ndims]), name='local_scale') + loc=0., scale=tf.ones([ndims]), name='local_scale') yield tfd.Normal( - loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights') + loc=0., scale=tf.sqrt(global_scale * local_scale), + name='weights') tfp.experimental.vi.register_asvi_substitution_rule( - condition=tfd.HalfCauchy, - substitution_fn=( - lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale)))) + condition=tfd.HalfCauchy, + substitution_fn=( + lambda d: tfb.Softplus(1e-6)( + tfd.Normal(loc=d.loc, scale=d.scale)))) surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( - centered_horseshoe) + centered_horseshoe) self.assertAllEqualNested(centered_horseshoe.event_shape, surrogate.event_shape) @@ -329,11 +336,8 @@ def centered_horseshoe(ndims=100): tfd.Normal) self.assertIsInstance(surrogate_dists.local_scale.distribution, tfd.Normal) - self.assertIsInstance(surrogate_dists.weights, tfd.Normal)''' + self.assertIsInstance(surrogate_dists.weights, tfd.Normal) -# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions. -# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with -# missing observations. if __name__ == '__main__': tf.test.main() From ae34080e07e5c1777d3318a86326315ccaa59351 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 28 May 2021 10:55:58 +0200 Subject: [PATCH 32/54] changed seed --- .../experimental/vi/cascading_flows_test.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 35772332d8..0b45486e6c 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -31,8 +31,6 @@ tfd = tfp.distributions -# test_util.test_seed(sampler_type='stateless')) - @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): @@ -62,9 +60,9 @@ def test_dims_and_gradients(self): # Test that the sample shape is correct three_posterior_samples = surrogate_posterior.sample( - 3, seed=(0, 0)) + 3, seed=test_util.test_seed(sampler_type='stateless')) three_prior_samples = prior_dist.sample( - 3, seed=(0, 0)) + 3, seed=test_util.test_seed(sampler_type='stateless')) self.assertAllEqualNested( [s.shape for s in tf.nest.flatten(three_prior_samples)], [s.shape for s in tf.nest.flatten(three_posterior_samples)]) @@ -72,7 +70,7 @@ def test_dims_and_gradients(self): # Test that gradients are available wrt the variational parameters. with tf.GradientTape() as tape: posterior_sample = surrogate_posterior.sample( - seed=(0, 0)) + seed=test_util.test_seed(sampler_type='stateless')) posterior_logprob = surrogate_posterior.log_prob(posterior_sample) grad = tape.gradient(posterior_logprob, surrogate_posterior.trainable_variables) @@ -83,19 +81,19 @@ def test_initialization_is_deterministic_following_seed(self): surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=(0, 0)) + seed=test_util.test_seed(sampler_type='stateless')) self.evaluate( [v.initializer for v in surrogate_posterior.trainable_variables]) posterior_sample = surrogate_posterior.sample( - seed=(0, 0)) + seed=test_util.test_seed(sampler_type='stateless')) surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( prior=prior_dist, - seed=(0, 0)) + seed=test_util.test_seed(sampler_type='stateless')) self.evaluate( [v.initializer for v in surrogate_posterior2.trainable_variables]) posterior_sample2 = surrogate_posterior2.sample( - seed=(0, 0)) + seed=test_util.test_seed(sampler_type='stateless')) self.assertAllEqualNested(posterior_sample, posterior_sample2) @@ -104,7 +102,6 @@ def test_initialization_is_deterministic_following_seed(self): class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, _TrainableCFSurrogate): - def make_prior_dist(self): def _prior_model_fn(): @@ -339,5 +336,6 @@ def centered_horseshoe(ndims=100): tfd.Normal) self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + if __name__ == '__main__': tf.test.main() From d0e287f79d5c2e36d5e8c9217081ad63e540c216 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 28 May 2021 10:56:13 +0200 Subject: [PATCH 33/54] reverted to master --- .../python/experimental/bijectors/BUILD | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD index 8596d61582..9f7afce0b8 100644 --- a/tensorflow_probability/python/experimental/bijectors/BUILD +++ b/tensorflow_probability/python/experimental/bijectors/BUILD @@ -119,7 +119,6 @@ multi_substrate_py_library( ], ) - multi_substrate_py_test( name = "highway_flow_test", size = "medium", @@ -149,20 +148,6 @@ multi_substrate_py_library( ], ) -multi_substrate_py_library( - name = "highway_flow", - srcs = ["highway_flow.py"], - srcs_version = "PY3", - deps = [ - ":scalar_function_with_inferred_inverse", - # numpy dep, - # tensorflow dep, - "//tensorflow_probability/python/bijectors", - "//tensorflow_probability/python/util", - "//tensorflow_probability/python/internal:samplers", - ], -) - multi_substrate_py_test( name = "sharded_test", size = "medium", @@ -179,18 +164,3 @@ multi_substrate_py_test( "//tensorflow_probability/python/internal:test_util", ], ) - -multi_substrate_py_test( - name = "highway_flow_test", - size = "medium", - srcs = ["highway_flow_test.py"], - jax_size = "medium", - python_version = "PY3", - srcs_version = "PY3", - deps = [ - # numpy dep - # tensorflow dep, - "//tensorflow_probability", - "//tensorflow_probability/python/internal:test_util", - ], -) \ No newline at end of file From 1e9a4867d47f9e9126d778dc2c03732ab4d2a89f Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 3 Jun 2021 15:03:30 +0200 Subject: [PATCH 34/54] removed substitution rule and updated dependencies --- .../python/experimental/vi/BUILD | 14 +- .../python/experimental/vi/cascading_flows.py | 900 ++++++++---------- 2 files changed, 401 insertions(+), 513 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD index 6e08525bb3..d693fa80ea 100644 --- a/tensorflow_probability/python/experimental/vi/BUILD +++ b/tensorflow_probability/python/experimental/vi/BUILD @@ -70,30 +70,22 @@ py_library( py_library( name = "cascading_flows", - srcs = ["cascading_flows.py.py"], + srcs = ["cascading_flows.py"], srcs_version = "PY3", deps = [ # tensorflow dep, - "//tensorflow_probability/python/bijectors:build_highway_flow_layer", "//tensorflow_probability/python/bijectors:chain", "//tensorflow_probability/python/bijectors:reshape", - "//tensorflow_probability/python/bijectors:scale", - "//tensorflow_probability/python/bijectors:shift", "//tensorflow_probability/python/bijectors:split", "//tensorflow_probability/python/distributions:batch_broadcast", - "//tensorflow_probability/python/distributions:beta", "//tensorflow_probability/python/distributions:blockwise", - "//tensorflow_probability/python/distributions:chi2", - "//tensorflow_probability/python/distributions:exponential", - "//tensorflow_probability/python/distributions:gamma", - "//tensorflow_probability/python/distributions:half_normal", + "//tensorflow_probability/python/distributions:deterministic", + "//tensorflow_probability/python/distributions:independent", "//tensorflow_probability/python/distributions:joint_distribution_auto_batched", "//tensorflow_probability/python/distributions:joint_distribution_coroutine", "//tensorflow_probability/python/distributions:normal", "//tensorflow_probability/python/distributions:sample", "//tensorflow_probability/python/distributions:transformed_distribution", - "//tensorflow_probability/python/distributions:truncated_normal", - "//tensorflow_probability/python/distributions:uniform", "//tensorflow_probability/python/experimental/bijectors:build_trainable_highway_flow", "//tensorflow_probability/python/internal:samplers", ], diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index f61f962012..0a87ae4399 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -21,287 +21,183 @@ import copy import functools -import inspect import tensorflow.compat.v2 as tf from tensorflow_probability.python.bijectors import chain from tensorflow_probability.python.bijectors import reshape -from tensorflow_probability.python.bijectors import scale as scale_lib -from tensorflow_probability.python.bijectors import shift from tensorflow_probability.python.bijectors import split from tensorflow_probability.python.distributions import batch_broadcast -from tensorflow_probability.python.distributions import beta from tensorflow_probability.python.distributions import blockwise -from tensorflow_probability.python.distributions import chi2 from tensorflow_probability.python.distributions import deterministic -from tensorflow_probability.python.distributions import exponential -from tensorflow_probability.python.distributions import gamma -from tensorflow_probability.python.distributions import half_normal from tensorflow_probability.python.distributions import independent from tensorflow_probability.python.distributions import \ - joint_distribution_auto_batched + joint_distribution_auto_batched from tensorflow_probability.python.distributions import \ - joint_distribution_coroutine + joint_distribution_coroutine from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution -from tensorflow_probability.python.distributions import truncated_normal -from tensorflow_probability.python.distributions import uniform -from tensorflow_probability.python.experimental.bijectors import build_trainable_highway_flow +from tensorflow_probability.python.experimental.bijectors import \ + build_trainable_highway_flow from tensorflow_probability.python.internal import samplers __all__ = [ - 'register_cf_substitution_rule', 'build_cf_surrogate_posterior' ] Root = joint_distribution_coroutine.JointDistributionCoroutine.Root -# TODO: the part until the function build_cf_surrogate_posterior is identical to -# the one in automatic_structured_vi. Should we remove it from this file and -# import them directly from automatic_structured_vi? -_NON_STATISTICAL_PARAMS = [ - 'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum', - 'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support', - 'num_probit_terms_approx' -] -_NON_TRAINABLE_PARAMS = ['low', 'high'] - -# Registry of transformations that are applied to distributions in the prior -# before defining the surrogate family. - - -ASVI_SURROGATE_SUBSTITUTIONS = {} - - -def _as_substituted_distribution(distribution): - """Applies all substitution rules that match a distribution.""" - for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items(): - if condition(distribution): - distribution = substitution_fn(distribution) - return distribution - - -def register_cf_substitution_rule(condition, substitution_fn): - """Registers a rule for substituting distributions in ASVI surrogates. - - Args: - condition: Python `callable` that takes a Distribution instance and - returns a Python `bool` indicating whether or not to substitute it. - May also be a class type such as `tfd.Normal`, in which case the - condition is interpreted as - `lambda distribution: isinstance(distribution, class)`. - substitution_fn: Python `callable` that takes a Distribution - instance and returns a new Distribution instance used to define - the ASVI surrogate posterior. Note that this substitution does not modify - the original model. - - #### Example - - To use a Normal surrogate for all location-scale family distributions, we - could register the substitution: - - ```python - tfp.experimental.vi.register_asvi_surrogate_substitution( - condition=lambda distribution: ( - hasattr(distribution, 'loc') and hasattr(distribution, 'scale')) - substitution_fn=lambda distribution: ( - # Invoking the event space bijector applies any relevant constraints, - # e.g., that HalfCauchy samples must be `>= loc`. - distribution.experimental_default_event_space_bijector()( - tfd.Normal(loc=distribution.loc, scale=distribution.scale))) - ``` - - This rule will fire when ASVI encounters a location-scale distribution, - and instructs ASVI to build a surrogate 'as if' the model had just used a - (possibly constrained) Normal in its place. Note that we could have used a - more precise condition, e.g., to limit the substitution to distributions with - a specific `name`, if we had reason to think that a Normal distribution would - be a good surrogate for some model variables but not others. - - """ - global ASVI_SURROGATE_SUBSTITUTIONS - if inspect.isclass(condition): - condition = lambda distribution, cls=condition: isinstance( - # pylint: disable=g-long-lambda - distribution, cls) - ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn - -# Default substitutions attempt to express distributions using the most -# flexible available parameterization. -# pylint: disable=g-long-lambda -register_cf_substitution_rule( - half_normal.HalfNormal, - lambda dist: truncated_normal.TruncatedNormal( - loc=0., scale=dist.scale, low=0., high=dist.scale * 10.)) -register_cf_substitution_rule( - uniform.Uniform, - lambda dist: shift.Shift(dist.low)( - scale_lib.Scale(dist.high - dist.low)( - beta.Beta(concentration0=tf.ones_like(dist.mean()), - concentration1=1.)))) -register_cf_substitution_rule( - exponential.Exponential, - lambda dist: gamma.Gamma(concentration=1., rate=dist.rate)) -register_cf_substitution_rule( - chi2.Chi2, - lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5)) - - -# pylint: enable=g-long-lambda - -# a single JointDistribution. def build_cf_surrogate_posterior( - prior, - num_auxiliary_variables=0, - initial_prior_weight=0.98, - num_layers=3, - seed=None, - name=None): - """Builds a structured surrogate posterior with cascading flows. - - Cascading Flows (CF) [1] is a method that automatically construct a - variational approximation given an input probabilistic program. CF combines - ASVI [2] with the flexibility of normalizing flows, by transforming the - conditional distributions of the prior program with HighwayFlow architectures, - to steer the prior towards the observed data. More details on the HighwayFlow - architecture can be found in [1] and in the tfp bijector `HighwayFlow`. - It is possible to add auxiliary variables to the prior program to further - increase the flexibility of cascading flows, useful especially in the - cases where the input program has low dimensionality. The auxiliary variables - are sampled from a global linear flow, to account for statistical dependencies - among variables, and then transformed with local HighwayFlows together with - samples form the prior. Note that when using auxiliary variables it is - necessary to modify the variational lower bound [3]. - - Args: - prior: tfd.JointDistribution instance of the prior. - num_auxiliary_variables: The number of auxiliary variables to use for each - variable in the input program. Default value: `0`. - initial_prior_weight: Optional float value (either static or tensor value) - on the interval [0, 1]. A larger value creates an initial surrogate - distribution with more dependence on the prior structure. Default value: - `0.98`. - num_layers: Number of layers to use in each Highway Flow architecture. All - the layers will have `softplus` activation function, apart from the last one - which will have linear activation. Default value: `3`. - seed: Python `int` seed for random initialization. - name: Optional string. Default value: `build_cf_surrogate_posterior`. - - Returns: - surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance - whose samples have shape and structure matching that of `prior`. - - Raises: - TypeError: The `prior` argument cannot be a nested `JointDistribution`. - - ### Examples - - Consider a Brownian motion model expressed as a JointDistribution: - - ```python - prior_loc = 0. - innovation_noise = .1 - - def model_fn(): - new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) - for i in range(4): - new = yield tfd.Normal(loc=new, scale=innovation_noise) - - prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) - ``` - - Let's use variational inference to approximate the posterior. We'll build a - surrogate posterior distribution by feeding in the prior distribution. - - ```python - surrogate_posterior = - tfp.experimental.vi.build_cf_surrogate_posterior(prior) - ``` - - This creates a trainable joint distribution, defined by variables in - `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` - to fit this distribution by minimizing a divergence to the true posterior. - - ```python - losses = tfp.vi.fit_surrogate_posterior( - target_log_prob_fn, - surrogate_posterior=surrogate_posterior, - num_steps=100, - optimizer=tf.optimizers.Adam(0.1), - sample_size=10) - - # After optimization, samples from the surrogate will approximate - # samples from the true posterior. - samples = surrogate_posterior.sample(100) - posterior_mean = [tf.reduce_mean(x) for x in samples] - posterior_std = [tf.math.reduce_std(x) for x in samples] - ``` - - When using auxiliary variables, we need some modifications for loss and - samples, as samples will return also the global variables and transformed - auxiliary variables - - ```python - num_aux_vars=10 - target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape( - tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1), - scale=tf.reshape(tfp.util.TransformedVariable( - [tf.random.uniform((1,), minval=0.01, maxval=1.) - for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1) - - def target_log_prob_aux_vars(z_and_eps): - z = [x[0] for x in z_and_eps[1:]] - eps = [x[1] for x in z_and_eps[1:]] - lp_z = target_log_prob_fn(z) - lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape) - return lp_z + lp_eps - - target_log_prob = lambda *values: target_log_prob_aux_vars(values) - cf_surrogate_posterior = build_cf_surrogate_posterior(prior, - num_auxiliary_variables=num_aux_vars) - trainable_variables = list(cf_surrogate_posterior.trainable_variables) - trainable_variables.extend(list(target_dist.trainable_variables)) - cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob, - cf_surrogate_posterior, - optimizer=tf.optimizers.Adam(0.01), - num_steps=8000, - sample_size=50, - trainable_variables=trainable_variables) - - cf_posterior_samples = cf_surrogate_posterior.sample(num_samples) - cf_posterior_samples = tf.convert_to_tensor( - [s[0] for s in cf_posterior_samples[1:]]) - ``` - - #### References - [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic - variational inference with cascading flows." arXiv preprint arXiv:2102.04801 - (2021). - - [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference." - International Conference on Artificial Intelligence and Statistics. PMLR, - 2021. - - [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational - models." International Conference on Machine Learning. PMLR, 2016. - - """ - with tf.name_scope(name or 'build_cf_surrogate_posterior'): - surrogate_posterior, variables = _cf_surrogate_for_distribution( - dist=prior, - base_distribution_surrogate_fn=functools.partial( - _cf_convex_update_for_base_distribution, - initial_prior_weight=initial_prior_weight, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers), - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - seed=seed) - surrogate_posterior.also_track = variables - return surrogate_posterior + prior, + num_auxiliary_variables=0, + initial_prior_weight=0.98, + num_layers=3, + seed=None, + name=None): + """Builds a structured surrogate posterior with cascading flows. + + Cascading Flows (CF) [1] is a method that automatically construct a + variational approximation given an input probabilistic program. CF combines + ASVI [2] with the flexibility of normalizing flows, by transforming the + conditional distributions of the prior program with HighwayFlow architectures, + to steer the prior towards the observed data. More details on the HighwayFlow + architecture can be found in [1] and in the tfp bijector `HighwayFlow`. + It is possible to add auxiliary variables to the prior program to further + increase the flexibility of cascading flows, useful especially in the + cases where the input program has low dimensionality. The auxiliary variables + are sampled from a global linear flow, to account for statistical dependencies + among variables, and then transformed with local HighwayFlows together with + samples form the prior. Note that when using auxiliary variables it is + necessary to modify the variational lower bound [3]. + + Args: + prior: tfd.JointDistribution instance of the prior. + num_auxiliary_variables: The number of auxiliary variables to use for each + variable in the input program. Default value: `0`. + initial_prior_weight: Optional float value (either static or tensor value) + on the interval [0, 1]. A larger value creates an initial surrogate + distribution with more dependence on the prior structure. Default value: + `0.98`. + num_layers: Number of layers to use in each Highway Flow architecture. All + the layers will have `softplus` activation function, apart from the last one + which will have linear activation. Default value: `3`. + seed: Python `int` seed for random initialization. + name: Optional string. Default value: `build_cf_surrogate_posterior`. + + Returns: + surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance + whose samples have shape and structure matching that of `prior`. + + Raises: + TypeError: The `prior` argument cannot be a nested `JointDistribution`. + + ### Examples + + Consider a Brownian motion model expressed as a JointDistribution: + + ```python + prior_loc = 0. + innovation_noise = .1 + + def model_fn(): + new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise) + for i in range(4): + new = yield tfd.Normal(loc=new, scale=innovation_noise) + + prior = tfd.JointDistributionCoroutineAutoBatched(model_fn) + ``` + + Let's use variational inference to approximate the posterior. We'll build a + surrogate posterior distribution by feeding in the prior distribution. + + ```python + surrogate_posterior = + tfp.experimental.vi.build_cf_surrogate_posterior(prior) + ``` + + This creates a trainable joint distribution, defined by variables in + `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior` + to fit this distribution by minimizing a divergence to the true posterior. + + ```python + losses = tfp.vi.fit_surrogate_posterior( + target_log_prob_fn, + surrogate_posterior=surrogate_posterior, + num_steps=100, + optimizer=tf.optimizers.Adam(0.1), + sample_size=10) + + # After optimization, samples from the surrogate will approximate + # samples from the true posterior. + samples = surrogate_posterior.sample(100) + posterior_mean = [tf.reduce_mean(x) for x in samples] + posterior_std = [tf.math.reduce_std(x) for x in samples] + ``` + + When using auxiliary variables, we need some modifications for loss and + samples, as samples will return also the global variables and transformed + auxiliary variables + + ```python + num_aux_vars=10 + target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape( + tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1), + scale=tf.reshape(tfp.util.TransformedVariable( + [tf.random.uniform((1,), minval=0.01, maxval=1.) + for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1) + + def target_log_prob_aux_vars(z_and_eps): + z = [x[0] for x in z_and_eps[1:]] + eps = [x[1] for x in z_and_eps[1:]] + lp_z = target_log_prob_fn(z) + lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape) + return lp_z + lp_eps + + target_log_prob = lambda *values: target_log_prob_aux_vars(values) + cf_surrogate_posterior = build_cf_surrogate_posterior(prior, + num_auxiliary_variables=num_aux_vars) + trainable_variables = list(cf_surrogate_posterior.trainable_variables) + trainable_variables.extend(list(target_dist.trainable_variables)) + cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob, + cf_surrogate_posterior, + optimizer=tf.optimizers.Adam(0.01), + num_steps=8000, + sample_size=50, + trainable_variables=trainable_variables) + + cf_posterior_samples = cf_surrogate_posterior.sample(num_samples) + cf_posterior_samples = tf.convert_to_tensor( + [s[0] for s in cf_posterior_samples[1:]]) + ``` + + #### References + [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic + variational inference with cascading flows." arXiv preprint arXiv:2102.04801 + (2021). + + [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference." + International Conference on Artificial Intelligence and Statistics. PMLR, + 2021. + + [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational + models." International Conference on Machine Learning. PMLR, 2016. + + """ + with tf.name_scope(name or 'build_cf_surrogate_posterior'): + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist=prior, + base_distribution_surrogate_fn=functools.partial( + _cf_convex_update_for_base_distribution, + initial_prior_weight=initial_prior_weight, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers), + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + seed=seed) + surrogate_posterior.also_track = variables + return surrogate_posterior def _cf_surrogate_for_distribution(dist, @@ -312,207 +208,207 @@ def _cf_surrogate_for_distribution(dist, sample_shape=None, variables=None, seed=None): - """Recursively creates CF surrogates, and creates new variables if needed. - - Args: - dist: a `tfd.Distribution` instance. - base_distribution_surrogate_fn: Callable to build a surrogate posterior - for a 'base' (non-meta and non-joint) distribution, with signature - `surrogate_posterior, variables = base_distribution_fn( - dist, sample_shape=None, variables=None, seed=None)`. - num_auxiliary_variables: The number of auxiliary variables to use for each - variable in the input program. - num_layers: Number of layers to use in each Highway Flow architecture. - global_auxiliary_variables: The sampled global auxiliary variables - (available only if using auxiliary variables). Default value: None. - sample_shape: Optional `Tensor` shape of samples drawn from `dist` by - `tfd.Sample` wrappers. If not `None`, the surrogate's event will include - independent sample dimensions, i.e., it will have event shape - `concat([sample_shape, dist.event_shape], axis=0)`. - Default value: `None`. - variables: Optional nested structure of `tf.Variable`s returned from a - previous call to `_cf_surrogate_for_distribution`. If `None`, - new variables will be created; otherwise, constructs a surrogate posterior - backed by the passed-in variables. - Default value: `None`. - seed: Python `int` seed for random initialization. - Returns: - surrogate_posterior: Instance of `tfd.Distribution` representing a trainable - surrogate posterior distribution, with the same structure and `name` as - `dist`, and with addition of global and local auxiliary variables if - `num_auxiliary_variables > 0`. - variables: Nested structure of `tf.Variable` trainable parameters for the - surrogate posterior. If `dist` is a base distribution, this is - a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape` - bijectors. If `dist` is a joint distribution, this is a `dist.dtype` - structure of such `tfb.Chain`s. - """ - - # Apply any substitutions, while attempting to preserve the original name. - dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) - - if hasattr(dist, '_model_coroutine'): - surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( - dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - variables=variables, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - global_auxiliary_variables=global_auxiliary_variables, - seed=seed) - else: - surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, - global_auxiliary_variables=global_auxiliary_variables, - num_layers=num_layers, - seed=seed) - return surrogate_posterior, variables + """Recursively creates CF surrogates, and creates new variables if needed. + + Args: + dist: a `tfd.Distribution` instance. + base_distribution_surrogate_fn: Callable to build a surrogate posterior + for a 'base' (non-meta and non-joint) distribution, with signature + `surrogate_posterior, variables = base_distribution_fn( + dist, sample_shape=None, variables=None, seed=None)`. + num_auxiliary_variables: The number of auxiliary variables to use for each + variable in the input program. + num_layers: Number of layers to use in each Highway Flow architecture. + global_auxiliary_variables: The sampled global auxiliary variables + (available only if using auxiliary variables). Default value: None. + sample_shape: Optional `Tensor` shape of samples drawn from `dist` by + `tfd.Sample` wrappers. If not `None`, the surrogate's event will include + independent sample dimensions, i.e., it will have event shape + `concat([sample_shape, dist.event_shape], axis=0)`. + Default value: `None`. + variables: Optional nested structure of `tf.Variable`s returned from a + previous call to `_cf_surrogate_for_distribution`. If `None`, + new variables will be created; otherwise, constructs a surrogate posterior + backed by the passed-in variables. + Default value: `None`. + seed: Python `int` seed for random initialization. + Returns: + surrogate_posterior: Instance of `tfd.Distribution` representing a trainable + surrogate posterior distribution, with the same structure and `name` as + `dist`, and with addition of global and local auxiliary variables if + `num_auxiliary_variables > 0`. + variables: Nested structure of `tf.Variable` trainable parameters for the + surrogate posterior. If `dist` is a base distribution, this is + a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape` + bijectors. If `dist` is a joint distribution, this is a `dist.dtype` + structure of such `tfb.Chain`s. + """ + + # Apply any substitutions, while attempting to preserve the original name. + dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) + + if hasattr(dist, '_model_coroutine'): + surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + variables=variables, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + global_auxiliary_variables=global_auxiliary_variables, + seed=seed) + else: + surrogate_posterior, variables = base_distribution_surrogate_fn( + dist=dist, sample_shape=sample_shape, variables=variables, + global_auxiliary_variables=global_auxiliary_variables, + num_layers=num_layers, + seed=seed) + return surrogate_posterior, variables def _build_highway_flow_block(num_layers, width, residual_fraction_initial_value, gate_first_n, seed): - bijectors = [] - - for _ in range(0, num_layers - 1): - bijectors.append( - build_trainable_highway_flow(width, - residual_fraction_initial_value=residual_fraction_initial_value, - activation_fn=tf.nn.softplus, - gate_first_n=gate_first_n, seed=seed)) + bijectors = [] + + for _ in range(0, num_layers - 1): bijectors.append( - build_trainable_highway_flow(width, - residual_fraction_initial_value=residual_fraction_initial_value, - activation_fn=None, - gate_first_n=gate_first_n, seed=seed)) + build_trainable_highway_flow(width, + residual_fraction_initial_value=residual_fraction_initial_value, + activation_fn=tf.nn.softplus, + gate_first_n=gate_first_n, seed=seed)) + bijectors.append( + build_trainable_highway_flow(width, + residual_fraction_initial_value=residual_fraction_initial_value, + activation_fn=None, + gate_first_n=gate_first_n, seed=seed)) - return bijectors + return bijectors def _cf_surrogate_for_joint_distribution( - dist, base_distribution_surrogate_fn, variables, - num_auxiliary_variables, num_layers, global_auxiliary_variables, - seed=None): - """Builds a structured joint surrogate posterior for a joint model.""" + dist, base_distribution_surrogate_fn, variables, + num_auxiliary_variables, num_layers, global_auxiliary_variables, + seed=None): + """Builds a structured joint surrogate posterior for a joint model.""" - # Probabilistic program for CF surrogate posterior. - flat_variables = dist._model_flatten( - variables) if variables else None # pylint: disable=protected-access - prior_coroutine = dist._model_coroutine # pylint: disable=protected-access + # Probabilistic program for CF surrogate posterior. + flat_variables = dist._model_flatten( + variables) if variables else None # pylint: disable=protected-access + prior_coroutine = dist._model_coroutine # pylint: disable=protected-access - def posterior_generator(seed=seed): - prior_gen = prior_coroutine() - dist = next(prior_gen) + def posterior_generator(seed=seed): + prior_gen = prior_coroutine() + dist = next(prior_gen) - if num_auxiliary_variables > 0: - i = 1 + if num_auxiliary_variables > 0: + i = 1 - if flat_variables: - variables = flat_variables[0] + if flat_variables: + variables = flat_variables[0] - else: + else: - bijectors = _build_highway_flow_block( - num_layers, - width=num_auxiliary_variables, - residual_fraction_initial_value=0, # not used - gate_first_n=0, seed=seed) - variables = chain.Chain(bijectors=list(reversed(bijectors))) + bijectors = _build_highway_flow_block( + num_layers, + width=num_auxiliary_variables, + residual_fraction_initial_value=0, # not used + gate_first_n=0, seed=seed) + variables = chain.Chain(bijectors=list(reversed(bijectors))) - eps = transformed_distribution.TransformedDistribution( - distribution=sample.Sample(normal.Normal(0., 1.), - num_auxiliary_variables), - bijector=variables) + eps = transformed_distribution.TransformedDistribution( + distribution=sample.Sample(normal.Normal(0., 1.), + num_auxiliary_variables), + bijector=variables) - eps = Root(eps) + eps = Root(eps) - value_out = yield (eps if flat_variables - else (eps, variables)) + value_out = yield (eps if flat_variables + else (eps, variables)) - global_auxiliary_variables = value_out + global_auxiliary_variables = value_out + + else: + global_auxiliary_variables = None + i = 0 + + try: + while True: + was_root = isinstance(dist, Root) + if was_root: + dist = dist.distribution + + seed, init_seed = samplers.split_seed(seed) + surrogate_posterior, variables = _cf_surrogate_for_distribution( + dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + variables=flat_variables[i] if flat_variables else None, + global_auxiliary_variables=global_auxiliary_variables, + seed=init_seed) + + if was_root and num_auxiliary_variables == 0: + surrogate_posterior = Root(surrogate_posterior) + # If variables were not given---i.e., we're creating new + # variables---then yield the new variables along with the surrogate + # posterior. This assumes an execution context such as + # `_extract_variables_from_coroutine_model` below that will capture and + # save the variables. + value_out = yield (surrogate_posterior if flat_variables + else (surrogate_posterior, variables)) + if type(value_out) == list: + if len(dist.event_shape) == 0: + dist = prior_gen.send(tf.squeeze(value_out[0], -1)) + else: + dist = prior_gen.send(value_out[0]) else: - global_auxiliary_variables = None - i = 0 - - try: - while True: - was_root = isinstance(dist, Root) - if was_root: - dist = dist.distribution - - seed, init_seed = samplers.split_seed(seed) - surrogate_posterior, variables = _cf_surrogate_for_distribution( - dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - variables=flat_variables[i] if flat_variables else None, - global_auxiliary_variables=global_auxiliary_variables, - seed=init_seed) - - if was_root and num_auxiliary_variables == 0: - surrogate_posterior = Root(surrogate_posterior) - # If variables were not given---i.e., we're creating new - # variables---then yield the new variables along with the surrogate - # posterior. This assumes an execution context such as - # `_extract_variables_from_coroutine_model` below that will capture and - # save the variables. - value_out = yield (surrogate_posterior if flat_variables - else (surrogate_posterior, variables)) - if type(value_out) == list: - if len(dist.event_shape) == 0: - dist = prior_gen.send(tf.squeeze(value_out[0], -1)) - else: - dist = prior_gen.send(value_out[0]) - - else: - dist = prior_gen.send(value_out) - i += 1 - except StopIteration: - pass - - if variables is None: - # Run the generator to create variables, then call ourselves again - # to construct the surrogate JD from these variables. Note that we can't - # just create a JDC from the current `posterior_generator`, because it will - # try to build new variables on every invocation; the recursive call will - # define a new `posterior_generator` that knows about the variables we're - # about to create. - return _cf_surrogate_for_joint_distribution( - dist=dist, - base_distribution_surrogate_fn=base_distribution_surrogate_fn, - num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers, - global_auxiliary_variables=global_auxiliary_variables, - variables=dist._model_unflatten( - # pylint: disable=protected-access - _extract_variables_from_coroutine_model( - posterior_generator, seed=seed))) - - # Temporary workaround for bijector caching issues with autobatched JDs. - surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( - posterior_generator, - use_vectorized_map=dist.use_vectorized_map, - name=_get_name(dist)) - - # Ensure that the surrogate posterior structure matches that of the prior. - # todo: check me, do we need this? in case needs to be modified - # if we use auxiliary variables, then the structure won't match the one of the - # prior - '''try: - tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) - except TypeError: - tokenize = lambda jd: jd._model_unflatten( - # pylint: disable=protected-access, g-long-lambda - range(len(jd._model_flatten(jd.dtype))) + dist = prior_gen.send(value_out) + i += 1 + except StopIteration: + pass + + if variables is None: + # Run the generator to create variables, then call ourselves again + # to construct the surrogate JD from these variables. Note that we can't + # just create a JDC from the current `posterior_generator`, because it will + # try to build new variables on every invocation; the recursive call will + # define a new `posterior_generator` that knows about the variables we're + # about to create. + return _cf_surrogate_for_joint_distribution( + dist=dist, + base_distribution_surrogate_fn=base_distribution_surrogate_fn, + num_auxiliary_variables=num_auxiliary_variables, + num_layers=num_layers, + global_auxiliary_variables=global_auxiliary_variables, + variables=dist._model_unflatten( # pylint: disable=protected-access - ) - surrogate_posterior = restructure.Restructure( - output_structure=tokenize(dist), - input_structure=tokenize(surrogate_posterior))( - surrogate_posterior, name=_get_name(dist))''' - return surrogate_posterior, variables + _extract_variables_from_coroutine_model( + posterior_generator, seed=seed))) + + # Temporary workaround for bijector caching issues with autobatched JDs. + surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( + posterior_generator, + use_vectorized_map=dist.use_vectorized_map, + name=_get_name(dist)) + + # Ensure that the surrogate posterior structure matches that of the prior. + # todo: check me, do we need this? in case needs to be modified + # if we use auxiliary variables, then the structure won't match the one of the + # prior + '''try: + tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) + except TypeError: + tokenize = lambda jd: jd._model_unflatten( + # pylint: disable=protected-access, g-long-lambda + range(len(jd._model_flatten(jd.dtype))) + # pylint: disable=protected-access + ) + surrogate_posterior = restructure.Restructure( + output_structure=tokenize(dist), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist))''' + return surrogate_posterior, variables # todo: sample_shape is not used.. can remove? @@ -524,84 +420,84 @@ def _cf_convex_update_for_base_distribution(dist, variables, sample_shape=None, seed=None): - """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" - - if variables is None: - actual_event_shape = dist.event_shape_tensor() - int_event_shape = int(actual_event_shape) if \ - actual_event_shape.shape.as_list()[0] > 0 else 1 - bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] - - bijectors.extend( - _build_highway_flow_block( - num_layers, - width=tf.reduce_prod( - actual_event_shape + num_auxiliary_variables), - residual_fraction_initial_value=initial_prior_weight, - gate_first_n=int_event_shape, seed=seed)) - - bijectors.append( - reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + + if variables is None: + actual_event_shape = dist.event_shape_tensor() + int_event_shape = int(actual_event_shape) if \ + actual_event_shape.shape.as_list()[0] > 0 else 1 + bijectors = [reshape.Reshape([-1], + event_shape_in=actual_event_shape + + num_auxiliary_variables)] + + bijectors.extend( + _build_highway_flow_block( + num_layers, + width=tf.reduce_prod( + actual_event_shape + num_auxiliary_variables), + residual_fraction_initial_value=initial_prior_weight, + gate_first_n=int_event_shape, seed=seed)) - variables = chain.Chain(bijectors=list(reversed(bijectors))) + bijectors.append( + reshape.Reshape(actual_event_shape + num_auxiliary_variables)) - if num_auxiliary_variables > 0: - batch_shape = global_auxiliary_variables.shape[0] if len( - global_auxiliary_variables.shape) > 1 else [] - - cascading_flows = split.Split( - [-1, num_auxiliary_variables])( - transformed_distribution.TransformedDistribution( - distribution=blockwise.Blockwise([ - batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), - independent.Independent( - deterministic.Deterministic( - global_auxiliary_variables), - reinterpreted_batch_ndims=1)]), - bijector=variables)) + variables = chain.Chain(bijectors=list(reversed(bijectors))) - else: - cascading_flows = transformed_distribution.TransformedDistribution( - distribution=dist, - bijector=variables) + if num_auxiliary_variables > 0: + batch_shape = global_auxiliary_variables.shape[0] if len( + global_auxiliary_variables.shape) > 1 else [] + + cascading_flows = split.Split( + [-1, num_auxiliary_variables])( + transformed_distribution.TransformedDistribution( + distribution=blockwise.Blockwise([ + batch_broadcast.BatchBroadcast(dist, + to_shape=batch_shape), + independent.Independent( + deterministic.Deterministic( + global_auxiliary_variables), + reinterpreted_batch_ndims=1)]), + bijector=variables)) + + else: + cascading_flows = transformed_distribution.TransformedDistribution( + distribution=dist, + bijector=variables) - return cascading_flows, variables + return cascading_flows, variables def _extract_variables_from_coroutine_model(model_fn, seed=None): - """Extracts variables from a generator that yields (dist, variables) pairs.""" - gen = model_fn() - try: - dist, dist_variables = next(gen) - flat_variables = [dist_variables] - while True: - seed, local_seed = samplers.split_seed(seed, n=2) - sampled_value = (dist.distribution.sample(seed=local_seed) - if isinstance(dist, Root) - else dist.sample(seed=local_seed)) - dist, dist_variables = gen.send( - sampled_value) # tf.concat(sampled_value, axis=0) - flat_variables.append(dist_variables) - except StopIteration: - pass - return flat_variables + """Extracts variables from a generator that yields (dist, variables) pairs.""" + gen = model_fn() + try: + dist, dist_variables = next(gen) + flat_variables = [dist_variables] + while True: + seed, local_seed = samplers.split_seed(seed, n=2) + sampled_value = (dist.distribution.sample(seed=local_seed) + if isinstance(dist, Root) + else dist.sample(seed=local_seed)) + dist, dist_variables = gen.send( + sampled_value) # tf.concat(sampled_value, axis=0) + flat_variables.append(dist_variables) + except StopIteration: + pass + return flat_variables def _set_name(dist, name): - """Copies a distribution-like object, replacing its name.""" - if hasattr(dist, 'copy'): - return dist.copy(name=name) - # Some distribution-like entities such as JointDistributionPinned don't - # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set - # the name directly. - dist = copy.copy(dist) - dist._name = name # pylint: disable=protected-access - return dist + """Copies a distribution-like object, replacing its name.""" + if hasattr(dist, 'copy'): + return dist.copy(name=name) + # Some distribution-like entities such as JointDistributionPinned don't + # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set + # the name directly. + dist = copy.copy(dist) + dist._name = name # pylint: disable=protected-access + return dist def _get_name(dist): - """Attempts to get a distribution's short name, excluding the name scope.""" - return getattr(dist, 'parameters', {}).get('name', dist.name) + """Attempts to get a distribution's short name, excluding the name scope.""" + return getattr(dist, 'parameters', {}).get('name', dist.name) From 6305084e32d76563f4afbc3f9e426ddc33c2266b Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 3 Jun 2021 15:05:48 +0200 Subject: [PATCH 35/54] removed sample_shape --- .../python/experimental/vi/cascading_flows.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 0a87ae4399..5ca9b97bdf 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -205,7 +205,6 @@ def _cf_surrogate_for_distribution(dist, num_auxiliary_variables, num_layers, global_auxiliary_variables=None, - sample_shape=None, variables=None, seed=None): """Recursively creates CF surrogates, and creates new variables if needed. @@ -215,17 +214,12 @@ def _cf_surrogate_for_distribution(dist, base_distribution_surrogate_fn: Callable to build a surrogate posterior for a 'base' (non-meta and non-joint) distribution, with signature `surrogate_posterior, variables = base_distribution_fn( - dist, sample_shape=None, variables=None, seed=None)`. + dist, variables=None, seed=None)`. num_auxiliary_variables: The number of auxiliary variables to use for each variable in the input program. num_layers: Number of layers to use in each Highway Flow architecture. global_auxiliary_variables: The sampled global auxiliary variables (available only if using auxiliary variables). Default value: None. - sample_shape: Optional `Tensor` shape of samples drawn from `dist` by - `tfd.Sample` wrappers. If not `None`, the surrogate's event will include - independent sample dimensions, i.e., it will have event shape - `concat([sample_shape, dist.event_shape], axis=0)`. - Default value: `None`. variables: Optional nested structure of `tf.Variable`s returned from a previous call to `_cf_surrogate_for_distribution`. If `None`, new variables will be created; otherwise, constructs a surrogate posterior @@ -244,9 +238,6 @@ def _cf_surrogate_for_distribution(dist, structure of such `tfb.Chain`s. """ - # Apply any substitutions, while attempting to preserve the original name. - dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist)) - if hasattr(dist, '_model_coroutine'): surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( dist, @@ -258,7 +249,7 @@ def _cf_surrogate_for_distribution(dist, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( - dist=dist, sample_shape=sample_shape, variables=variables, + dist=dist, variables=variables, global_auxiliary_variables=global_auxiliary_variables, num_layers=num_layers, seed=seed) @@ -386,7 +377,6 @@ def posterior_generator(seed=seed): _extract_variables_from_coroutine_model( posterior_generator, seed=seed))) - # Temporary workaround for bijector caching issues with autobatched JDs. surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched( posterior_generator, use_vectorized_map=dist.use_vectorized_map, @@ -411,14 +401,12 @@ def posterior_generator(seed=seed): return surrogate_posterior, variables -# todo: sample_shape is not used.. can remove? def _cf_convex_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables, num_layers, global_auxiliary_variables, variables, - sample_shape=None, seed=None): """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" From 2f27b952669390fbd2d5ba4519aac866491e09e1 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Thu, 3 Jun 2021 15:13:30 +0200 Subject: [PATCH 36/54] changed if statement and array slicing for value_out --- .../python/experimental/vi/cascading_flows.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 5ca9b97bdf..2a6e8c3834 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -347,9 +347,12 @@ def posterior_generator(seed=seed): # save the variables. value_out = yield (surrogate_posterior if flat_variables else (surrogate_posterior, variables)) - if type(value_out) == list: + + # When using auxiliary variables, value out is a list containing + # [latent_variables, auxiliary_variables]. + if num_auxiliary_variables>0: if len(dist.event_shape) == 0: - dist = prior_gen.send(tf.squeeze(value_out[0], -1)) + dist = prior_gen.send(value_out[0][...,0]) else: dist = prior_gen.send(value_out[0]) From 326a7660e69efd678b2e051a5ffc7e91ee085602 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 14 Jun 2021 15:26:14 +0200 Subject: [PATCH 37/54] changed docstrings for target_dist --- .../python/experimental/vi/cascading_flows.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 2a6e8c3834..8c93bde2c6 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -142,11 +142,12 @@ def model_fn(): ```python num_aux_vars=10 - target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape( - tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1), - scale=tf.reshape(tfp.util.TransformedVariable( - [tf.random.uniform((1,), minval=0.01, maxval=1.) - for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1) + event_len = len(prior.event_shape_tensor()) + target_dist = tfd.Independent( + tfd.Normal(loc=tf.Variable(tf.random.normal((event_len,num_aux_vars))), + scale=tfp.util.TransformedVariable( + tf.random.uniform((event_len,num_aux_vars), minval=0.01, maxval=1.) + , bijector=tfb.Softplus())), 2) def target_log_prob_aux_vars(z_and_eps): z = [x[0] for x in z_and_eps[1:]] From 1f295dacadf42ca412702c0de22614fc69a8a552 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 14 Jun 2021 16:40:44 +0200 Subject: [PATCH 38/54] expanded cf to cascading flows and changed bijector --- .../python/experimental/vi/cascading_flows.py | 67 +++++++++++-------- .../experimental/vi/cascading_flows_test.py | 35 +++++++++- 2 files changed, 73 insertions(+), 29 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 8c93bde2c6..8599bf51a6 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -25,7 +25,11 @@ import tensorflow.compat.v2 as tf from tensorflow_probability.python.bijectors import chain +from tensorflow_probability.python.bijectors import identity +from tensorflow_probability.python.bijectors import invert +from tensorflow_probability.python.bijectors import joint_map from tensorflow_probability.python.bijectors import reshape +from tensorflow_probability.python.bijectors import restructure from tensorflow_probability.python.bijectors import split from tensorflow_probability.python.distributions import batch_broadcast from tensorflow_probability.python.distributions import blockwise @@ -43,13 +47,13 @@ from tensorflow_probability.python.internal import samplers __all__ = [ - 'build_cf_surrogate_posterior' + 'build_cascading_flow_surrogate_posterior' ] Root = joint_distribution_coroutine.JointDistributionCoroutine.Root -def build_cf_surrogate_posterior( +def build_cascading_flow_surrogate_posterior( prior, num_auxiliary_variables=0, initial_prior_weight=0.98, @@ -84,7 +88,7 @@ def build_cf_surrogate_posterior( the layers will have `softplus` activation function, apart from the last one which will have linear activation. Default value: `3`. seed: Python `int` seed for random initialization. - name: Optional string. Default value: `build_cf_surrogate_posterior`. + name: Optional string. Default value: `build_cascading_flow_surrogate_posterior`. Returns: surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance @@ -114,7 +118,7 @@ def model_fn(): ```python surrogate_posterior = - tfp.experimental.vi.build_cf_surrogate_posterior(prior) + tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior) ``` This creates a trainable joint distribution, defined by variables in @@ -157,20 +161,20 @@ def target_log_prob_aux_vars(z_and_eps): return lp_z + lp_eps target_log_prob = lambda *values: target_log_prob_aux_vars(values) - cf_surrogate_posterior = build_cf_surrogate_posterior(prior, + cascading_flow_surrogate_posterior = build_cascading_flow_surrogate_posterior(prior, num_auxiliary_variables=num_aux_vars) - trainable_variables = list(cf_surrogate_posterior.trainable_variables) + trainable_variables = list(cascading_flow_surrogate_posterior.trainable_variables) trainable_variables.extend(list(target_dist.trainable_variables)) - cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob, - cf_surrogate_posterior, + cascading_flow_losses = tfp.vi.fit_surrogate_posterior(target_log_prob, + cascading_flow_surrogate_posterior, optimizer=tf.optimizers.Adam(0.01), num_steps=8000, sample_size=50, trainable_variables=trainable_variables) - cf_posterior_samples = cf_surrogate_posterior.sample(num_samples) - cf_posterior_samples = tf.convert_to_tensor( - [s[0] for s in cf_posterior_samples[1:]]) + cascading_flow_posterior_samples = cascading_flow_surrogate_posterior.sample(num_samples) + cascading_flow_posterior_samples = tf.convert_to_tensor( + [s[0] for s in cascading_flow_posterior_samples[1:]]) ``` #### References @@ -186,11 +190,11 @@ def target_log_prob_aux_vars(z_and_eps): models." International Conference on Machine Learning. PMLR, 2016. """ - with tf.name_scope(name or 'build_cf_surrogate_posterior'): - surrogate_posterior, variables = _cf_surrogate_for_distribution( + with tf.name_scope(name or 'build_cascading_flow_surrogate_posterior'): + surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution( dist=prior, base_distribution_surrogate_fn=functools.partial( - _cf_convex_update_for_base_distribution, + _cascading_flow_convex_update_for_base_distribution, initial_prior_weight=initial_prior_weight, num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers), @@ -201,7 +205,7 @@ def target_log_prob_aux_vars(z_and_eps): return surrogate_posterior -def _cf_surrogate_for_distribution(dist, +def _cascading_flow_surrogate_for_distribution(dist, base_distribution_surrogate_fn, num_auxiliary_variables, num_layers, @@ -222,7 +226,7 @@ def _cf_surrogate_for_distribution(dist, global_auxiliary_variables: The sampled global auxiliary variables (available only if using auxiliary variables). Default value: None. variables: Optional nested structure of `tf.Variable`s returned from a - previous call to `_cf_surrogate_for_distribution`. If `None`, + previous call to `_cascading_flow_surrogate_for_distribution`. If `None`, new variables will be created; otherwise, constructs a surrogate posterior backed by the passed-in variables. Default value: `None`. @@ -240,7 +244,7 @@ def _cf_surrogate_for_distribution(dist, """ if hasattr(dist, '_model_coroutine'): - surrogate_posterior, variables = _cf_surrogate_for_joint_distribution( + surrogate_posterior, variables = _cascading_flow_surrogate_for_joint_distribution( dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, variables=variables, @@ -277,7 +281,7 @@ def _build_highway_flow_block(num_layers, width, return bijectors -def _cf_surrogate_for_joint_distribution( +def _cascading_flow_surrogate_for_joint_distribution( dist, base_distribution_surrogate_fn, variables, num_auxiliary_variables, num_layers, global_auxiliary_variables, seed=None): @@ -330,7 +334,7 @@ def posterior_generator(seed=seed): dist = dist.distribution seed, init_seed = samplers.split_seed(seed) - surrogate_posterior, variables = _cf_surrogate_for_distribution( + surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution( dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, num_auxiliary_variables=num_auxiliary_variables, @@ -370,7 +374,7 @@ def posterior_generator(seed=seed): # try to build new variables on every invocation; the recursive call will # define a new `posterior_generator` that knows about the variables we're # about to create. - return _cf_surrogate_for_joint_distribution( + return _cascading_flow_surrogate_for_joint_distribution( dist=dist, base_distribution_surrogate_fn=base_distribution_surrogate_fn, num_auxiliary_variables=num_auxiliary_variables, @@ -386,6 +390,12 @@ def posterior_generator(seed=seed): use_vectorized_map=dist.use_vectorized_map, name=_get_name(dist)) + '''tokenize = lambda jd: jd._model_unflatten( + # pylint: disable=protected-access, g-long-lambda + range(len(jd._model_flatten(jd.dtype))) + # pylint: disable=protected-access + )''' + # Ensure that the surrogate posterior structure matches that of the prior. # todo: check me, do we need this? in case needs to be modified # if we use auxiliary variables, then the structure won't match the one of the @@ -405,7 +415,7 @@ def posterior_generator(seed=seed): return surrogate_posterior, variables -def _cf_convex_update_for_base_distribution(dist, +def _cascading_flow_convex_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables, num_layers, @@ -436,21 +446,24 @@ def _cf_convex_update_for_base_distribution(dist, variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: - batch_shape = global_auxiliary_variables.shape[0] if len( - global_auxiliary_variables.shape) > 1 else [] + flatten_event = reshape.Reshape( + event_shape_out=[-1], + event_shape_in=dist.event_shape_tensor()) cascading_flows = split.Split( [-1, num_auxiliary_variables])( transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ - batch_broadcast.BatchBroadcast(dist, - to_shape=batch_shape), + transformed_distribution.TransformedDistribution( + distribution=dist, bijector=flatten_event), independent.Independent( - deterministic.Deterministic( - global_auxiliary_variables), + deterministic.Deterministic(global_auxiliary_variables), reinterpreted_batch_ndims=1)]), bijector=variables)) + cascading_flows = joint_map.JointMap( + [invert.Invert(flatten_event), identity.Identity()])(cascading_flows) + else: cascading_flows = transformed_distribution.TransformedDistribution( distribution=dist, diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 0b45486e6c..bfa556d987 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -24,14 +24,45 @@ import tensorflow_probability as tfp from tensorflow_probability.python.internal import prefer_static as ps from tensorflow_probability.python.internal import test_util +from tensorflow.python.util import nest # Dependency imports tfb = tfp.bijectors tfd = tfp.distributions - +Root = tfd.JointDistributionCoroutine.Root @test_util.test_all_tf_execution_regimes +class CascadingFlowTests(test_util.TestCase): + + def test_shapes(self): + @tfd.JointDistributionCoroutine + def test_shapes_model(): + # Matrix-valued random variable with batch shape [3]. + A = yield Root( + tfd.WishartTriL(df=2, scale_tril=tf.eye(2, batch_shape=[3]), name='A')) + # Vector-valued random variable with batch shape [3] (inherited from `A`) + x = yield tfd.MultivariateNormalDiag(loc=tf.zeros([2]), + scale_tril=tf.linalg.cholesky(A), + name='x') + # Scalar-valued random variable, with batch shape `[4, 3]`. + y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3])) + + surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10) + + x1 = test_shapes_model.sample() + x2 = nest.map_structure_up_to( + x1, + # Strip auxiliary variables. + lambda *rv_and_aux: rv_and_aux[0], + surrogate_posterior.sample()) + + # Assert that samples from the surrogate have the same shape as the prior. + get_shapes = lambda x: tf.nest.map_structure(lambda xp: xp.shape, x) + self.assertAllEqualNested(get_shapes(x1), get_shapes(x2)) + + +'''@test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): def _expected_num_trainable_variables(self, prior_dist, num_layers): @@ -334,7 +365,7 @@ def centered_horseshoe(ndims=100): tfd.Normal) self.assertIsInstance(surrogate_dists.local_scale.distribution, tfd.Normal) - self.assertIsInstance(surrogate_dists.weights, tfd.Normal) + self.assertIsInstance(surrogate_dists.weights, tfd.Normal)''' if __name__ == '__main__': From 487f7dd800859d7c5ebd1e69075d4303c252b2c0 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 14 Jun 2021 17:10:34 +0200 Subject: [PATCH 39/54] removed testCFDistributionSubstitution --- .../experimental/vi/cascading_flows_test.py | 63 +------------------ 1 file changed, 1 insertion(+), 62 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index bfa556d987..ba03cf19ca 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -62,7 +62,7 @@ def test_shapes_model(): self.assertAllEqualNested(get_shapes(x1), get_shapes(x2)) -'''@test_util.test_all_tf_execution_regimes +@test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): def _expected_num_trainable_variables(self, prior_dist, num_layers): @@ -307,66 +307,5 @@ def nested_model(): return tfd.JointDistributionCoroutineAutoBatched(nested_model) -@test_util.test_all_tf_execution_regimes -class TestCFDistributionSubstitution(test_util.TestCase): - - def test_default_substitutes_trainable_families(self): - @tfd.JointDistributionCoroutineAutoBatched - def model(): - yield tfd.Sample( - tfd.Uniform(low=-2., high=7.), - sample_shape=[2], - name='a') - yield tfd.HalfNormal(1., name='b') - yield tfd.Exponential(rate=[1., 2.], name='c') - yield tfd.Chi2(df=3., name='d') - - surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( - model) - self.assertAllEqualNested(model.event_shape, surrogate.event_shape) - - surrogate_dists, _ = surrogate.sample_distributions() - self.assertIsInstance(surrogate_dists.a, tfd.Independent) - self.assertIsInstance(surrogate_dists.a.distribution, - tfd.TransformedDistribution) - self.assertIsInstance(surrogate_dists.a.distribution.distribution, - tfd.Beta) - self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal) - self.assertIsInstance(surrogate_dists.c, tfd.Gamma) - self.assertIsInstance(surrogate_dists.d, tfd.Gamma) - - def test_can_specify_custom_substitution(self): - @tfd.JointDistributionCoroutineAutoBatched - def centered_horseshoe(ndims=100): - global_scale = yield tfd.HalfCauchy( - loc=0., scale=1., name='global_scale') - local_scale = yield tfd.HalfCauchy( - loc=0., scale=tf.ones([ndims]), name='local_scale') - yield tfd.Normal( - loc=0., scale=tf.sqrt(global_scale * local_scale), - name='weights') - - tfp.experimental.vi.register_asvi_substitution_rule( - condition=tfd.HalfCauchy, - substitution_fn=( - lambda d: tfb.Softplus(1e-6)( - tfd.Normal(loc=d.loc, scale=d.scale)))) - surrogate = tfp.experimental.vi.build_cf_surrogate_posterior( - centered_horseshoe) - self.assertAllEqualNested(centered_horseshoe.event_shape, - surrogate.event_shape) - - # If the surrogate was built with names or structure differing from the - # model, so that it had to be `tfb.Restructure`'d, then this - # sample_distributions call will fail because the surrogate isn't an - # instance of tfd.JointDistribution. - surrogate_dists, _ = surrogate.sample_distributions() - self.assertIsInstance(surrogate_dists.global_scale.distribution, - tfd.Normal) - self.assertIsInstance(surrogate_dists.local_scale.distribution, - tfd.Normal) - self.assertIsInstance(surrogate_dists.weights, tfd.Normal)''' - - if __name__ == '__main__': tf.test.main() From e361a46b31731440e5bd6a33e9d57ad01734f004 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 14 Jun 2021 17:21:17 +0200 Subject: [PATCH 40/54] removed convex from name --- .../python/experimental/vi/cascading_flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 8599bf51a6..e0f94a28c5 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -194,7 +194,7 @@ def target_log_prob_aux_vars(z_and_eps): surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution( dist=prior, base_distribution_surrogate_fn=functools.partial( - _cascading_flow_convex_update_for_base_distribution, + _cascading_flow_update_for_base_distribution, initial_prior_weight=initial_prior_weight, num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers), @@ -415,7 +415,7 @@ def posterior_generator(seed=seed): return surrogate_posterior, variables -def _cascading_flow_convex_update_for_base_distribution(dist, +def _cascading_flow_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables, num_layers, From 70ffe7b8de755d93232503ffe7fdc0a7f812a004 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 14 Jun 2021 17:29:55 +0200 Subject: [PATCH 41/54] fixed comment --- .../python/experimental/vi/cascading_flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index e0f94a28c5..368436933e 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -225,7 +225,7 @@ def _cascading_flow_surrogate_for_distribution(dist, num_layers: Number of layers to use in each Highway Flow architecture. global_auxiliary_variables: The sampled global auxiliary variables (available only if using auxiliary variables). Default value: None. - variables: Optional nested structure of `tf.Variable`s returned from a + variables: Optional nested structure containing `tf.Variable`s returned from a previous call to `_cascading_flow_surrogate_for_distribution`. If `None`, new variables will be created; otherwise, constructs a surrogate posterior backed by the passed-in variables. @@ -236,7 +236,7 @@ def _cascading_flow_surrogate_for_distribution(dist, surrogate posterior distribution, with the same structure and `name` as `dist`, and with addition of global and local auxiliary variables if `num_auxiliary_variables > 0`. - variables: Nested structure of `tf.Variable` trainable parameters for the + variables: Nested structure containing `tf.Variable` trainable parameters for the surrogate posterior. If `dist` is a base distribution, this is a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape` bijectors. If `dist` is a joint distribution, this is a `dist.dtype` From 398d4598547f15e7bb59ac8fa677f2172a4bc3a8 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 15 Jun 2021 14:27:50 +0200 Subject: [PATCH 42/54] adjusted names --- tensorflow_probability/python/experimental/vi/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py index cc5530300a..1f2fa2f900 100644 --- a/tensorflow_probability/python/experimental/vi/__init__.py +++ b/tensorflow_probability/python/experimental/vi/__init__.py @@ -17,7 +17,7 @@ from tensorflow_probability.python.experimental.vi import util from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule -from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior +from tensorflow_probability.python.experimental.vi.cascading_flows import build_cascading_flow_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior @@ -30,7 +30,7 @@ 'build_affine_surrogate_posterior', 'build_affine_surrogate_posterior_from_base_distribution', 'build_asvi_surrogate_posterior', - 'build_cf_surrogate_posterior', + 'build_cascading_flow_surrogate_posterior', 'build_factored_surrogate_posterior', 'build_split_flow_surrogate_posterior', 'build_trainable_location_scale_distribution', From 2c44c483c92122b1759beecd0eaeb5ba8ce808a6 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Tue, 15 Jun 2021 14:28:10 +0200 Subject: [PATCH 43/54] fixed dimensions of prior --- .../python/experimental/vi/cascading_flows.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 368436933e..7608c07945 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -356,11 +356,7 @@ def posterior_generator(seed=seed): # When using auxiliary variables, value out is a list containing # [latent_variables, auxiliary_variables]. if num_auxiliary_variables>0: - if len(dist.event_shape) == 0: - dist = prior_gen.send(value_out[0][...,0]) - else: - dist = prior_gen.send(value_out[0]) - + dist = prior_gen.send(value_out[0]) else: dist = prior_gen.send(value_out) i += 1 @@ -423,32 +419,35 @@ def _cascading_flow_update_for_base_distribution(dist, variables, seed=None): """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" + event_shape = dist.event_shape_tensor() + flat_event_shape = tf.nest.flatten(event_shape) + flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape) + ndims = int(tf.reduce_sum(flat_event_size)) + flatten_event = reshape.Reshape( + event_shape_out=[-1], + event_shape_in=dist.event_shape_tensor()) if variables is None: - actual_event_shape = dist.event_shape_tensor() - int_event_shape = int(actual_event_shape) if \ - actual_event_shape.shape.as_list()[0] > 0 else 1 - bijectors = [reshape.Reshape([-1], - event_shape_in=actual_event_shape + - num_auxiliary_variables)] + + '''bijectors = [reshape.Reshape([-1], + event_shape_in=ndims + + num_auxiliary_variables)]''' + bijectors = [] bijectors.extend( _build_highway_flow_block( num_layers, width=tf.reduce_prod( - actual_event_shape + num_auxiliary_variables), + ndims + num_auxiliary_variables), residual_fraction_initial_value=initial_prior_weight, - gate_first_n=int_event_shape, seed=seed)) + gate_first_n=ndims, seed=seed)) - bijectors.append( - reshape.Reshape(actual_event_shape + num_auxiliary_variables)) + '''bijectors.append( + reshape.Reshape(ndims + num_auxiliary_variables))''' variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: - flatten_event = reshape.Reshape( - event_shape_out=[-1], - event_shape_in=dist.event_shape_tensor()) cascading_flows = split.Split( [-1, num_auxiliary_variables])( @@ -457,7 +456,7 @@ def _cascading_flow_update_for_base_distribution(dist, transformed_distribution.TransformedDistribution( distribution=dist, bijector=flatten_event), independent.Independent( - deterministic.Deterministic(global_auxiliary_variables), + deterministic.Deterministic(global_auxiliary_variables, ), reinterpreted_batch_ndims=1)]), bijector=variables)) @@ -469,6 +468,8 @@ def _cascading_flow_update_for_base_distribution(dist, distribution=dist, bijector=variables) + cascading_flows = invert.Invert(flatten_event)(cascading_flows) + return cascading_flows, variables From 5fb11ec6aa7e0f3952b50992045d0e8e6749440b Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Wed, 16 Jun 2021 11:27:04 +0200 Subject: [PATCH 44/54] readded batchbroadcast --- .../python/experimental/vi/cascading_flows.py | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 7608c07945..609c9d645f 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -386,28 +386,31 @@ def posterior_generator(seed=seed): use_vectorized_map=dist.use_vectorized_map, name=_get_name(dist)) - '''tokenize = lambda jd: jd._model_unflatten( + tokenize = lambda jd: jd._model_unflatten( # pylint: disable=protected-access, g-long-lambda range(len(jd._model_flatten(jd.dtype))) # pylint: disable=protected-access - )''' - - # Ensure that the surrogate posterior structure matches that of the prior. - # todo: check me, do we need this? in case needs to be modified - # if we use auxiliary variables, then the structure won't match the one of the - # prior - '''try: - tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) - except TypeError: - tokenize = lambda jd: jd._model_unflatten( - # pylint: disable=protected-access, g-long-lambda - range(len(jd._model_flatten(jd.dtype))) - # pylint: disable=protected-access - ) + ) + + dist_tokens = tokenize(dist) + + if num_auxiliary_variables == 0: + try: + tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) + except TypeError: + surrogate_posterior = restructure.Restructure( + output_structure=tokenize(dist), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist)) + + '''else: surrogate_posterior = restructure.Restructure( - output_structure=tokenize(dist), + output_structure=( + tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens), + [0] + [2 * k + 2 for k in tf.nest.flatten(dist_tokens)]), input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist))''' + return surrogate_posterior, variables @@ -422,16 +425,16 @@ def _cascading_flow_update_for_base_distribution(dist, event_shape = dist.event_shape_tensor() flat_event_shape = tf.nest.flatten(event_shape) flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape) - ndims = int(tf.reduce_sum(flat_event_size)) + try: + ndims = int(tf.reduce_sum(flat_event_size)) + except: + a=0 flatten_event = reshape.Reshape( event_shape_out=[-1], event_shape_in=dist.event_shape_tensor()) if variables is None: - '''bijectors = [reshape.Reshape([-1], - event_shape_in=ndims + - num_auxiliary_variables)]''' bijectors = [] bijectors.extend( @@ -442,19 +445,17 @@ def _cascading_flow_update_for_base_distribution(dist, residual_fraction_initial_value=initial_prior_weight, gate_first_n=ndims, seed=seed)) - '''bijectors.append( - reshape.Reshape(ndims + num_auxiliary_variables))''' - variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: - + batch_shape = global_auxiliary_variables.shape[0] if len( + global_auxiliary_variables.shape) > 1 else [] cascading_flows = split.Split( [-1, num_auxiliary_variables])( transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ - transformed_distribution.TransformedDistribution( - distribution=dist, bijector=flatten_event), + batch_broadcast.BatchBroadcast(transformed_distribution.TransformedDistribution( + distribution=dist, bijector=flatten_event), to_shape=batch_shape), independent.Independent( deterministic.Deterministic(global_auxiliary_variables, ), reinterpreted_batch_ndims=1)]), @@ -465,7 +466,8 @@ def _cascading_flow_update_for_base_distribution(dist, else: cascading_flows = transformed_distribution.TransformedDistribution( - distribution=dist, + distribution=transformed_distribution.TransformedDistribution( + distribution=dist, bijector=flatten_event), bijector=variables) cascading_flows = invert.Invert(flatten_event)(cascading_flows) From 5a20976786e5b42e84db5218b3d50ee6db652bb3 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Wed, 16 Jun 2021 11:27:44 +0200 Subject: [PATCH 45/54] small fixes --- .../python/experimental/vi/cascading_flows_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index ba03cf19ca..c6d0531846 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -42,13 +42,13 @@ def test_shapes_model(): A = yield Root( tfd.WishartTriL(df=2, scale_tril=tf.eye(2, batch_shape=[3]), name='A')) # Vector-valued random variable with batch shape [3] (inherited from `A`) - x = yield tfd.MultivariateNormalDiag(loc=tf.zeros([2]), + x = yield tfd.MultivariateNormalTriL(loc=tf.zeros([2]), scale_tril=tf.linalg.cholesky(A), name='x') # Scalar-valued random variable, with batch shape `[4, 3]`. y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3])) - surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10) + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10) x1 = test_shapes_model.sample() x2 = nest.map_structure_up_to( @@ -81,7 +81,7 @@ def _expected_num_trainable_variables(self, prior_dist, num_layers): def test_dims_and_gradients(self): prior_dist = self.make_prior_dist() num_layers = 3 - surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, num_layers=num_layers) # Test that the correct number of trainable variables are being tracked @@ -110,7 +110,7 @@ def test_dims_and_gradients(self): def test_initialization_is_deterministic_following_seed(self): prior_dist = self.make_prior_dist() - surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, seed=test_util.test_seed(sampler_type='stateless')) self.evaluate( @@ -118,7 +118,7 @@ def test_initialization_is_deterministic_following_seed(self): posterior_sample = surrogate_posterior.sample( seed=test_util.test_seed(sampler_type='stateless')) - surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior( + surrogate_posterior2 = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, seed=test_util.test_seed(sampler_type='stateless')) self.evaluate( @@ -175,7 +175,7 @@ def test_fitting_surrogate_posterior(self): prior_dist = self.make_prior_dist() observations = self.get_observations(prior_dist) - surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior( + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist) target_log_prob = self.get_target_log_prob(observations, prior_dist) From 14e34ee7145089abdc74e2b960add1171dfd62a9 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Wed, 16 Jun 2021 11:28:42 +0200 Subject: [PATCH 46/54] removed try except --- .../python/experimental/vi/cascading_flows.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 609c9d645f..14dce602cd 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -425,10 +425,7 @@ def _cascading_flow_update_for_base_distribution(dist, event_shape = dist.event_shape_tensor() flat_event_shape = tf.nest.flatten(event_shape) flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape) - try: - ndims = int(tf.reduce_sum(flat_event_size)) - except: - a=0 + ndims = int(tf.reduce_sum(flat_event_size)) flatten_event = reshape.Reshape( event_shape_out=[-1], event_shape_in=dist.event_shape_tensor()) From fa69f67c9cb0a50082a2e92e13b828cabc14798b Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Wed, 16 Jun 2021 11:55:06 +0200 Subject: [PATCH 47/54] added support for distributions withc constrained support and test --- .../python/experimental/vi/cascading_flows.py | 18 ++++++++++-------- .../experimental/vi/cascading_flows_test.py | 11 +++++++++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 14dce602cd..62c1e61d8f 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -399,7 +399,7 @@ def posterior_generator(seed=seed): tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype) except TypeError: surrogate_posterior = restructure.Restructure( - output_structure=tokenize(dist), + output_structure=dist_tokens, input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist)) @@ -426,10 +426,14 @@ def _cascading_flow_update_for_base_distribution(dist, flat_event_shape = tf.nest.flatten(event_shape) flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape) ndims = int(tf.reduce_sum(flat_event_size)) - flatten_event = reshape.Reshape( + constraining_bijector = dist.experimental_default_event_space_bijector() + flatten_bijector = reshape.Reshape( event_shape_out=[-1], event_shape_in=dist.event_shape_tensor()) + constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector]) + processed_dist = transformed_distribution.TransformedDistribution(distribution=dist, + bijector=constraining_and_flattening_bijector) if variables is None: bijectors = [] @@ -451,23 +455,21 @@ def _cascading_flow_update_for_base_distribution(dist, [-1, num_auxiliary_variables])( transformed_distribution.TransformedDistribution( distribution=blockwise.Blockwise([ - batch_broadcast.BatchBroadcast(transformed_distribution.TransformedDistribution( - distribution=dist, bijector=flatten_event), to_shape=batch_shape), + batch_broadcast.BatchBroadcast(processed_dist, to_shape=batch_shape), independent.Independent( deterministic.Deterministic(global_auxiliary_variables, ), reinterpreted_batch_ndims=1)]), bijector=variables)) cascading_flows = joint_map.JointMap( - [invert.Invert(flatten_event), identity.Identity()])(cascading_flows) + [invert.Invert(constraining_and_flattening_bijector), identity.Identity()])(cascading_flows) else: cascading_flows = transformed_distribution.TransformedDistribution( - distribution=transformed_distribution.TransformedDistribution( - distribution=dist, bijector=flatten_event), + distribution=processed_dist, bijector=variables) - cascading_flows = invert.Invert(flatten_event)(cascading_flows) + cascading_flows = invert.Invert(constraining_and_flattening_bijector)(cascading_flows) return cascading_flows, variables diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index c6d0531846..8bcbc6a26c 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -65,7 +65,7 @@ def test_shapes_model(): @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): - def _expected_num_trainable_variables(self, prior_dist, num_layers): + '''def _expected_num_trainable_variables(self, prior_dist, num_layers): """Infers the expected number of trainable variables for a non-nested JD.""" prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access expected_num_trainable_variables = 0 @@ -127,7 +127,14 @@ def test_initialization_is_deterministic_following_seed(self): seed=test_util.test_seed(sampler_type='stateless')) self.assertAllEqualNested(posterior_sample, posterior_sample2) - +''' + def test_surrogate_and_prior_have_same_domain(self): + prior_dist = self.make_prior_dist() + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( + prior=prior_dist, + seed=test_util.test_seed(sampler_type='stateless')) + self.assertAllFinite(prior_dist.log_prob( + surrogate_posterior.sample(10, seed=test_util.test_seed()))) @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, From e296c6f3474f7e6b3ea350e2ce0b3aa39b98811d Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 18 Jun 2021 13:59:07 +0200 Subject: [PATCH 48/54] fixed output reshape --- .../python/experimental/vi/cascading_flows.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 62c1e61d8f..c9ea70c04e 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -386,11 +386,9 @@ def posterior_generator(seed=seed): use_vectorized_map=dist.use_vectorized_map, name=_get_name(dist)) - tokenize = lambda jd: jd._model_unflatten( - # pylint: disable=protected-access, g-long-lambda - range(len(jd._model_flatten(jd.dtype))) - # pylint: disable=protected-access - ) + tokenize = lambda jd: tf.nest.pack_sequence_as( + jd.dtype, + range(len(tf.nest.flatten(jd.dtype)))) dist_tokens = tokenize(dist) @@ -403,13 +401,13 @@ def posterior_generator(seed=seed): input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist)) - '''else: + else: surrogate_posterior = restructure.Restructure( output_structure=( tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens), [0] + [2 * k + 2 for k in tf.nest.flatten(dist_tokens)]), input_structure=tokenize(surrogate_posterior))( - surrogate_posterior, name=_get_name(dist))''' + surrogate_posterior, name=_get_name(dist)) return surrogate_posterior, variables @@ -449,7 +447,7 @@ def _cascading_flow_update_for_base_distribution(dist, variables = chain.Chain(bijectors=list(reversed(bijectors))) if num_auxiliary_variables > 0: - batch_shape = global_auxiliary_variables.shape[0] if len( + batch_shape = global_auxiliary_variables.shape[:-1] if len( global_auxiliary_variables.shape) > 1 else [] cascading_flows = split.Split( [-1, num_auxiliary_variables])( From 55155e8f29b7b66b57728a1e28df8a325a448f49 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 21 Jun 2021 10:54:18 +0200 Subject: [PATCH 49/54] removed discrete test --- .../experimental/vi/cascading_flows_test.py | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 8bcbc6a26c..5acca06910 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -36,7 +36,7 @@ class CascadingFlowTests(test_util.TestCase): def test_shapes(self): - @tfd.JointDistributionCoroutine + def test_shapes_model(): # Matrix-valued random variable with batch shape [3]. A = yield Root( @@ -45,10 +45,11 @@ def test_shapes_model(): x = yield tfd.MultivariateNormalTriL(loc=tf.zeros([2]), scale_tril=tf.linalg.cholesky(A), name='x') - # Scalar-valued random variable, with batch shape `[4, 3]`. - y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3])) + # Scalar-valued random variable, with batch shape `[3]`. + y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([3])) - surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10) + prior = tfd.JointDistributionCoroutineAutoBatched(test_shapes_model, batch_ndims=1) + surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior) #num_auxiliary_variables=10) x1 = test_shapes_model.sample() x2 = nest.map_structure_up_to( @@ -65,7 +66,7 @@ def test_shapes_model(): @test_util.test_all_tf_execution_regimes class _TrainableCFSurrogate(object): - '''def _expected_num_trainable_variables(self, prior_dist, num_layers): + def _expected_num_trainable_variables(self, prior_dist, num_layers): """Infers the expected number of trainable variables for a non-nested JD.""" prior_dists = prior_dist._get_single_sample_distributions() # pylint: disable=protected-access expected_num_trainable_variables = 0 @@ -127,7 +128,7 @@ def test_initialization_is_deterministic_following_seed(self): seed=test_util.test_seed(sampler_type='stateless')) self.assertAllEqualNested(posterior_sample, posterior_sample2) -''' + def test_surrogate_and_prior_have_same_domain(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( @@ -269,20 +270,6 @@ def _prior_model_fn(): return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) - -@test_util.test_all_tf_execution_regimes -class CFSurrogatePosteriorTestDiscreteLatent( - test_util.TestCase, _TrainableCFSurrogate): - - def make_prior_dist(self): - def _prior_model_fn(): - a = yield tfd.Bernoulli(logits=0.5, name='a') - yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1., - scale=1., name='b') - - return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn) - - @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestNesting(test_util.TestCase, _TrainableCFSurrogate): From 9eb8b97f1b70b75c7921179a8162b9b4b669aefd Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 21 Jun 2021 10:55:55 +0200 Subject: [PATCH 50/54] working on batch shape --- .../python/experimental/vi/cascading_flows.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index c9ea70c04e..fec1d8c4a1 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -291,6 +291,10 @@ def _cascading_flow_surrogate_for_joint_distribution( flat_variables = dist._model_flatten( variables) if variables else None # pylint: disable=protected-access prior_coroutine = dist._model_coroutine # pylint: disable=protected-access + prior_batch_shape = dist.batch_shape_tensor() + if tf.nest.is_nested(prior_batch_shape): + prior_batch_shape = functools.reduce(tf.broadcast_static_shape, + dist._model_flatten(prior_batch_shape)) def posterior_generator(seed=seed): prior_gen = prior_coroutine() @@ -312,8 +316,8 @@ def posterior_generator(seed=seed): variables = chain.Chain(bijectors=list(reversed(bijectors))) eps = transformed_distribution.TransformedDistribution( - distribution=sample.Sample(normal.Normal(0., 1.), - num_auxiliary_variables), + distribution=batch_broadcast.BatchBroadcast(sample.Sample(normal.Normal(0., 1.), + num_auxiliary_variables), prior_batch_shape), bijector=variables) eps = Root(eps) From 0c0fb39af14232ce18c1391833d1346c40a608c2 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 21 Jun 2021 17:34:01 +0200 Subject: [PATCH 51/54] small bug fixed --- .../python/experimental/vi/cascading_flows_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index 5acca06910..fb0890b12c 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -51,7 +51,7 @@ def test_shapes_model(): prior = tfd.JointDistributionCoroutineAutoBatched(test_shapes_model, batch_ndims=1) surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior) #num_auxiliary_variables=10) - x1 = test_shapes_model.sample() + x1 = surrogate_posterior.sample() x2 = nest.map_structure_up_to( x1, # Strip auxiliary variables. From 8d8777d11e044d11cf2c87f132a0f5abed5335b5 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Fri, 23 Jul 2021 13:36:00 +0200 Subject: [PATCH 52/54] changed shapes to static and added auxiliary variables without global flow --- .../python/experimental/vi/cascading_flows.py | 59 +++++++++++++++---- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index fec1d8c4a1..3dcc12fe59 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -42,9 +42,10 @@ from tensorflow_probability.python.distributions import normal from tensorflow_probability.python.distributions import sample from tensorflow_probability.python.distributions import transformed_distribution -from tensorflow_probability.python.experimental.bijectors import \ - build_trainable_highway_flow + +from tensorflow_probability.python.experimental.bijectors import build_trainable_highway_flow from tensorflow_probability.python.internal import samplers +from tensorflow_probability.python.internal import prefer_static as ps __all__ = [ 'build_cascading_flow_surrogate_posterior' @@ -52,12 +53,13 @@ Root = joint_distribution_coroutine.JointDistributionCoroutine.Root - +# todo: add check that id use_global_auxiliary_variables is true then num_auxiliary variables must be >=1 def build_cascading_flow_surrogate_posterior( prior, num_auxiliary_variables=0, initial_prior_weight=0.98, num_layers=3, + use_global_auxiliary_variables=False, seed=None, name=None): """Builds a structured surrogate posterior with cascading flows. @@ -190,6 +192,8 @@ def target_log_prob_aux_vars(z_and_eps): models." International Conference on Machine Learning. PMLR, 2016. """ + if num_auxiliary_variables == 0 and use_global_auxiliary_variables == True: + raise ValueError('cannot use global auxiliary variables if auxiliary variables is 0') with tf.name_scope(name or 'build_cascading_flow_surrogate_posterior'): surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution( dist=prior, @@ -197,9 +201,11 @@ def target_log_prob_aux_vars(z_and_eps): _cascading_flow_update_for_base_distribution, initial_prior_weight=initial_prior_weight, num_auxiliary_variables=num_auxiliary_variables, - num_layers=num_layers), + num_layers=num_layers, + use_global_auxiliary_variables=use_global_auxiliary_variables,), num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers, + use_global_auxiliary_variables=use_global_auxiliary_variables, seed=seed) surrogate_posterior.also_track = variables return surrogate_posterior @@ -209,6 +215,7 @@ def _cascading_flow_surrogate_for_distribution(dist, base_distribution_surrogate_fn, num_auxiliary_variables, num_layers, + use_global_auxiliary_variables, global_auxiliary_variables=None, variables=None, seed=None): @@ -250,11 +257,13 @@ def _cascading_flow_surrogate_for_distribution(dist, variables=variables, num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers, + use_global_auxiliary_variables=use_global_auxiliary_variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed) else: surrogate_posterior, variables = base_distribution_surrogate_fn( dist=dist, variables=variables, + use_global_auxiliary_variables=use_global_auxiliary_variables, global_auxiliary_variables=global_auxiliary_variables, num_layers=num_layers, seed=seed) @@ -283,7 +292,7 @@ def _build_highway_flow_block(num_layers, width, def _cascading_flow_surrogate_for_joint_distribution( dist, base_distribution_surrogate_fn, variables, - num_auxiliary_variables, num_layers, global_auxiliary_variables, + num_auxiliary_variables, num_layers, use_global_auxiliary_variables, global_auxiliary_variables, seed=None): """Builds a structured joint surrogate posterior for a joint model.""" @@ -292,15 +301,16 @@ def _cascading_flow_surrogate_for_joint_distribution( variables) if variables else None # pylint: disable=protected-access prior_coroutine = dist._model_coroutine # pylint: disable=protected-access prior_batch_shape = dist.batch_shape_tensor() + #fixme if tf.nest.is_nested(prior_batch_shape): - prior_batch_shape = functools.reduce(tf.broadcast_static_shape, + prior_batch_shape = functools.reduce(ps.broadcast_shape, dist._model_flatten(prior_batch_shape)) def posterior_generator(seed=seed): prior_gen = prior_coroutine() dist = next(prior_gen) - if num_auxiliary_variables > 0: + if use_global_auxiliary_variables == True: i = 1 if flat_variables: @@ -344,10 +354,11 @@ def posterior_generator(seed=seed): num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers, variables=flat_variables[i] if flat_variables else None, + use_global_auxiliary_variables=use_global_auxiliary_variables, global_auxiliary_variables=global_auxiliary_variables, seed=init_seed) - if was_root and num_auxiliary_variables == 0: + if was_root and use_global_auxiliary_variables == False: surrogate_posterior = Root(surrogate_posterior) # If variables were not given---i.e., we're creating new # variables---then yield the new variables along with the surrogate @@ -379,6 +390,7 @@ def posterior_generator(seed=seed): base_distribution_surrogate_fn=base_distribution_surrogate_fn, num_auxiliary_variables=num_auxiliary_variables, num_layers=num_layers, + use_global_auxiliary_variables=use_global_auxiliary_variables, global_auxiliary_variables=global_auxiliary_variables, variables=dist._model_unflatten( # pylint: disable=protected-access @@ -405,7 +417,7 @@ def posterior_generator(seed=seed): input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist)) - else: + elif use_global_auxiliary_variables: surrogate_posterior = restructure.Restructure( output_structure=( tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens), @@ -413,6 +425,14 @@ def posterior_generator(seed=seed): input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist)) + else: + surrogate_posterior = restructure.Restructure( + output_structure=( + tf.nest.map_structure(lambda k: 2 * k, dist_tokens), + [2 * k + 1 for k in tf.nest.flatten(dist_tokens)]), + input_structure=tokenize(surrogate_posterior))( + surrogate_posterior, name=_get_name(dist)) + return surrogate_posterior, variables @@ -420,17 +440,18 @@ def _cascading_flow_update_for_base_distribution(dist, initial_prior_weight, num_auxiliary_variables, num_layers, + use_global_auxiliary_variables, global_auxiliary_variables, variables, seed=None): """Creates a trainable surrogate for a (non-meta, non-joint) distribution.""" event_shape = dist.event_shape_tensor() flat_event_shape = tf.nest.flatten(event_shape) - flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape) - ndims = int(tf.reduce_sum(flat_event_size)) + flat_event_size = tf.nest.map_structure(ps.reduce_prod, flat_event_shape) + ndims = ps.reduce_sum(flat_event_size) constraining_bijector = dist.experimental_default_event_space_bijector() flatten_bijector = reshape.Reshape( - event_shape_out=[-1], + event_shape_out=flat_event_size, event_shape_in=dist.event_shape_tensor()) constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector]) @@ -450,7 +471,7 @@ def _cascading_flow_update_for_base_distribution(dist, variables = chain.Chain(bijectors=list(reversed(bijectors))) - if num_auxiliary_variables > 0: + if num_auxiliary_variables > 0 and use_global_auxiliary_variables == True: batch_shape = global_auxiliary_variables.shape[:-1] if len( global_auxiliary_variables.shape) > 1 else [] cascading_flows = split.Split( @@ -466,6 +487,18 @@ def _cascading_flow_update_for_base_distribution(dist, cascading_flows = joint_map.JointMap( [invert.Invert(constraining_and_flattening_bijector), identity.Identity()])(cascading_flows) + elif num_auxiliary_variables > 0 and use_global_auxiliary_variables == False: + cascading_flows = split.Split( + [-1, num_auxiliary_variables])( + transformed_distribution.TransformedDistribution( + distribution=blockwise.Blockwise([processed_dist, + batch_broadcast.BatchBroadcast( + sample.Sample(normal.Normal(0.,1.), num_auxiliary_variables), to_shape=processed_dist.batch_shape)]), + bijector=variables)) + + cascading_flows = joint_map.JointMap( + [invert.Invert(constraining_and_flattening_bijector), + identity.Identity()])(cascading_flows) else: cascading_flows = transformed_distribution.TransformedDistribution( distribution=processed_dist, From d514f7283e5451623021682aaecca4db492c63a0 Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 9 Aug 2021 12:18:07 +0200 Subject: [PATCH 53/54] fixed constraining_bijector --- .../python/experimental/vi/cascading_flows.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py index 3dcc12fe59..e8796b35f7 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py @@ -332,11 +332,9 @@ def posterior_generator(seed=seed): eps = Root(eps) - value_out = yield (eps if flat_variables + global_auxiliary_variables = yield (eps if flat_variables else (eps, variables)) - global_auxiliary_variables = value_out - else: global_auxiliary_variables = None i = 0 @@ -417,7 +415,8 @@ def posterior_generator(seed=seed): input_structure=tokenize(surrogate_posterior))( surrogate_posterior, name=_get_name(dist)) - elif use_global_auxiliary_variables: + #FIXME: this part is commented out as blows up RAM memory + '''elif use_global_auxiliary_variables: surrogate_posterior = restructure.Restructure( output_structure=( tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens), @@ -431,7 +430,7 @@ def posterior_generator(seed=seed): tf.nest.map_structure(lambda k: 2 * k, dist_tokens), [2 * k + 1 for k in tf.nest.flatten(dist_tokens)]), input_structure=tokenize(surrogate_posterior))( - surrogate_posterior, name=_get_name(dist)) + surrogate_posterior, name=_get_name(dist))''' return surrogate_posterior, variables @@ -454,7 +453,7 @@ def _cascading_flow_update_for_base_distribution(dist, event_shape_out=flat_event_size, event_shape_in=dist.event_shape_tensor()) - constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector]) + constraining_and_flattening_bijector = chain.Chain([flatten_bijector, invert.Invert(constraining_bijector)]) processed_dist = transformed_distribution.TransformedDistribution(distribution=dist, bijector=constraining_and_flattening_bijector) if variables is None: From 5eabcf8a0fdf0125c2a1bed6071c91d441b9812b Mon Sep 17 00:00:00 2001 From: GianluigiSilvestri Date: Mon, 9 Aug 2021 14:39:28 +0200 Subject: [PATCH 54/54] working cf and cf with local aux vars --- .../experimental/vi/cascading_flows_test.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py index fb0890b12c..de10179486 100644 --- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py +++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py @@ -92,9 +92,9 @@ def test_dims_and_gradients(self): # Test that the sample shape is correct three_posterior_samples = surrogate_posterior.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + 3, seed=(0,0)) three_prior_samples = prior_dist.sample( - 3, seed=test_util.test_seed(sampler_type='stateless')) + 3, seed=(0,0)) self.assertAllEqualNested( [s.shape for s in tf.nest.flatten(three_prior_samples)], [s.shape for s in tf.nest.flatten(three_posterior_samples)]) @@ -102,7 +102,7 @@ def test_dims_and_gradients(self): # Test that gradients are available wrt the variational parameters. with tf.GradientTape() as tape: posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) posterior_logprob = surrogate_posterior.log_prob(posterior_sample) grad = tape.gradient(posterior_logprob, surrogate_posterior.trainable_variables) @@ -113,19 +113,19 @@ def test_initialization_is_deterministic_following_seed(self): surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) self.evaluate( [v.initializer for v in surrogate_posterior.trainable_variables]) posterior_sample = surrogate_posterior.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) surrogate_posterior2 = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) self.evaluate( [v.initializer for v in surrogate_posterior2.trainable_variables]) posterior_sample2 = surrogate_posterior2.sample( - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) self.assertAllEqualNested(posterior_sample, posterior_sample2) @@ -133,9 +133,9 @@ def test_surrogate_and_prior_have_same_domain(self): prior_dist = self.make_prior_dist() surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior( prior=prior_dist, - seed=test_util.test_seed(sampler_type='stateless')) + seed=(0,0)) self.assertAllFinite(prior_dist.log_prob( - surrogate_posterior.sample(10, seed=test_util.test_seed()))) + surrogate_posterior.sample(10, seed=(0,0)))) @test_util.test_all_tf_execution_regimes class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase, @@ -192,7 +192,7 @@ def test_fitting_surrogate_posterior(self): target_log_prob, surrogate_posterior, num_steps=5, # Don't optimize to completion. - optimizer=tf.optimizers.Adam(0.1), + optimizer=tf.optimizers.Adam(1e-3), sample_size=10) # Compute posterior statistics.