From e6a27139dd74b9f548b6cb921f474bb0a134cc92 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:51:37 +0200
Subject: [PATCH 01/54] fixed conflicts

---
 tensorflow_probability/python/experimental/vi/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py
index 0cb4971fcc..e18c8d3455 100644
--- a/tensorflow_probability/python/experimental/vi/__init__.py
+++ b/tensorflow_probability/python/experimental/vi/__init__.py
@@ -29,6 +29,7 @@
     'build_affine_surrogate_posterior',
     'build_affine_surrogate_posterior_from_base_distribution',
     'build_asvi_surrogate_posterior',
+    'builf_cf_surrogate_posterior'
     'build_factored_surrogate_posterior',
     'build_split_flow_surrogate_posterior',
     'build_trainable_location_scale_distribution',

From c501d2bb6a120fb1d41068bc0a94d7233ee02f4c Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:52:05 +0200
Subject: [PATCH 02/54] Revert "Revert "initial tests, updated init and build""

This reverts commit 5bb28b08
---
 .../python/experimental/vi/BUILD              | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD
index e57f884ca5..863e0aeef2 100644
--- a/tensorflow_probability/python/experimental/vi/BUILD
+++ b/tensorflow_probability/python/experimental/vi/BUILD
@@ -31,6 +31,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":automatic_structured_vi",
+        ":cascading_flows",
         ":surrogate_posteriors",
         "//tensorflow_probability/python/experimental/vi/util",
         "//tensorflow_probability/python/internal:all_util",
@@ -67,6 +68,36 @@ py_library(
     ],
 )
 
+py_library(
+    name = "cascading_flows",
+    srcs = ["cascading_flows.py.py"],
+    srcs_version = "PY3",
+    deps = [
+        # tensorflow dep,
+        "//tensorflow_probability/python/bijectors:build_highway_flow_layer",
+        "//tensorflow_probability/python/bijectors:chain",
+        "//tensorflow_probability/python/bijectors:reshape",
+        "//tensorflow_probability/python/bijectors:scale",
+        "//tensorflow_probability/python/bijectors:shift",
+        "//tensorflow_probability/python/bijectors:split",
+        "//tensorflow_probability/python/distributions:batch_broadcast",
+        "//tensorflow_probability/python/distributions:beta",
+        "//tensorflow_probability/python/distributions:blockwise",
+        "//tensorflow_probability/python/distributions:chi2",
+        "//tensorflow_probability/python/distributions:exponential",
+        "//tensorflow_probability/python/distributions:gamma",
+        "//tensorflow_probability/python/distributions:half_normal",
+        "//tensorflow_probability/python/distributions:joint_distribution_auto_batched",
+        "//tensorflow_probability/python/distributions:joint_distribution_coroutine",
+        "//tensorflow_probability/python/distributions:normal",
+        "//tensorflow_probability/python/distributions:sample",
+        "//tensorflow_probability/python/distributions:transformed_distribution",
+        "//tensorflow_probability/python/distributions:truncated_normal",
+        "//tensorflow_probability/python/distributions:uniform",
+        "//tensorflow_probability/python/internal:samplers",
+    ],
+)
+
 py_library(
     name = "surrogate_posteriors",
     srcs = ["surrogate_posteriors.py"],
@@ -111,6 +142,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "cascading_flows_test",
+    size = "large",
+    srcs = ["cascading_flows_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY3",
+    deps = [
+        # absl/testing:parameterized dep,
+        # numpy dep,
+        # tensorflow dep,
+        "//tensorflow_probability",
+        "//tensorflow_probability/python/internal:test_util",
+    ],
+)
+
 py_test(
     name = "surrogate_posteriors_test",
     size = "large",

From b6be9d96c1bcc278e5f3278dbec76ef480090e9a Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:54:47 +0200
Subject: [PATCH 03/54] reverted commit

---
 .../python/experimental/vi/cascading_flows.py | 483 ++++++++++++++++++
 1 file changed, 483 insertions(+)
 create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows.py

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
new file mode 100644
index 0000000000..d8c9393d8e
--- /dev/null
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -0,0 +1,483 @@
+# Copyright 2021 The TensorFlow Probability Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Utilities for constructing structured surrogate posteriors."""
+
+from __future__ import absolute_import
+from __future__ import division
+# [internal] enable type annotations
+from __future__ import print_function
+
+import copy
+import functools
+import inspect
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow_probability.python.experimental.bijectors import \
+  build_highway_flow_layer
+from tensorflow_probability.python.bijectors import chain
+from tensorflow_probability.python.bijectors import reshape
+from tensorflow_probability.python.bijectors import scale as scale_lib
+from tensorflow_probability.python.bijectors import shift
+from tensorflow_probability.python.bijectors import split
+
+from tensorflow_probability.python.distributions import batch_broadcast
+from tensorflow_probability.python.distributions import beta
+from tensorflow_probability.python.distributions import blockwise
+from tensorflow_probability.python.distributions import chi2
+from tensorflow_probability.python.distributions import exponential
+from tensorflow_probability.python.distributions import gamma
+from tensorflow_probability.python.distributions import half_normal
+from tensorflow_probability.python.distributions import \
+  joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import \
+  joint_distribution_coroutine
+from tensorflow_probability.python.distributions import normal
+from tensorflow_probability.python.distributions import sample
+from tensorflow_probability.python.distributions import transformed_distribution
+from tensorflow_probability.python.distributions import truncated_normal
+from tensorflow_probability.python.distributions import uniform
+from tensorflow_probability.python.internal import samplers
+
+__all__ = [
+  'register_asvi_substitution_rule',
+  'build_cf_surrogate_posterior'
+]
+
+Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
+
+_NON_STATISTICAL_PARAMS = [
+  'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum',
+  'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support',
+  'num_probit_terms_approx'
+]
+_NON_TRAINABLE_PARAMS = ['low', 'high']
+
+# Registry of transformations that are applied to distributions in the prior
+# before defining the surrogate family.
+
+
+# Todo: inherited from asvi code, do we need this?
+ASVI_SURROGATE_SUBSTITUTIONS = {}
+
+
+# Todo: inherited from asvi code, do we need this?
+def _as_substituted_distribution(distribution):
+  """Applies all substitution rules that match a distribution."""
+  for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items():
+    if condition(distribution):
+      distribution = substitution_fn(distribution)
+  return distribution
+
+
+# Todo: inherited from asvi code, do we need this?
+def register_asvi_substitution_rule(condition, substitution_fn):
+  """Registers a rule for substituting distributions in ASVI surrogates.
+
+  Args:
+    condition: Python `callable` that takes a Distribution instance and
+      returns a Python `bool` indicating whether or not to substitute it.
+      May also be a class type such as `tfd.Normal`, in which case the
+      condition is interpreted as
+      `lambda distribution: isinstance(distribution, class)`.
+    substitution_fn: Python `callable` that takes a Distribution
+      instance and returns a new Distribution instance used to define
+      the ASVI surrogate posterior. Note that this substitution does not modify
+      the original model.
+
+  #### Example
+
+  To use a Normal surrogate for all location-scale family distributions, we
+  could register the substitution:
+
+  ```python
+  tfp.experimental.vi.register_asvi_surrogate_substitution(
+    condition=lambda distribution: (
+      hasattr(distribution, 'loc') and hasattr(distribution, 'scale'))
+    substitution_fn=lambda distribution: (
+      # Invoking the event space bijector applies any relevant constraints,
+      # e.g., that HalfCauchy samples must be `>= loc`.
+      distribution.experimental_default_event_space_bijector()(
+        tfd.Normal(loc=distribution.loc, scale=distribution.scale)))
+  ```
+
+  This rule will fire when ASVI encounters a location-scale distribution,
+  and instructs ASVI to build a surrogate 'as if' the model had just used a
+  (possibly constrained) Normal in its place. Note that we could have used a
+  more precise condition, e.g., to limit the substitution to distributions with
+  a specific `name`, if we had reason to think that a Normal distribution would
+  be a good surrogate for some model variables but not others.
+
+  """
+  global ASVI_SURROGATE_SUBSTITUTIONS
+  if inspect.isclass(condition):
+    condition = lambda distribution, cls=condition: isinstance(
+      # pylint: disable=g-long-lambda
+      distribution, cls)
+  ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn
+
+
+# Default substitutions attempt to express distributions using the most
+# flexible available parameterization.
+# pylint: disable=g-long-lambda
+register_asvi_substitution_rule(
+  half_normal.HalfNormal,
+  lambda dist: truncated_normal.TruncatedNormal(
+    loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
+register_asvi_substitution_rule(
+  uniform.Uniform,
+  lambda dist: shift.Shift(dist.low)(
+    scale_lib.Scale(dist.high - dist.low)(
+      beta.Beta(concentration0=tf.ones_like(dist.mean()),
+                concentration1=1.))))
+register_asvi_substitution_rule(
+  exponential.Exponential,
+  lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
+register_asvi_substitution_rule(
+  chi2.Chi2,
+  lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
+
+
+# pylint: enable=g-long-lambda
+
+# a single JointDistribution.
+def build_cf_surrogate_posterior(
+    prior,
+    num_auxiliary_variables=0,
+    initial_prior_weight=0.5,
+    seed=None,
+    name=None):
+  # todo: change docstrings
+  """Builds a structured surrogate posterior inspired by conjugate updating.
+
+  ASVI, or Automatic Structured Variational Inference, was proposed by
+  Ambrogioni et al. (2020) [1] as a method of automatically constructing a
+  surrogate posterior with the same structure as the prior. It does this by
+  reparameterizing the variational family of the surrogate posterior by
+  structuring each parameter according to the equation
+  ```none
+  prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter
+  ```
+  In this equation, `prior_parameter` is a vector of prior parameters and
+  `mean_field_parameter` is a vector of trainable parameters with the same
+  domain as `prior_parameter`. `prior_weight` is a vector of learnable
+  parameters where `0. <= prior_weight <= 1.`. When `prior_weight =
+  0`, the surrogate posterior will be a mean-field surrogate, and when
+  `prior_weight = 1.`, the surrogate posterior will be the prior. This convex
+  combination equation, inspired by conjugacy in exponential families, thus
+  allows the surrogate posterior to balance between the structure of the prior
+  and the structure of a mean-field approximation.
+
+  Args:
+    prior: tfd.JointDistribution instance of the prior.
+    mean_field: Optional Python boolean. If `True`, creates a degenerate
+      surrogate distribution in which all variables are independent,
+      ignoring the prior dependence structure. Default value: `False`.
+    initial_prior_weight: Optional float value (either static or tensor value)
+      on the interval [0, 1]. A larger value creates an initial surrogate
+      distribution with more dependence on the prior structure. Default value:
+      `0.5`.
+    seed: Python `int` seed for random initialization.
+    name: Optional string. Default value: `build_cf_surrogate_posterior`.
+
+  Returns:
+    surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
+    whose samples have shape and structure matching that of `prior`.
+
+  Raises:
+    TypeError: The `prior` argument cannot be a nested `JointDistribution`.
+
+  ### Examples
+
+  Consider a Brownian motion model expressed as a JointDistribution:
+
+  ```python
+  prior_loc = 0.
+  innovation_noise = .1
+
+  def model_fn():
+    new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+    for i in range(4):
+      new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+  prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
+  ```
+
+  Let's use variational inference to approximate the posterior. We'll build a
+  surrogate posterior distribution by feeding in the prior distribution.
+
+  ```python
+  surrogate_posterior =
+    tfp.experimental.vi.build_cf_surrogate_posterior(prior)
+  ```
+
+  This creates a trainable joint distribution, defined by variables in
+  `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
+  to fit this distribution by minimizing a divergence to the true posterior.
+
+  ```python
+  losses = tfp.vi.fit_surrogate_posterior(
+    target_log_prob_fn,
+    surrogate_posterior=surrogate_posterior,
+    num_steps=100,
+    optimizer=tf.optimizers.Adam(0.1),
+    sample_size=10)
+
+  # After optimization, samples from the surrogate will approximate
+  # samples from the true posterior.
+  samples = surrogate_posterior.sample(100)
+  posterior_mean = [tf.reduce_mean(x) for x in samples]
+  posterior_std = [tf.math.reduce_std(x) for x in samples]
+  ```
+
+  #### References
+  [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured
+        variational inference. _arXiv preprint arXiv:2002.00643_, 2020
+        https://arxiv.org/abs/2002.00643
+
+  """
+  with tf.name_scope(name or 'build_cf_surrogate_posterior'):
+    surrogate_posterior, variables = _cf_surrogate_for_distribution(
+      dist=prior,
+      base_distribution_surrogate_fn=functools.partial(
+        _cf_convex_update_for_base_distribution,
+        initial_prior_weight=initial_prior_weight,
+        num_auxiliary_variables=num_auxiliary_variables),
+      seed=seed)
+    surrogate_posterior.also_track = variables
+    return surrogate_posterior
+
+
+def _cf_surrogate_for_distribution(dist,
+                                   base_distribution_surrogate_fn,
+                                   sample_shape=None,
+                                   variables=None,
+                                   seed=None):
+  # todo: change docstrings
+  """Recursively creates ASVI surrogates, and creates new variables if needed.
+
+  Args:
+    dist: a `tfd.Distribution` instance.
+    base_distribution_surrogate_fn: Callable to build a surrogate posterior
+      for a 'base' (non-meta and non-joint) distribution, with signature
+      `surrogate_posterior, variables = base_distribution_fn(
+      dist, sample_shape=None, variables=None, seed=None)`.
+    sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
+      `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
+      independent sample dimensions, i.e., it will have event shape
+      `concat([sample_shape, dist.event_shape], axis=0)`.
+      Default value: `None`.
+    variables: Optional nested structure of `tf.Variable`s returned from a
+      previous call to `_cf_surrogate_for_distribution`. If `None`,
+      new variables will be created; otherwise, constructs a surrogate posterior
+      backed by the passed-in variables.
+      Default value: `None`.
+    seed: Python `int` seed for random initialization.
+  Returns:
+    surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
+      surrogate posterior distribution, with the same structure and `name` as
+      `dist`.
+    variables: Nested structure of `tf.Variable` trainable parameters for the
+      surrogate posterior. If `dist` is a base distribution, this is
+      a `dict` of `ASVIParameters` instances. If `dist` is a joint
+      distribution, this is a `dist.dtype` structure of such `dict`s.
+  """
+
+  # Apply any substitutions, while attempting to preserve the original name.
+  dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
+
+  if hasattr(dist, '_model_coroutine'):
+    surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
+      dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      variables=variables,
+      seed=seed)
+  else:
+    surrogate_posterior, variables = base_distribution_surrogate_fn(
+      dist=dist, sample_shape=sample_shape, variables=variables, seed=seed)
+  return surrogate_posterior, variables
+
+
+def _cf_surrogate_for_joint_distribution(
+    dist, base_distribution_surrogate_fn, variables=None, seed=None):
+  """Builds a structured joint surrogate posterior for a joint model."""
+
+  # Probabilistic program for CF surrogate posterior.
+  flat_variables = dist._model_flatten(
+    variables) if variables else None  # pylint: disable=protected-access
+  prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
+
+  def posterior_generator(seed=seed):
+    prior_gen = prior_coroutine()
+    dist = next(prior_gen)
+    i = 0
+    try:
+      while True:
+        was_root = isinstance(dist, Root)
+        if was_root:
+          dist = dist.distribution
+
+        seed, init_seed = samplers.split_seed(seed)
+        surrogate_posterior, variables = _cf_surrogate_for_distribution(
+          dist,
+          base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+          variables=flat_variables[i] if flat_variables else None,
+          seed=init_seed)
+
+        if was_root:
+          surrogate_posterior = Root(surrogate_posterior)
+        # If variables were not given---i.e., we're creating new
+        # variables---then yield the new variables along with the surrogate
+        # posterior. This assumes an execution context such as
+        # `_extract_variables_from_coroutine_model` below that will capture and
+        # save the variables.
+        value_out = yield (surrogate_posterior if flat_variables
+                           else (surrogate_posterior, variables))
+        if type(value_out) == list:
+          if len(dist.event_shape) == 0:
+            dist = prior_gen.send(tf.squeeze(value_out[0], -1))
+          else:
+            dist = prior_gen.send(value_out[0])
+
+        else:
+          dist = prior_gen.send(value_out)
+        i += 1
+    except StopIteration:
+      pass
+
+  if variables is None:
+    # Run the generator to create variables, then call ourselves again
+    # to construct the surrogate JD from these variables. Note that we can't
+    # just create a JDC from the current `posterior_generator`, because it will
+    # try to build new variables on every invocation; the recursive call will
+    # define a new `posterior_generator` that knows about the variables we're
+    # about to create.
+    return _cf_surrogate_for_joint_distribution(
+      dist=dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      variables=dist._model_unflatten(  # pylint: disable=protected-access
+        _extract_variables_from_coroutine_model(
+          posterior_generator, seed=seed)))
+
+  # Temporary workaround for bijector caching issues with autobatched JDs.
+  surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched
+  if not hasattr(dist, 'use_vectorized_map'):
+    surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine
+  surrogate_posterior = surrogate_type(posterior_generator,
+                                       name=_get_name(dist))
+
+  # Ensure that the surrogate posterior structure matches that of the prior.
+  # todo: check me, do we need this? in case needs to be modified
+  # if we use auxiliary variables, then the structure won't match the one of the
+  # prior
+  '''try:
+    tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
+  except TypeError:
+    tokenize = lambda jd: jd._model_unflatten(
+      # pylint: disable=protected-access, g-long-lambda
+      range(len(jd._model_flatten(jd.dtype)))
+      # pylint: disable=protected-access
+    )
+    surrogate_posterior = restructure.Restructure(
+      output_structure=tokenize(dist),
+      input_structure=tokenize(surrogate_posterior))(
+      surrogate_posterior, name=_get_name(dist))'''
+  return surrogate_posterior, variables
+
+
+# todo: sample_shape and seed are not used.. maybe they should?
+def _cf_convex_update_for_base_distribution(dist,
+                                            initial_prior_weight,
+                                            num_auxiliary_variables=0,
+                                            sample_shape=None,
+                                            variables=None,
+                                            seed=None):
+  """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+
+  if variables is None:
+    actual_event_shape = dist.event_shape_tensor()
+    int_event_shape = int(actual_event_shape) if \
+      actual_event_shape.shape.as_list()[0] > 0 else 1
+    layers = 3
+    bijectors = [reshape.Reshape([-1],
+                             event_shape_in=actual_event_shape +
+                                            num_auxiliary_variables)]
+
+    for _ in range(0, layers - 1):
+      bijectors.append(
+        build_highway_flow_layer(
+          tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+          residual_fraction_initial_value=initial_prior_weight,
+          activation_fn=True, gate_first_n=int_event_shape))
+    bijectors.append(
+      build_highway_flow_layer(
+        tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+        residual_fraction_initial_value=initial_prior_weight,
+        activation_fn=False, gate_first_n=int_event_shape))
+    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+
+    variables = chain.Chain(bijectors=list(reversed(bijectors)))
+
+  if num_auxiliary_variables > 0:
+    cascading_flows = split.Split(
+      [-1, num_auxiliary_variables])(
+      transformed_distribution.TransformedDistribution(
+        distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast(
+          sample.Sample(normal.Normal(0., .1), num_auxiliary_variables),
+          to_shape=dist.batch_shape)]),
+        bijector=variables))
+
+  else:
+    cascading_flows = transformed_distribution.TransformedDistribution(
+      distribution=dist,
+      bijector=variables)
+
+  return cascading_flows, variables
+
+
+def _extract_variables_from_coroutine_model(model_fn, seed=None):
+  """Extracts variables from a generator that yields (dist, variables) pairs."""
+  gen = model_fn()
+  try:
+    dist, dist_variables = next(gen)
+    flat_variables = [dist_variables]
+    while True:
+      seed, local_seed = samplers.split_seed(seed, n=2)
+      sampled_value = (dist.distribution.sample(seed=local_seed)
+                       if isinstance(dist, Root)
+                       else dist.sample(seed=local_seed))
+      dist, dist_variables = gen.send(
+        sampled_value)  # tf.concat(sampled_value, axis=0)
+      flat_variables.append(dist_variables)
+  except StopIteration:
+    pass
+  return flat_variables
+
+
+def _set_name(dist, name):
+  """Copies a distribution-like object, replacing its name."""
+  if hasattr(dist, 'copy'):
+    return dist.copy(name=name)
+  # Some distribution-like entities such as JointDistributionPinned don't
+  # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
+  # the name directly.
+  dist = copy.copy(dist)
+  dist._name = name  # pylint: disable=protected-access
+  return dist
+
+
+def _get_name(dist):
+  """Attempts to get a distribution's short name, excluding the name scope."""
+  return getattr(dist, 'parameters', {}).get('name', dist.name)

From dbf371b97027fcdd7b85f96bfc0e4d00dccc1bd9 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:55:54 +0200
Subject: [PATCH 04/54] Revert "removed cascading_flows from pr"

This reverts commit 1620ebd2
---
 .../experimental/bijectors/highway_flow.py    |   1 +
 .../experimental/vi/cascading_flows_test.py   | 354 ++++++++++++++++++
 2 files changed, 355 insertions(+)
 create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows_test.py

diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow.py b/tensorflow_probability/python/experimental/bijectors/highway_flow.py
index bdfed9b2e8..6f26abe72f 100644
--- a/tensorflow_probability/python/experimental/bijectors/highway_flow.py
+++ b/tensorflow_probability/python/experimental/bijectors/highway_flow.py
@@ -26,6 +26,7 @@
 from tensorflow_probability.python.internal import tensor_util
 
 
+
 def build_highway_flow_layer(width,
                              residual_fraction_initial_value=0.5,
                              activation_fn=False,
diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
new file mode 100644
index 0000000000..9c4393be24
--- /dev/null
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -0,0 +1,354 @@
+# Copyright 2021 The TensorFlow Probability Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tests for structured surrogate posteriors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import tensorflow.compat.v1 as tf1
+import tensorflow.compat.v2 as tf
+import tensorflow_probability as tfp
+from tensorflow_probability.python.experimental.vi import cascading_flows
+from tensorflow_probability.python.internal import prefer_static as ps
+from tensorflow_probability.python.internal import test_util
+
+
+tfb = tfp.bijectors
+tfd = tfp.distributions
+
+
+@test_util.test_all_tf_execution_regimes
+class _TrainableCFSurrogate(object):
+
+  def _expected_num_trainable_variables(self, prior_dist):
+    """Infers the expected number of trainable variables for a non-nested JD."""
+    prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
+    expected_num_trainable_variables = 0
+    for original_dist in prior_dists:
+      try:
+        original_dist = original_dist.distribution
+      except AttributeError:
+        pass
+      dist = cascading_flows._as_substituted_distribution(original_dist)
+      dist_params = dist.parameters
+      for param, value in dist_params.items():
+        if (param not in cascading_flows._NON_STATISTICAL_PARAMS
+            and value is not None and param not in ('low', 'high')):
+          # One variable each for prior_weight, mean_field_parameter.
+          expected_num_trainable_variables += 2
+    return expected_num_trainable_variables
+
+  def test_dims_and_gradients(self):
+
+    prior_dist = self.make_prior_dist()
+
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist)
+
+    # Test that the correct number of trainable variables are being tracked
+    self.assertLen(surrogate_posterior.trainable_variables,
+                   self._expected_num_trainable_variables(prior_dist))
+
+    # Test that the sample shape is correct
+    three_posterior_samples = surrogate_posterior.sample(
+        3, seed=test_util.test_seed(sampler_type='stateless'))
+    three_prior_samples = prior_dist.sample(
+        3, seed=test_util.test_seed(sampler_type='stateless'))
+    self.assertAllEqualNested(
+        [s.shape for s in tf.nest.flatten(three_prior_samples)],
+        [s.shape for s in tf.nest.flatten(three_posterior_samples)])
+
+    # Test that gradients are available wrt the variational parameters.
+    posterior_sample = surrogate_posterior.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+    with tf.GradientTape() as tape:
+      posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
+    grad = tape.gradient(posterior_logprob,
+                         surrogate_posterior.trainable_variables)
+    self.assertTrue(all(g is not None for g in grad))
+
+  def test_initialization_is_deterministic_following_seed(self):
+    prior_dist = self.make_prior_dist()
+
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist,
+        seed=test_util.test_seed(sampler_type='stateless'))
+    self.evaluate(
+        [v.initializer for v in surrogate_posterior.trainable_variables])
+    posterior_sample = surrogate_posterior.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+
+    surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist,
+        seed=test_util.test_seed(sampler_type='stateless'))
+    self.evaluate(
+        [v.initializer for v in surrogate_posterior2.trainable_variables])
+    posterior_sample2 = surrogate_posterior2.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,
+                                               _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      innovation_noise = 0.1
+      prior_loc = 0.
+      new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+      for _ in range(4):
+        new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+  def make_likelihood_model(self, x, observation_noise):
+
+    def _likelihood_model():
+      for i in range(5):
+        yield tfd.Normal(loc=x[i], scale=observation_noise)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_likelihood_model)
+
+  def get_observations(self, prior_dist):
+    observation_noise = 0.15
+    ground_truth = prior_dist.sample()
+    likelihood = self.make_likelihood_model(
+        x=ground_truth, observation_noise=observation_noise)
+    return likelihood.sample(1)
+
+  def get_target_log_prob(self, observations, prior_dist):
+
+    def target_log_prob(*x):
+      observation_noise = 0.15
+      likelihood_dist = self.make_likelihood_model(
+          x=x, observation_noise=observation_noise)
+      return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x)
+
+    return target_log_prob
+
+  def test_fitting_surrogate_posterior(self):
+
+    prior_dist = self.make_prior_dist()
+    observations = self.get_observations(prior_dist)
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist)
+    target_log_prob = self.get_target_log_prob(observations, prior_dist)
+
+    # Test vi fit surrogate posterior works
+    losses = tfp.vi.fit_surrogate_posterior(
+        target_log_prob,
+        surrogate_posterior,
+        num_steps=5,  # Don't optimize to completion.
+        optimizer=tf.optimizers.Adam(0.1),
+        sample_size=10)
+
+    # Compute posterior statistics.
+    with tf.control_dependencies([losses]):
+      posterior_samples = surrogate_posterior.sample(100)
+      posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples)
+      posterior_stddev = tf.nest.map_structure(tf.math.reduce_std,
+                                               posterior_samples)
+
+    self.evaluate(tf1.global_variables_initializer())
+    _ = self.evaluate(losses)
+    _ = self.evaluate(posterior_mean)
+    _ = self.evaluate(posterior_stddev)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestEightSchools(test_util.TestCase,
+                                             _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+    treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12],
+                                    dtype=tf.float32)
+    num_schools = ps.shape(treatment_effects)[-1]
+
+    return tfd.JointDistributionNamed({
+        'avg_effect':
+            tfd.Normal(loc=0., scale=10., name='avg_effect'),
+        'log_stddev':
+            tfd.Normal(loc=5., scale=1., name='log_stddev'),
+        'school_effects':
+            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
+                tfd.Independent(
+                    tfd.Normal(
+                        loc=avg_effect[..., None] * tf.ones(num_schools),
+                        scale=tf.exp(log_stddev[..., None]) * tf.ones(
+                            num_schools),
+                        name='school_effects'),
+                    reinterpreted_batch_ndims=1))
+    })
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase,
+                                                   _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    return tfd.JointDistributionNamed({
+        'avg_effect':
+            tfd.Normal(loc=0., scale=10., name='avg_effect'),
+        'log_stddev':
+            tfd.Normal(loc=5., scale=1., name='log_stddev'),
+        'school_effects':
+            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
+                tfd.Sample(
+                    tfd.Normal(
+                        loc=avg_effect[..., None],
+                        scale=tf.exp(log_stddev[..., None]),
+                        name='school_effects'),
+                    sample_shape=[8]))
+    })
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase,
+                                           _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      innovation_noise = 1.
+      yield tfd.HalfNormal(
+          scale=innovation_noise, validate_args=True, allow_nan_stats=False)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestDiscreteLatent(
+    test_util.TestCase, _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      a = yield tfd.Bernoulli(logits=0.5, name='a')
+      yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1.,
+                       scale=1., name='b')
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestNesting(test_util.TestCase,
+                                        _TrainableCFSurrogate):
+
+  def _expected_num_trainable_variables(self, _):
+    # Nested distributions have total of 10 params after Exponential->Gamma
+    # substitution, multiplied by 2 variables per param.
+    return 20
+
+  def make_prior_dist(self):
+
+    def nested_model():
+      a = yield tfd.Sample(
+          tfd.Sample(
+              tfd.Normal(0., 1.),
+              sample_shape=4),
+          sample_shape=[2],
+          name='a')
+      b = yield tfb.Sigmoid()(
+          tfb.Square()(
+              tfd.Exponential(rate=tf.exp(a))),
+          name='b')
+      # pylint: disable=g-long-lambda
+      yield tfd.JointDistributionSequential(
+          [tfd.Laplace(loc=a, scale=b),
+           lambda c1: tfd.Independent(
+               tfd.Beta(concentration1=1.,
+                        concentration0=tf.nn.softplus(c1)),
+               reinterpreted_batch_ndims=1),
+           lambda c1, c2: tfd.JointDistributionNamed({
+               'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)})
+           ], name='c')
+      # pylint: enable=g-long-lambda
+
+    return tfd.JointDistributionCoroutineAutoBatched(nested_model)
+
+
+@test_util.test_all_tf_execution_regimes
+class TestCFDistributionSubstitution(test_util.TestCase):
+
+  def test_default_substitutes_trainable_families(self):
+
+    @tfd.JointDistributionCoroutineAutoBatched
+    def model():
+      yield tfd.Sample(
+          tfd.Uniform(low=-2., high=7.),
+          sample_shape=[2],
+          name='a')
+      yield tfd.HalfNormal(1., name='b')
+      yield tfd.Exponential(rate=[1., 2.], name='c')
+      yield tfd.Chi2(df=3., name='d')
+
+    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
+        model)
+    self.assertAllEqualNested(model.event_shape, surrogate.event_shape)
+
+    surrogate_dists, _ = surrogate.sample_distributions()
+    self.assertIsInstance(surrogate_dists.a, tfd.Independent)
+    self.assertIsInstance(surrogate_dists.a.distribution,
+                          tfd.TransformedDistribution)
+    self.assertIsInstance(surrogate_dists.a.distribution.distribution,
+                          tfd.Beta)
+    self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal)
+    self.assertIsInstance(surrogate_dists.c, tfd.Gamma)
+    self.assertIsInstance(surrogate_dists.d, tfd.Gamma)
+
+  def test_can_specify_custom_substitution(self):
+
+    @tfd.JointDistributionCoroutineAutoBatched
+    def centered_horseshoe(ndims=100):
+      global_scale = yield tfd.HalfCauchy(
+          loc=0., scale=1., name='global_scale')
+      local_scale = yield tfd.HalfCauchy(
+          loc=0., scale=tf.ones([ndims]), name='local_scale')
+      yield tfd.Normal(
+          loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights')
+
+    tfp.experimental.vi.register_asvi_substitution_rule(
+        condition=tfd.HalfCauchy,
+        substitution_fn=(
+            lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale))))
+    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
+        centered_horseshoe)
+    self.assertAllEqualNested(centered_horseshoe.event_shape,
+                              surrogate.event_shape)
+
+    # If the surrogate was built with names or structure differing from the
+    # model, so that it had to be `tfb.Restructure`'d, then this
+    # sample_distributions call will fail because the surrogate isn't an
+    # instance of tfd.JointDistribution.
+    surrogate_dists, _ = surrogate.sample_distributions()
+    self.assertIsInstance(surrogate_dists.global_scale.distribution,
+                          tfd.Normal)
+    self.assertIsInstance(surrogate_dists.local_scale.distribution,
+                          tfd.Normal)
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
+
+# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions.
+# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with
+#  missing observations.
+
+if __name__ == '__main__':
+  tf.test.main()

From c6118b13c2b8b04358bab2e9e865830a540cff06 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:56:49 +0200
Subject: [PATCH 05/54] reverted to latest version

---
 .../python/experimental/vi/cascading_flows.py | 101 ++++++++++++++----
 1 file changed, 80 insertions(+), 21 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index d8c9393d8e..61dcce7236 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -25,21 +25,20 @@
 
 import tensorflow.compat.v2 as tf
 
-from tensorflow_probability.python.experimental.bijectors import \
-  build_highway_flow_layer
 from tensorflow_probability.python.bijectors import chain
 from tensorflow_probability.python.bijectors import reshape
 from tensorflow_probability.python.bijectors import scale as scale_lib
 from tensorflow_probability.python.bijectors import shift
 from tensorflow_probability.python.bijectors import split
-
 from tensorflow_probability.python.distributions import batch_broadcast
 from tensorflow_probability.python.distributions import beta
 from tensorflow_probability.python.distributions import blockwise
 from tensorflow_probability.python.distributions import chi2
+from tensorflow_probability.python.distributions import deterministic
 from tensorflow_probability.python.distributions import exponential
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
+from tensorflow_probability.python.distributions import independent
 from tensorflow_probability.python.distributions import \
   joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
@@ -49,10 +48,12 @@
 from tensorflow_probability.python.distributions import transformed_distribution
 from tensorflow_probability.python.distributions import truncated_normal
 from tensorflow_probability.python.distributions import uniform
+from tensorflow_probability.python.experimental.bijectors import \
+  build_highway_flow_layer
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
-  'register_asvi_substitution_rule',
+  'register_cf_substitution_rule',
   'build_cf_surrogate_posterior'
 ]
 
@@ -83,7 +84,7 @@ def _as_substituted_distribution(distribution):
 
 
 # Todo: inherited from asvi code, do we need this?
-def register_asvi_substitution_rule(condition, substitution_fn):
+def register_cf_substitution_rule(condition, substitution_fn):
   """Registers a rule for substituting distributions in ASVI surrogates.
 
   Args:
@@ -132,20 +133,20 @@ def register_asvi_substitution_rule(condition, substitution_fn):
 # Default substitutions attempt to express distributions using the most
 # flexible available parameterization.
 # pylint: disable=g-long-lambda
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   half_normal.HalfNormal,
   lambda dist: truncated_normal.TruncatedNormal(
     loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   uniform.Uniform,
   lambda dist: shift.Shift(dist.low)(
     scale_lib.Scale(dist.high - dist.low)(
       beta.Beta(concentration0=tf.ones_like(dist.mean()),
                 concentration1=1.))))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   exponential.Exponential,
   lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   chi2.Chi2,
   lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
 
@@ -255,6 +256,7 @@ def model_fn():
         _cf_convex_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
         num_auxiliary_variables=num_auxiliary_variables),
+      num_auxiliary_variables=num_auxiliary_variables,
       seed=seed)
     surrogate_posterior.also_track = variables
     return surrogate_posterior
@@ -264,6 +266,8 @@ def _cf_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
                                    sample_shape=None,
                                    variables=None,
+                                   num_auxiliary_variables=0,
+                                   global_auxiliary_variables=None,
                                    seed=None):
   # todo: change docstrings
   """Recursively creates ASVI surrogates, and creates new variables if needed.
@@ -303,15 +307,19 @@ def _cf_surrogate_for_distribution(dist,
       dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       variables=variables,
+      num_auxiliary_variables=num_auxiliary_variables,
+      global_auxiliary_variables=global_auxiliary_variables,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables,
+      global_auxiliary_variables=global_auxiliary_variables, seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables=None,
+    num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -322,7 +330,46 @@ def _cf_surrogate_for_joint_distribution(
   def posterior_generator(seed=seed):
     prior_gen = prior_coroutine()
     dist = next(prior_gen)
-    i = 0
+
+    if num_auxiliary_variables > 0:
+      i = 1
+
+      if flat_variables:
+        variables = flat_variables[0]
+
+      else:
+        layers = 3
+        bijectors = []
+
+        for _ in range(0, layers - 1):
+          bijectors.append(
+            build_highway_flow_layer(num_auxiliary_variables,
+                                     residual_fraction_initial_value=0.5,
+                                     activation_fn=True, gate_first_n=0,
+                                     seed=seed))
+        bijectors.append(
+          build_highway_flow_layer(num_auxiliary_variables,
+                                   residual_fraction_initial_value=0.5,
+                                   activation_fn=False, gate_first_n=0,
+                                   seed=seed))
+
+        variables = chain.Chain(bijectors=list(reversed(bijectors)))
+
+      eps = transformed_distribution.TransformedDistribution(
+        distribution=sample.Sample(normal.Normal(0., 0.1),
+                                   num_auxiliary_variables),
+        bijector=variables)
+
+      eps = Root(eps)
+
+      value_out = yield (eps if flat_variables
+                         else (eps, variables))
+
+      global_auxiliary_variables = value_out
+
+    else:
+      i = 0
+
     try:
       while True:
         was_root = isinstance(dist, Root)
@@ -334,9 +381,10 @@ def posterior_generator(seed=seed):
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
           variables=flat_variables[i] if flat_variables else None,
+          global_auxiliary_variables=global_auxiliary_variables,
           seed=init_seed)
 
-        if was_root:
+        if was_root and num_auxiliary_variables == 0:
           surrogate_posterior = Root(surrogate_posterior)
         # If variables were not given---i.e., we're creating new
         # variables---then yield the new variables along with the surrogate
@@ -367,6 +415,8 @@ def posterior_generator(seed=seed):
     return _cf_surrogate_for_joint_distribution(
       dist=dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      num_auxiliary_variables=num_auxiliary_variables,
+      global_auxiliary_variables=global_auxiliary_variables,
       variables=dist._model_unflatten(  # pylint: disable=protected-access
         _extract_variables_from_coroutine_model(
           posterior_generator, seed=seed)))
@@ -401,6 +451,7 @@ def posterior_generator(seed=seed):
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables=0,
+                                            global_auxiliary_variables=None,
                                             sample_shape=None,
                                             variables=None,
                                             seed=None):
@@ -412,31 +463,39 @@ def _cf_convex_update_for_base_distribution(dist,
       actual_event_shape.shape.as_list()[0] > 0 else 1
     layers = 3
     bijectors = [reshape.Reshape([-1],
-                             event_shape_in=actual_event_shape +
-                                            num_auxiliary_variables)]
+                                 event_shape_in=actual_event_shape +
+                                                num_auxiliary_variables)]
 
     for _ in range(0, layers - 1):
       bijectors.append(
         build_highway_flow_layer(
           tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
           residual_fraction_initial_value=initial_prior_weight,
-          activation_fn=True, gate_first_n=int_event_shape))
+          activation_fn=True, gate_first_n=int_event_shape, seed=seed))
     bijectors.append(
       build_highway_flow_layer(
         tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
-        activation_fn=False, gate_first_n=int_event_shape))
-    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+        activation_fn=False, gate_first_n=int_event_shape, seed=seed))
+    bijectors.append(
+      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
+    batch_shape = global_auxiliary_variables.shape[0] if len(
+      global_auxiliary_variables.shape) > 1 else []
+
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(
       transformed_distribution.TransformedDistribution(
-        distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast(
-          sample.Sample(normal.Normal(0., .1), num_auxiliary_variables),
-          to_shape=dist.batch_shape)]),
+        distribution=blockwise.Blockwise([
+          batch_broadcast.BatchBroadcast(dist,
+                                         to_shape=batch_shape),
+          independent.Independent(
+            deterministic.Deterministic(
+              global_auxiliary_variables),
+            reinterpreted_batch_ndims=1)]),
         bijector=variables))
 
   else:

From bcf95e154f02d45c704aee82d0e1fccf05f2f03c Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 11:22:31 +0200
Subject: [PATCH 06/54] fixed surrogate posterior type

---
 .../python/experimental/vi/cascading_flows.py        | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 61dcce7236..95c7cf5faf 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -39,8 +39,7 @@
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
-from tensorflow_probability.python.distributions import \
-  joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
   joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
@@ -422,11 +421,10 @@ def posterior_generator(seed=seed):
           posterior_generator, seed=seed)))
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
-  surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched
-  if not hasattr(dist, 'use_vectorized_map'):
-    surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine
-  surrogate_posterior = surrogate_type(posterior_generator,
-                                       name=_get_name(dist))
+  surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
+    posterior_generator,
+    use_vectorized_map=dist.use_vectorized_map,
+    name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified

From 4d4b291c8e759a81ce171641ad1912cbb641b5f0 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 10:47:34 +0200
Subject: [PATCH 07/54] small fixes

---
 .../python/experimental/vi/cascading_flows.py | 37 ++++++++-----------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 95c7cf5faf..a9735f3739 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -156,7 +156,7 @@ def register_cf_substitution_rule(condition, substitution_fn):
 def build_cf_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
-    initial_prior_weight=0.5,
+    initial_prior_weight=0.98,
     seed=None,
     name=None):
   # todo: change docstrings
@@ -311,14 +311,12 @@ def _cf_surrogate_for_distribution(dist,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables,
-      global_auxiliary_variables=global_auxiliary_variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None,
-    num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -343,19 +341,17 @@ def posterior_generator(seed=seed):
         for _ in range(0, layers - 1):
           bijectors.append(
             build_highway_flow_layer(num_auxiliary_variables,
-                                     residual_fraction_initial_value=0.5,
-                                     activation_fn=True, gate_first_n=0,
-                                     seed=seed))
+              residual_fraction_initial_value=0.98,
+              activation_fn=True, gate_first_n=0, seed=seed))
         bijectors.append(
           build_highway_flow_layer(num_auxiliary_variables,
-                                   residual_fraction_initial_value=0.5,
-                                   activation_fn=False, gate_first_n=0,
-                                   seed=seed))
+            residual_fraction_initial_value=0.98,
+            activation_fn=False, gate_first_n=0, seed=seed))
 
         variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
       eps = transformed_distribution.TransformedDistribution(
-        distribution=sample.Sample(normal.Normal(0., 0.1),
+        distribution=sample.Sample(normal.Normal(0., 1.),
                                    num_auxiliary_variables),
         bijector=variables)
 
@@ -380,7 +376,7 @@ def posterior_generator(seed=seed):
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
           variables=flat_variables[i] if flat_variables else None,
-          global_auxiliary_variables=global_auxiliary_variables,
+          global_auxiliary_variables = global_auxiliary_variables,
           seed=init_seed)
 
         if was_root and num_auxiliary_variables == 0:
@@ -422,9 +418,9 @@ def posterior_generator(seed=seed):
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
   surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-    posterior_generator,
-    use_vectorized_map=dist.use_vectorized_map,
-    name=_get_name(dist))
+      posterior_generator,
+      use_vectorized_map=dist.use_vectorized_map,
+      name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified
@@ -461,8 +457,8 @@ def _cf_convex_update_for_base_distribution(dist,
       actual_event_shape.shape.as_list()[0] > 0 else 1
     layers = 3
     bijectors = [reshape.Reshape([-1],
-                                 event_shape_in=actual_event_shape +
-                                                num_auxiliary_variables)]
+                             event_shape_in=actual_event_shape +
+                                            num_auxiliary_variables)]
 
     for _ in range(0, layers - 1):
       bijectors.append(
@@ -475,8 +471,7 @@ def _cf_convex_update_for_base_distribution(dist,
         tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
         activation_fn=False, gate_first_n=int_event_shape, seed=seed))
-    bijectors.append(
-      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -489,7 +484,7 @@ def _cf_convex_update_for_base_distribution(dist,
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
           batch_broadcast.BatchBroadcast(dist,
-                                         to_shape=batch_shape),
+                                        to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(
               global_auxiliary_variables),

From 6cd887105e07df82b8d4bef8fafbb06f3e245680 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:09:26 +0200
Subject: [PATCH 08/54] fixed global variables if no auxiliary variabled

---
 tensorflow_probability/python/experimental/vi/cascading_flows.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index a9735f3739..ef9f6f78da 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -363,6 +363,7 @@ def posterior_generator(seed=seed):
       global_auxiliary_variables = value_out
 
     else:
+      global_auxiliary_variables = None
       i = 0
 
     try:

From 4690d0a75e9dc7e5e78914d9503f2ea1e0bc9d8a Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:51:56 +0200
Subject: [PATCH 09/54] added number of layers parameter

---
 .../python/experimental/vi/cascading_flows.py | 81 ++++++++++++-------
 1 file changed, 50 insertions(+), 31 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index ef9f6f78da..9b430bf6a6 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -39,16 +39,17 @@
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
-from tensorflow_probability.python.distributions import joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
-  joint_distribution_coroutine
+    joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import \
+    joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
 from tensorflow_probability.python.distributions import sample
 from tensorflow_probability.python.distributions import transformed_distribution
 from tensorflow_probability.python.distributions import truncated_normal
 from tensorflow_probability.python.distributions import uniform
 from tensorflow_probability.python.experimental.bijectors import \
-  build_highway_flow_layer
+    build_highway_flow_layer
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
@@ -157,6 +158,7 @@ def build_cf_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
     initial_prior_weight=0.98,
+    num_layers=3,
     seed=None,
     name=None):
   # todo: change docstrings
@@ -254,8 +256,10 @@ def model_fn():
       base_distribution_surrogate_fn=functools.partial(
         _cf_convex_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
-        num_auxiliary_variables=num_auxiliary_variables),
+        num_auxiliary_variables=num_auxiliary_variables,
+        num_layers=num_layers),
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       seed=seed)
     surrogate_posterior.also_track = variables
     return surrogate_posterior
@@ -263,9 +267,10 @@ def model_fn():
 
 def _cf_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
+                                   num_auxiliary_variables,
+                                   num_layers,
                                    sample_shape=None,
                                    variables=None,
-                                   num_auxiliary_variables=0,
                                    global_auxiliary_variables=None,
                                    seed=None):
   # todo: change docstrings
@@ -307,16 +312,22 @@ def _cf_surrogate_for_distribution(dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       variables=variables,
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       global_auxiliary_variables=global_auxiliary_variables,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables,
+      global_auxiliary_variables=global_auxiliary_variables,
+      num_layers=num_layers,
+      seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables,
+    num_auxiliary_variables, num_layers, global_auxiliary_variables,
+    seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -335,18 +346,17 @@ def posterior_generator(seed=seed):
         variables = flat_variables[0]
 
       else:
-        layers = 3
         bijectors = []
 
-        for _ in range(0, layers - 1):
+        for _ in range(0, num_layers - 1):
           bijectors.append(
             build_highway_flow_layer(num_auxiliary_variables,
-              residual_fraction_initial_value=0.98,
-              activation_fn=True, gate_first_n=0, seed=seed))
+                                     activation_fn=True,
+                                     gate_first_n=0, seed=seed))
         bijectors.append(
           build_highway_flow_layer(num_auxiliary_variables,
-            residual_fraction_initial_value=0.98,
-            activation_fn=False, gate_first_n=0, seed=seed))
+                                   activation_fn=False,
+                                   gate_first_n=0, seed=seed))
 
         variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -376,8 +386,10 @@ def posterior_generator(seed=seed):
         surrogate_posterior, variables = _cf_surrogate_for_distribution(
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+          num_auxiliary_variables=num_auxiliary_variables,
+          num_layers=num_layers,
           variables=flat_variables[i] if flat_variables else None,
-          global_auxiliary_variables = global_auxiliary_variables,
+          global_auxiliary_variables=global_auxiliary_variables,
           seed=init_seed)
 
         if was_root and num_auxiliary_variables == 0:
@@ -412,16 +424,18 @@ def posterior_generator(seed=seed):
       dist=dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       global_auxiliary_variables=global_auxiliary_variables,
-      variables=dist._model_unflatten(  # pylint: disable=protected-access
+      variables=dist._model_unflatten(
+        # pylint: disable=protected-access
         _extract_variables_from_coroutine_model(
           posterior_generator, seed=seed)))
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
   surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-      posterior_generator,
-      use_vectorized_map=dist.use_vectorized_map,
-      name=_get_name(dist))
+    posterior_generator,
+    use_vectorized_map=dist.use_vectorized_map,
+    name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified
@@ -445,10 +459,11 @@ def posterior_generator(seed=seed):
 # todo: sample_shape and seed are not used.. maybe they should?
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
-                                            num_auxiliary_variables=0,
-                                            global_auxiliary_variables=None,
+                                            num_auxiliary_variables,
+                                            num_layers,
+                                            global_auxiliary_variables,
+                                            variables,
                                             sample_shape=None,
-                                            variables=None,
                                             seed=None):
   """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
 
@@ -456,23 +471,27 @@ def _cf_convex_update_for_base_distribution(dist,
     actual_event_shape = dist.event_shape_tensor()
     int_event_shape = int(actual_event_shape) if \
       actual_event_shape.shape.as_list()[0] > 0 else 1
-    layers = 3
     bijectors = [reshape.Reshape([-1],
-                             event_shape_in=actual_event_shape +
-                                            num_auxiliary_variables)]
+                                 event_shape_in=actual_event_shape +
+                                                num_auxiliary_variables)]
 
-    for _ in range(0, layers - 1):
+    for _ in range(0, num_layers - 1):
       bijectors.append(
         build_highway_flow_layer(
-          tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+          tf.reduce_prod(
+            actual_event_shape + num_auxiliary_variables),
           residual_fraction_initial_value=initial_prior_weight,
-          activation_fn=True, gate_first_n=int_event_shape, seed=seed))
+          activation_fn=True, gate_first_n=int_event_shape,
+          seed=seed))
     bijectors.append(
       build_highway_flow_layer(
-        tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+        tf.reduce_prod(
+          actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
-        activation_fn=False, gate_first_n=int_event_shape, seed=seed))
-    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+        activation_fn=False, gate_first_n=int_event_shape,
+        seed=seed))
+    bijectors.append(
+      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -485,7 +504,7 @@ def _cf_convex_update_for_base_distribution(dist,
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
           batch_broadcast.BatchBroadcast(dist,
-                                        to_shape=batch_shape),
+                                         to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(
               global_auxiliary_variables),

From 3b182aa6a30bfc7ae90715995dab33c22ec52525 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:59:21 +0200
Subject: [PATCH 10/54] readded highway flow

---
 .../python/experimental/bijectors/BUILD       | 29 +++++++++++++++++++
 .../python/experimental/bijectors/__init__.py |  4 +++
 2 files changed, 33 insertions(+)

diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD
index 3e42d43fb8..3fa54d43c1 100644
--- a/tensorflow_probability/python/experimental/bijectors/BUILD
+++ b/tensorflow_probability/python/experimental/bijectors/BUILD
@@ -117,6 +117,20 @@ multi_substrate_py_library(
     ],
 )
 
+multi_substrate_py_library(
+    name = "highway_flow",
+    srcs = ["highway_flow.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":scalar_function_with_inferred_inverse",
+        # numpy dep,
+        # tensorflow dep,
+        "//tensorflow_probability/python/bijectors",
+        "//tensorflow_probability/python/util",
+        "//tensorflow_probability/python/internal:samplers",
+    ],
+)
+
 multi_substrate_py_test(
     name = "sharded_test",
     size = "medium",
@@ -133,3 +147,18 @@ multi_substrate_py_test(
         "//tensorflow_probability/python/internal:test_util",
     ],
 )
+
+multi_substrate_py_test(
+    name = "highway_flow_test",
+    size = "medium",
+    srcs = ["highway_flow_test.py"],
+    jax_size = "medium",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        # numpy dep
+        # tensorflow dep,
+        "//tensorflow_probability",
+        "//tensorflow_probability/python/internal:test_util",
+    ],
+)
\ No newline at end of file
diff --git a/tensorflow_probability/python/experimental/bijectors/__init__.py b/tensorflow_probability/python/experimental/bijectors/__init__.py
index a261af93f5..e7b4fb00da 100644
--- a/tensorflow_probability/python/experimental/bijectors/__init__.py
+++ b/tensorflow_probability/python/experimental/bijectors/__init__.py
@@ -18,9 +18,13 @@
 from tensorflow_probability.python.bijectors.ldj_ratio import inverse_log_det_jacobian_ratio
 from tensorflow_probability.python.experimental.bijectors.distribution_bijectors import make_distribution_bijector
 from tensorflow_probability.python.experimental.bijectors.scalar_function_with_inferred_inverse import ScalarFunctionWithInferredInverse
+from tensorflow_probability.python.experimental.bijectors.highway_flow import build_highway_flow_layer
+from tensorflow_probability.python.experimental.bijectors.highway_flow import HighwayFlow
 from tensorflow_probability.python.experimental.bijectors.sharded import Sharded
 
 __all__ = [
+    'build_highway_flow_layer',
+    'HighwayFlow',
     'forward_log_det_jacobian_ratio',
     'inverse_log_det_jacobian_ratio',
     'make_distribution_bijector',

From 3e11546deec4f55936d4dc2555b7c43c3ebad778 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:05:36 +0200
Subject: [PATCH 11/54] fixed init

---
 tensorflow_probability/python/experimental/vi/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py
index e18c8d3455..cc5530300a 100644
--- a/tensorflow_probability/python/experimental/vi/__init__.py
+++ b/tensorflow_probability/python/experimental/vi/__init__.py
@@ -17,6 +17,7 @@
 from tensorflow_probability.python.experimental.vi import util
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule
+from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior
@@ -29,7 +30,7 @@
     'build_affine_surrogate_posterior',
     'build_affine_surrogate_posterior_from_base_distribution',
     'build_asvi_surrogate_posterior',
-    'builf_cf_surrogate_posterior'
+    'build_cf_surrogate_posterior',
     'build_factored_surrogate_posterior',
     'build_split_flow_surrogate_posterior',
     'build_trainable_location_scale_distribution',

From 2ff7130e48c91989dd34ce2997c71696cfebf651 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:07:24 +0200
Subject: [PATCH 12/54] removed highway flow from this branch

---
 .../bijectors/highway_flow_test.py            | 142 ------------------
 1 file changed, 142 deletions(-)
 delete mode 100644 tensorflow_probability/python/experimental/bijectors/highway_flow_test.py

diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py b/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py
deleted file mode 100644
index 24e6b7fb4e..0000000000
--- a/tensorflow_probability/python/experimental/bijectors/highway_flow_test.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2021 The TensorFlow Probability Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Tests for HighwayFlow."""
-import tensorflow.compat.v2 as tf
-
-import tensorflow_probability as tfp
-from tensorflow_probability.python.internal import samplers
-from tensorflow_probability.python.internal import test_util
-
-tfb = tfp.bijectors
-tfd = tfp.distributions
-
-@test_util.test_all_tf_execution_regimes
-class HighwayFlowTests(test_util.TestCase):
-
-  def testBijector(self):
-    width = 1
-    for dim in range(2):
-      if dim == 0:
-        # Test generic case with scalar input
-        x = samplers.uniform((width,), minval=-1.,
-                             maxval=1.,
-                             seed=test_util.test_seed(sampler_type='stateless'))
-      elif dim == 1:
-        # Test with 2D tensor + batch
-        x = samplers.uniform((5, width, width),
-                             minval=-1.,
-                             maxval=1.,
-                             seed=test_util.test_seed(sampler_type='stateless'))
-
-      bijector = tfp.experimental.bijectors.build_highway_flow_layer(
-        width, activation_fn=True)
-      self.evaluate(
-        [v.initializer for v in bijector.trainable_variables])
-      self.assertStartsWith(bijector.name, 'highway_flow')
-      self.assertAllClose(x, bijector.inverse(
-        tf.identity(bijector.forward(x))))
-      self.assertAllClose(
-        bijector.forward_log_det_jacobian(x, event_ndims=dim + 1),
-        -bijector.inverse_log_det_jacobian(
-          tf.identity(bijector.forward(x)), event_ndims=dim + 1))
-
-  def testBijectorWithoutActivation(self):
-    width = 4
-    x = samplers.uniform((2, width, width),
-                         minval=-1.,
-                         maxval=1.,
-                         seed=test_util.test_seed(sampler_type='stateless'))
-
-    bijector = tfp.experimental.bijectors.build_highway_flow_layer(
-      width, activation_fn=False)
-    self.evaluate(
-      [v.initializer for v in bijector.trainable_variables])
-    self.assertStartsWith(bijector.name, 'highway_flow')
-    self.assertAllClose(x, bijector.inverse(
-      tf.identity(bijector.forward(x))))
-    self.assertAllClose(
-      bijector.forward_log_det_jacobian(x, event_ndims=2),
-      -bijector.inverse_log_det_jacobian(
-        tf.identity(bijector.forward(x)), event_ndims=2))
-
-  def testGating(self):
-    width = 4
-    x = samplers.uniform((2, width, width),
-                         minval=-1.,
-                         maxval=1.,
-                         seed=test_util.test_seed(sampler_type='stateless'))
-
-    # Test with gating half of the inputs
-    bijector = tfp.experimental.bijectors.build_highway_flow_layer(
-      width, activation_fn=True, gate_first_n=2)
-    self.evaluate(
-      [v.initializer for v in bijector.trainable_variables])
-    self.assertStartsWith(bijector.name, 'highway_flow')
-    self.assertAllClose(x, bijector.inverse(
-      tf.identity(bijector.forward(x))))
-    self.assertAllClose(
-      bijector.forward_log_det_jacobian(x, event_ndims=2),
-      -bijector.inverse_log_det_jacobian(
-        tf.identity(bijector.forward(x)), event_ndims=2))
-
-    # Test with gating no inputs
-    bijector = tfp.experimental.bijectors.build_highway_flow_layer(
-      width, activation_fn=True, gate_first_n=0)
-    self.evaluate(
-      [v.initializer for v in bijector.trainable_variables])
-    self.assertStartsWith(bijector.name, 'highway_flow')
-    self.assertAllClose(x, bijector.inverse(
-      tf.identity(bijector.forward(x))))
-    self.assertAllClose(
-      bijector.forward_log_det_jacobian(x, event_ndims=2),
-      -bijector.inverse_log_det_jacobian(
-        tf.identity(bijector.forward(x)), event_ndims=2))
-
-  def testResidualFractionGradientsWithCenteredDifference(self):
-    width = 4
-    batch_size = 3
-    residual_fraction = tf.constant(0.5)
-    bijector = tfp.experimental.bijectors.HighwayFlow(
-      residual_fraction=residual_fraction,
-      activation_fn=tf.nn.softplus,
-      bias=tf.zeros(width),
-      upper_diagonal_weights_matrix=tf.eye(width),
-      lower_diagonal_weights_matrix=tf.eye(width),
-      gate_first_n=width
-    )
-    target = tfd.MultivariateNormalDiag(loc=tf.zeros(width),
-                                        scale_diag=tf.ones(width))
-    x = tf.ones((batch_size, width))
-    with tf.GradientTape() as g:
-      g.watch(bijector.residual_fraction)
-      y = tf.reduce_mean(target.log_prob(bijector.forward(x)))
-    tf_grad = g.gradient(y, bijector.residual_fraction)
-
-    h = 1e-3
-
-    # pylint: disable=protected-access
-    bijector._residual_fraction = residual_fraction + h
-    y1 = tf.reduce_mean(target.log_prob(bijector.forward(tf.identity(x))))
-    bijector._residual_fraction = residual_fraction - h
-    y2 = tf.reduce_mean(target.log_prob(bijector.forward(tf.identity(x))))
-    # pylint: enable=protected-access
-
-    manual_grad = (y1 - y2) / (2 * h)
-
-    self.assertAllClose(tf_grad, manual_grad, rtol=1e-4)
-
-
-if __name__ == '__main__':
-  tf.test.main()

From 4cc20e3b438c25428b8c8b4c5df9c3d54b8a82ff Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:07:59 +0200
Subject: [PATCH 13/54] removed highway flow from this branch

---
 .../experimental/bijectors/highway_flow.py    | 396 ------------------
 1 file changed, 396 deletions(-)
 delete mode 100644 tensorflow_probability/python/experimental/bijectors/highway_flow.py

diff --git a/tensorflow_probability/python/experimental/bijectors/highway_flow.py b/tensorflow_probability/python/experimental/bijectors/highway_flow.py
deleted file mode 100644
index 6f26abe72f..0000000000
--- a/tensorflow_probability/python/experimental/bijectors/highway_flow.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# Copyright 2021 The TensorFlow Probability Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""Highway Flow bijector."""
-
-import tensorflow.compat.v2 as tf
-
-from tensorflow_probability.python import bijectors as tfb
-from tensorflow_probability.python import util
-from tensorflow_probability.python.internal import cache_util
-from tensorflow_probability.python.internal import dtype_util
-from tensorflow_probability.python.internal import prefer_static as ps
-from tensorflow_probability.python.internal import samplers
-from tensorflow_probability.python.internal import tensor_util
-
-
-
-def build_highway_flow_layer(width,
-                             residual_fraction_initial_value=0.5,
-                             activation_fn=False,
-                             gate_first_n=None,
-                             seed=None):
-  """Builds HighwayFlow making sure that all the requirements are satisfied.
-
-  Args:
-    width: Input dimension of the bijector.
-    residual_fraction_initial_value: Initial value for gating parameter, must be
-      between 0 and 1.
-    activation_fn: Whether or not use SoftPlus activation function.
-    gate_first_n: Decides which part of the input should be gated (useful for
-    example when using auxiliary variables).
-    seed: Seed for random initialization of the weights.
-
-  Returns:
-    The initialized bijector with the following elements:
-      `residual_fraction` is bounded between 0 and 1.
-      `upper_diagonal_weights_matrix` is a randomly initialized (lower) diagonal
-      matrix with positive diagonal of size `width x width`.
-      `lower_diagonal_weights_matrix` is a randomly initialized lower diagonal
-      matrix with ones on the diagonal of size `width x width`;
-      `bias` is a randomly initialized vector of size `width`
-  """
-
-  # TODO: add control that residual_fraction_initial_value is between 0 and 1
-  residual_fraction_initial_value = tf.convert_to_tensor(
-    residual_fraction_initial_value,
-    dtype_hint=tf.float32,
-    name='residual_fraction_initial_value')
-  dtype = residual_fraction_initial_value.dtype
-
-  bias_seed, upper_seed, lower_seed = samplers.split_seed(
-    seed, n=3)
-  lower_bijector = tfb.Chain(
-    [tfb.TransformDiagonal(diag_bijector=tfb.Shift(1.)),
-     tfb.Pad(paddings=[(1, 0), (0, 1)]),
-     tfb.FillTriangular()])
-  unconstrained_lower_initial_values = samplers.normal(
-    shape=lower_bijector.inverse_event_shape([width, width]),
-    mean=0.,
-    stddev=.01,
-    seed=lower_seed)
-  upper_bijector = tfb.FillScaleTriL(diag_bijector=tfb.Softplus(),
-                                     diag_shift=None)
-  unconstrained_upper_initial_values = samplers.normal(
-    shape=upper_bijector.inverse_event_shape([width, width]),
-    mean=0.,
-    stddev=.01,
-    seed=upper_seed)
-
-  return HighwayFlow(
-    residual_fraction=util.TransformedVariable(
-      initial_value=residual_fraction_initial_value,
-      bijector=tfb.Sigmoid(),
-      dtype=dtype),
-    activation_fn=activation_fn,
-    bias=tf.Variable(
-      samplers.normal((width,), mean=0., stddev=0.01, seed=bias_seed),
-      dtype=dtype),
-    upper_diagonal_weights_matrix=util.TransformedVariable(
-      initial_value=upper_bijector.forward(unconstrained_upper_initial_values),
-      bijector=upper_bijector,
-      dtype=dtype),
-    lower_diagonal_weights_matrix=util.TransformedVariable(
-      initial_value=lower_bijector.forward(unconstrained_lower_initial_values),
-      bijector=lower_bijector,
-      dtype=dtype),
-    gate_first_n=gate_first_n
-  )
-
-
-class HighwayFlow(tfb.Bijector):
-  """Implements an Highway Flow bijector [1].
-
-  HighwayFlow interpolates the input `X` with the transformations at each step
-  of the bjiector. The Highway Flow can be used as building block for a
-  Cascading flow [1] or as a generic normalizing flow.
-
-  The transformation consists of a convex update between the input `X` and a
-  linear transformation of `X` followed by activation with the form `g(A @
-  X + b)`, where `g(.)` is a differentiable non-decreasing activation
-  function, and `A` and `b` are trainable weights.
-
-  The convex update is regulated by a trainable residual fraction `l`
-  constrained between 0 and 1, and can be
-  formalized as:
-  `Y = l * X + (1 - l) * g(A @ X + b)`.
-
-  To make this transformation invertible, the bijector is split in three
-  convex updates:
-   - `Y1 = l * X + (1 - l) * L @ X`, with `L` lower diagonal matrix with ones
-   on the diagonal;
-   - `Y2 = l * Y1 + (1 - l) * (U @ Y1 + b)`, with `U` upper diagonal matrix
-   with positive diagonal;
-   - `Y = l * Y2 + (1 - l) * g(Y2)`
-
-  The function `build_highway_flow_layer` helps initializing the bijector
-  with the variables respecting the various constraints.
-
-  For more details on Highway Flow and Cascading Flows see [1].
-
-  #### Usage example
-  ```python
-  tfd = tfp.distributions
-  tfb = tfp.bijectors
-
-  dim = 4 # last input dimension
-
-  bijector = build_highway_flow_layer(dim, activation_fn=True)
-  y = bijector.forward(x)  # forward mapping
-  x = bijector.inverse(y)  # inverse mapping
-  base = tfd.MultivariateNormalDiag(loc=tf.zeros(dim)) # Base distribution
-  transformed_distribution = tfd.TransformedDistribution(base, bijector)
-  ```
-
-  #### References
-
-  [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven.
-  "Automatic variational inference with cascading flows." arXiv preprint
-  arXiv:2102.04801 (2021).
-  """
-
-  # HighWay Flow simultaneously computes `forward` and `fldj`
-  # (and `inverse`/`ildj`), so we override the bijector cache to update the
-  # LDJ entries of attrs on forward/inverse inverse calls (instead of
-  # updating them only when the LDJ methods themselves are called).
-
-  _cache = cache_util.BijectorCacheWithGreedyAttrs(
-    forward_name='_augmented_forward',
-    inverse_name='_augmented_inverse')
-
-  def __init__(self, residual_fraction, activation_fn, bias,
-               upper_diagonal_weights_matrix,
-               lower_diagonal_weights_matrix,
-               gate_first_n=None,
-               validate_args=False,
-               name=None):
-    """Initializes the HighwayFlow.
-    Args:
-      residual_fraction: Scalar `Tensor` used for the convex update, must be
-        between 0 and 1.
-      activation_fn: Boolean to decide whether to use SoftPlus (True) activation
-        or no activation (False).
-      bias: Bias vector.
-      upper_diagonal_weights_matrix: Lower diagional matrix of size
-        (width, width) with positive diagonal (is transposed to Upper diagonal
-        within the bijector).
-      lower_diagonal_weights_matrix: Lower diagonal matrix with ones on the main
-        diagional.
-      gate_first_n: Integer that decides what part of the input is gated.
-        Default: `None`. When None, the whole input is gated.
-    """
-    parameters = dict(locals())
-    name = name or 'highway_flow'
-    dtype = dtype_util.common_dtype(
-      [residual_fraction, bias, upper_diagonal_weights_matrix,
-       lower_diagonal_weights_matrix], dtype_hint=tf.float32)
-    with tf.name_scope(name) as name:
-      self._width = ps.shape(bias)[-1]
-      self._bias = tensor_util.convert_nonref_to_tensor(bias, dtype=dtype,
-                                                        name='bias')
-      self._residual_fraction = tensor_util.convert_nonref_to_tensor(
-        residual_fraction, dtype=dtype, name='residual_fraction')
-      # The upper matrix is still lower triangular, transpose is done in
-      # _inverse and _forwars metowds.
-      self._upper_diagonal_weights_matrix = tensor_util.convert_nonref_to_tensor(
-        upper_diagonal_weights_matrix, dtype=dtype,
-        name='upper_diagonal_weights_matrix')
-      self._lower_diagonal_weights_matrix = tensor_util.convert_nonref_to_tensor(
-        lower_diagonal_weights_matrix, dtype=dtype,
-        name='lower_diagonal_weights_matrix')
-      self._activation_fn = activation_fn
-      self._gate_first_n = gate_first_n if gate_first_n else self.width
-
-      self._num_ungated = self.width - self.gate_first_n
-
-      super(HighwayFlow, self).__init__(
-        validate_args=validate_args,
-        forward_min_event_ndims=1,
-        parameters=parameters,
-        dtype=dtype,
-        name=name)
-
-  @property
-  def bias(self):
-    return self._bias
-
-  @property
-  def width(self):
-    return self._width
-
-  @property
-  def residual_fraction(self):
-    return self._residual_fraction
-
-  @property
-  def upper_diagonal_weights_matrix(self):
-    return self._upper_diagonal_weights_matrix
-
-  @property
-  def lower_diagonal_weights_matrix(self):
-    return self._lower_diagonal_weights_matrix
-
-  @property
-  def activation_fn(self):
-    return self._activation_fn
-
-  @property
-  def gate_first_n(self):
-    return self._gate_first_n
-
-  @property
-  def num_ungated(self):
-    return self._num_ungated
-
-  def _derivative_of_softplus(self, x):
-    return tf.concat([(self.residual_fraction) * tf.ones(
-      self.gate_first_n, dtype=self.dtype),
-                      tf.zeros(self.num_ungated, dtype=self.dtype)],
-                     axis=0) + (
-             tf.concat([(1. - self.residual_fraction) * tf.ones(
-               self.gate_first_n, dtype=self.dtype),
-                        tf.ones(self.num_ungated, dtype=self.dtype)],
-                       axis=0)) * tf.math.sigmoid(x)
-
-  def _convex_update(self, weights_matrix):
-    return tf.concat(
-      [self.residual_fraction * tf.eye(num_rows=self.gate_first_n,
-                                       num_columns=self.width,
-                                       dtype=self.dtype),
-       tf.zeros([self.num_ungated, self.width], dtype=self.dtype)],
-      axis=0) + tf.concat([(1. - self.residual_fraction) * tf.ones(
-      self.gate_first_n, dtype=self.dtype),
-                           tf.ones(self.num_ungated, dtype=self.dtype)],
-                          axis=0) * weights_matrix
-
-  def _inverse_of_softplus(self, y, n=20):
-    """Inverse of the activation layer with softplus using Newton iteration."""
-    x = tf.ones_like(y, dtype=self.dtype)
-    for _ in range(n):
-      x = x - (tf.concat([(self.residual_fraction) * tf.ones(
-        self.gate_first_n, dtype=self.dtype),
-                          tf.zeros(self.num_ungated, dtype=self.dtype)],
-                         axis=0) * x + tf.concat(
-        [(1. - self.residual_fraction) * tf.ones(
-          self.gate_first_n, dtype=self.dtype),
-         tf.ones(self.num_ungated, dtype=self.dtype)],
-        axis=0) * tf.math.softplus(
-        x) - y) / (
-            self._derivative_of_softplus(x))
-    return x
-
-  def _augmented_forward(self, x):
-    """Computes forward and forward_log_det_jacobian transformations.
-
-    Args:
-      x: Input of the bijector.
-
-    Returns:
-      x after forward flow and a dict containing forward and inverse log
-      determinant of the jacobian.
-    """
-
-    # Log determinant term from the upper matrix. Note that the log determinant
-    # of the lower matrix is zero.
-
-    fldj = tf.zeros(ps.shape(x)[:-1], dtype=self.dtype) + tf.reduce_sum(
-      tf.math.log(tf.concat([(self.residual_fraction) * tf.ones(
-        self.gate_first_n, dtype=self.dtype),
-                             tf.zeros(self.num_ungated, dtype=self.dtype)],
-                            axis=0) + (
-                    tf.concat([(1. - self.residual_fraction) * tf.ones(
-                      self.gate_first_n, dtype=self.dtype),
-                               tf.ones(self.num_ungated, dtype=self.dtype)],
-                              axis=0)) * tf.linalg.diag_part(
-        self.upper_diagonal_weights_matrix)))
-    x = x[tf.newaxis, ...]
-    x = tf.linalg.matvec(
-      self._convex_update(self.lower_diagonal_weights_matrix), x)
-    x = tf.linalg.matvec(
-      self._convex_update(self.upper_diagonal_weights_matrix),
-      x, transpose_a=True)
-    x += (tf.concat([(1. - self.residual_fraction) * tf.ones(
-      self.gate_first_n, dtype=self.dtype),
-                     tf.ones(self.num_ungated, dtype=self.dtype)],
-                    axis=0) * self.bias)[tf.newaxis, ...]
-
-    if self.activation_fn:
-      fldj += tf.reduce_sum(tf.math.log(self._derivative_of_softplus(x[0])),
-                            axis=-1)
-      x = tf.concat([(self.residual_fraction) * tf.ones(
-        self.gate_first_n, dtype=self.dtype),
-                     tf.zeros(self.num_ungated, dtype=self.dtype)],
-                    axis=0) * x + tf.concat(
-        [(1. - self.residual_fraction) * tf.ones(
-          self.gate_first_n, dtype=self.dtype),
-         tf.ones(self.num_ungated, dtype=self.dtype)],
-        axis=0) * tf.nn.softplus(x)
-
-    return tf.squeeze(x, 0), {'ildj': -fldj, 'fldj': fldj}
-
-  def _augmented_inverse(self, y):
-    """Computes inverse and inverse_log_det_jacobian transformations.
-
-    Args:
-      y: input of the (inverse) bijectorr.
-
-    Returns:
-      y after inverse flow and a dict containing inverse and forward log
-      determinant of the jacobian.
-    """
-
-    ildj = tf.zeros(ps.shape(y)[:-1], dtype=self.dtype) - tf.reduce_sum(
-      tf.math.log(tf.concat([(self.residual_fraction) * tf.ones(
-        self.gate_first_n, dtype=self.dtype),
-                             tf.zeros(self.num_ungated, dtype=self.dtype)],
-                            axis=0) + tf.concat(
-        [(1. - self.residual_fraction) * tf.ones(
-          self.gate_first_n, dtype=self.dtype),
-         tf.ones(self.num_ungated, dtype=self.dtype)],
-        axis=0) * tf.linalg.diag_part(
-        self.upper_diagonal_weights_matrix)))
-
-    if self.activation_fn:
-      y = self._inverse_of_softplus(y)
-      ildj -= tf.reduce_sum(tf.math.log(self._derivative_of_softplus(y)),
-                            axis=-1)
-
-    y = y[..., tf.newaxis]
-
-    y = y - (tf.concat([(1. - self.residual_fraction) * tf.ones(
-      self.gate_first_n, dtype=self.dtype),
-                        tf.ones(self.num_ungated, dtype=self.dtype)],
-                       axis=0) * self.bias)[..., tf.newaxis]
-    y = tf.linalg.triangular_solve(
-      self._convex_update(self.upper_diagonal_weights_matrix), y,
-      lower=True, adjoint=True)
-    y = tf.linalg.triangular_solve(
-      self._convex_update(self.lower_diagonal_weights_matrix), y)
-
-    return tf.squeeze(y, axis=-1), {'ildj': ildj, 'fldj': -ildj}
-
-  def _forward(self, x):
-    y, _ = self._augmented_forward(x)
-    return y
-
-  def _inverse(self, y):
-    x, _ = self._augmented_inverse(y)
-    return x
-
-  def _forward_log_det_jacobian(self, x):
-    cached = self._cache.forward_attributes(x)
-    # If LDJ isn't in the cache, call forward once.
-    if 'fldj' not in cached:
-      _, attrs = self._augmented_forward(x)
-      cached.update(attrs)
-    return cached['fldj']
-
-  def _inverse_log_det_jacobian(self, y):
-    cached = self._cache.inverse_attributes(y)
-    # If LDJ isn't in the cache, call inverse once.
-    if 'ildj' not in cached:
-      _, attrs = self._augmented_inverse(y)
-      cached.update(attrs)
-    return cached['ildj']

From 0b386c671506a6f8f67c12cd5a8492db1d8d3fdd Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:08:45 +0200
Subject: [PATCH 14/54] working on tests

---
 .../experimental/vi/cascading_flows_test.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 9c4393be24..598d3fd66e 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -30,7 +30,7 @@
 
 tfb = tfp.bijectors
 tfd = tfp.distributions
-
+ # test_util.test_seed(sampler_type='stateless'))
 
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
@@ -65,7 +65,7 @@ def test_dims_and_gradients(self):
                    self._expected_num_trainable_variables(prior_dist))
 
     # Test that the sample shape is correct
-    three_posterior_samples = surrogate_posterior.sample(
+    '''three_posterior_samples = surrogate_posterior.sample(
         3, seed=test_util.test_seed(sampler_type='stateless'))
     three_prior_samples = prior_dist.sample(
         3, seed=test_util.test_seed(sampler_type='stateless'))
@@ -74,15 +74,15 @@ def test_dims_and_gradients(self):
         [s.shape for s in tf.nest.flatten(three_posterior_samples)])
 
     # Test that gradients are available wrt the variational parameters.
-    posterior_sample = surrogate_posterior.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+   posterior_sample = surrogate_posterior.sample(
+        seed=1)
     with tf.GradientTape() as tape:
       posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
     grad = tape.gradient(posterior_logprob,
                          surrogate_posterior.trainable_variables)
-    self.assertTrue(all(g is not None for g in grad))
+    self.assertTrue(all(g is not None for g in grad))'''
 
-  def test_initialization_is_deterministic_following_seed(self):
+  '''def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
@@ -101,7 +101,7 @@ def test_initialization_is_deterministic_following_seed(self):
     posterior_sample2 = surrogate_posterior2.sample(
         seed=test_util.test_seed(sampler_type='stateless'))
 
-    self.assertAllEqualNested(posterior_sample, posterior_sample2)
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)'''
 
 
 @test_util.test_all_tf_execution_regimes
@@ -144,7 +144,7 @@ def target_log_prob(*x):
 
     return target_log_prob
 
-  def test_fitting_surrogate_posterior(self):
+  '''def test_fitting_surrogate_posterior(self):
 
     prior_dist = self.make_prior_dist()
     observations = self.get_observations(prior_dist)
@@ -170,7 +170,7 @@ def test_fitting_surrogate_posterior(self):
     self.evaluate(tf1.global_variables_initializer())
     _ = self.evaluate(losses)
     _ = self.evaluate(posterior_mean)
-    _ = self.evaluate(posterior_stddev)
+    _ = self.evaluate(posterior_stddev)'''
 
 
 @test_util.test_all_tf_execution_regimes

From d8f47802d41a65304c103b6c8b0e2bb6fbdbbd48 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 11:39:55 +0200
Subject: [PATCH 15/54] more testing

---
 .../experimental/vi/cascading_flows_test.py   | 57 +++++++------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 598d3fd66e..b52e1e5f77 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -35,73 +35,59 @@
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
-  def _expected_num_trainable_variables(self, prior_dist):
+  def _expected_num_trainable_variables(self, prior_dist, num_layers):
     """Infers the expected number of trainable variables for a non-nested JD."""
     prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
     expected_num_trainable_variables = 0
+
+    # For each distribution in the prior, we will have one highway flow with
+    # `num_layers` blocks, and each block has 4 trainable variables:
+    # `residual_fraction`, `lower_diagonal_weights_matrix`,
+    # `upper_diagonal_weights_matrix` and `bias`.
     for original_dist in prior_dists:
-      try:
-        original_dist = original_dist.distribution
-      except AttributeError:
-        pass
-      dist = cascading_flows._as_substituted_distribution(original_dist)
-      dist_params = dist.parameters
-      for param, value in dist_params.items():
-        if (param not in cascading_flows._NON_STATISTICAL_PARAMS
-            and value is not None and param not in ('low', 'high')):
-          # One variable each for prior_weight, mean_field_parameter.
-          expected_num_trainable_variables += 2
+      expected_num_trainable_variables += (4 * num_layers)
     return expected_num_trainable_variables
 
   def test_dims_and_gradients(self):
 
     prior_dist = self.make_prior_dist()
-
+    num_layers = 3
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist)
+        prior=prior_dist, num_layers=num_layers)
 
     # Test that the correct number of trainable variables are being tracked
     self.assertLen(surrogate_posterior.trainable_variables,
-                   self._expected_num_trainable_variables(prior_dist))
+                   self._expected_num_trainable_variables(prior_dist, num_layers))
 
     # Test that the sample shape is correct
-    '''three_posterior_samples = surrogate_posterior.sample(
-        3, seed=test_util.test_seed(sampler_type='stateless'))
+    three_posterior_samples = surrogate_posterior.sample(
+        3, seed=1)
     three_prior_samples = prior_dist.sample(
-        3, seed=test_util.test_seed(sampler_type='stateless'))
+        3, seed=1)
     self.assertAllEqualNested(
         [s.shape for s in tf.nest.flatten(three_prior_samples)],
         [s.shape for s in tf.nest.flatten(three_posterior_samples)])
 
-    # Test that gradients are available wrt the variational parameters.
-   posterior_sample = surrogate_posterior.sample(
-        seed=1)
-    with tf.GradientTape() as tape:
-      posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
-    grad = tape.gradient(posterior_logprob,
-                         surrogate_posterior.trainable_variables)
-    self.assertTrue(all(g is not None for g in grad))'''
-
-  '''def test_initialization_is_deterministic_following_seed(self):
+  def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
         prior=prior_dist,
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
     self.evaluate(
         [v.initializer for v in surrogate_posterior.trainable_variables])
     posterior_sample = surrogate_posterior.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
 
     surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
         prior=prior_dist,
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
     self.evaluate(
         [v.initializer for v in surrogate_posterior2.trainable_variables])
     posterior_sample2 = surrogate_posterior2.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
 
-    self.assertAllEqualNested(posterior_sample, posterior_sample2)'''
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)
 
 
 @test_util.test_all_tf_execution_regimes
@@ -172,7 +158,6 @@ def target_log_prob(*x):
     _ = self.evaluate(posterior_mean)
     _ = self.evaluate(posterior_stddev)'''
 
-
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestEightSchools(test_util.TestCase,
                                              _TrainableCFSurrogate):
@@ -235,7 +220,7 @@ def _prior_model_fn():
     return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
 
 
-@test_util.test_all_tf_execution_regimes
+'''@test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestDiscreteLatent(
     test_util.TestCase, _TrainableCFSurrogate):
 
@@ -344,7 +329,7 @@ def centered_horseshoe(ndims=100):
                           tfd.Normal)
     self.assertIsInstance(surrogate_dists.local_scale.distribution,
                           tfd.Normal)
-    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)'''
 
 # TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions.
 # TODO(kateslin): Add an ASVI surrogate posterior test with for a model with

From af9a5bae484ee95491f42415f88ff955c25564be Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:51:37 +0200
Subject: [PATCH 16/54] fixed conflicts

---
 tensorflow_probability/python/experimental/vi/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py
index 0cb4971fcc..e18c8d3455 100644
--- a/tensorflow_probability/python/experimental/vi/__init__.py
+++ b/tensorflow_probability/python/experimental/vi/__init__.py
@@ -29,6 +29,7 @@
     'build_affine_surrogate_posterior',
     'build_affine_surrogate_posterior_from_base_distribution',
     'build_asvi_surrogate_posterior',
+    'builf_cf_surrogate_posterior'
     'build_factored_surrogate_posterior',
     'build_split_flow_surrogate_posterior',
     'build_trainable_location_scale_distribution',

From 5cf8ce96854f3c27b529a023f282d525ccb0acfc Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:52:05 +0200
Subject: [PATCH 17/54] Revert "Revert "initial tests, updated init and build""

This reverts commit 5bb28b08
---
 .../python/experimental/vi/BUILD              | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD
index e57f884ca5..863e0aeef2 100644
--- a/tensorflow_probability/python/experimental/vi/BUILD
+++ b/tensorflow_probability/python/experimental/vi/BUILD
@@ -31,6 +31,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":automatic_structured_vi",
+        ":cascading_flows",
         ":surrogate_posteriors",
         "//tensorflow_probability/python/experimental/vi/util",
         "//tensorflow_probability/python/internal:all_util",
@@ -67,6 +68,36 @@ py_library(
     ],
 )
 
+py_library(
+    name = "cascading_flows",
+    srcs = ["cascading_flows.py.py"],
+    srcs_version = "PY3",
+    deps = [
+        # tensorflow dep,
+        "//tensorflow_probability/python/bijectors:build_highway_flow_layer",
+        "//tensorflow_probability/python/bijectors:chain",
+        "//tensorflow_probability/python/bijectors:reshape",
+        "//tensorflow_probability/python/bijectors:scale",
+        "//tensorflow_probability/python/bijectors:shift",
+        "//tensorflow_probability/python/bijectors:split",
+        "//tensorflow_probability/python/distributions:batch_broadcast",
+        "//tensorflow_probability/python/distributions:beta",
+        "//tensorflow_probability/python/distributions:blockwise",
+        "//tensorflow_probability/python/distributions:chi2",
+        "//tensorflow_probability/python/distributions:exponential",
+        "//tensorflow_probability/python/distributions:gamma",
+        "//tensorflow_probability/python/distributions:half_normal",
+        "//tensorflow_probability/python/distributions:joint_distribution_auto_batched",
+        "//tensorflow_probability/python/distributions:joint_distribution_coroutine",
+        "//tensorflow_probability/python/distributions:normal",
+        "//tensorflow_probability/python/distributions:sample",
+        "//tensorflow_probability/python/distributions:transformed_distribution",
+        "//tensorflow_probability/python/distributions:truncated_normal",
+        "//tensorflow_probability/python/distributions:uniform",
+        "//tensorflow_probability/python/internal:samplers",
+    ],
+)
+
 py_library(
     name = "surrogate_posteriors",
     srcs = ["surrogate_posteriors.py"],
@@ -111,6 +142,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "cascading_flows_test",
+    size = "large",
+    srcs = ["cascading_flows_test.py"],
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY3",
+    deps = [
+        # absl/testing:parameterized dep,
+        # numpy dep,
+        # tensorflow dep,
+        "//tensorflow_probability",
+        "//tensorflow_probability/python/internal:test_util",
+    ],
+)
+
 py_test(
     name = "surrogate_posteriors_test",
     size = "large",

From 755bca92cd518eb9c91c9dd177c8a02fb1ed3381 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:54:47 +0200
Subject: [PATCH 18/54] reverted commit

---
 .../python/experimental/vi/cascading_flows.py | 483 ++++++++++++++++++
 1 file changed, 483 insertions(+)
 create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows.py

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
new file mode 100644
index 0000000000..d8c9393d8e
--- /dev/null
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -0,0 +1,483 @@
+# Copyright 2021 The TensorFlow Probability Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Utilities for constructing structured surrogate posteriors."""
+
+from __future__ import absolute_import
+from __future__ import division
+# [internal] enable type annotations
+from __future__ import print_function
+
+import copy
+import functools
+import inspect
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow_probability.python.experimental.bijectors import \
+  build_highway_flow_layer
+from tensorflow_probability.python.bijectors import chain
+from tensorflow_probability.python.bijectors import reshape
+from tensorflow_probability.python.bijectors import scale as scale_lib
+from tensorflow_probability.python.bijectors import shift
+from tensorflow_probability.python.bijectors import split
+
+from tensorflow_probability.python.distributions import batch_broadcast
+from tensorflow_probability.python.distributions import beta
+from tensorflow_probability.python.distributions import blockwise
+from tensorflow_probability.python.distributions import chi2
+from tensorflow_probability.python.distributions import exponential
+from tensorflow_probability.python.distributions import gamma
+from tensorflow_probability.python.distributions import half_normal
+from tensorflow_probability.python.distributions import \
+  joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import \
+  joint_distribution_coroutine
+from tensorflow_probability.python.distributions import normal
+from tensorflow_probability.python.distributions import sample
+from tensorflow_probability.python.distributions import transformed_distribution
+from tensorflow_probability.python.distributions import truncated_normal
+from tensorflow_probability.python.distributions import uniform
+from tensorflow_probability.python.internal import samplers
+
+__all__ = [
+  'register_asvi_substitution_rule',
+  'build_cf_surrogate_posterior'
+]
+
+Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
+
+_NON_STATISTICAL_PARAMS = [
+  'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum',
+  'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support',
+  'num_probit_terms_approx'
+]
+_NON_TRAINABLE_PARAMS = ['low', 'high']
+
+# Registry of transformations that are applied to distributions in the prior
+# before defining the surrogate family.
+
+
+# Todo: inherited from asvi code, do we need this?
+ASVI_SURROGATE_SUBSTITUTIONS = {}
+
+
+# Todo: inherited from asvi code, do we need this?
+def _as_substituted_distribution(distribution):
+  """Applies all substitution rules that match a distribution."""
+  for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items():
+    if condition(distribution):
+      distribution = substitution_fn(distribution)
+  return distribution
+
+
+# Todo: inherited from asvi code, do we need this?
+def register_asvi_substitution_rule(condition, substitution_fn):
+  """Registers a rule for substituting distributions in ASVI surrogates.
+
+  Args:
+    condition: Python `callable` that takes a Distribution instance and
+      returns a Python `bool` indicating whether or not to substitute it.
+      May also be a class type such as `tfd.Normal`, in which case the
+      condition is interpreted as
+      `lambda distribution: isinstance(distribution, class)`.
+    substitution_fn: Python `callable` that takes a Distribution
+      instance and returns a new Distribution instance used to define
+      the ASVI surrogate posterior. Note that this substitution does not modify
+      the original model.
+
+  #### Example
+
+  To use a Normal surrogate for all location-scale family distributions, we
+  could register the substitution:
+
+  ```python
+  tfp.experimental.vi.register_asvi_surrogate_substitution(
+    condition=lambda distribution: (
+      hasattr(distribution, 'loc') and hasattr(distribution, 'scale'))
+    substitution_fn=lambda distribution: (
+      # Invoking the event space bijector applies any relevant constraints,
+      # e.g., that HalfCauchy samples must be `>= loc`.
+      distribution.experimental_default_event_space_bijector()(
+        tfd.Normal(loc=distribution.loc, scale=distribution.scale)))
+  ```
+
+  This rule will fire when ASVI encounters a location-scale distribution,
+  and instructs ASVI to build a surrogate 'as if' the model had just used a
+  (possibly constrained) Normal in its place. Note that we could have used a
+  more precise condition, e.g., to limit the substitution to distributions with
+  a specific `name`, if we had reason to think that a Normal distribution would
+  be a good surrogate for some model variables but not others.
+
+  """
+  global ASVI_SURROGATE_SUBSTITUTIONS
+  if inspect.isclass(condition):
+    condition = lambda distribution, cls=condition: isinstance(
+      # pylint: disable=g-long-lambda
+      distribution, cls)
+  ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn
+
+
+# Default substitutions attempt to express distributions using the most
+# flexible available parameterization.
+# pylint: disable=g-long-lambda
+register_asvi_substitution_rule(
+  half_normal.HalfNormal,
+  lambda dist: truncated_normal.TruncatedNormal(
+    loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
+register_asvi_substitution_rule(
+  uniform.Uniform,
+  lambda dist: shift.Shift(dist.low)(
+    scale_lib.Scale(dist.high - dist.low)(
+      beta.Beta(concentration0=tf.ones_like(dist.mean()),
+                concentration1=1.))))
+register_asvi_substitution_rule(
+  exponential.Exponential,
+  lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
+register_asvi_substitution_rule(
+  chi2.Chi2,
+  lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
+
+
+# pylint: enable=g-long-lambda
+
+# a single JointDistribution.
+def build_cf_surrogate_posterior(
+    prior,
+    num_auxiliary_variables=0,
+    initial_prior_weight=0.5,
+    seed=None,
+    name=None):
+  # todo: change docstrings
+  """Builds a structured surrogate posterior inspired by conjugate updating.
+
+  ASVI, or Automatic Structured Variational Inference, was proposed by
+  Ambrogioni et al. (2020) [1] as a method of automatically constructing a
+  surrogate posterior with the same structure as the prior. It does this by
+  reparameterizing the variational family of the surrogate posterior by
+  structuring each parameter according to the equation
+  ```none
+  prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter
+  ```
+  In this equation, `prior_parameter` is a vector of prior parameters and
+  `mean_field_parameter` is a vector of trainable parameters with the same
+  domain as `prior_parameter`. `prior_weight` is a vector of learnable
+  parameters where `0. <= prior_weight <= 1.`. When `prior_weight =
+  0`, the surrogate posterior will be a mean-field surrogate, and when
+  `prior_weight = 1.`, the surrogate posterior will be the prior. This convex
+  combination equation, inspired by conjugacy in exponential families, thus
+  allows the surrogate posterior to balance between the structure of the prior
+  and the structure of a mean-field approximation.
+
+  Args:
+    prior: tfd.JointDistribution instance of the prior.
+    mean_field: Optional Python boolean. If `True`, creates a degenerate
+      surrogate distribution in which all variables are independent,
+      ignoring the prior dependence structure. Default value: `False`.
+    initial_prior_weight: Optional float value (either static or tensor value)
+      on the interval [0, 1]. A larger value creates an initial surrogate
+      distribution with more dependence on the prior structure. Default value:
+      `0.5`.
+    seed: Python `int` seed for random initialization.
+    name: Optional string. Default value: `build_cf_surrogate_posterior`.
+
+  Returns:
+    surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
+    whose samples have shape and structure matching that of `prior`.
+
+  Raises:
+    TypeError: The `prior` argument cannot be a nested `JointDistribution`.
+
+  ### Examples
+
+  Consider a Brownian motion model expressed as a JointDistribution:
+
+  ```python
+  prior_loc = 0.
+  innovation_noise = .1
+
+  def model_fn():
+    new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+    for i in range(4):
+      new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+  prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
+  ```
+
+  Let's use variational inference to approximate the posterior. We'll build a
+  surrogate posterior distribution by feeding in the prior distribution.
+
+  ```python
+  surrogate_posterior =
+    tfp.experimental.vi.build_cf_surrogate_posterior(prior)
+  ```
+
+  This creates a trainable joint distribution, defined by variables in
+  `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
+  to fit this distribution by minimizing a divergence to the true posterior.
+
+  ```python
+  losses = tfp.vi.fit_surrogate_posterior(
+    target_log_prob_fn,
+    surrogate_posterior=surrogate_posterior,
+    num_steps=100,
+    optimizer=tf.optimizers.Adam(0.1),
+    sample_size=10)
+
+  # After optimization, samples from the surrogate will approximate
+  # samples from the true posterior.
+  samples = surrogate_posterior.sample(100)
+  posterior_mean = [tf.reduce_mean(x) for x in samples]
+  posterior_std = [tf.math.reduce_std(x) for x in samples]
+  ```
+
+  #### References
+  [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured
+        variational inference. _arXiv preprint arXiv:2002.00643_, 2020
+        https://arxiv.org/abs/2002.00643
+
+  """
+  with tf.name_scope(name or 'build_cf_surrogate_posterior'):
+    surrogate_posterior, variables = _cf_surrogate_for_distribution(
+      dist=prior,
+      base_distribution_surrogate_fn=functools.partial(
+        _cf_convex_update_for_base_distribution,
+        initial_prior_weight=initial_prior_weight,
+        num_auxiliary_variables=num_auxiliary_variables),
+      seed=seed)
+    surrogate_posterior.also_track = variables
+    return surrogate_posterior
+
+
+def _cf_surrogate_for_distribution(dist,
+                                   base_distribution_surrogate_fn,
+                                   sample_shape=None,
+                                   variables=None,
+                                   seed=None):
+  # todo: change docstrings
+  """Recursively creates ASVI surrogates, and creates new variables if needed.
+
+  Args:
+    dist: a `tfd.Distribution` instance.
+    base_distribution_surrogate_fn: Callable to build a surrogate posterior
+      for a 'base' (non-meta and non-joint) distribution, with signature
+      `surrogate_posterior, variables = base_distribution_fn(
+      dist, sample_shape=None, variables=None, seed=None)`.
+    sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
+      `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
+      independent sample dimensions, i.e., it will have event shape
+      `concat([sample_shape, dist.event_shape], axis=0)`.
+      Default value: `None`.
+    variables: Optional nested structure of `tf.Variable`s returned from a
+      previous call to `_cf_surrogate_for_distribution`. If `None`,
+      new variables will be created; otherwise, constructs a surrogate posterior
+      backed by the passed-in variables.
+      Default value: `None`.
+    seed: Python `int` seed for random initialization.
+  Returns:
+    surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
+      surrogate posterior distribution, with the same structure and `name` as
+      `dist`.
+    variables: Nested structure of `tf.Variable` trainable parameters for the
+      surrogate posterior. If `dist` is a base distribution, this is
+      a `dict` of `ASVIParameters` instances. If `dist` is a joint
+      distribution, this is a `dist.dtype` structure of such `dict`s.
+  """
+
+  # Apply any substitutions, while attempting to preserve the original name.
+  dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
+
+  if hasattr(dist, '_model_coroutine'):
+    surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
+      dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      variables=variables,
+      seed=seed)
+  else:
+    surrogate_posterior, variables = base_distribution_surrogate_fn(
+      dist=dist, sample_shape=sample_shape, variables=variables, seed=seed)
+  return surrogate_posterior, variables
+
+
+def _cf_surrogate_for_joint_distribution(
+    dist, base_distribution_surrogate_fn, variables=None, seed=None):
+  """Builds a structured joint surrogate posterior for a joint model."""
+
+  # Probabilistic program for CF surrogate posterior.
+  flat_variables = dist._model_flatten(
+    variables) if variables else None  # pylint: disable=protected-access
+  prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
+
+  def posterior_generator(seed=seed):
+    prior_gen = prior_coroutine()
+    dist = next(prior_gen)
+    i = 0
+    try:
+      while True:
+        was_root = isinstance(dist, Root)
+        if was_root:
+          dist = dist.distribution
+
+        seed, init_seed = samplers.split_seed(seed)
+        surrogate_posterior, variables = _cf_surrogate_for_distribution(
+          dist,
+          base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+          variables=flat_variables[i] if flat_variables else None,
+          seed=init_seed)
+
+        if was_root:
+          surrogate_posterior = Root(surrogate_posterior)
+        # If variables were not given---i.e., we're creating new
+        # variables---then yield the new variables along with the surrogate
+        # posterior. This assumes an execution context such as
+        # `_extract_variables_from_coroutine_model` below that will capture and
+        # save the variables.
+        value_out = yield (surrogate_posterior if flat_variables
+                           else (surrogate_posterior, variables))
+        if type(value_out) == list:
+          if len(dist.event_shape) == 0:
+            dist = prior_gen.send(tf.squeeze(value_out[0], -1))
+          else:
+            dist = prior_gen.send(value_out[0])
+
+        else:
+          dist = prior_gen.send(value_out)
+        i += 1
+    except StopIteration:
+      pass
+
+  if variables is None:
+    # Run the generator to create variables, then call ourselves again
+    # to construct the surrogate JD from these variables. Note that we can't
+    # just create a JDC from the current `posterior_generator`, because it will
+    # try to build new variables on every invocation; the recursive call will
+    # define a new `posterior_generator` that knows about the variables we're
+    # about to create.
+    return _cf_surrogate_for_joint_distribution(
+      dist=dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      variables=dist._model_unflatten(  # pylint: disable=protected-access
+        _extract_variables_from_coroutine_model(
+          posterior_generator, seed=seed)))
+
+  # Temporary workaround for bijector caching issues with autobatched JDs.
+  surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched
+  if not hasattr(dist, 'use_vectorized_map'):
+    surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine
+  surrogate_posterior = surrogate_type(posterior_generator,
+                                       name=_get_name(dist))
+
+  # Ensure that the surrogate posterior structure matches that of the prior.
+  # todo: check me, do we need this? in case needs to be modified
+  # if we use auxiliary variables, then the structure won't match the one of the
+  # prior
+  '''try:
+    tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
+  except TypeError:
+    tokenize = lambda jd: jd._model_unflatten(
+      # pylint: disable=protected-access, g-long-lambda
+      range(len(jd._model_flatten(jd.dtype)))
+      # pylint: disable=protected-access
+    )
+    surrogate_posterior = restructure.Restructure(
+      output_structure=tokenize(dist),
+      input_structure=tokenize(surrogate_posterior))(
+      surrogate_posterior, name=_get_name(dist))'''
+  return surrogate_posterior, variables
+
+
+# todo: sample_shape and seed are not used.. maybe they should?
+def _cf_convex_update_for_base_distribution(dist,
+                                            initial_prior_weight,
+                                            num_auxiliary_variables=0,
+                                            sample_shape=None,
+                                            variables=None,
+                                            seed=None):
+  """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+
+  if variables is None:
+    actual_event_shape = dist.event_shape_tensor()
+    int_event_shape = int(actual_event_shape) if \
+      actual_event_shape.shape.as_list()[0] > 0 else 1
+    layers = 3
+    bijectors = [reshape.Reshape([-1],
+                             event_shape_in=actual_event_shape +
+                                            num_auxiliary_variables)]
+
+    for _ in range(0, layers - 1):
+      bijectors.append(
+        build_highway_flow_layer(
+          tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+          residual_fraction_initial_value=initial_prior_weight,
+          activation_fn=True, gate_first_n=int_event_shape))
+    bijectors.append(
+      build_highway_flow_layer(
+        tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+        residual_fraction_initial_value=initial_prior_weight,
+        activation_fn=False, gate_first_n=int_event_shape))
+    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+
+    variables = chain.Chain(bijectors=list(reversed(bijectors)))
+
+  if num_auxiliary_variables > 0:
+    cascading_flows = split.Split(
+      [-1, num_auxiliary_variables])(
+      transformed_distribution.TransformedDistribution(
+        distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast(
+          sample.Sample(normal.Normal(0., .1), num_auxiliary_variables),
+          to_shape=dist.batch_shape)]),
+        bijector=variables))
+
+  else:
+    cascading_flows = transformed_distribution.TransformedDistribution(
+      distribution=dist,
+      bijector=variables)
+
+  return cascading_flows, variables
+
+
+def _extract_variables_from_coroutine_model(model_fn, seed=None):
+  """Extracts variables from a generator that yields (dist, variables) pairs."""
+  gen = model_fn()
+  try:
+    dist, dist_variables = next(gen)
+    flat_variables = [dist_variables]
+    while True:
+      seed, local_seed = samplers.split_seed(seed, n=2)
+      sampled_value = (dist.distribution.sample(seed=local_seed)
+                       if isinstance(dist, Root)
+                       else dist.sample(seed=local_seed))
+      dist, dist_variables = gen.send(
+        sampled_value)  # tf.concat(sampled_value, axis=0)
+      flat_variables.append(dist_variables)
+  except StopIteration:
+    pass
+  return flat_variables
+
+
+def _set_name(dist, name):
+  """Copies a distribution-like object, replacing its name."""
+  if hasattr(dist, 'copy'):
+    return dist.copy(name=name)
+  # Some distribution-like entities such as JointDistributionPinned don't
+  # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
+  # the name directly.
+  dist = copy.copy(dist)
+  dist._name = name  # pylint: disable=protected-access
+  return dist
+
+
+def _get_name(dist):
+  """Attempts to get a distribution's short name, excluding the name scope."""
+  return getattr(dist, 'parameters', {}).get('name', dist.name)

From bbc38a44d1ae1976f03a305e257dd18dd53939e6 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:55:54 +0200
Subject: [PATCH 19/54] Revert "removed cascading_flows from pr"

This reverts commit 1620ebd2
---
 .../experimental/vi/cascading_flows_test.py   | 354 ++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 tensorflow_probability/python/experimental/vi/cascading_flows_test.py

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
new file mode 100644
index 0000000000..9c4393be24
--- /dev/null
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -0,0 +1,354 @@
+# Copyright 2021 The TensorFlow Probability Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tests for structured surrogate posteriors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import tensorflow.compat.v1 as tf1
+import tensorflow.compat.v2 as tf
+import tensorflow_probability as tfp
+from tensorflow_probability.python.experimental.vi import cascading_flows
+from tensorflow_probability.python.internal import prefer_static as ps
+from tensorflow_probability.python.internal import test_util
+
+
+tfb = tfp.bijectors
+tfd = tfp.distributions
+
+
+@test_util.test_all_tf_execution_regimes
+class _TrainableCFSurrogate(object):
+
+  def _expected_num_trainable_variables(self, prior_dist):
+    """Infers the expected number of trainable variables for a non-nested JD."""
+    prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
+    expected_num_trainable_variables = 0
+    for original_dist in prior_dists:
+      try:
+        original_dist = original_dist.distribution
+      except AttributeError:
+        pass
+      dist = cascading_flows._as_substituted_distribution(original_dist)
+      dist_params = dist.parameters
+      for param, value in dist_params.items():
+        if (param not in cascading_flows._NON_STATISTICAL_PARAMS
+            and value is not None and param not in ('low', 'high')):
+          # One variable each for prior_weight, mean_field_parameter.
+          expected_num_trainable_variables += 2
+    return expected_num_trainable_variables
+
+  def test_dims_and_gradients(self):
+
+    prior_dist = self.make_prior_dist()
+
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist)
+
+    # Test that the correct number of trainable variables are being tracked
+    self.assertLen(surrogate_posterior.trainable_variables,
+                   self._expected_num_trainable_variables(prior_dist))
+
+    # Test that the sample shape is correct
+    three_posterior_samples = surrogate_posterior.sample(
+        3, seed=test_util.test_seed(sampler_type='stateless'))
+    three_prior_samples = prior_dist.sample(
+        3, seed=test_util.test_seed(sampler_type='stateless'))
+    self.assertAllEqualNested(
+        [s.shape for s in tf.nest.flatten(three_prior_samples)],
+        [s.shape for s in tf.nest.flatten(three_posterior_samples)])
+
+    # Test that gradients are available wrt the variational parameters.
+    posterior_sample = surrogate_posterior.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+    with tf.GradientTape() as tape:
+      posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
+    grad = tape.gradient(posterior_logprob,
+                         surrogate_posterior.trainable_variables)
+    self.assertTrue(all(g is not None for g in grad))
+
+  def test_initialization_is_deterministic_following_seed(self):
+    prior_dist = self.make_prior_dist()
+
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist,
+        seed=test_util.test_seed(sampler_type='stateless'))
+    self.evaluate(
+        [v.initializer for v in surrogate_posterior.trainable_variables])
+    posterior_sample = surrogate_posterior.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+
+    surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist,
+        seed=test_util.test_seed(sampler_type='stateless'))
+    self.evaluate(
+        [v.initializer for v in surrogate_posterior2.trainable_variables])
+    posterior_sample2 = surrogate_posterior2.sample(
+        seed=test_util.test_seed(sampler_type='stateless'))
+
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,
+                                               _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      innovation_noise = 0.1
+      prior_loc = 0.
+      new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+      for _ in range(4):
+        new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+  def make_likelihood_model(self, x, observation_noise):
+
+    def _likelihood_model():
+      for i in range(5):
+        yield tfd.Normal(loc=x[i], scale=observation_noise)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_likelihood_model)
+
+  def get_observations(self, prior_dist):
+    observation_noise = 0.15
+    ground_truth = prior_dist.sample()
+    likelihood = self.make_likelihood_model(
+        x=ground_truth, observation_noise=observation_noise)
+    return likelihood.sample(1)
+
+  def get_target_log_prob(self, observations, prior_dist):
+
+    def target_log_prob(*x):
+      observation_noise = 0.15
+      likelihood_dist = self.make_likelihood_model(
+          x=x, observation_noise=observation_noise)
+      return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x)
+
+    return target_log_prob
+
+  def test_fitting_surrogate_posterior(self):
+
+    prior_dist = self.make_prior_dist()
+    observations = self.get_observations(prior_dist)
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+        prior=prior_dist)
+    target_log_prob = self.get_target_log_prob(observations, prior_dist)
+
+    # Test vi fit surrogate posterior works
+    losses = tfp.vi.fit_surrogate_posterior(
+        target_log_prob,
+        surrogate_posterior,
+        num_steps=5,  # Don't optimize to completion.
+        optimizer=tf.optimizers.Adam(0.1),
+        sample_size=10)
+
+    # Compute posterior statistics.
+    with tf.control_dependencies([losses]):
+      posterior_samples = surrogate_posterior.sample(100)
+      posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples)
+      posterior_stddev = tf.nest.map_structure(tf.math.reduce_std,
+                                               posterior_samples)
+
+    self.evaluate(tf1.global_variables_initializer())
+    _ = self.evaluate(losses)
+    _ = self.evaluate(posterior_mean)
+    _ = self.evaluate(posterior_stddev)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestEightSchools(test_util.TestCase,
+                                             _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+    treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12],
+                                    dtype=tf.float32)
+    num_schools = ps.shape(treatment_effects)[-1]
+
+    return tfd.JointDistributionNamed({
+        'avg_effect':
+            tfd.Normal(loc=0., scale=10., name='avg_effect'),
+        'log_stddev':
+            tfd.Normal(loc=5., scale=1., name='log_stddev'),
+        'school_effects':
+            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
+                tfd.Independent(
+                    tfd.Normal(
+                        loc=avg_effect[..., None] * tf.ones(num_schools),
+                        scale=tf.exp(log_stddev[..., None]) * tf.ones(
+                            num_schools),
+                        name='school_effects'),
+                    reinterpreted_batch_ndims=1))
+    })
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase,
+                                                   _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    return tfd.JointDistributionNamed({
+        'avg_effect':
+            tfd.Normal(loc=0., scale=10., name='avg_effect'),
+        'log_stddev':
+            tfd.Normal(loc=5., scale=1., name='log_stddev'),
+        'school_effects':
+            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
+                tfd.Sample(
+                    tfd.Normal(
+                        loc=avg_effect[..., None],
+                        scale=tf.exp(log_stddev[..., None]),
+                        name='school_effects'),
+                    sample_shape=[8]))
+    })
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase,
+                                           _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      innovation_noise = 1.
+      yield tfd.HalfNormal(
+          scale=innovation_noise, validate_args=True, allow_nan_stats=False)
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestDiscreteLatent(
+    test_util.TestCase, _TrainableCFSurrogate):
+
+  def make_prior_dist(self):
+
+    def _prior_model_fn():
+      a = yield tfd.Bernoulli(logits=0.5, name='a')
+      yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1.,
+                       scale=1., name='b')
+
+    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
+
+
+@test_util.test_all_tf_execution_regimes
+class CFSurrogatePosteriorTestNesting(test_util.TestCase,
+                                        _TrainableCFSurrogate):
+
+  def _expected_num_trainable_variables(self, _):
+    # Nested distributions have total of 10 params after Exponential->Gamma
+    # substitution, multiplied by 2 variables per param.
+    return 20
+
+  def make_prior_dist(self):
+
+    def nested_model():
+      a = yield tfd.Sample(
+          tfd.Sample(
+              tfd.Normal(0., 1.),
+              sample_shape=4),
+          sample_shape=[2],
+          name='a')
+      b = yield tfb.Sigmoid()(
+          tfb.Square()(
+              tfd.Exponential(rate=tf.exp(a))),
+          name='b')
+      # pylint: disable=g-long-lambda
+      yield tfd.JointDistributionSequential(
+          [tfd.Laplace(loc=a, scale=b),
+           lambda c1: tfd.Independent(
+               tfd.Beta(concentration1=1.,
+                        concentration0=tf.nn.softplus(c1)),
+               reinterpreted_batch_ndims=1),
+           lambda c1, c2: tfd.JointDistributionNamed({
+               'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)})
+           ], name='c')
+      # pylint: enable=g-long-lambda
+
+    return tfd.JointDistributionCoroutineAutoBatched(nested_model)
+
+
+@test_util.test_all_tf_execution_regimes
+class TestCFDistributionSubstitution(test_util.TestCase):
+
+  def test_default_substitutes_trainable_families(self):
+
+    @tfd.JointDistributionCoroutineAutoBatched
+    def model():
+      yield tfd.Sample(
+          tfd.Uniform(low=-2., high=7.),
+          sample_shape=[2],
+          name='a')
+      yield tfd.HalfNormal(1., name='b')
+      yield tfd.Exponential(rate=[1., 2.], name='c')
+      yield tfd.Chi2(df=3., name='d')
+
+    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
+        model)
+    self.assertAllEqualNested(model.event_shape, surrogate.event_shape)
+
+    surrogate_dists, _ = surrogate.sample_distributions()
+    self.assertIsInstance(surrogate_dists.a, tfd.Independent)
+    self.assertIsInstance(surrogate_dists.a.distribution,
+                          tfd.TransformedDistribution)
+    self.assertIsInstance(surrogate_dists.a.distribution.distribution,
+                          tfd.Beta)
+    self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal)
+    self.assertIsInstance(surrogate_dists.c, tfd.Gamma)
+    self.assertIsInstance(surrogate_dists.d, tfd.Gamma)
+
+  def test_can_specify_custom_substitution(self):
+
+    @tfd.JointDistributionCoroutineAutoBatched
+    def centered_horseshoe(ndims=100):
+      global_scale = yield tfd.HalfCauchy(
+          loc=0., scale=1., name='global_scale')
+      local_scale = yield tfd.HalfCauchy(
+          loc=0., scale=tf.ones([ndims]), name='local_scale')
+      yield tfd.Normal(
+          loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights')
+
+    tfp.experimental.vi.register_asvi_substitution_rule(
+        condition=tfd.HalfCauchy,
+        substitution_fn=(
+            lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale))))
+    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
+        centered_horseshoe)
+    self.assertAllEqualNested(centered_horseshoe.event_shape,
+                              surrogate.event_shape)
+
+    # If the surrogate was built with names or structure differing from the
+    # model, so that it had to be `tfb.Restructure`'d, then this
+    # sample_distributions call will fail because the surrogate isn't an
+    # instance of tfd.JointDistribution.
+    surrogate_dists, _ = surrogate.sample_distributions()
+    self.assertIsInstance(surrogate_dists.global_scale.distribution,
+                          tfd.Normal)
+    self.assertIsInstance(surrogate_dists.local_scale.distribution,
+                          tfd.Normal)
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
+
+# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions.
+# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with
+#  missing observations.
+
+if __name__ == '__main__':
+  tf.test.main()

From a89d60a746ef75c4e0acb7c872c0c5b290703e9d Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 10:56:49 +0200
Subject: [PATCH 20/54] reverted to latest version

---
 .../python/experimental/vi/cascading_flows.py | 101 ++++++++++++++----
 1 file changed, 80 insertions(+), 21 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index d8c9393d8e..61dcce7236 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -25,21 +25,20 @@
 
 import tensorflow.compat.v2 as tf
 
-from tensorflow_probability.python.experimental.bijectors import \
-  build_highway_flow_layer
 from tensorflow_probability.python.bijectors import chain
 from tensorflow_probability.python.bijectors import reshape
 from tensorflow_probability.python.bijectors import scale as scale_lib
 from tensorflow_probability.python.bijectors import shift
 from tensorflow_probability.python.bijectors import split
-
 from tensorflow_probability.python.distributions import batch_broadcast
 from tensorflow_probability.python.distributions import beta
 from tensorflow_probability.python.distributions import blockwise
 from tensorflow_probability.python.distributions import chi2
+from tensorflow_probability.python.distributions import deterministic
 from tensorflow_probability.python.distributions import exponential
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
+from tensorflow_probability.python.distributions import independent
 from tensorflow_probability.python.distributions import \
   joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
@@ -49,10 +48,12 @@
 from tensorflow_probability.python.distributions import transformed_distribution
 from tensorflow_probability.python.distributions import truncated_normal
 from tensorflow_probability.python.distributions import uniform
+from tensorflow_probability.python.experimental.bijectors import \
+  build_highway_flow_layer
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
-  'register_asvi_substitution_rule',
+  'register_cf_substitution_rule',
   'build_cf_surrogate_posterior'
 ]
 
@@ -83,7 +84,7 @@ def _as_substituted_distribution(distribution):
 
 
 # Todo: inherited from asvi code, do we need this?
-def register_asvi_substitution_rule(condition, substitution_fn):
+def register_cf_substitution_rule(condition, substitution_fn):
   """Registers a rule for substituting distributions in ASVI surrogates.
 
   Args:
@@ -132,20 +133,20 @@ def register_asvi_substitution_rule(condition, substitution_fn):
 # Default substitutions attempt to express distributions using the most
 # flexible available parameterization.
 # pylint: disable=g-long-lambda
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   half_normal.HalfNormal,
   lambda dist: truncated_normal.TruncatedNormal(
     loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   uniform.Uniform,
   lambda dist: shift.Shift(dist.low)(
     scale_lib.Scale(dist.high - dist.low)(
       beta.Beta(concentration0=tf.ones_like(dist.mean()),
                 concentration1=1.))))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   exponential.Exponential,
   lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
-register_asvi_substitution_rule(
+register_cf_substitution_rule(
   chi2.Chi2,
   lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
 
@@ -255,6 +256,7 @@ def model_fn():
         _cf_convex_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
         num_auxiliary_variables=num_auxiliary_variables),
+      num_auxiliary_variables=num_auxiliary_variables,
       seed=seed)
     surrogate_posterior.also_track = variables
     return surrogate_posterior
@@ -264,6 +266,8 @@ def _cf_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
                                    sample_shape=None,
                                    variables=None,
+                                   num_auxiliary_variables=0,
+                                   global_auxiliary_variables=None,
                                    seed=None):
   # todo: change docstrings
   """Recursively creates ASVI surrogates, and creates new variables if needed.
@@ -303,15 +307,19 @@ def _cf_surrogate_for_distribution(dist,
       dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       variables=variables,
+      num_auxiliary_variables=num_auxiliary_variables,
+      global_auxiliary_variables=global_auxiliary_variables,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables,
+      global_auxiliary_variables=global_auxiliary_variables, seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables=None,
+    num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -322,7 +330,46 @@ def _cf_surrogate_for_joint_distribution(
   def posterior_generator(seed=seed):
     prior_gen = prior_coroutine()
     dist = next(prior_gen)
-    i = 0
+
+    if num_auxiliary_variables > 0:
+      i = 1
+
+      if flat_variables:
+        variables = flat_variables[0]
+
+      else:
+        layers = 3
+        bijectors = []
+
+        for _ in range(0, layers - 1):
+          bijectors.append(
+            build_highway_flow_layer(num_auxiliary_variables,
+                                     residual_fraction_initial_value=0.5,
+                                     activation_fn=True, gate_first_n=0,
+                                     seed=seed))
+        bijectors.append(
+          build_highway_flow_layer(num_auxiliary_variables,
+                                   residual_fraction_initial_value=0.5,
+                                   activation_fn=False, gate_first_n=0,
+                                   seed=seed))
+
+        variables = chain.Chain(bijectors=list(reversed(bijectors)))
+
+      eps = transformed_distribution.TransformedDistribution(
+        distribution=sample.Sample(normal.Normal(0., 0.1),
+                                   num_auxiliary_variables),
+        bijector=variables)
+
+      eps = Root(eps)
+
+      value_out = yield (eps if flat_variables
+                         else (eps, variables))
+
+      global_auxiliary_variables = value_out
+
+    else:
+      i = 0
+
     try:
       while True:
         was_root = isinstance(dist, Root)
@@ -334,9 +381,10 @@ def posterior_generator(seed=seed):
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
           variables=flat_variables[i] if flat_variables else None,
+          global_auxiliary_variables=global_auxiliary_variables,
           seed=init_seed)
 
-        if was_root:
+        if was_root and num_auxiliary_variables == 0:
           surrogate_posterior = Root(surrogate_posterior)
         # If variables were not given---i.e., we're creating new
         # variables---then yield the new variables along with the surrogate
@@ -367,6 +415,8 @@ def posterior_generator(seed=seed):
     return _cf_surrogate_for_joint_distribution(
       dist=dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      num_auxiliary_variables=num_auxiliary_variables,
+      global_auxiliary_variables=global_auxiliary_variables,
       variables=dist._model_unflatten(  # pylint: disable=protected-access
         _extract_variables_from_coroutine_model(
           posterior_generator, seed=seed)))
@@ -401,6 +451,7 @@ def posterior_generator(seed=seed):
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables=0,
+                                            global_auxiliary_variables=None,
                                             sample_shape=None,
                                             variables=None,
                                             seed=None):
@@ -412,31 +463,39 @@ def _cf_convex_update_for_base_distribution(dist,
       actual_event_shape.shape.as_list()[0] > 0 else 1
     layers = 3
     bijectors = [reshape.Reshape([-1],
-                             event_shape_in=actual_event_shape +
-                                            num_auxiliary_variables)]
+                                 event_shape_in=actual_event_shape +
+                                                num_auxiliary_variables)]
 
     for _ in range(0, layers - 1):
       bijectors.append(
         build_highway_flow_layer(
           tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
           residual_fraction_initial_value=initial_prior_weight,
-          activation_fn=True, gate_first_n=int_event_shape))
+          activation_fn=True, gate_first_n=int_event_shape, seed=seed))
     bijectors.append(
       build_highway_flow_layer(
         tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
-        activation_fn=False, gate_first_n=int_event_shape))
-    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+        activation_fn=False, gate_first_n=int_event_shape, seed=seed))
+    bijectors.append(
+      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
+    batch_shape = global_auxiliary_variables.shape[0] if len(
+      global_auxiliary_variables.shape) > 1 else []
+
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(
       transformed_distribution.TransformedDistribution(
-        distribution=blockwise.Blockwise([dist, batch_broadcast.BatchBroadcast(
-          sample.Sample(normal.Normal(0., .1), num_auxiliary_variables),
-          to_shape=dist.batch_shape)]),
+        distribution=blockwise.Blockwise([
+          batch_broadcast.BatchBroadcast(dist,
+                                         to_shape=batch_shape),
+          independent.Independent(
+            deterministic.Deterministic(
+              global_auxiliary_variables),
+            reinterpreted_batch_ndims=1)]),
         bijector=variables))
 
   else:

From ea80d7bac076dfd3b971bcbfb0a31653d03b7ac0 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 14 May 2021 11:22:31 +0200
Subject: [PATCH 21/54] fixed surrogate posterior type

---
 .../python/experimental/vi/cascading_flows.py        | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 61dcce7236..95c7cf5faf 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -39,8 +39,7 @@
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
-from tensorflow_probability.python.distributions import \
-  joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
   joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
@@ -422,11 +421,10 @@ def posterior_generator(seed=seed):
           posterior_generator, seed=seed)))
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
-  surrogate_type = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched
-  if not hasattr(dist, 'use_vectorized_map'):
-    surrogate_type = joint_distribution_coroutine.JointDistributionCoroutine
-  surrogate_posterior = surrogate_type(posterior_generator,
-                                       name=_get_name(dist))
+  surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
+    posterior_generator,
+    use_vectorized_map=dist.use_vectorized_map,
+    name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified

From cf11c70ac38c55f49cec1132e6e38d7faa941d23 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 10:47:34 +0200
Subject: [PATCH 22/54] small fixes

---
 .../python/experimental/vi/cascading_flows.py | 37 ++++++++-----------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 95c7cf5faf..a9735f3739 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -156,7 +156,7 @@ def register_cf_substitution_rule(condition, substitution_fn):
 def build_cf_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
-    initial_prior_weight=0.5,
+    initial_prior_weight=0.98,
     seed=None,
     name=None):
   # todo: change docstrings
@@ -311,14 +311,12 @@ def _cf_surrogate_for_distribution(dist,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables,
-      global_auxiliary_variables=global_auxiliary_variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None,
-    num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -343,19 +341,17 @@ def posterior_generator(seed=seed):
         for _ in range(0, layers - 1):
           bijectors.append(
             build_highway_flow_layer(num_auxiliary_variables,
-                                     residual_fraction_initial_value=0.5,
-                                     activation_fn=True, gate_first_n=0,
-                                     seed=seed))
+              residual_fraction_initial_value=0.98,
+              activation_fn=True, gate_first_n=0, seed=seed))
         bijectors.append(
           build_highway_flow_layer(num_auxiliary_variables,
-                                   residual_fraction_initial_value=0.5,
-                                   activation_fn=False, gate_first_n=0,
-                                   seed=seed))
+            residual_fraction_initial_value=0.98,
+            activation_fn=False, gate_first_n=0, seed=seed))
 
         variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
       eps = transformed_distribution.TransformedDistribution(
-        distribution=sample.Sample(normal.Normal(0., 0.1),
+        distribution=sample.Sample(normal.Normal(0., 1.),
                                    num_auxiliary_variables),
         bijector=variables)
 
@@ -380,7 +376,7 @@ def posterior_generator(seed=seed):
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
           variables=flat_variables[i] if flat_variables else None,
-          global_auxiliary_variables=global_auxiliary_variables,
+          global_auxiliary_variables = global_auxiliary_variables,
           seed=init_seed)
 
         if was_root and num_auxiliary_variables == 0:
@@ -422,9 +418,9 @@ def posterior_generator(seed=seed):
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
   surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-    posterior_generator,
-    use_vectorized_map=dist.use_vectorized_map,
-    name=_get_name(dist))
+      posterior_generator,
+      use_vectorized_map=dist.use_vectorized_map,
+      name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified
@@ -461,8 +457,8 @@ def _cf_convex_update_for_base_distribution(dist,
       actual_event_shape.shape.as_list()[0] > 0 else 1
     layers = 3
     bijectors = [reshape.Reshape([-1],
-                                 event_shape_in=actual_event_shape +
-                                                num_auxiliary_variables)]
+                             event_shape_in=actual_event_shape +
+                                            num_auxiliary_variables)]
 
     for _ in range(0, layers - 1):
       bijectors.append(
@@ -475,8 +471,7 @@ def _cf_convex_update_for_base_distribution(dist,
         tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
         activation_fn=False, gate_first_n=int_event_shape, seed=seed))
-    bijectors.append(
-      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -489,7 +484,7 @@ def _cf_convex_update_for_base_distribution(dist,
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
           batch_broadcast.BatchBroadcast(dist,
-                                         to_shape=batch_shape),
+                                        to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(
               global_auxiliary_variables),

From d9e28288bb82fce607a57a0b796e1e627abecd9a Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:09:26 +0200
Subject: [PATCH 23/54] fixed global variables if no auxiliary variabled

---
 tensorflow_probability/python/experimental/vi/cascading_flows.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index a9735f3739..ef9f6f78da 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -363,6 +363,7 @@ def posterior_generator(seed=seed):
       global_auxiliary_variables = value_out
 
     else:
+      global_auxiliary_variables = None
       i = 0
 
     try:

From 80e8ee7be1de52270cccff87a99cbecc1e677841 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:51:56 +0200
Subject: [PATCH 24/54] added number of layers parameter

---
 .../python/experimental/vi/cascading_flows.py | 81 ++++++++++++-------
 1 file changed, 50 insertions(+), 31 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index ef9f6f78da..9b430bf6a6 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -39,16 +39,17 @@
 from tensorflow_probability.python.distributions import gamma
 from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
-from tensorflow_probability.python.distributions import joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
-  joint_distribution_coroutine
+    joint_distribution_auto_batched
+from tensorflow_probability.python.distributions import \
+    joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
 from tensorflow_probability.python.distributions import sample
 from tensorflow_probability.python.distributions import transformed_distribution
 from tensorflow_probability.python.distributions import truncated_normal
 from tensorflow_probability.python.distributions import uniform
 from tensorflow_probability.python.experimental.bijectors import \
-  build_highway_flow_layer
+    build_highway_flow_layer
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
@@ -157,6 +158,7 @@ def build_cf_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
     initial_prior_weight=0.98,
+    num_layers=3,
     seed=None,
     name=None):
   # todo: change docstrings
@@ -254,8 +256,10 @@ def model_fn():
       base_distribution_surrogate_fn=functools.partial(
         _cf_convex_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
-        num_auxiliary_variables=num_auxiliary_variables),
+        num_auxiliary_variables=num_auxiliary_variables,
+        num_layers=num_layers),
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       seed=seed)
     surrogate_posterior.also_track = variables
     return surrogate_posterior
@@ -263,9 +267,10 @@ def model_fn():
 
 def _cf_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
+                                   num_auxiliary_variables,
+                                   num_layers,
                                    sample_shape=None,
                                    variables=None,
-                                   num_auxiliary_variables=0,
                                    global_auxiliary_variables=None,
                                    seed=None):
   # todo: change docstrings
@@ -307,16 +312,22 @@ def _cf_surrogate_for_distribution(dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       variables=variables,
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       global_auxiliary_variables=global_auxiliary_variables,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables, global_auxiliary_variables=global_auxiliary_variables, seed=seed)
+      dist=dist, sample_shape=sample_shape, variables=variables,
+      global_auxiliary_variables=global_auxiliary_variables,
+      num_layers=num_layers,
+      seed=seed)
   return surrogate_posterior, variables
 
 
 def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables=None, num_auxiliary_variables=0, global_auxiliary_variables=None, seed=None):
+    dist, base_distribution_surrogate_fn, variables,
+    num_auxiliary_variables, num_layers, global_auxiliary_variables,
+    seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
   # Probabilistic program for CF surrogate posterior.
@@ -335,18 +346,17 @@ def posterior_generator(seed=seed):
         variables = flat_variables[0]
 
       else:
-        layers = 3
         bijectors = []
 
-        for _ in range(0, layers - 1):
+        for _ in range(0, num_layers - 1):
           bijectors.append(
             build_highway_flow_layer(num_auxiliary_variables,
-              residual_fraction_initial_value=0.98,
-              activation_fn=True, gate_first_n=0, seed=seed))
+                                     activation_fn=True,
+                                     gate_first_n=0, seed=seed))
         bijectors.append(
           build_highway_flow_layer(num_auxiliary_variables,
-            residual_fraction_initial_value=0.98,
-            activation_fn=False, gate_first_n=0, seed=seed))
+                                   activation_fn=False,
+                                   gate_first_n=0, seed=seed))
 
         variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -376,8 +386,10 @@ def posterior_generator(seed=seed):
         surrogate_posterior, variables = _cf_surrogate_for_distribution(
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+          num_auxiliary_variables=num_auxiliary_variables,
+          num_layers=num_layers,
           variables=flat_variables[i] if flat_variables else None,
-          global_auxiliary_variables = global_auxiliary_variables,
+          global_auxiliary_variables=global_auxiliary_variables,
           seed=init_seed)
 
         if was_root and num_auxiliary_variables == 0:
@@ -412,16 +424,18 @@ def posterior_generator(seed=seed):
       dist=dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
       global_auxiliary_variables=global_auxiliary_variables,
-      variables=dist._model_unflatten(  # pylint: disable=protected-access
+      variables=dist._model_unflatten(
+        # pylint: disable=protected-access
         _extract_variables_from_coroutine_model(
           posterior_generator, seed=seed)))
 
   # Temporary workaround for bijector caching issues with autobatched JDs.
   surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-      posterior_generator,
-      use_vectorized_map=dist.use_vectorized_map,
-      name=_get_name(dist))
+    posterior_generator,
+    use_vectorized_map=dist.use_vectorized_map,
+    name=_get_name(dist))
 
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified
@@ -445,10 +459,11 @@ def posterior_generator(seed=seed):
 # todo: sample_shape and seed are not used.. maybe they should?
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
-                                            num_auxiliary_variables=0,
-                                            global_auxiliary_variables=None,
+                                            num_auxiliary_variables,
+                                            num_layers,
+                                            global_auxiliary_variables,
+                                            variables,
                                             sample_shape=None,
-                                            variables=None,
                                             seed=None):
   """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
 
@@ -456,23 +471,27 @@ def _cf_convex_update_for_base_distribution(dist,
     actual_event_shape = dist.event_shape_tensor()
     int_event_shape = int(actual_event_shape) if \
       actual_event_shape.shape.as_list()[0] > 0 else 1
-    layers = 3
     bijectors = [reshape.Reshape([-1],
-                             event_shape_in=actual_event_shape +
-                                            num_auxiliary_variables)]
+                                 event_shape_in=actual_event_shape +
+                                                num_auxiliary_variables)]
 
-    for _ in range(0, layers - 1):
+    for _ in range(0, num_layers - 1):
       bijectors.append(
         build_highway_flow_layer(
-          tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+          tf.reduce_prod(
+            actual_event_shape + num_auxiliary_variables),
           residual_fraction_initial_value=initial_prior_weight,
-          activation_fn=True, gate_first_n=int_event_shape, seed=seed))
+          activation_fn=True, gate_first_n=int_event_shape,
+          seed=seed))
     bijectors.append(
       build_highway_flow_layer(
-        tf.reduce_prod(actual_event_shape + num_auxiliary_variables),
+        tf.reduce_prod(
+          actual_event_shape + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
-        activation_fn=False, gate_first_n=int_event_shape, seed=seed))
-    bijectors.append(reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+        activation_fn=False, gate_first_n=int_event_shape,
+        seed=seed))
+    bijectors.append(
+      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
@@ -485,7 +504,7 @@ def _cf_convex_update_for_base_distribution(dist,
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
           batch_broadcast.BatchBroadcast(dist,
-                                        to_shape=batch_shape),
+                                         to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(
               global_auxiliary_variables),

From 13602296347ccea0763cea8af8b729fc8c100226 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 18 May 2021 11:59:21 +0200
Subject: [PATCH 25/54] readded highway flow

---
 .../python/experimental/bijectors/BUILD       | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD
index 9f7afce0b8..6befb7bf81 100644
--- a/tensorflow_probability/python/experimental/bijectors/BUILD
+++ b/tensorflow_probability/python/experimental/bijectors/BUILD
@@ -148,6 +148,20 @@ multi_substrate_py_library(
     ],
 )
 
+multi_substrate_py_library(
+    name = "highway_flow",
+    srcs = ["highway_flow.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":scalar_function_with_inferred_inverse",
+        # numpy dep,
+        # tensorflow dep,
+        "//tensorflow_probability/python/bijectors",
+        "//tensorflow_probability/python/util",
+        "//tensorflow_probability/python/internal:samplers",
+    ],
+)
+
 multi_substrate_py_test(
     name = "sharded_test",
     size = "medium",
@@ -164,3 +178,18 @@ multi_substrate_py_test(
         "//tensorflow_probability/python/internal:test_util",
     ],
 )
+
+multi_substrate_py_test(
+    name = "highway_flow_test",
+    size = "medium",
+    srcs = ["highway_flow_test.py"],
+    jax_size = "medium",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        # numpy dep
+        # tensorflow dep,
+        "//tensorflow_probability",
+        "//tensorflow_probability/python/internal:test_util",
+    ],
+)
\ No newline at end of file

From e1a22184a9cb454fc472ffc47d46c82241f4f5e0 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:05:36 +0200
Subject: [PATCH 26/54] fixed init

---
 tensorflow_probability/python/experimental/vi/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py
index e18c8d3455..cc5530300a 100644
--- a/tensorflow_probability/python/experimental/vi/__init__.py
+++ b/tensorflow_probability/python/experimental/vi/__init__.py
@@ -17,6 +17,7 @@
 from tensorflow_probability.python.experimental.vi import util
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule
+from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior
@@ -29,7 +30,7 @@
     'build_affine_surrogate_posterior',
     'build_affine_surrogate_posterior_from_base_distribution',
     'build_asvi_surrogate_posterior',
-    'builf_cf_surrogate_posterior'
+    'build_cf_surrogate_posterior',
     'build_factored_surrogate_posterior',
     'build_split_flow_surrogate_posterior',
     'build_trainable_location_scale_distribution',

From 4f667ee1f89daa467ef3d7e25cb3f536d6990061 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 10:08:45 +0200
Subject: [PATCH 27/54] working on tests

---
 .../experimental/vi/cascading_flows_test.py    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 9c4393be24..598d3fd66e 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -30,7 +30,7 @@
 
 tfb = tfp.bijectors
 tfd = tfp.distributions
-
+ # test_util.test_seed(sampler_type='stateless'))
 
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
@@ -65,7 +65,7 @@ def test_dims_and_gradients(self):
                    self._expected_num_trainable_variables(prior_dist))
 
     # Test that the sample shape is correct
-    three_posterior_samples = surrogate_posterior.sample(
+    '''three_posterior_samples = surrogate_posterior.sample(
         3, seed=test_util.test_seed(sampler_type='stateless'))
     three_prior_samples = prior_dist.sample(
         3, seed=test_util.test_seed(sampler_type='stateless'))
@@ -74,15 +74,15 @@ def test_dims_and_gradients(self):
         [s.shape for s in tf.nest.flatten(three_posterior_samples)])
 
     # Test that gradients are available wrt the variational parameters.
-    posterior_sample = surrogate_posterior.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+   posterior_sample = surrogate_posterior.sample(
+        seed=1)
     with tf.GradientTape() as tape:
       posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
     grad = tape.gradient(posterior_logprob,
                          surrogate_posterior.trainable_variables)
-    self.assertTrue(all(g is not None for g in grad))
+    self.assertTrue(all(g is not None for g in grad))'''
 
-  def test_initialization_is_deterministic_following_seed(self):
+  '''def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
@@ -101,7 +101,7 @@ def test_initialization_is_deterministic_following_seed(self):
     posterior_sample2 = surrogate_posterior2.sample(
         seed=test_util.test_seed(sampler_type='stateless'))
 
-    self.assertAllEqualNested(posterior_sample, posterior_sample2)
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)'''
 
 
 @test_util.test_all_tf_execution_regimes
@@ -144,7 +144,7 @@ def target_log_prob(*x):
 
     return target_log_prob
 
-  def test_fitting_surrogate_posterior(self):
+  '''def test_fitting_surrogate_posterior(self):
 
     prior_dist = self.make_prior_dist()
     observations = self.get_observations(prior_dist)
@@ -170,7 +170,7 @@ def test_fitting_surrogate_posterior(self):
     self.evaluate(tf1.global_variables_initializer())
     _ = self.evaluate(losses)
     _ = self.evaluate(posterior_mean)
-    _ = self.evaluate(posterior_stddev)
+    _ = self.evaluate(posterior_stddev)'''
 
 
 @test_util.test_all_tf_execution_regimes

From 75d8b53ebb000c50f262c7dce905c1989cc51ea5 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 20 May 2021 11:39:55 +0200
Subject: [PATCH 28/54] more testing

---
 .../experimental/vi/cascading_flows_test.py   | 57 +++++++------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 598d3fd66e..b52e1e5f77 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -35,73 +35,59 @@
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
-  def _expected_num_trainable_variables(self, prior_dist):
+  def _expected_num_trainable_variables(self, prior_dist, num_layers):
     """Infers the expected number of trainable variables for a non-nested JD."""
     prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
     expected_num_trainable_variables = 0
+
+    # For each distribution in the prior, we will have one highway flow with
+    # `num_layers` blocks, and each block has 4 trainable variables:
+    # `residual_fraction`, `lower_diagonal_weights_matrix`,
+    # `upper_diagonal_weights_matrix` and `bias`.
     for original_dist in prior_dists:
-      try:
-        original_dist = original_dist.distribution
-      except AttributeError:
-        pass
-      dist = cascading_flows._as_substituted_distribution(original_dist)
-      dist_params = dist.parameters
-      for param, value in dist_params.items():
-        if (param not in cascading_flows._NON_STATISTICAL_PARAMS
-            and value is not None and param not in ('low', 'high')):
-          # One variable each for prior_weight, mean_field_parameter.
-          expected_num_trainable_variables += 2
+      expected_num_trainable_variables += (4 * num_layers)
     return expected_num_trainable_variables
 
   def test_dims_and_gradients(self):
 
     prior_dist = self.make_prior_dist()
-
+    num_layers = 3
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist)
+        prior=prior_dist, num_layers=num_layers)
 
     # Test that the correct number of trainable variables are being tracked
     self.assertLen(surrogate_posterior.trainable_variables,
-                   self._expected_num_trainable_variables(prior_dist))
+                   self._expected_num_trainable_variables(prior_dist, num_layers))
 
     # Test that the sample shape is correct
-    '''three_posterior_samples = surrogate_posterior.sample(
-        3, seed=test_util.test_seed(sampler_type='stateless'))
+    three_posterior_samples = surrogate_posterior.sample(
+        3, seed=1)
     three_prior_samples = prior_dist.sample(
-        3, seed=test_util.test_seed(sampler_type='stateless'))
+        3, seed=1)
     self.assertAllEqualNested(
         [s.shape for s in tf.nest.flatten(three_prior_samples)],
         [s.shape for s in tf.nest.flatten(three_posterior_samples)])
 
-    # Test that gradients are available wrt the variational parameters.
-   posterior_sample = surrogate_posterior.sample(
-        seed=1)
-    with tf.GradientTape() as tape:
-      posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
-    grad = tape.gradient(posterior_logprob,
-                         surrogate_posterior.trainable_variables)
-    self.assertTrue(all(g is not None for g in grad))'''
-
-  '''def test_initialization_is_deterministic_following_seed(self):
+  def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
         prior=prior_dist,
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
     self.evaluate(
         [v.initializer for v in surrogate_posterior.trainable_variables])
     posterior_sample = surrogate_posterior.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
 
     surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
         prior=prior_dist,
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
     self.evaluate(
         [v.initializer for v in surrogate_posterior2.trainable_variables])
     posterior_sample2 = surrogate_posterior2.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=1)
 
-    self.assertAllEqualNested(posterior_sample, posterior_sample2)'''
+    self.assertAllEqualNested(posterior_sample, posterior_sample2)
 
 
 @test_util.test_all_tf_execution_regimes
@@ -172,7 +158,6 @@ def target_log_prob(*x):
     _ = self.evaluate(posterior_mean)
     _ = self.evaluate(posterior_stddev)'''
 
-
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestEightSchools(test_util.TestCase,
                                              _TrainableCFSurrogate):
@@ -235,7 +220,7 @@ def _prior_model_fn():
     return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
 
 
-@test_util.test_all_tf_execution_regimes
+'''@test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestDiscreteLatent(
     test_util.TestCase, _TrainableCFSurrogate):
 
@@ -344,7 +329,7 @@ def centered_horseshoe(ndims=100):
                           tfd.Normal)
     self.assertIsInstance(surrogate_dists.local_scale.distribution,
                           tfd.Normal)
-    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)'''
 
 # TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions.
 # TODO(kateslin): Add an ASVI surrogate posterior test with for a model with

From 65f5adbafb7057d4712bb8adc1f0dc49c87f0431 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 27 May 2021 13:34:20 +0200
Subject: [PATCH 29/54] small refsctoring and changed docstriings

---
 .../python/experimental/vi/cascading_flows.py | 890 +++++++++---------
 1 file changed, 471 insertions(+), 419 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 9b430bf6a6..8c9f48222c 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -40,29 +40,33 @@
 from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
 from tensorflow_probability.python.distributions import \
-    joint_distribution_auto_batched
+  joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
-    joint_distribution_coroutine
+  joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
 from tensorflow_probability.python.distributions import sample
 from tensorflow_probability.python.distributions import transformed_distribution
 from tensorflow_probability.python.distributions import truncated_normal
 from tensorflow_probability.python.distributions import uniform
 from tensorflow_probability.python.experimental.bijectors import \
-    build_highway_flow_layer
+  build_trainable_highway_flow
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
-  'register_cf_substitution_rule',
-  'build_cf_surrogate_posterior'
+    'register_cf_substitution_rule',
+    'build_cf_surrogate_posterior'
 ]
 
 Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
 
+# TODO: the part until the function build_cf_surrogate_posterior is identical to
+#  the one in automatic_structured_vi. Should we remove it from this file and
+#  import them directly from automatic_structured_vi?
+
 _NON_STATISTICAL_PARAMS = [
-  'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum',
-  'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support',
-  'num_probit_terms_approx'
+    'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum',
+    'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support',
+    'num_probit_terms_approx'
 ]
 _NON_TRAINABLE_PARAMS = ['low', 'high']
 
@@ -70,393 +74,448 @@
 # before defining the surrogate family.
 
 
-# Todo: inherited from asvi code, do we need this?
 ASVI_SURROGATE_SUBSTITUTIONS = {}
 
 
-# Todo: inherited from asvi code, do we need this?
 def _as_substituted_distribution(distribution):
-  """Applies all substitution rules that match a distribution."""
-  for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items():
-    if condition(distribution):
-      distribution = substitution_fn(distribution)
-  return distribution
+    """Applies all substitution rules that match a distribution."""
+    for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items():
+        if condition(distribution):
+            distribution = substitution_fn(distribution)
+    return distribution
 
 
-# Todo: inherited from asvi code, do we need this?
 def register_cf_substitution_rule(condition, substitution_fn):
-  """Registers a rule for substituting distributions in ASVI surrogates.
-
-  Args:
-    condition: Python `callable` that takes a Distribution instance and
-      returns a Python `bool` indicating whether or not to substitute it.
-      May also be a class type such as `tfd.Normal`, in which case the
-      condition is interpreted as
-      `lambda distribution: isinstance(distribution, class)`.
-    substitution_fn: Python `callable` that takes a Distribution
-      instance and returns a new Distribution instance used to define
-      the ASVI surrogate posterior. Note that this substitution does not modify
-      the original model.
-
-  #### Example
-
-  To use a Normal surrogate for all location-scale family distributions, we
-  could register the substitution:
-
-  ```python
-  tfp.experimental.vi.register_asvi_surrogate_substitution(
-    condition=lambda distribution: (
-      hasattr(distribution, 'loc') and hasattr(distribution, 'scale'))
-    substitution_fn=lambda distribution: (
-      # Invoking the event space bijector applies any relevant constraints,
-      # e.g., that HalfCauchy samples must be `>= loc`.
-      distribution.experimental_default_event_space_bijector()(
-        tfd.Normal(loc=distribution.loc, scale=distribution.scale)))
-  ```
-
-  This rule will fire when ASVI encounters a location-scale distribution,
-  and instructs ASVI to build a surrogate 'as if' the model had just used a
-  (possibly constrained) Normal in its place. Note that we could have used a
-  more precise condition, e.g., to limit the substitution to distributions with
-  a specific `name`, if we had reason to think that a Normal distribution would
-  be a good surrogate for some model variables but not others.
-
-  """
-  global ASVI_SURROGATE_SUBSTITUTIONS
-  if inspect.isclass(condition):
-    condition = lambda distribution, cls=condition: isinstance(
-      # pylint: disable=g-long-lambda
-      distribution, cls)
-  ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn
+    """Registers a rule for substituting distributions in ASVI surrogates.
+
+    Args:
+      condition: Python `callable` that takes a Distribution instance and
+        returns a Python `bool` indicating whether or not to substitute it.
+        May also be a class type such as `tfd.Normal`, in which case the
+        condition is interpreted as
+        `lambda distribution: isinstance(distribution, class)`.
+      substitution_fn: Python `callable` that takes a Distribution
+        instance and returns a new Distribution instance used to define
+        the ASVI surrogate posterior. Note that this substitution does not modify
+        the original model.
+
+    #### Example
+
+    To use a Normal surrogate for all location-scale family distributions, we
+    could register the substitution:
+
+    ```python
+    tfp.experimental.vi.register_asvi_surrogate_substitution(
+      condition=lambda distribution: (
+        hasattr(distribution, 'loc') and hasattr(distribution, 'scale'))
+      substitution_fn=lambda distribution: (
+        # Invoking the event space bijector applies any relevant constraints,
+        # e.g., that HalfCauchy samples must be `>= loc`.
+        distribution.experimental_default_event_space_bijector()(
+          tfd.Normal(loc=distribution.loc, scale=distribution.scale)))
+    ```
+
+    This rule will fire when ASVI encounters a location-scale distribution,
+    and instructs ASVI to build a surrogate 'as if' the model had just used a
+    (possibly constrained) Normal in its place. Note that we could have used a
+    more precise condition, e.g., to limit the substitution to distributions with
+    a specific `name`, if we had reason to think that a Normal distribution would
+    be a good surrogate for some model variables but not others.
+
+    """
+    global ASVI_SURROGATE_SUBSTITUTIONS
+    if inspect.isclass(condition):
+        condition = lambda distribution, cls=condition: isinstance(
+            # pylint: disable=g-long-lambda
+            distribution, cls)
+    ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn
 
 
 # Default substitutions attempt to express distributions using the most
 # flexible available parameterization.
 # pylint: disable=g-long-lambda
 register_cf_substitution_rule(
-  half_normal.HalfNormal,
-  lambda dist: truncated_normal.TruncatedNormal(
-    loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
+    half_normal.HalfNormal,
+    lambda dist: truncated_normal.TruncatedNormal(
+        loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
 register_cf_substitution_rule(
-  uniform.Uniform,
-  lambda dist: shift.Shift(dist.low)(
-    scale_lib.Scale(dist.high - dist.low)(
-      beta.Beta(concentration0=tf.ones_like(dist.mean()),
-                concentration1=1.))))
+    uniform.Uniform,
+    lambda dist: shift.Shift(dist.low)(
+        scale_lib.Scale(dist.high - dist.low)(
+            beta.Beta(concentration0=tf.ones_like(dist.mean()),
+                      concentration1=1.))))
 register_cf_substitution_rule(
-  exponential.Exponential,
-  lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
+    exponential.Exponential,
+    lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
 register_cf_substitution_rule(
-  chi2.Chi2,
-  lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
+    chi2.Chi2,
+    lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
 
 
 # pylint: enable=g-long-lambda
 
 # a single JointDistribution.
 def build_cf_surrogate_posterior(
-    prior,
-    num_auxiliary_variables=0,
-    initial_prior_weight=0.98,
-    num_layers=3,
-    seed=None,
-    name=None):
-  # todo: change docstrings
-  """Builds a structured surrogate posterior inspired by conjugate updating.
-
-  ASVI, or Automatic Structured Variational Inference, was proposed by
-  Ambrogioni et al. (2020) [1] as a method of automatically constructing a
-  surrogate posterior with the same structure as the prior. It does this by
-  reparameterizing the variational family of the surrogate posterior by
-  structuring each parameter according to the equation
-  ```none
-  prior_weight * prior_parameter + (1 - prior_weight) * mean_field_parameter
-  ```
-  In this equation, `prior_parameter` is a vector of prior parameters and
-  `mean_field_parameter` is a vector of trainable parameters with the same
-  domain as `prior_parameter`. `prior_weight` is a vector of learnable
-  parameters where `0. <= prior_weight <= 1.`. When `prior_weight =
-  0`, the surrogate posterior will be a mean-field surrogate, and when
-  `prior_weight = 1.`, the surrogate posterior will be the prior. This convex
-  combination equation, inspired by conjugacy in exponential families, thus
-  allows the surrogate posterior to balance between the structure of the prior
-  and the structure of a mean-field approximation.
-
-  Args:
-    prior: tfd.JointDistribution instance of the prior.
-    mean_field: Optional Python boolean. If `True`, creates a degenerate
-      surrogate distribution in which all variables are independent,
-      ignoring the prior dependence structure. Default value: `False`.
-    initial_prior_weight: Optional float value (either static or tensor value)
-      on the interval [0, 1]. A larger value creates an initial surrogate
-      distribution with more dependence on the prior structure. Default value:
-      `0.5`.
-    seed: Python `int` seed for random initialization.
-    name: Optional string. Default value: `build_cf_surrogate_posterior`.
-
-  Returns:
-    surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
-    whose samples have shape and structure matching that of `prior`.
-
-  Raises:
-    TypeError: The `prior` argument cannot be a nested `JointDistribution`.
-
-  ### Examples
-
-  Consider a Brownian motion model expressed as a JointDistribution:
-
-  ```python
-  prior_loc = 0.
-  innovation_noise = .1
-
-  def model_fn():
-    new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
-    for i in range(4):
-      new = yield tfd.Normal(loc=new, scale=innovation_noise)
-
-  prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
-  ```
-
-  Let's use variational inference to approximate the posterior. We'll build a
-  surrogate posterior distribution by feeding in the prior distribution.
-
-  ```python
-  surrogate_posterior =
-    tfp.experimental.vi.build_cf_surrogate_posterior(prior)
-  ```
-
-  This creates a trainable joint distribution, defined by variables in
-  `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
-  to fit this distribution by minimizing a divergence to the true posterior.
-
-  ```python
-  losses = tfp.vi.fit_surrogate_posterior(
-    target_log_prob_fn,
-    surrogate_posterior=surrogate_posterior,
-    num_steps=100,
-    optimizer=tf.optimizers.Adam(0.1),
-    sample_size=10)
-
-  # After optimization, samples from the surrogate will approximate
-  # samples from the true posterior.
-  samples = surrogate_posterior.sample(100)
-  posterior_mean = [tf.reduce_mean(x) for x in samples]
-  posterior_std = [tf.math.reduce_std(x) for x in samples]
-  ```
-
-  #### References
-  [1]: Luca Ambrogioni, Max Hinne, Marcel van Gerven. Automatic structured
-        variational inference. _arXiv preprint arXiv:2002.00643_, 2020
-        https://arxiv.org/abs/2002.00643
-
-  """
-  with tf.name_scope(name or 'build_cf_surrogate_posterior'):
-    surrogate_posterior, variables = _cf_surrogate_for_distribution(
-      dist=prior,
-      base_distribution_surrogate_fn=functools.partial(
-        _cf_convex_update_for_base_distribution,
-        initial_prior_weight=initial_prior_weight,
-        num_auxiliary_variables=num_auxiliary_variables,
-        num_layers=num_layers),
-      num_auxiliary_variables=num_auxiliary_variables,
-      num_layers=num_layers,
-      seed=seed)
-    surrogate_posterior.also_track = variables
-    return surrogate_posterior
+        prior,
+        num_auxiliary_variables=0,
+        initial_prior_weight=0.98,
+        num_layers=3,
+        seed=None,
+        name=None):
+    """Builds a structured surrogate posterior with cascading flows.
+
+    Cascading Flows (CF) [1] is a method that automatically construct a
+    variational approximation given an input probabilistic program. CF combines
+    ASVI [2] with the flexibility of normalizing flows, by transforming the
+    conditional distributions of the prior program with HighwayFlow architectures,
+    to steer the prior towards the observed data. More details on the HighwayFlow
+    architecture can be found in [1] and in the tfp bijector `HighwayFlow`.
+    It is possible to add auxiliary variables to the prior program to further
+    increase the flexibility of cascading flows, useful especially in the
+    cases where the input program has low dimensionality. The auxiliary variables
+    are sampled from a global linear flow, to account for statistical dependencies
+    among variables, and then transformed with local HighwayFlows together with
+    samples form the prior. Note that when using auxiliary variables it is
+    necessary to modify the variational lower bound [3].
+
+    Args:
+      prior: tfd.JointDistribution instance of the prior.
+      num_auxiliary_variables: The number of auxiliary variables to use for each
+        variable in the input program. Default value: `0`.
+      initial_prior_weight: Optional float value (either static or tensor value)
+        on the interval [0, 1]. A larger value creates an initial surrogate
+        distribution with more dependence on the prior structure. Default value:
+        `0.98`.
+      num_layers: Number of layers to use in each Highway Flow architecture. All
+      the layers will have `softplus` activation function, apart from the last one
+      which will have linear activation. Default value: `3`.
+      seed: Python `int` seed for random initialization.
+      name: Optional string. Default value: `build_cf_surrogate_posterior`.
+
+    Returns:
+      surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
+      whose samples have shape and structure matching that of `prior`.
+
+    Raises:
+      TypeError: The `prior` argument cannot be a nested `JointDistribution`.
+
+    ### Examples
+
+    Consider a Brownian motion model expressed as a JointDistribution:
+
+    ```python
+    prior_loc = 0.
+    innovation_noise = .1
+
+    def model_fn():
+      new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+      for i in range(4):
+        new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+    prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
+    ```
+
+    Let's use variational inference to approximate the posterior. We'll build a
+    surrogate posterior distribution by feeding in the prior distribution.
+
+    ```python
+    surrogate_posterior =
+      tfp.experimental.vi.build_cf_surrogate_posterior(prior)
+    ```
+
+    This creates a trainable joint distribution, defined by variables in
+    `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
+    to fit this distribution by minimizing a divergence to the true posterior.
+
+    ```python
+    losses = tfp.vi.fit_surrogate_posterior(
+      target_log_prob_fn,
+      surrogate_posterior=surrogate_posterior,
+      num_steps=100,
+      optimizer=tf.optimizers.Adam(0.1),
+      sample_size=10)
+
+    # After optimization, samples from the surrogate will approximate
+    # samples from the true posterior.
+    samples = surrogate_posterior.sample(100)
+    posterior_mean = [tf.reduce_mean(x) for x in samples]
+    posterior_std = [tf.math.reduce_std(x) for x in samples]
+    ```
+
+    When using auxiliary variables, we need some modifications for loss and
+    samples, as samples will return also the global variables and transformed
+    auxiliary variables
+
+    ```python
+    num_aux_vars=10
+    target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape(
+      tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1),
+        scale=tf.reshape(tfp.util.TransformedVariable(
+          [tf.random.uniform((1,), minval=0.01, maxval=1.)
+        for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1)
+
+    def target_log_prob_aux_vars(z_and_eps):
+      z = [x[0] for x in z_and_eps[1:]]
+      eps = [x[1] for x in z_and_eps[1:]]
+      lp_z = target_log_prob_fn(z)
+      lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape)
+      return lp_z + lp_eps
+
+    target_log_prob = lambda *values: target_log_prob_aux_vars(values)
+    cf_surrogate_posterior = build_cf_surrogate_posterior(prior,
+                                            num_auxiliary_variables=num_aux_vars)
+    trainable_variables = list(cf_surrogate_posterior.trainable_variables)
+    trainable_variables.extend(list(target_dist.trainable_variables))
+    cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,
+                                          cf_surrogate_posterior,
+                                          optimizer=tf.optimizers.Adam(0.01),
+                                          num_steps=8000,
+                                          sample_size=50,
+                                          trainable_variables=trainable_variables)
+
+    cf_posterior_samples = cf_surrogate_posterior.sample(num_samples)
+    cf_posterior_samples = tf.convert_to_tensor(
+                                         [s[0] for s in cf_posterior_samples[1:]])
+    ```
+
+    #### References
+    [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic
+    variational inference with cascading flows." arXiv preprint arXiv:2102.04801
+    (2021).
+
+    [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference."
+    International Conference on Artificial Intelligence and Statistics. PMLR,
+    2021.
+
+    [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational
+    models." International Conference on Machine Learning. PMLR, 2016.
+
+    """
+    with tf.name_scope(name or 'build_cf_surrogate_posterior'):
+        surrogate_posterior, variables = _cf_surrogate_for_distribution(
+            dist=prior,
+            base_distribution_surrogate_fn=functools.partial(
+                _cf_convex_update_for_base_distribution,
+                initial_prior_weight=initial_prior_weight,
+                num_auxiliary_variables=num_auxiliary_variables,
+                num_layers=num_layers),
+            num_auxiliary_variables=num_auxiliary_variables,
+            num_layers=num_layers,
+            seed=seed)
+        surrogate_posterior.also_track = variables
+        return surrogate_posterior
 
 
 def _cf_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
                                    num_auxiliary_variables,
                                    num_layers,
+                                   global_auxiliary_variables=None,
                                    sample_shape=None,
                                    variables=None,
-                                   global_auxiliary_variables=None,
                                    seed=None):
-  # todo: change docstrings
-  """Recursively creates ASVI surrogates, and creates new variables if needed.
-
-  Args:
-    dist: a `tfd.Distribution` instance.
-    base_distribution_surrogate_fn: Callable to build a surrogate posterior
-      for a 'base' (non-meta and non-joint) distribution, with signature
-      `surrogate_posterior, variables = base_distribution_fn(
-      dist, sample_shape=None, variables=None, seed=None)`.
-    sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
-      `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
-      independent sample dimensions, i.e., it will have event shape
-      `concat([sample_shape, dist.event_shape], axis=0)`.
-      Default value: `None`.
-    variables: Optional nested structure of `tf.Variable`s returned from a
-      previous call to `_cf_surrogate_for_distribution`. If `None`,
-      new variables will be created; otherwise, constructs a surrogate posterior
-      backed by the passed-in variables.
-      Default value: `None`.
-    seed: Python `int` seed for random initialization.
-  Returns:
-    surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
-      surrogate posterior distribution, with the same structure and `name` as
-      `dist`.
-    variables: Nested structure of `tf.Variable` trainable parameters for the
-      surrogate posterior. If `dist` is a base distribution, this is
-      a `dict` of `ASVIParameters` instances. If `dist` is a joint
-      distribution, this is a `dist.dtype` structure of such `dict`s.
-  """
-
-  # Apply any substitutions, while attempting to preserve the original name.
-  dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
-
-  if hasattr(dist, '_model_coroutine'):
-    surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
-      dist,
-      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-      variables=variables,
-      num_auxiliary_variables=num_auxiliary_variables,
-      num_layers=num_layers,
-      global_auxiliary_variables=global_auxiliary_variables,
-      seed=seed)
-  else:
-    surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables,
-      global_auxiliary_variables=global_auxiliary_variables,
-      num_layers=num_layers,
-      seed=seed)
-  return surrogate_posterior, variables
+    """Recursively creates CF surrogates, and creates new variables if needed.
+
+    Args:
+      dist: a `tfd.Distribution` instance.
+      base_distribution_surrogate_fn: Callable to build a surrogate posterior
+        for a 'base' (non-meta and non-joint) distribution, with signature
+        `surrogate_posterior, variables = base_distribution_fn(
+        dist, sample_shape=None, variables=None, seed=None)`.
+      num_auxiliary_variables: The number of auxiliary variables to use for each
+        variable in the input program.
+      num_layers: Number of layers to use in each Highway Flow architecture.
+      global_auxiliary_variables: The sampled global auxiliary variables
+        (available only if using auxiliary variables). Default value: None.
+      sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
+        `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
+        independent sample dimensions, i.e., it will have event shape
+        `concat([sample_shape, dist.event_shape], axis=0)`.
+        Default value: `None`.
+      variables: Optional nested structure of `tf.Variable`s returned from a
+        previous call to `_cf_surrogate_for_distribution`. If `None`,
+        new variables will be created; otherwise, constructs a surrogate posterior
+        backed by the passed-in variables.
+        Default value: `None`.
+      seed: Python `int` seed for random initialization.
+    Returns:
+      surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
+        surrogate posterior distribution, with the same structure and `name` as
+        `dist`, and with addition of global and local auxiliary variables if
+        `num_auxiliary_variables > 0`.
+      variables: Nested structure of `tf.Variable` trainable parameters for the
+        surrogate posterior. If `dist` is a base distribution, this is
+        a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape`
+        bijectors. If `dist` is a joint distribution, this is a `dist.dtype`
+        structure of such `tfb.Chain`s.
+    """
+
+    # Apply any substitutions, while attempting to preserve the original name.
+    dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
+
+    if hasattr(dist, '_model_coroutine'):
+        surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
+            dist,
+            base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+            variables=variables,
+            num_auxiliary_variables=num_auxiliary_variables,
+            num_layers=num_layers,
+            global_auxiliary_variables=global_auxiliary_variables,
+            seed=seed)
+    else:
+        surrogate_posterior, variables = base_distribution_surrogate_fn(
+            dist=dist, sample_shape=sample_shape, variables=variables,
+            global_auxiliary_variables=global_auxiliary_variables,
+            num_layers=num_layers,
+            seed=seed)
+    return surrogate_posterior, variables
 
 
-def _cf_surrogate_for_joint_distribution(
-    dist, base_distribution_surrogate_fn, variables,
-    num_auxiliary_variables, num_layers, global_auxiliary_variables,
-    seed=None):
-  """Builds a structured joint surrogate posterior for a joint model."""
+def _build_highway_flow_block(num_layers, width,
+                              residual_fraction_initial_value, gate_first_n,
+                              seed):
+    bijectors = []
 
-  # Probabilistic program for CF surrogate posterior.
-  flat_variables = dist._model_flatten(
-    variables) if variables else None  # pylint: disable=protected-access
-  prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
+    for _ in range(0, num_layers - 1):
+        bijectors.append(
+            build_trainable_highway_flow(width,
+                                         activation_fn=tf.nn.softplus,
+                                         gate_first_n=gate_first_n, seed=seed))
+    bijectors.append(
+        build_trainable_highway_flow(width,
+                                     activation_fn=None,
+                                     gate_first_n=gate_first_n, seed=seed))
 
-  def posterior_generator(seed=seed):
-    prior_gen = prior_coroutine()
-    dist = next(prior_gen)
+    return bijectors
 
-    if num_auxiliary_variables > 0:
-      i = 1
 
-      if flat_variables:
-        variables = flat_variables[0]
+def _cf_surrogate_for_joint_distribution(
+        dist, base_distribution_surrogate_fn, variables,
+        num_auxiliary_variables, num_layers, global_auxiliary_variables,
+        seed=None):
+    """Builds a structured joint surrogate posterior for a joint model."""
 
-      else:
-        bijectors = []
+    # Probabilistic program for CF surrogate posterior.
+    flat_variables = dist._model_flatten(
+        variables) if variables else None  # pylint: disable=protected-access
+    prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
 
-        for _ in range(0, num_layers - 1):
-          bijectors.append(
-            build_highway_flow_layer(num_auxiliary_variables,
-                                     activation_fn=True,
-                                     gate_first_n=0, seed=seed))
-        bijectors.append(
-          build_highway_flow_layer(num_auxiliary_variables,
-                                   activation_fn=False,
-                                   gate_first_n=0, seed=seed))
+    def posterior_generator(seed=seed):
+        prior_gen = prior_coroutine()
+        dist = next(prior_gen)
 
-        variables = chain.Chain(bijectors=list(reversed(bijectors)))
+        if num_auxiliary_variables > 0:
+            i = 1
 
-      eps = transformed_distribution.TransformedDistribution(
-        distribution=sample.Sample(normal.Normal(0., 1.),
-                                   num_auxiliary_variables),
-        bijector=variables)
+            if flat_variables:
+                variables = flat_variables[0]
 
-      eps = Root(eps)
+            else:
 
-      value_out = yield (eps if flat_variables
-                         else (eps, variables))
+                bijectors = _build_highway_flow_block(
+                    num_layers,
+                    width=num_auxiliary_variables,
+                    residual_fraction_initial_value=None,
+                    gate_first_n=0, seed=seed)
+                variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
-      global_auxiliary_variables = value_out
+            eps = transformed_distribution.TransformedDistribution(
+                distribution=sample.Sample(normal.Normal(0., 1.),
+                                           num_auxiliary_variables),
+                bijector=variables)
 
-    else:
-      global_auxiliary_variables = None
-      i = 0
+            eps = Root(eps)
 
-    try:
-      while True:
-        was_root = isinstance(dist, Root)
-        if was_root:
-          dist = dist.distribution
+            value_out = yield (eps if flat_variables
+                               else (eps, variables))
 
-        seed, init_seed = samplers.split_seed(seed)
-        surrogate_posterior, variables = _cf_surrogate_for_distribution(
-          dist,
-          base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-          num_auxiliary_variables=num_auxiliary_variables,
-          num_layers=num_layers,
-          variables=flat_variables[i] if flat_variables else None,
-          global_auxiliary_variables=global_auxiliary_variables,
-          seed=init_seed)
-
-        if was_root and num_auxiliary_variables == 0:
-          surrogate_posterior = Root(surrogate_posterior)
-        # If variables were not given---i.e., we're creating new
-        # variables---then yield the new variables along with the surrogate
-        # posterior. This assumes an execution context such as
-        # `_extract_variables_from_coroutine_model` below that will capture and
-        # save the variables.
-        value_out = yield (surrogate_posterior if flat_variables
-                           else (surrogate_posterior, variables))
-        if type(value_out) == list:
-          if len(dist.event_shape) == 0:
-            dist = prior_gen.send(tf.squeeze(value_out[0], -1))
-          else:
-            dist = prior_gen.send(value_out[0])
+            global_auxiliary_variables = value_out
 
         else:
-          dist = prior_gen.send(value_out)
-        i += 1
-    except StopIteration:
-      pass
-
-  if variables is None:
-    # Run the generator to create variables, then call ourselves again
-    # to construct the surrogate JD from these variables. Note that we can't
-    # just create a JDC from the current `posterior_generator`, because it will
-    # try to build new variables on every invocation; the recursive call will
-    # define a new `posterior_generator` that knows about the variables we're
-    # about to create.
-    return _cf_surrogate_for_joint_distribution(
-      dist=dist,
-      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-      num_auxiliary_variables=num_auxiliary_variables,
-      num_layers=num_layers,
-      global_auxiliary_variables=global_auxiliary_variables,
-      variables=dist._model_unflatten(
+            global_auxiliary_variables = None
+            i = 0
+
+        try:
+            while True:
+                was_root = isinstance(dist, Root)
+                if was_root:
+                    dist = dist.distribution
+
+                seed, init_seed = samplers.split_seed(seed)
+                surrogate_posterior, variables = _cf_surrogate_for_distribution(
+                    dist,
+                    base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+                    num_auxiliary_variables=num_auxiliary_variables,
+                    num_layers=num_layers,
+                    variables=flat_variables[i] if flat_variables else None,
+                    global_auxiliary_variables=global_auxiliary_variables,
+                    seed=init_seed)
+
+                if was_root and num_auxiliary_variables == 0:
+                    surrogate_posterior = Root(surrogate_posterior)
+                # If variables were not given---i.e., we're creating new
+                # variables---then yield the new variables along with the surrogate
+                # posterior. This assumes an execution context such as
+                # `_extract_variables_from_coroutine_model` below that will capture and
+                # save the variables.
+                value_out = yield (surrogate_posterior if flat_variables
+                                   else (surrogate_posterior, variables))
+                if type(value_out) == list:
+                    if len(dist.event_shape) == 0:
+                        dist = prior_gen.send(tf.squeeze(value_out[0], -1))
+                    else:
+                        dist = prior_gen.send(value_out[0])
+
+                else:
+                    dist = prior_gen.send(value_out)
+                i += 1
+        except StopIteration:
+            pass
+
+    if variables is None:
+        # Run the generator to create variables, then call ourselves again
+        # to construct the surrogate JD from these variables. Note that we can't
+        # just create a JDC from the current `posterior_generator`, because it will
+        # try to build new variables on every invocation; the recursive call will
+        # define a new `posterior_generator` that knows about the variables we're
+        # about to create.
+        return _cf_surrogate_for_joint_distribution(
+            dist=dist,
+            base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+            num_auxiliary_variables=num_auxiliary_variables,
+            num_layers=num_layers,
+            global_auxiliary_variables=global_auxiliary_variables,
+            variables=dist._model_unflatten(
+                # pylint: disable=protected-access
+                _extract_variables_from_coroutine_model(
+                    posterior_generator, seed=seed)))
+
+    # Temporary workaround for bijector caching issues with autobatched JDs.
+    surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
+        posterior_generator,
+        use_vectorized_map=dist.use_vectorized_map,
+        name=_get_name(dist))
+
+    # Ensure that the surrogate posterior structure matches that of the prior.
+    # todo: check me, do we need this? in case needs to be modified
+    #  if we use auxiliary variables, then the structure won't match the one of the
+    #  prior
+    '''try:
+      tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
+    except TypeError:
+      tokenize = lambda jd: jd._model_unflatten(
+        # pylint: disable=protected-access, g-long-lambda
+        range(len(jd._model_flatten(jd.dtype)))
         # pylint: disable=protected-access
-        _extract_variables_from_coroutine_model(
-          posterior_generator, seed=seed)))
-
-  # Temporary workaround for bijector caching issues with autobatched JDs.
-  surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-    posterior_generator,
-    use_vectorized_map=dist.use_vectorized_map,
-    name=_get_name(dist))
-
-  # Ensure that the surrogate posterior structure matches that of the prior.
-  # todo: check me, do we need this? in case needs to be modified
-  # if we use auxiliary variables, then the structure won't match the one of the
-  # prior
-  '''try:
-    tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
-  except TypeError:
-    tokenize = lambda jd: jd._model_unflatten(
-      # pylint: disable=protected-access, g-long-lambda
-      range(len(jd._model_flatten(jd.dtype)))
-      # pylint: disable=protected-access
-    )
-    surrogate_posterior = restructure.Restructure(
-      output_structure=tokenize(dist),
-      input_structure=tokenize(surrogate_posterior))(
-      surrogate_posterior, name=_get_name(dist))'''
-  return surrogate_posterior, variables
-
-
-# todo: sample_shape and seed are not used.. maybe they should?
+      )
+      surrogate_posterior = restructure.Restructure(
+        output_structure=tokenize(dist),
+        input_structure=tokenize(surrogate_posterior))(
+        surrogate_posterior, name=_get_name(dist))'''
+    return surrogate_posterior, variables
+
+
+# todo: sample_shape is not used.. can remove?
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables,
@@ -465,91 +524,84 @@ def _cf_convex_update_for_base_distribution(dist,
                                             variables,
                                             sample_shape=None,
                                             seed=None):
-  """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+    """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+
+    if variables is None:
+        actual_event_shape = dist.event_shape_tensor()
+        int_event_shape = int(actual_event_shape) if \
+            actual_event_shape.shape.as_list()[0] > 0 else 1
+        bijectors = [reshape.Reshape([-1],
+                                     event_shape_in=actual_event_shape +
+                                                    num_auxiliary_variables)]
+
+        bijectors.extend(
+            _build_highway_flow_block(
+                num_layers,
+                width=tf.reduce_prod(
+                    actual_event_shape + num_auxiliary_variables),
+                residual_fraction_initial_value=initial_prior_weight,
+                gate_first_n=int_event_shape, seed=seed))
 
-  if variables is None:
-    actual_event_shape = dist.event_shape_tensor()
-    int_event_shape = int(actual_event_shape) if \
-      actual_event_shape.shape.as_list()[0] > 0 else 1
-    bijectors = [reshape.Reshape([-1],
-                                 event_shape_in=actual_event_shape +
-                                                num_auxiliary_variables)]
-
-    for _ in range(0, num_layers - 1):
-      bijectors.append(
-        build_highway_flow_layer(
-          tf.reduce_prod(
-            actual_event_shape + num_auxiliary_variables),
-          residual_fraction_initial_value=initial_prior_weight,
-          activation_fn=True, gate_first_n=int_event_shape,
-          seed=seed))
-    bijectors.append(
-      build_highway_flow_layer(
-        tf.reduce_prod(
-          actual_event_shape + num_auxiliary_variables),
-        residual_fraction_initial_value=initial_prior_weight,
-        activation_fn=False, gate_first_n=int_event_shape,
-        seed=seed))
-    bijectors.append(
-      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
-
-    variables = chain.Chain(bijectors=list(reversed(bijectors)))
+        bijectors.append(
+            reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
-  if num_auxiliary_variables > 0:
-    batch_shape = global_auxiliary_variables.shape[0] if len(
-      global_auxiliary_variables.shape) > 1 else []
+        variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
-    cascading_flows = split.Split(
-      [-1, num_auxiliary_variables])(
-      transformed_distribution.TransformedDistribution(
-        distribution=blockwise.Blockwise([
-          batch_broadcast.BatchBroadcast(dist,
-                                         to_shape=batch_shape),
-          independent.Independent(
-            deterministic.Deterministic(
-              global_auxiliary_variables),
-            reinterpreted_batch_ndims=1)]),
-        bijector=variables))
+    if num_auxiliary_variables > 0:
+        batch_shape = global_auxiliary_variables.shape[0] if len(
+            global_auxiliary_variables.shape) > 1 else []
+
+        cascading_flows = split.Split(
+            [-1, num_auxiliary_variables])(
+            transformed_distribution.TransformedDistribution(
+                distribution=blockwise.Blockwise([
+                    batch_broadcast.BatchBroadcast(dist,
+                                                   to_shape=batch_shape),
+                    independent.Independent(
+                        deterministic.Deterministic(
+                            global_auxiliary_variables),
+                        reinterpreted_batch_ndims=1)]),
+                bijector=variables))
 
-  else:
-    cascading_flows = transformed_distribution.TransformedDistribution(
-      distribution=dist,
-      bijector=variables)
+    else:
+        cascading_flows = transformed_distribution.TransformedDistribution(
+            distribution=dist,
+            bijector=variables)
 
-  return cascading_flows, variables
+    return cascading_flows, variables
 
 
 def _extract_variables_from_coroutine_model(model_fn, seed=None):
-  """Extracts variables from a generator that yields (dist, variables) pairs."""
-  gen = model_fn()
-  try:
-    dist, dist_variables = next(gen)
-    flat_variables = [dist_variables]
-    while True:
-      seed, local_seed = samplers.split_seed(seed, n=2)
-      sampled_value = (dist.distribution.sample(seed=local_seed)
-                       if isinstance(dist, Root)
-                       else dist.sample(seed=local_seed))
-      dist, dist_variables = gen.send(
-        sampled_value)  # tf.concat(sampled_value, axis=0)
-      flat_variables.append(dist_variables)
-  except StopIteration:
-    pass
-  return flat_variables
+    """Extracts variables from a generator that yields (dist, variables) pairs."""
+    gen = model_fn()
+    try:
+        dist, dist_variables = next(gen)
+        flat_variables = [dist_variables]
+        while True:
+            seed, local_seed = samplers.split_seed(seed, n=2)
+            sampled_value = (dist.distribution.sample(seed=local_seed)
+                             if isinstance(dist, Root)
+                             else dist.sample(seed=local_seed))
+            dist, dist_variables = gen.send(
+                sampled_value)  # tf.concat(sampled_value, axis=0)
+            flat_variables.append(dist_variables)
+    except StopIteration:
+        pass
+    return flat_variables
 
 
 def _set_name(dist, name):
-  """Copies a distribution-like object, replacing its name."""
-  if hasattr(dist, 'copy'):
-    return dist.copy(name=name)
-  # Some distribution-like entities such as JointDistributionPinned don't
-  # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
-  # the name directly.
-  dist = copy.copy(dist)
-  dist._name = name  # pylint: disable=protected-access
-  return dist
+    """Copies a distribution-like object, replacing its name."""
+    if hasattr(dist, 'copy'):
+        return dist.copy(name=name)
+    # Some distribution-like entities such as JointDistributionPinned don't
+    # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
+    # the name directly.
+    dist = copy.copy(dist)
+    dist._name = name  # pylint: disable=protected-access
+    return dist
 
 
 def _get_name(dist):
-  """Attempts to get a distribution's short name, excluding the name scope."""
-  return getattr(dist, 'parameters', {}).get('name', dist.name)
+    """Attempts to get a distribution's short name, excluding the name scope."""
+    return getattr(dist, 'parameters', {}).get('name', dist.name)

From a2b025c8be79fbf69d6aff37f87403523fd13da3 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 27 May 2021 13:35:03 +0200
Subject: [PATCH 30/54] added dependency to build_trainable_highway_flow

---
 tensorflow_probability/python/experimental/vi/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD
index 863e0aeef2..6e08525bb3 100644
--- a/tensorflow_probability/python/experimental/vi/BUILD
+++ b/tensorflow_probability/python/experimental/vi/BUILD
@@ -94,6 +94,7 @@ py_library(
         "//tensorflow_probability/python/distributions:transformed_distribution",
         "//tensorflow_probability/python/distributions:truncated_normal",
         "//tensorflow_probability/python/distributions:uniform",
+        "//tensorflow_probability/python/experimental/bijectors:build_trainable_highway_flow",
         "//tensorflow_probability/python/internal:samplers",
     ],
 )

From 7bd84572e3f87d9b3c594b605c28ac6352227ffe Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 28 May 2021 10:37:54 +0200
Subject: [PATCH 31/54] some refactoring

---
 .../python/experimental/vi/cascading_flows.py |   4 +-
 .../experimental/vi/cascading_flows_test.py   | 216 +++++++++---------
 2 files changed, 113 insertions(+), 107 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 8c9f48222c..fad0a70517 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -379,10 +379,12 @@ def _build_highway_flow_block(num_layers, width,
     for _ in range(0, num_layers - 1):
         bijectors.append(
             build_trainable_highway_flow(width,
+                                         residual_fraction_initial_value=residual_fraction_initial_value,
                                          activation_fn=tf.nn.softplus,
                                          gate_first_n=gate_first_n, seed=seed))
     bijectors.append(
         build_trainable_highway_flow(width,
+                                     residual_fraction_initial_value=residual_fraction_initial_value,
                                      activation_fn=None,
                                      gate_first_n=gate_first_n, seed=seed))
 
@@ -415,7 +417,7 @@ def posterior_generator(seed=seed):
                 bijectors = _build_highway_flow_block(
                     num_layers,
                     width=num_auxiliary_variables,
-                    residual_fraction_initial_value=None,
+                    residual_fraction_initial_value=0,  # not used
                     gate_first_n=0, seed=seed)
                 variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index b52e1e5f77..2d9d35c808 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -18,19 +18,20 @@
 from __future__ import division
 from __future__ import print_function
 
-# Dependency imports
-
 import tensorflow.compat.v1 as tf1
 import tensorflow.compat.v2 as tf
+
 import tensorflow_probability as tfp
-from tensorflow_probability.python.experimental.vi import cascading_flows
 from tensorflow_probability.python.internal import prefer_static as ps
 from tensorflow_probability.python.internal import test_util
 
+# Dependency imports
 
 tfb = tfp.bijectors
 tfd = tfp.distributions
- # test_util.test_seed(sampler_type='stateless'))
+
+
+# test_util.test_seed(sampler_type='stateless'))
 
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
@@ -49,50 +50,59 @@ def _expected_num_trainable_variables(self, prior_dist, num_layers):
     return expected_num_trainable_variables
 
   def test_dims_and_gradients(self):
-
     prior_dist = self.make_prior_dist()
     num_layers = 3
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist, num_layers=num_layers)
+      prior=prior_dist, num_layers=num_layers)
 
     # Test that the correct number of trainable variables are being tracked
     self.assertLen(surrogate_posterior.trainable_variables,
-                   self._expected_num_trainable_variables(prior_dist, num_layers))
+                   self._expected_num_trainable_variables(prior_dist,
+                                                          num_layers))
 
     # Test that the sample shape is correct
     three_posterior_samples = surrogate_posterior.sample(
-        3, seed=1)
+      3, seed=(0, 0))
     three_prior_samples = prior_dist.sample(
-        3, seed=1)
+      3, seed=(0, 0))
     self.assertAllEqualNested(
-        [s.shape for s in tf.nest.flatten(three_prior_samples)],
-        [s.shape for s in tf.nest.flatten(three_posterior_samples)])
+      [s.shape for s in tf.nest.flatten(three_prior_samples)],
+      [s.shape for s in tf.nest.flatten(three_posterior_samples)])
+
+    # Test that gradients are available wrt the variational parameters.
+    with tf.GradientTape() as tape:
+      posterior_sample = surrogate_posterior.sample(
+        seed=(0, 0))
+      posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
+    grad = tape.gradient(posterior_logprob,
+                         surrogate_posterior.trainable_variables)
+    self.assertTrue(all(g is not None for g in grad))
 
   def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist,
-        seed=1)
+      prior=prior_dist,
+      seed=(0, 0))
     self.evaluate(
-        [v.initializer for v in surrogate_posterior.trainable_variables])
+      [v.initializer for v in surrogate_posterior.trainable_variables])
     posterior_sample = surrogate_posterior.sample(
-        seed=1)
+      seed=(0, 0))
 
     surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist,
-        seed=1)
+      prior=prior_dist,
+      seed=(0, 0))
     self.evaluate(
-        [v.initializer for v in surrogate_posterior2.trainable_variables])
+      [v.initializer for v in surrogate_posterior2.trainable_variables])
     posterior_sample2 = surrogate_posterior2.sample(
-        seed=1)
+      seed=(0, 0))
 
     self.assertAllEqualNested(posterior_sample, posterior_sample2)
 
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,
-                                               _TrainableCFSurrogate):
+                                             _TrainableCFSurrogate):
 
   def make_prior_dist(self):
 
@@ -117,7 +127,7 @@ def get_observations(self, prior_dist):
     observation_noise = 0.15
     ground_truth = prior_dist.sample()
     likelihood = self.make_likelihood_model(
-        x=ground_truth, observation_noise=observation_noise)
+      x=ground_truth, observation_noise=observation_noise)
     return likelihood.sample(1)
 
   def get_target_log_prob(self, observations, prior_dist):
@@ -125,42 +135,45 @@ def get_target_log_prob(self, observations, prior_dist):
     def target_log_prob(*x):
       observation_noise = 0.15
       likelihood_dist = self.make_likelihood_model(
-          x=x, observation_noise=observation_noise)
-      return likelihood_dist.log_prob(observations) + prior_dist.log_prob(x)
+        x=x, observation_noise=observation_noise)
+      return likelihood_dist.log_prob(observations) + prior_dist.log_prob(
+        x)
 
     return target_log_prob
 
-  '''def test_fitting_surrogate_posterior(self):
+  def test_fitting_surrogate_posterior(self):
 
     prior_dist = self.make_prior_dist()
     observations = self.get_observations(prior_dist)
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
-        prior=prior_dist)
+      prior=prior_dist)
     target_log_prob = self.get_target_log_prob(observations, prior_dist)
 
     # Test vi fit surrogate posterior works
     losses = tfp.vi.fit_surrogate_posterior(
-        target_log_prob,
-        surrogate_posterior,
-        num_steps=5,  # Don't optimize to completion.
-        optimizer=tf.optimizers.Adam(0.1),
-        sample_size=10)
+      target_log_prob,
+      surrogate_posterior,
+      num_steps=5,  # Don't optimize to completion.
+      optimizer=tf.optimizers.Adam(0.1),
+      sample_size=10)
 
     # Compute posterior statistics.
     with tf.control_dependencies([losses]):
       posterior_samples = surrogate_posterior.sample(100)
-      posterior_mean = tf.nest.map_structure(tf.reduce_mean, posterior_samples)
+      posterior_mean = tf.nest.map_structure(tf.reduce_mean,
+                                             posterior_samples)
       posterior_stddev = tf.nest.map_structure(tf.math.reduce_std,
                                                posterior_samples)
 
     self.evaluate(tf1.global_variables_initializer())
     _ = self.evaluate(losses)
     _ = self.evaluate(posterior_mean)
-    _ = self.evaluate(posterior_stddev)'''
+    _ = self.evaluate(posterior_stddev)
+
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestEightSchools(test_util.TestCase,
-                                             _TrainableCFSurrogate):
+                                           _TrainableCFSurrogate):
 
   def make_prior_dist(self):
     treatment_effects = tf.constant([28, 8, -3, 7, -1, 1, 18, 12],
@@ -168,64 +181,64 @@ def make_prior_dist(self):
     num_schools = ps.shape(treatment_effects)[-1]
 
     return tfd.JointDistributionNamed({
-        'avg_effect':
-            tfd.Normal(loc=0., scale=10., name='avg_effect'),
-        'log_stddev':
-            tfd.Normal(loc=5., scale=1., name='log_stddev'),
-        'school_effects':
-            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
-                tfd.Independent(
-                    tfd.Normal(
-                        loc=avg_effect[..., None] * tf.ones(num_schools),
-                        scale=tf.exp(log_stddev[..., None]) * tf.ones(
-                            num_schools),
-                        name='school_effects'),
-                    reinterpreted_batch_ndims=1))
+      'avg_effect':
+        tfd.Normal(loc=0., scale=10., name='avg_effect'),
+      'log_stddev':
+        tfd.Normal(loc=5., scale=1., name='log_stddev'),
+      'school_effects':
+        lambda log_stddev, avg_effect: (
+          # pylint: disable=g-long-lambda
+          tfd.Independent(
+            tfd.Normal(
+              loc=avg_effect[..., None] * tf.ones(num_schools),
+              scale=tf.exp(log_stddev[..., None]) * tf.ones(
+                num_schools),
+              name='school_effects'),
+            reinterpreted_batch_ndims=1))
     })
 
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestEightSchoolsSample(test_util.TestCase,
-                                                   _TrainableCFSurrogate):
+                                                 _TrainableCFSurrogate):
 
   def make_prior_dist(self):
-
     return tfd.JointDistributionNamed({
-        'avg_effect':
-            tfd.Normal(loc=0., scale=10., name='avg_effect'),
-        'log_stddev':
-            tfd.Normal(loc=5., scale=1., name='log_stddev'),
-        'school_effects':
-            lambda log_stddev, avg_effect: (  # pylint: disable=g-long-lambda
-                tfd.Sample(
-                    tfd.Normal(
-                        loc=avg_effect[..., None],
-                        scale=tf.exp(log_stddev[..., None]),
-                        name='school_effects'),
-                    sample_shape=[8]))
+      'avg_effect':
+        tfd.Normal(loc=0., scale=10., name='avg_effect'),
+      'log_stddev':
+        tfd.Normal(loc=5., scale=1., name='log_stddev'),
+      'school_effects':
+        lambda log_stddev, avg_effect: (
+          # pylint: disable=g-long-lambda
+          tfd.Sample(
+            tfd.Normal(
+              loc=avg_effect[..., None],
+              scale=tf.exp(log_stddev[..., None]),
+              name='school_effects'),
+            sample_shape=[8]))
     })
 
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestHalfNormal(test_util.TestCase,
-                                           _TrainableCFSurrogate):
+                                         _TrainableCFSurrogate):
 
   def make_prior_dist(self):
-
     def _prior_model_fn():
       innovation_noise = 1.
       yield tfd.HalfNormal(
-          scale=innovation_noise, validate_args=True, allow_nan_stats=False)
+        scale=innovation_noise, validate_args=True,
+        allow_nan_stats=False)
 
     return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
 
 
-'''@test_util.test_all_tf_execution_regimes
+@test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestDiscreteLatent(
-    test_util.TestCase, _TrainableCFSurrogate):
+  test_util.TestCase, _TrainableCFSurrogate):
 
   def make_prior_dist(self):
-
     def _prior_model_fn():
       a = yield tfd.Bernoulli(logits=0.5, name='a')
       yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1.,
@@ -236,36 +249,30 @@ def _prior_model_fn():
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestNesting(test_util.TestCase,
-                                        _TrainableCFSurrogate):
-
-  def _expected_num_trainable_variables(self, _):
-    # Nested distributions have total of 10 params after Exponential->Gamma
-    # substitution, multiplied by 2 variables per param.
-    return 20
+                                      _TrainableCFSurrogate):
 
   def make_prior_dist(self):
-
     def nested_model():
       a = yield tfd.Sample(
-          tfd.Sample(
-              tfd.Normal(0., 1.),
-              sample_shape=4),
-          sample_shape=[2],
-          name='a')
+        tfd.Sample(
+          tfd.Normal(0., 1.),
+          sample_shape=4),
+        sample_shape=[2],
+        name='a')
       b = yield tfb.Sigmoid()(
-          tfb.Square()(
-              tfd.Exponential(rate=tf.exp(a))),
-          name='b')
+        tfb.Square()(
+          tfd.Exponential(rate=tf.exp(a))),
+        name='b')
       # pylint: disable=g-long-lambda
       yield tfd.JointDistributionSequential(
-          [tfd.Laplace(loc=a, scale=b),
-           lambda c1: tfd.Independent(
-               tfd.Beta(concentration1=1.,
-                        concentration0=tf.nn.softplus(c1)),
-               reinterpreted_batch_ndims=1),
-           lambda c1, c2: tfd.JointDistributionNamed({
-               'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)})
-           ], name='c')
+        [tfd.Laplace(loc=a, scale=b),
+         lambda c1: tfd.Independent(
+           tfd.Beta(concentration1=1.,
+                    concentration0=tf.nn.softplus(c1)),
+           reinterpreted_batch_ndims=1),
+         lambda c1, c2: tfd.JointDistributionNamed({
+           'x': tfd.Gamma(concentration=tf.nn.softplus(c1), rate=c2)})
+         ], name='c')
       # pylint: enable=g-long-lambda
 
     return tfd.JointDistributionCoroutineAutoBatched(nested_model)
@@ -275,19 +282,18 @@ def nested_model():
 class TestCFDistributionSubstitution(test_util.TestCase):
 
   def test_default_substitutes_trainable_families(self):
-
     @tfd.JointDistributionCoroutineAutoBatched
     def model():
       yield tfd.Sample(
-          tfd.Uniform(low=-2., high=7.),
-          sample_shape=[2],
-          name='a')
+        tfd.Uniform(low=-2., high=7.),
+        sample_shape=[2],
+        name='a')
       yield tfd.HalfNormal(1., name='b')
       yield tfd.Exponential(rate=[1., 2.], name='c')
       yield tfd.Chi2(df=3., name='d')
 
     surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
-        model)
+      model)
     self.assertAllEqualNested(model.event_shape, surrogate.event_shape)
 
     surrogate_dists, _ = surrogate.sample_distributions()
@@ -301,22 +307,23 @@ def model():
     self.assertIsInstance(surrogate_dists.d, tfd.Gamma)
 
   def test_can_specify_custom_substitution(self):
-
     @tfd.JointDistributionCoroutineAutoBatched
     def centered_horseshoe(ndims=100):
       global_scale = yield tfd.HalfCauchy(
-          loc=0., scale=1., name='global_scale')
+        loc=0., scale=1., name='global_scale')
       local_scale = yield tfd.HalfCauchy(
-          loc=0., scale=tf.ones([ndims]), name='local_scale')
+        loc=0., scale=tf.ones([ndims]), name='local_scale')
       yield tfd.Normal(
-          loc=0., scale=tf.sqrt(global_scale * local_scale), name='weights')
+        loc=0., scale=tf.sqrt(global_scale * local_scale),
+        name='weights')
 
     tfp.experimental.vi.register_asvi_substitution_rule(
-        condition=tfd.HalfCauchy,
-        substitution_fn=(
-            lambda d: tfb.Softplus(1e-6)(tfd.Normal(loc=d.loc, scale=d.scale))))
+      condition=tfd.HalfCauchy,
+      substitution_fn=(
+        lambda d: tfb.Softplus(1e-6)(
+          tfd.Normal(loc=d.loc, scale=d.scale))))
     surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
-        centered_horseshoe)
+      centered_horseshoe)
     self.assertAllEqualNested(centered_horseshoe.event_shape,
                               surrogate.event_shape)
 
@@ -329,11 +336,8 @@ def centered_horseshoe(ndims=100):
                           tfd.Normal)
     self.assertIsInstance(surrogate_dists.local_scale.distribution,
                           tfd.Normal)
-    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)'''
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
 
-# TODO(kateslin): Add an ASVI surrogate posterior test for gamma distributions.
-# TODO(kateslin): Add an ASVI surrogate posterior test with for a model with
-#  missing observations.
 
 if __name__ == '__main__':
   tf.test.main()

From ae34080e07e5c1777d3318a86326315ccaa59351 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 28 May 2021 10:55:58 +0200
Subject: [PATCH 32/54] changed seed

---
 .../experimental/vi/cascading_flows_test.py    | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 35772332d8..0b45486e6c 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -31,8 +31,6 @@
 tfd = tfp.distributions
 
 
-# test_util.test_seed(sampler_type='stateless'))
-
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
@@ -62,9 +60,9 @@ def test_dims_and_gradients(self):
 
     # Test that the sample shape is correct
     three_posterior_samples = surrogate_posterior.sample(
-      3, seed=(0, 0))
+      3, seed=test_util.test_seed(sampler_type='stateless'))
     three_prior_samples = prior_dist.sample(
-      3, seed=(0, 0))
+      3, seed=test_util.test_seed(sampler_type='stateless'))
     self.assertAllEqualNested(
       [s.shape for s in tf.nest.flatten(three_prior_samples)],
       [s.shape for s in tf.nest.flatten(three_posterior_samples)])
@@ -72,7 +70,7 @@ def test_dims_and_gradients(self):
     # Test that gradients are available wrt the variational parameters.
     with tf.GradientTape() as tape:
       posterior_sample = surrogate_posterior.sample(
-        seed=(0, 0))
+        seed=test_util.test_seed(sampler_type='stateless'))
       posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
     grad = tape.gradient(posterior_logprob,
                          surrogate_posterior.trainable_variables)
@@ -83,19 +81,19 @@ def test_initialization_is_deterministic_following_seed(self):
 
     surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
       prior=prior_dist,
-      seed=(0, 0))
+      seed=test_util.test_seed(sampler_type='stateless'))
     self.evaluate(
       [v.initializer for v in surrogate_posterior.trainable_variables])
     posterior_sample = surrogate_posterior.sample(
-      seed=(0, 0))
+      seed=test_util.test_seed(sampler_type='stateless'))
 
     surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
       prior=prior_dist,
-      seed=(0, 0))
+      seed=test_util.test_seed(sampler_type='stateless'))
     self.evaluate(
       [v.initializer for v in surrogate_posterior2.trainable_variables])
     posterior_sample2 = surrogate_posterior2.sample(
-      seed=(0, 0))
+      seed=test_util.test_seed(sampler_type='stateless'))
 
     self.assertAllEqualNested(posterior_sample, posterior_sample2)
 
@@ -104,7 +102,6 @@ def test_initialization_is_deterministic_following_seed(self):
 class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,
                                              _TrainableCFSurrogate):
 
-
   def make_prior_dist(self):
 
     def _prior_model_fn():
@@ -339,5 +336,6 @@ def centered_horseshoe(ndims=100):
                           tfd.Normal)
     self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
 
+
 if __name__ == '__main__':
   tf.test.main()

From d0e287f79d5c2e36d5e8c9217081ad63e540c216 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 28 May 2021 10:56:13 +0200
Subject: [PATCH 33/54] reverted to master

---
 .../python/experimental/bijectors/BUILD       | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/tensorflow_probability/python/experimental/bijectors/BUILD b/tensorflow_probability/python/experimental/bijectors/BUILD
index 8596d61582..9f7afce0b8 100644
--- a/tensorflow_probability/python/experimental/bijectors/BUILD
+++ b/tensorflow_probability/python/experimental/bijectors/BUILD
@@ -119,7 +119,6 @@ multi_substrate_py_library(
     ],
 )
 
-
 multi_substrate_py_test(
     name = "highway_flow_test",
     size = "medium",
@@ -149,20 +148,6 @@ multi_substrate_py_library(
     ],
 )
 
-multi_substrate_py_library(
-    name = "highway_flow",
-    srcs = ["highway_flow.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":scalar_function_with_inferred_inverse",
-        # numpy dep,
-        # tensorflow dep,
-        "//tensorflow_probability/python/bijectors",
-        "//tensorflow_probability/python/util",
-        "//tensorflow_probability/python/internal:samplers",
-    ],
-)
-
 multi_substrate_py_test(
     name = "sharded_test",
     size = "medium",
@@ -179,18 +164,3 @@ multi_substrate_py_test(
         "//tensorflow_probability/python/internal:test_util",
     ],
 )
-
-multi_substrate_py_test(
-    name = "highway_flow_test",
-    size = "medium",
-    srcs = ["highway_flow_test.py"],
-    jax_size = "medium",
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        # numpy dep
-        # tensorflow dep,
-        "//tensorflow_probability",
-        "//tensorflow_probability/python/internal:test_util",
-    ],
-)
\ No newline at end of file

From 1e9a4867d47f9e9126d778dc2c03732ab4d2a89f Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 3 Jun 2021 15:03:30 +0200
Subject: [PATCH 34/54] removed substitution rule and updated dependencies

---
 .../python/experimental/vi/BUILD              |  14 +-
 .../python/experimental/vi/cascading_flows.py | 900 ++++++++----------
 2 files changed, 401 insertions(+), 513 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/BUILD b/tensorflow_probability/python/experimental/vi/BUILD
index 6e08525bb3..d693fa80ea 100644
--- a/tensorflow_probability/python/experimental/vi/BUILD
+++ b/tensorflow_probability/python/experimental/vi/BUILD
@@ -70,30 +70,22 @@ py_library(
 
 py_library(
     name = "cascading_flows",
-    srcs = ["cascading_flows.py.py"],
+    srcs = ["cascading_flows.py"],
     srcs_version = "PY3",
     deps = [
         # tensorflow dep,
-        "//tensorflow_probability/python/bijectors:build_highway_flow_layer",
         "//tensorflow_probability/python/bijectors:chain",
         "//tensorflow_probability/python/bijectors:reshape",
-        "//tensorflow_probability/python/bijectors:scale",
-        "//tensorflow_probability/python/bijectors:shift",
         "//tensorflow_probability/python/bijectors:split",
         "//tensorflow_probability/python/distributions:batch_broadcast",
-        "//tensorflow_probability/python/distributions:beta",
         "//tensorflow_probability/python/distributions:blockwise",
-        "//tensorflow_probability/python/distributions:chi2",
-        "//tensorflow_probability/python/distributions:exponential",
-        "//tensorflow_probability/python/distributions:gamma",
-        "//tensorflow_probability/python/distributions:half_normal",
+        "//tensorflow_probability/python/distributions:deterministic",
+        "//tensorflow_probability/python/distributions:independent",
         "//tensorflow_probability/python/distributions:joint_distribution_auto_batched",
         "//tensorflow_probability/python/distributions:joint_distribution_coroutine",
         "//tensorflow_probability/python/distributions:normal",
         "//tensorflow_probability/python/distributions:sample",
         "//tensorflow_probability/python/distributions:transformed_distribution",
-        "//tensorflow_probability/python/distributions:truncated_normal",
-        "//tensorflow_probability/python/distributions:uniform",
         "//tensorflow_probability/python/experimental/bijectors:build_trainable_highway_flow",
         "//tensorflow_probability/python/internal:samplers",
     ],
diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index f61f962012..0a87ae4399 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -21,287 +21,183 @@
 
 import copy
 import functools
-import inspect
 
 import tensorflow.compat.v2 as tf
 
 from tensorflow_probability.python.bijectors import chain
 from tensorflow_probability.python.bijectors import reshape
-from tensorflow_probability.python.bijectors import scale as scale_lib
-from tensorflow_probability.python.bijectors import shift
 from tensorflow_probability.python.bijectors import split
 from tensorflow_probability.python.distributions import batch_broadcast
-from tensorflow_probability.python.distributions import beta
 from tensorflow_probability.python.distributions import blockwise
-from tensorflow_probability.python.distributions import chi2
 from tensorflow_probability.python.distributions import deterministic
-from tensorflow_probability.python.distributions import exponential
-from tensorflow_probability.python.distributions import gamma
-from tensorflow_probability.python.distributions import half_normal
 from tensorflow_probability.python.distributions import independent
 from tensorflow_probability.python.distributions import \
-  joint_distribution_auto_batched
+    joint_distribution_auto_batched
 from tensorflow_probability.python.distributions import \
-  joint_distribution_coroutine
+    joint_distribution_coroutine
 from tensorflow_probability.python.distributions import normal
 from tensorflow_probability.python.distributions import sample
 from tensorflow_probability.python.distributions import transformed_distribution
-from tensorflow_probability.python.distributions import truncated_normal
-from tensorflow_probability.python.distributions import uniform
-from tensorflow_probability.python.experimental.bijectors import build_trainable_highway_flow
+from tensorflow_probability.python.experimental.bijectors import \
+    build_trainable_highway_flow
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
-  'register_cf_substitution_rule',
   'build_cf_surrogate_posterior'
 ]
 
 Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
 
-# TODO: the part until the function build_cf_surrogate_posterior is identical to
-#  the one in automatic_structured_vi. Should we remove it from this file and
-#  import them directly from automatic_structured_vi?
 
-_NON_STATISTICAL_PARAMS = [
-    'name', 'validate_args', 'allow_nan_stats', 'experimental_use_kahan_sum',
-    'reinterpreted_batch_ndims', 'dtype', 'force_probs_to_zero_outside_support',
-    'num_probit_terms_approx'
-]
-_NON_TRAINABLE_PARAMS = ['low', 'high']
-
-# Registry of transformations that are applied to distributions in the prior
-# before defining the surrogate family.
-
-
-ASVI_SURROGATE_SUBSTITUTIONS = {}
-
-
-def _as_substituted_distribution(distribution):
-    """Applies all substitution rules that match a distribution."""
-    for condition, substitution_fn in ASVI_SURROGATE_SUBSTITUTIONS.items():
-        if condition(distribution):
-            distribution = substitution_fn(distribution)
-    return distribution
-
-
-def register_cf_substitution_rule(condition, substitution_fn):
-    """Registers a rule for substituting distributions in ASVI surrogates.
-
-    Args:
-      condition: Python `callable` that takes a Distribution instance and
-        returns a Python `bool` indicating whether or not to substitute it.
-        May also be a class type such as `tfd.Normal`, in which case the
-        condition is interpreted as
-        `lambda distribution: isinstance(distribution, class)`.
-      substitution_fn: Python `callable` that takes a Distribution
-        instance and returns a new Distribution instance used to define
-        the ASVI surrogate posterior. Note that this substitution does not modify
-        the original model.
-
-    #### Example
-
-    To use a Normal surrogate for all location-scale family distributions, we
-    could register the substitution:
-
-    ```python
-    tfp.experimental.vi.register_asvi_surrogate_substitution(
-      condition=lambda distribution: (
-        hasattr(distribution, 'loc') and hasattr(distribution, 'scale'))
-      substitution_fn=lambda distribution: (
-        # Invoking the event space bijector applies any relevant constraints,
-        # e.g., that HalfCauchy samples must be `>= loc`.
-        distribution.experimental_default_event_space_bijector()(
-          tfd.Normal(loc=distribution.loc, scale=distribution.scale)))
-    ```
-
-    This rule will fire when ASVI encounters a location-scale distribution,
-    and instructs ASVI to build a surrogate 'as if' the model had just used a
-    (possibly constrained) Normal in its place. Note that we could have used a
-    more precise condition, e.g., to limit the substitution to distributions with
-    a specific `name`, if we had reason to think that a Normal distribution would
-    be a good surrogate for some model variables but not others.
-
-    """
-    global ASVI_SURROGATE_SUBSTITUTIONS
-    if inspect.isclass(condition):
-        condition = lambda distribution, cls=condition: isinstance(
-            # pylint: disable=g-long-lambda
-            distribution, cls)
-    ASVI_SURROGATE_SUBSTITUTIONS[condition] = substitution_fn
-
-# Default substitutions attempt to express distributions using the most
-# flexible available parameterization.
-# pylint: disable=g-long-lambda
-register_cf_substitution_rule(
-    half_normal.HalfNormal,
-    lambda dist: truncated_normal.TruncatedNormal(
-        loc=0., scale=dist.scale, low=0., high=dist.scale * 10.))
-register_cf_substitution_rule(
-    uniform.Uniform,
-    lambda dist: shift.Shift(dist.low)(
-        scale_lib.Scale(dist.high - dist.low)(
-            beta.Beta(concentration0=tf.ones_like(dist.mean()),
-                      concentration1=1.))))
-register_cf_substitution_rule(
-    exponential.Exponential,
-    lambda dist: gamma.Gamma(concentration=1., rate=dist.rate))
-register_cf_substitution_rule(
-    chi2.Chi2,
-    lambda dist: gamma.Gamma(concentration=0.5 * dist.df, rate=0.5))
-
-
-# pylint: enable=g-long-lambda
-
-# a single JointDistribution.
 def build_cf_surrogate_posterior(
-        prior,
-        num_auxiliary_variables=0,
-        initial_prior_weight=0.98,
-        num_layers=3,
-        seed=None,
-        name=None):
-    """Builds a structured surrogate posterior with cascading flows.
-
-    Cascading Flows (CF) [1] is a method that automatically construct a
-    variational approximation given an input probabilistic program. CF combines
-    ASVI [2] with the flexibility of normalizing flows, by transforming the
-    conditional distributions of the prior program with HighwayFlow architectures,
-    to steer the prior towards the observed data. More details on the HighwayFlow
-    architecture can be found in [1] and in the tfp bijector `HighwayFlow`.
-    It is possible to add auxiliary variables to the prior program to further
-    increase the flexibility of cascading flows, useful especially in the
-    cases where the input program has low dimensionality. The auxiliary variables
-    are sampled from a global linear flow, to account for statistical dependencies
-    among variables, and then transformed with local HighwayFlows together with
-    samples form the prior. Note that when using auxiliary variables it is
-    necessary to modify the variational lower bound [3].
-
-    Args:
-      prior: tfd.JointDistribution instance of the prior.
-      num_auxiliary_variables: The number of auxiliary variables to use for each
-        variable in the input program. Default value: `0`.
-      initial_prior_weight: Optional float value (either static or tensor value)
-        on the interval [0, 1]. A larger value creates an initial surrogate
-        distribution with more dependence on the prior structure. Default value:
-        `0.98`.
-      num_layers: Number of layers to use in each Highway Flow architecture. All
-      the layers will have `softplus` activation function, apart from the last one
-      which will have linear activation. Default value: `3`.
-      seed: Python `int` seed for random initialization.
-      name: Optional string. Default value: `build_cf_surrogate_posterior`.
-
-    Returns:
-      surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
-      whose samples have shape and structure matching that of `prior`.
-
-    Raises:
-      TypeError: The `prior` argument cannot be a nested `JointDistribution`.
-
-    ### Examples
-
-    Consider a Brownian motion model expressed as a JointDistribution:
-
-    ```python
-    prior_loc = 0.
-    innovation_noise = .1
-
-    def model_fn():
-      new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
-      for i in range(4):
-        new = yield tfd.Normal(loc=new, scale=innovation_noise)
-
-    prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
-    ```
-
-    Let's use variational inference to approximate the posterior. We'll build a
-    surrogate posterior distribution by feeding in the prior distribution.
-
-    ```python
-    surrogate_posterior =
-      tfp.experimental.vi.build_cf_surrogate_posterior(prior)
-    ```
-
-    This creates a trainable joint distribution, defined by variables in
-    `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
-    to fit this distribution by minimizing a divergence to the true posterior.
-
-    ```python
-    losses = tfp.vi.fit_surrogate_posterior(
-      target_log_prob_fn,
-      surrogate_posterior=surrogate_posterior,
-      num_steps=100,
-      optimizer=tf.optimizers.Adam(0.1),
-      sample_size=10)
-
-    # After optimization, samples from the surrogate will approximate
-    # samples from the true posterior.
-    samples = surrogate_posterior.sample(100)
-    posterior_mean = [tf.reduce_mean(x) for x in samples]
-    posterior_std = [tf.math.reduce_std(x) for x in samples]
-    ```
-
-    When using auxiliary variables, we need some modifications for loss and
-    samples, as samples will return also the global variables and transformed
-    auxiliary variables
-
-    ```python
-    num_aux_vars=10
-    target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape(
-      tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1),
-        scale=tf.reshape(tfp.util.TransformedVariable(
-          [tf.random.uniform((1,), minval=0.01, maxval=1.)
-        for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1)
-
-    def target_log_prob_aux_vars(z_and_eps):
-      z = [x[0] for x in z_and_eps[1:]]
-      eps = [x[1] for x in z_and_eps[1:]]
-      lp_z = target_log_prob_fn(z)
-      lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape)
-      return lp_z + lp_eps
-
-    target_log_prob = lambda *values: target_log_prob_aux_vars(values)
-    cf_surrogate_posterior = build_cf_surrogate_posterior(prior,
-                                            num_auxiliary_variables=num_aux_vars)
-    trainable_variables = list(cf_surrogate_posterior.trainable_variables)
-    trainable_variables.extend(list(target_dist.trainable_variables))
-    cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,
-                                          cf_surrogate_posterior,
-                                          optimizer=tf.optimizers.Adam(0.01),
-                                          num_steps=8000,
-                                          sample_size=50,
-                                          trainable_variables=trainable_variables)
-
-    cf_posterior_samples = cf_surrogate_posterior.sample(num_samples)
-    cf_posterior_samples = tf.convert_to_tensor(
-                                         [s[0] for s in cf_posterior_samples[1:]])
-    ```
-
-    #### References
-    [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic
-    variational inference with cascading flows." arXiv preprint arXiv:2102.04801
-    (2021).
-
-    [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference."
-    International Conference on Artificial Intelligence and Statistics. PMLR,
-    2021.
-
-    [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational
-    models." International Conference on Machine Learning. PMLR, 2016.
-
-    """
-    with tf.name_scope(name or 'build_cf_surrogate_posterior'):
-        surrogate_posterior, variables = _cf_surrogate_for_distribution(
-            dist=prior,
-            base_distribution_surrogate_fn=functools.partial(
-                _cf_convex_update_for_base_distribution,
-                initial_prior_weight=initial_prior_weight,
-                num_auxiliary_variables=num_auxiliary_variables,
-                num_layers=num_layers),
-            num_auxiliary_variables=num_auxiliary_variables,
-            num_layers=num_layers,
-            seed=seed)
-        surrogate_posterior.also_track = variables
-        return surrogate_posterior
+    prior,
+    num_auxiliary_variables=0,
+    initial_prior_weight=0.98,
+    num_layers=3,
+    seed=None,
+    name=None):
+  """Builds a structured surrogate posterior with cascading flows.
+
+  Cascading Flows (CF) [1] is a method that automatically construct a
+  variational approximation given an input probabilistic program. CF combines
+  ASVI [2] with the flexibility of normalizing flows, by transforming the
+  conditional distributions of the prior program with HighwayFlow architectures,
+  to steer the prior towards the observed data. More details on the HighwayFlow
+  architecture can be found in [1] and in the tfp bijector `HighwayFlow`.
+  It is possible to add auxiliary variables to the prior program to further
+  increase the flexibility of cascading flows, useful especially in the
+  cases where the input program has low dimensionality. The auxiliary variables
+  are sampled from a global linear flow, to account for statistical dependencies
+  among variables, and then transformed with local HighwayFlows together with
+  samples form the prior. Note that when using auxiliary variables it is
+  necessary to modify the variational lower bound [3].
+
+  Args:
+    prior: tfd.JointDistribution instance of the prior.
+    num_auxiliary_variables: The number of auxiliary variables to use for each
+      variable in the input program. Default value: `0`.
+    initial_prior_weight: Optional float value (either static or tensor value)
+      on the interval [0, 1]. A larger value creates an initial surrogate
+      distribution with more dependence on the prior structure. Default value:
+      `0.98`.
+    num_layers: Number of layers to use in each Highway Flow architecture. All
+    the layers will have `softplus` activation function, apart from the last one
+    which will have linear activation. Default value: `3`.
+    seed: Python `int` seed for random initialization.
+    name: Optional string. Default value: `build_cf_surrogate_posterior`.
+
+  Returns:
+    surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
+    whose samples have shape and structure matching that of `prior`.
+
+  Raises:
+    TypeError: The `prior` argument cannot be a nested `JointDistribution`.
+
+  ### Examples
+
+  Consider a Brownian motion model expressed as a JointDistribution:
+
+  ```python
+  prior_loc = 0.
+  innovation_noise = .1
+
+  def model_fn():
+    new = yield tfd.Normal(loc=prior_loc, scale=innovation_noise)
+    for i in range(4):
+      new = yield tfd.Normal(loc=new, scale=innovation_noise)
+
+  prior = tfd.JointDistributionCoroutineAutoBatched(model_fn)
+  ```
+
+  Let's use variational inference to approximate the posterior. We'll build a
+  surrogate posterior distribution by feeding in the prior distribution.
+
+  ```python
+  surrogate_posterior =
+    tfp.experimental.vi.build_cf_surrogate_posterior(prior)
+  ```
+
+  This creates a trainable joint distribution, defined by variables in
+  `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
+  to fit this distribution by minimizing a divergence to the true posterior.
+
+  ```python
+  losses = tfp.vi.fit_surrogate_posterior(
+    target_log_prob_fn,
+    surrogate_posterior=surrogate_posterior,
+    num_steps=100,
+    optimizer=tf.optimizers.Adam(0.1),
+    sample_size=10)
+
+  # After optimization, samples from the surrogate will approximate
+  # samples from the true posterior.
+  samples = surrogate_posterior.sample(100)
+  posterior_mean = [tf.reduce_mean(x) for x in samples]
+  posterior_std = [tf.math.reduce_std(x) for x in samples]
+  ```
+
+  When using auxiliary variables, we need some modifications for loss and
+  samples, as samples will return also the global variables and transformed
+  auxiliary variables
+
+  ```python
+  num_aux_vars=10
+  target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape(
+    tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1),
+      scale=tf.reshape(tfp.util.TransformedVariable(
+        [tf.random.uniform((1,), minval=0.01, maxval=1.)
+      for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1)
+
+  def target_log_prob_aux_vars(z_and_eps):
+    z = [x[0] for x in z_and_eps[1:]]
+    eps = [x[1] for x in z_and_eps[1:]]
+    lp_z = target_log_prob_fn(z)
+    lp_eps = tf.reshape(tf.reduce_sum(target_dist.log_prob(eps), 0), lp_z.shape)
+    return lp_z + lp_eps
+
+  target_log_prob = lambda *values: target_log_prob_aux_vars(values)
+  cf_surrogate_posterior = build_cf_surrogate_posterior(prior,
+                                          num_auxiliary_variables=num_aux_vars)
+  trainable_variables = list(cf_surrogate_posterior.trainable_variables)
+  trainable_variables.extend(list(target_dist.trainable_variables))
+  cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,
+                                        cf_surrogate_posterior,
+                                        optimizer=tf.optimizers.Adam(0.01),
+                                        num_steps=8000,
+                                        sample_size=50,
+                                        trainable_variables=trainable_variables)
+
+  cf_posterior_samples = cf_surrogate_posterior.sample(num_samples)
+  cf_posterior_samples = tf.convert_to_tensor(
+                                       [s[0] for s in cf_posterior_samples[1:]])
+  ```
+
+  #### References
+  [1]: Ambrogioni, Luca, Gianluigi Silvestri, and Marcel van Gerven. "Automatic
+  variational inference with cascading flows." arXiv preprint arXiv:2102.04801
+  (2021).
+
+  [2]: Ambrogioni, Luca, et al. "Automatic structured variational inference."
+  International Conference on Artificial Intelligence and Statistics. PMLR,
+  2021.
+
+  [3]: Ranganath, Rajesh, Dustin Tran, and David Blei. "Hierarchical variational
+  models." International Conference on Machine Learning. PMLR, 2016.
+
+  """
+  with tf.name_scope(name or 'build_cf_surrogate_posterior'):
+    surrogate_posterior, variables = _cf_surrogate_for_distribution(
+      dist=prior,
+      base_distribution_surrogate_fn=functools.partial(
+        _cf_convex_update_for_base_distribution,
+        initial_prior_weight=initial_prior_weight,
+        num_auxiliary_variables=num_auxiliary_variables,
+        num_layers=num_layers),
+      num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
+      seed=seed)
+    surrogate_posterior.also_track = variables
+    return surrogate_posterior
 
 
 def _cf_surrogate_for_distribution(dist,
@@ -312,207 +208,207 @@ def _cf_surrogate_for_distribution(dist,
                                    sample_shape=None,
                                    variables=None,
                                    seed=None):
-    """Recursively creates CF surrogates, and creates new variables if needed.
-
-    Args:
-      dist: a `tfd.Distribution` instance.
-      base_distribution_surrogate_fn: Callable to build a surrogate posterior
-        for a 'base' (non-meta and non-joint) distribution, with signature
-        `surrogate_posterior, variables = base_distribution_fn(
-        dist, sample_shape=None, variables=None, seed=None)`.
-      num_auxiliary_variables: The number of auxiliary variables to use for each
-        variable in the input program.
-      num_layers: Number of layers to use in each Highway Flow architecture.
-      global_auxiliary_variables: The sampled global auxiliary variables
-        (available only if using auxiliary variables). Default value: None.
-      sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
-        `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
-        independent sample dimensions, i.e., it will have event shape
-        `concat([sample_shape, dist.event_shape], axis=0)`.
-        Default value: `None`.
-      variables: Optional nested structure of `tf.Variable`s returned from a
-        previous call to `_cf_surrogate_for_distribution`. If `None`,
-        new variables will be created; otherwise, constructs a surrogate posterior
-        backed by the passed-in variables.
-        Default value: `None`.
-      seed: Python `int` seed for random initialization.
-    Returns:
-      surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
-        surrogate posterior distribution, with the same structure and `name` as
-        `dist`, and with addition of global and local auxiliary variables if
-        `num_auxiliary_variables > 0`.
-      variables: Nested structure of `tf.Variable` trainable parameters for the
-        surrogate posterior. If `dist` is a base distribution, this is
-        a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape`
-        bijectors. If `dist` is a joint distribution, this is a `dist.dtype`
-        structure of such `tfb.Chain`s.
-    """
-
-    # Apply any substitutions, while attempting to preserve the original name.
-    dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
-
-    if hasattr(dist, '_model_coroutine'):
-        surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
-            dist,
-            base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-            variables=variables,
-            num_auxiliary_variables=num_auxiliary_variables,
-            num_layers=num_layers,
-            global_auxiliary_variables=global_auxiliary_variables,
-            seed=seed)
-    else:
-        surrogate_posterior, variables = base_distribution_surrogate_fn(
-            dist=dist, sample_shape=sample_shape, variables=variables,
-            global_auxiliary_variables=global_auxiliary_variables,
-            num_layers=num_layers,
-            seed=seed)
-    return surrogate_posterior, variables
+  """Recursively creates CF surrogates, and creates new variables if needed.
+
+  Args:
+    dist: a `tfd.Distribution` instance.
+    base_distribution_surrogate_fn: Callable to build a surrogate posterior
+      for a 'base' (non-meta and non-joint) distribution, with signature
+      `surrogate_posterior, variables = base_distribution_fn(
+      dist, sample_shape=None, variables=None, seed=None)`.
+    num_auxiliary_variables: The number of auxiliary variables to use for each
+      variable in the input program.
+    num_layers: Number of layers to use in each Highway Flow architecture.
+    global_auxiliary_variables: The sampled global auxiliary variables
+      (available only if using auxiliary variables). Default value: None.
+    sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
+      `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
+      independent sample dimensions, i.e., it will have event shape
+      `concat([sample_shape, dist.event_shape], axis=0)`.
+      Default value: `None`.
+    variables: Optional nested structure of `tf.Variable`s returned from a
+      previous call to `_cf_surrogate_for_distribution`. If `None`,
+      new variables will be created; otherwise, constructs a surrogate posterior
+      backed by the passed-in variables.
+      Default value: `None`.
+    seed: Python `int` seed for random initialization.
+  Returns:
+    surrogate_posterior: Instance of `tfd.Distribution` representing a trainable
+      surrogate posterior distribution, with the same structure and `name` as
+      `dist`, and with addition of global and local auxiliary variables if
+      `num_auxiliary_variables > 0`.
+    variables: Nested structure of `tf.Variable` trainable parameters for the
+      surrogate posterior. If `dist` is a base distribution, this is
+      a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape`
+      bijectors. If `dist` is a joint distribution, this is a `dist.dtype`
+      structure of such `tfb.Chain`s.
+  """
+
+  # Apply any substitutions, while attempting to preserve the original name.
+  dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
+
+  if hasattr(dist, '_model_coroutine'):
+    surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
+      dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      variables=variables,
+      num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
+      global_auxiliary_variables=global_auxiliary_variables,
+      seed=seed)
+  else:
+    surrogate_posterior, variables = base_distribution_surrogate_fn(
+      dist=dist, sample_shape=sample_shape, variables=variables,
+      global_auxiliary_variables=global_auxiliary_variables,
+      num_layers=num_layers,
+      seed=seed)
+  return surrogate_posterior, variables
 
 
 def _build_highway_flow_block(num_layers, width,
                               residual_fraction_initial_value, gate_first_n,
                               seed):
-    bijectors = []
-
-    for _ in range(0, num_layers - 1):
-        bijectors.append(
-            build_trainable_highway_flow(width,
-                                         residual_fraction_initial_value=residual_fraction_initial_value,
-                                         activation_fn=tf.nn.softplus,
-                                         gate_first_n=gate_first_n, seed=seed))
+  bijectors = []
+
+  for _ in range(0, num_layers - 1):
     bijectors.append(
-        build_trainable_highway_flow(width,
-                                     residual_fraction_initial_value=residual_fraction_initial_value,
-                                     activation_fn=None,
-                                     gate_first_n=gate_first_n, seed=seed))
+      build_trainable_highway_flow(width,
+                                   residual_fraction_initial_value=residual_fraction_initial_value,
+                                   activation_fn=tf.nn.softplus,
+                                   gate_first_n=gate_first_n, seed=seed))
+  bijectors.append(
+    build_trainable_highway_flow(width,
+                                 residual_fraction_initial_value=residual_fraction_initial_value,
+                                 activation_fn=None,
+                                 gate_first_n=gate_first_n, seed=seed))
 
-    return bijectors
+  return bijectors
 
 
 def _cf_surrogate_for_joint_distribution(
-        dist, base_distribution_surrogate_fn, variables,
-        num_auxiliary_variables, num_layers, global_auxiliary_variables,
-        seed=None):
-    """Builds a structured joint surrogate posterior for a joint model."""
+    dist, base_distribution_surrogate_fn, variables,
+    num_auxiliary_variables, num_layers, global_auxiliary_variables,
+    seed=None):
+  """Builds a structured joint surrogate posterior for a joint model."""
 
-    # Probabilistic program for CF surrogate posterior.
-    flat_variables = dist._model_flatten(
-        variables) if variables else None  # pylint: disable=protected-access
-    prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
+  # Probabilistic program for CF surrogate posterior.
+  flat_variables = dist._model_flatten(
+    variables) if variables else None  # pylint: disable=protected-access
+  prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
 
-    def posterior_generator(seed=seed):
-        prior_gen = prior_coroutine()
-        dist = next(prior_gen)
+  def posterior_generator(seed=seed):
+    prior_gen = prior_coroutine()
+    dist = next(prior_gen)
 
-        if num_auxiliary_variables > 0:
-            i = 1
+    if num_auxiliary_variables > 0:
+      i = 1
 
-            if flat_variables:
-                variables = flat_variables[0]
+      if flat_variables:
+        variables = flat_variables[0]
 
-            else:
+      else:
 
-                bijectors = _build_highway_flow_block(
-                    num_layers,
-                    width=num_auxiliary_variables,
-                    residual_fraction_initial_value=0,  # not used
-                    gate_first_n=0, seed=seed)
-                variables = chain.Chain(bijectors=list(reversed(bijectors)))
+        bijectors = _build_highway_flow_block(
+          num_layers,
+          width=num_auxiliary_variables,
+          residual_fraction_initial_value=0,  # not used
+          gate_first_n=0, seed=seed)
+        variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
-            eps = transformed_distribution.TransformedDistribution(
-                distribution=sample.Sample(normal.Normal(0., 1.),
-                                           num_auxiliary_variables),
-                bijector=variables)
+      eps = transformed_distribution.TransformedDistribution(
+        distribution=sample.Sample(normal.Normal(0., 1.),
+                                   num_auxiliary_variables),
+        bijector=variables)
 
-            eps = Root(eps)
+      eps = Root(eps)
 
-            value_out = yield (eps if flat_variables
-                               else (eps, variables))
+      value_out = yield (eps if flat_variables
+                         else (eps, variables))
 
-            global_auxiliary_variables = value_out
+      global_auxiliary_variables = value_out
+
+    else:
+      global_auxiliary_variables = None
+      i = 0
+
+    try:
+      while True:
+        was_root = isinstance(dist, Root)
+        if was_root:
+          dist = dist.distribution
+
+        seed, init_seed = samplers.split_seed(seed)
+        surrogate_posterior, variables = _cf_surrogate_for_distribution(
+          dist,
+          base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+          num_auxiliary_variables=num_auxiliary_variables,
+          num_layers=num_layers,
+          variables=flat_variables[i] if flat_variables else None,
+          global_auxiliary_variables=global_auxiliary_variables,
+          seed=init_seed)
+
+        if was_root and num_auxiliary_variables == 0:
+          surrogate_posterior = Root(surrogate_posterior)
+        # If variables were not given---i.e., we're creating new
+        # variables---then yield the new variables along with the surrogate
+        # posterior. This assumes an execution context such as
+        # `_extract_variables_from_coroutine_model` below that will capture and
+        # save the variables.
+        value_out = yield (surrogate_posterior if flat_variables
+                           else (surrogate_posterior, variables))
+        if type(value_out) == list:
+          if len(dist.event_shape) == 0:
+            dist = prior_gen.send(tf.squeeze(value_out[0], -1))
+          else:
+            dist = prior_gen.send(value_out[0])
 
         else:
-            global_auxiliary_variables = None
-            i = 0
-
-        try:
-            while True:
-                was_root = isinstance(dist, Root)
-                if was_root:
-                    dist = dist.distribution
-
-                seed, init_seed = samplers.split_seed(seed)
-                surrogate_posterior, variables = _cf_surrogate_for_distribution(
-                    dist,
-                    base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-                    num_auxiliary_variables=num_auxiliary_variables,
-                    num_layers=num_layers,
-                    variables=flat_variables[i] if flat_variables else None,
-                    global_auxiliary_variables=global_auxiliary_variables,
-                    seed=init_seed)
-
-                if was_root and num_auxiliary_variables == 0:
-                    surrogate_posterior = Root(surrogate_posterior)
-                # If variables were not given---i.e., we're creating new
-                # variables---then yield the new variables along with the surrogate
-                # posterior. This assumes an execution context such as
-                # `_extract_variables_from_coroutine_model` below that will capture and
-                # save the variables.
-                value_out = yield (surrogate_posterior if flat_variables
-                                   else (surrogate_posterior, variables))
-                if type(value_out) == list:
-                    if len(dist.event_shape) == 0:
-                        dist = prior_gen.send(tf.squeeze(value_out[0], -1))
-                    else:
-                        dist = prior_gen.send(value_out[0])
-
-                else:
-                    dist = prior_gen.send(value_out)
-                i += 1
-        except StopIteration:
-            pass
-
-    if variables is None:
-        # Run the generator to create variables, then call ourselves again
-        # to construct the surrogate JD from these variables. Note that we can't
-        # just create a JDC from the current `posterior_generator`, because it will
-        # try to build new variables on every invocation; the recursive call will
-        # define a new `posterior_generator` that knows about the variables we're
-        # about to create.
-        return _cf_surrogate_for_joint_distribution(
-            dist=dist,
-            base_distribution_surrogate_fn=base_distribution_surrogate_fn,
-            num_auxiliary_variables=num_auxiliary_variables,
-            num_layers=num_layers,
-            global_auxiliary_variables=global_auxiliary_variables,
-            variables=dist._model_unflatten(
-                # pylint: disable=protected-access
-                _extract_variables_from_coroutine_model(
-                    posterior_generator, seed=seed)))
-
-    # Temporary workaround for bijector caching issues with autobatched JDs.
-    surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
-        posterior_generator,
-        use_vectorized_map=dist.use_vectorized_map,
-        name=_get_name(dist))
-
-    # Ensure that the surrogate posterior structure matches that of the prior.
-    # todo: check me, do we need this? in case needs to be modified
-    #  if we use auxiliary variables, then the structure won't match the one of the
-    #  prior
-    '''try:
-      tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
-    except TypeError:
-      tokenize = lambda jd: jd._model_unflatten(
-        # pylint: disable=protected-access, g-long-lambda
-        range(len(jd._model_flatten(jd.dtype)))
+          dist = prior_gen.send(value_out)
+        i += 1
+    except StopIteration:
+      pass
+
+  if variables is None:
+    # Run the generator to create variables, then call ourselves again
+    # to construct the surrogate JD from these variables. Note that we can't
+    # just create a JDC from the current `posterior_generator`, because it will
+    # try to build new variables on every invocation; the recursive call will
+    # define a new `posterior_generator` that knows about the variables we're
+    # about to create.
+    return _cf_surrogate_for_joint_distribution(
+      dist=dist,
+      base_distribution_surrogate_fn=base_distribution_surrogate_fn,
+      num_auxiliary_variables=num_auxiliary_variables,
+      num_layers=num_layers,
+      global_auxiliary_variables=global_auxiliary_variables,
+      variables=dist._model_unflatten(
         # pylint: disable=protected-access
-      )
-      surrogate_posterior = restructure.Restructure(
-        output_structure=tokenize(dist),
-        input_structure=tokenize(surrogate_posterior))(
-        surrogate_posterior, name=_get_name(dist))'''
-    return surrogate_posterior, variables
+        _extract_variables_from_coroutine_model(
+          posterior_generator, seed=seed)))
+
+  # Temporary workaround for bijector caching issues with autobatched JDs.
+  surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
+    posterior_generator,
+    use_vectorized_map=dist.use_vectorized_map,
+    name=_get_name(dist))
+
+  # Ensure that the surrogate posterior structure matches that of the prior.
+  # todo: check me, do we need this? in case needs to be modified
+  #  if we use auxiliary variables, then the structure won't match the one of the
+  #  prior
+  '''try:
+    tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
+  except TypeError:
+    tokenize = lambda jd: jd._model_unflatten(
+      # pylint: disable=protected-access, g-long-lambda
+      range(len(jd._model_flatten(jd.dtype)))
+      # pylint: disable=protected-access
+    )
+    surrogate_posterior = restructure.Restructure(
+      output_structure=tokenize(dist),
+      input_structure=tokenize(surrogate_posterior))(
+      surrogate_posterior, name=_get_name(dist))'''
+  return surrogate_posterior, variables
 
 
 # todo: sample_shape is not used.. can remove?
@@ -524,84 +420,84 @@ def _cf_convex_update_for_base_distribution(dist,
                                             variables,
                                             sample_shape=None,
                                             seed=None):
-    """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
-
-    if variables is None:
-        actual_event_shape = dist.event_shape_tensor()
-        int_event_shape = int(actual_event_shape) if \
-            actual_event_shape.shape.as_list()[0] > 0 else 1
-        bijectors = [reshape.Reshape([-1],
-                                     event_shape_in=actual_event_shape +
-                                                    num_auxiliary_variables)]
-
-        bijectors.extend(
-            _build_highway_flow_block(
-                num_layers,
-                width=tf.reduce_prod(
-                    actual_event_shape + num_auxiliary_variables),
-                residual_fraction_initial_value=initial_prior_weight,
-                gate_first_n=int_event_shape, seed=seed))
-
-        bijectors.append(
-            reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+  """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+
+  if variables is None:
+    actual_event_shape = dist.event_shape_tensor()
+    int_event_shape = int(actual_event_shape) if \
+      actual_event_shape.shape.as_list()[0] > 0 else 1
+    bijectors = [reshape.Reshape([-1],
+                                 event_shape_in=actual_event_shape +
+                                                num_auxiliary_variables)]
+
+    bijectors.extend(
+      _build_highway_flow_block(
+        num_layers,
+        width=tf.reduce_prod(
+          actual_event_shape + num_auxiliary_variables),
+        residual_fraction_initial_value=initial_prior_weight,
+        gate_first_n=int_event_shape, seed=seed))
 
-        variables = chain.Chain(bijectors=list(reversed(bijectors)))
+    bijectors.append(
+      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
 
-    if num_auxiliary_variables > 0:
-        batch_shape = global_auxiliary_variables.shape[0] if len(
-            global_auxiliary_variables.shape) > 1 else []
-
-        cascading_flows = split.Split(
-            [-1, num_auxiliary_variables])(
-            transformed_distribution.TransformedDistribution(
-                distribution=blockwise.Blockwise([
-                    batch_broadcast.BatchBroadcast(dist,
-                                                   to_shape=batch_shape),
-                    independent.Independent(
-                        deterministic.Deterministic(
-                            global_auxiliary_variables),
-                        reinterpreted_batch_ndims=1)]),
-                bijector=variables))
+    variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
-    else:
-        cascading_flows = transformed_distribution.TransformedDistribution(
-            distribution=dist,
-            bijector=variables)
+  if num_auxiliary_variables > 0:
+    batch_shape = global_auxiliary_variables.shape[0] if len(
+      global_auxiliary_variables.shape) > 1 else []
+
+    cascading_flows = split.Split(
+      [-1, num_auxiliary_variables])(
+      transformed_distribution.TransformedDistribution(
+        distribution=blockwise.Blockwise([
+          batch_broadcast.BatchBroadcast(dist,
+                                         to_shape=batch_shape),
+          independent.Independent(
+            deterministic.Deterministic(
+              global_auxiliary_variables),
+            reinterpreted_batch_ndims=1)]),
+        bijector=variables))
+
+  else:
+    cascading_flows = transformed_distribution.TransformedDistribution(
+      distribution=dist,
+      bijector=variables)
 
-    return cascading_flows, variables
+  return cascading_flows, variables
 
 
 def _extract_variables_from_coroutine_model(model_fn, seed=None):
-    """Extracts variables from a generator that yields (dist, variables) pairs."""
-    gen = model_fn()
-    try:
-        dist, dist_variables = next(gen)
-        flat_variables = [dist_variables]
-        while True:
-            seed, local_seed = samplers.split_seed(seed, n=2)
-            sampled_value = (dist.distribution.sample(seed=local_seed)
-                             if isinstance(dist, Root)
-                             else dist.sample(seed=local_seed))
-            dist, dist_variables = gen.send(
-                sampled_value)  # tf.concat(sampled_value, axis=0)
-            flat_variables.append(dist_variables)
-    except StopIteration:
-        pass
-    return flat_variables
+  """Extracts variables from a generator that yields (dist, variables) pairs."""
+  gen = model_fn()
+  try:
+    dist, dist_variables = next(gen)
+    flat_variables = [dist_variables]
+    while True:
+      seed, local_seed = samplers.split_seed(seed, n=2)
+      sampled_value = (dist.distribution.sample(seed=local_seed)
+                       if isinstance(dist, Root)
+                       else dist.sample(seed=local_seed))
+      dist, dist_variables = gen.send(
+        sampled_value)  # tf.concat(sampled_value, axis=0)
+      flat_variables.append(dist_variables)
+  except StopIteration:
+    pass
+  return flat_variables
 
 
 def _set_name(dist, name):
-    """Copies a distribution-like object, replacing its name."""
-    if hasattr(dist, 'copy'):
-        return dist.copy(name=name)
-    # Some distribution-like entities such as JointDistributionPinned don't
-    # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
-    # the name directly.
-    dist = copy.copy(dist)
-    dist._name = name  # pylint: disable=protected-access
-    return dist
+  """Copies a distribution-like object, replacing its name."""
+  if hasattr(dist, 'copy'):
+    return dist.copy(name=name)
+  # Some distribution-like entities such as JointDistributionPinned don't
+  # inherit from tfd.Distribution and don't define `self.copy`. We'll try to set
+  # the name directly.
+  dist = copy.copy(dist)
+  dist._name = name  # pylint: disable=protected-access
+  return dist
 
 
 def _get_name(dist):
-    """Attempts to get a distribution's short name, excluding the name scope."""
-    return getattr(dist, 'parameters', {}).get('name', dist.name)
+  """Attempts to get a distribution's short name, excluding the name scope."""
+  return getattr(dist, 'parameters', {}).get('name', dist.name)

From 6305084e32d76563f4afbc3f9e426ddc33c2266b Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 3 Jun 2021 15:05:48 +0200
Subject: [PATCH 35/54] removed sample_shape

---
 .../python/experimental/vi/cascading_flows.py    | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 0a87ae4399..5ca9b97bdf 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -205,7 +205,6 @@ def _cf_surrogate_for_distribution(dist,
                                    num_auxiliary_variables,
                                    num_layers,
                                    global_auxiliary_variables=None,
-                                   sample_shape=None,
                                    variables=None,
                                    seed=None):
   """Recursively creates CF surrogates, and creates new variables if needed.
@@ -215,17 +214,12 @@ def _cf_surrogate_for_distribution(dist,
     base_distribution_surrogate_fn: Callable to build a surrogate posterior
       for a 'base' (non-meta and non-joint) distribution, with signature
       `surrogate_posterior, variables = base_distribution_fn(
-      dist, sample_shape=None, variables=None, seed=None)`.
+      dist, variables=None, seed=None)`.
     num_auxiliary_variables: The number of auxiliary variables to use for each
       variable in the input program.
     num_layers: Number of layers to use in each Highway Flow architecture.
     global_auxiliary_variables: The sampled global auxiliary variables
       (available only if using auxiliary variables). Default value: None.
-    sample_shape: Optional `Tensor` shape of samples drawn from `dist` by
-      `tfd.Sample` wrappers. If not `None`, the surrogate's event will include
-      independent sample dimensions, i.e., it will have event shape
-      `concat([sample_shape, dist.event_shape], axis=0)`.
-      Default value: `None`.
     variables: Optional nested structure of `tf.Variable`s returned from a
       previous call to `_cf_surrogate_for_distribution`. If `None`,
       new variables will be created; otherwise, constructs a surrogate posterior
@@ -244,9 +238,6 @@ def _cf_surrogate_for_distribution(dist,
       structure of such `tfb.Chain`s.
   """
 
-  # Apply any substitutions, while attempting to preserve the original name.
-  dist = _set_name(_as_substituted_distribution(dist), name=_get_name(dist))
-
   if hasattr(dist, '_model_coroutine'):
     surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
       dist,
@@ -258,7 +249,7 @@ def _cf_surrogate_for_distribution(dist,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
-      dist=dist, sample_shape=sample_shape, variables=variables,
+      dist=dist, variables=variables,
       global_auxiliary_variables=global_auxiliary_variables,
       num_layers=num_layers,
       seed=seed)
@@ -386,7 +377,6 @@ def posterior_generator(seed=seed):
         _extract_variables_from_coroutine_model(
           posterior_generator, seed=seed)))
 
-  # Temporary workaround for bijector caching issues with autobatched JDs.
   surrogate_posterior = joint_distribution_auto_batched.JointDistributionCoroutineAutoBatched(
     posterior_generator,
     use_vectorized_map=dist.use_vectorized_map,
@@ -411,14 +401,12 @@ def posterior_generator(seed=seed):
   return surrogate_posterior, variables
 
 
-# todo: sample_shape is not used.. can remove?
 def _cf_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables,
                                             num_layers,
                                             global_auxiliary_variables,
                                             variables,
-                                            sample_shape=None,
                                             seed=None):
   """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
 

From 2f27b952669390fbd2d5ba4519aac866491e09e1 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Thu, 3 Jun 2021 15:13:30 +0200
Subject: [PATCH 36/54] changed if statement and array slicing for value_out

---
 .../python/experimental/vi/cascading_flows.py              | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 5ca9b97bdf..2a6e8c3834 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -347,9 +347,12 @@ def posterior_generator(seed=seed):
         # save the variables.
         value_out = yield (surrogate_posterior if flat_variables
                            else (surrogate_posterior, variables))
-        if type(value_out) == list:
+
+        # When using auxiliary variables, value out is a list containing
+        # [latent_variables, auxiliary_variables].
+        if num_auxiliary_variables>0:
           if len(dist.event_shape) == 0:
-            dist = prior_gen.send(tf.squeeze(value_out[0], -1))
+            dist = prior_gen.send(value_out[0][...,0])
           else:
             dist = prior_gen.send(value_out[0])
 

From 326a7660e69efd678b2e051a5ffc7e91ee085602 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 14 Jun 2021 15:26:14 +0200
Subject: [PATCH 37/54] changed docstrings for target_dist

---
 .../python/experimental/vi/cascading_flows.py         | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 2a6e8c3834..8c93bde2c6 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -142,11 +142,12 @@ def model_fn():
 
   ```python
   num_aux_vars=10
-  target_dist = tfd.Independent(tfd.Normal(loc=tf.reshape(
-    tf.Variable([tf.random.normal((1,)) for _ in range(num_aux_vars)]), -1),
-      scale=tf.reshape(tfp.util.TransformedVariable(
-        [tf.random.uniform((1,), minval=0.01, maxval=1.)
-      for _ in range(num_aux_vars)], bijector=tfb.Softplus()), -1)), 1)
+  event_len = len(prior.event_shape_tensor())
+  target_dist = tfd.Independent(
+  tfd.Normal(loc=tf.Variable(tf.random.normal((event_len,num_aux_vars))),
+             scale=tfp.util.TransformedVariable(
+             tf.random.uniform((event_len,num_aux_vars), minval=0.01, maxval=1.)
+             , bijector=tfb.Softplus())), 2)
 
   def target_log_prob_aux_vars(z_and_eps):
     z = [x[0] for x in z_and_eps[1:]]

From 1f295dacadf42ca412702c0de22614fc69a8a552 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 14 Jun 2021 16:40:44 +0200
Subject: [PATCH 38/54] expanded cf to cascading flows and changed bijector

---
 .../python/experimental/vi/cascading_flows.py | 67 +++++++++++--------
 .../experimental/vi/cascading_flows_test.py   | 35 +++++++++-
 2 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 8c93bde2c6..8599bf51a6 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -25,7 +25,11 @@
 import tensorflow.compat.v2 as tf
 
 from tensorflow_probability.python.bijectors import chain
+from tensorflow_probability.python.bijectors import identity
+from tensorflow_probability.python.bijectors import invert
+from tensorflow_probability.python.bijectors import joint_map
 from tensorflow_probability.python.bijectors import reshape
+from tensorflow_probability.python.bijectors import restructure
 from tensorflow_probability.python.bijectors import split
 from tensorflow_probability.python.distributions import batch_broadcast
 from tensorflow_probability.python.distributions import blockwise
@@ -43,13 +47,13 @@
 from tensorflow_probability.python.internal import samplers
 
 __all__ = [
-  'build_cf_surrogate_posterior'
+  'build_cascading_flow_surrogate_posterior'
 ]
 
 Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
 
 
-def build_cf_surrogate_posterior(
+def build_cascading_flow_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
     initial_prior_weight=0.98,
@@ -84,7 +88,7 @@ def build_cf_surrogate_posterior(
     the layers will have `softplus` activation function, apart from the last one
     which will have linear activation. Default value: `3`.
     seed: Python `int` seed for random initialization.
-    name: Optional string. Default value: `build_cf_surrogate_posterior`.
+    name: Optional string. Default value: `build_cascading_flow_surrogate_posterior`.
 
   Returns:
     surrogate_posterior: A `tfd.JointDistributionCoroutineAutoBatched` instance
@@ -114,7 +118,7 @@ def model_fn():
 
   ```python
   surrogate_posterior =
-    tfp.experimental.vi.build_cf_surrogate_posterior(prior)
+    tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior)
   ```
 
   This creates a trainable joint distribution, defined by variables in
@@ -157,20 +161,20 @@ def target_log_prob_aux_vars(z_and_eps):
     return lp_z + lp_eps
 
   target_log_prob = lambda *values: target_log_prob_aux_vars(values)
-  cf_surrogate_posterior = build_cf_surrogate_posterior(prior,
+  cascading_flow_surrogate_posterior = build_cascading_flow_surrogate_posterior(prior,
                                           num_auxiliary_variables=num_aux_vars)
-  trainable_variables = list(cf_surrogate_posterior.trainable_variables)
+  trainable_variables = list(cascading_flow_surrogate_posterior.trainable_variables)
   trainable_variables.extend(list(target_dist.trainable_variables))
-  cf_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,
-                                        cf_surrogate_posterior,
+  cascading_flow_losses = tfp.vi.fit_surrogate_posterior(target_log_prob,
+                                        cascading_flow_surrogate_posterior,
                                         optimizer=tf.optimizers.Adam(0.01),
                                         num_steps=8000,
                                         sample_size=50,
                                         trainable_variables=trainable_variables)
 
-  cf_posterior_samples = cf_surrogate_posterior.sample(num_samples)
-  cf_posterior_samples = tf.convert_to_tensor(
-                                       [s[0] for s in cf_posterior_samples[1:]])
+  cascading_flow_posterior_samples = cascading_flow_surrogate_posterior.sample(num_samples)
+  cascading_flow_posterior_samples = tf.convert_to_tensor(
+                                       [s[0] for s in cascading_flow_posterior_samples[1:]])
   ```
 
   #### References
@@ -186,11 +190,11 @@ def target_log_prob_aux_vars(z_and_eps):
   models." International Conference on Machine Learning. PMLR, 2016.
 
   """
-  with tf.name_scope(name or 'build_cf_surrogate_posterior'):
-    surrogate_posterior, variables = _cf_surrogate_for_distribution(
+  with tf.name_scope(name or 'build_cascading_flow_surrogate_posterior'):
+    surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution(
       dist=prior,
       base_distribution_surrogate_fn=functools.partial(
-        _cf_convex_update_for_base_distribution,
+        _cascading_flow_convex_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
         num_auxiliary_variables=num_auxiliary_variables,
         num_layers=num_layers),
@@ -201,7 +205,7 @@ def target_log_prob_aux_vars(z_and_eps):
     return surrogate_posterior
 
 
-def _cf_surrogate_for_distribution(dist,
+def _cascading_flow_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
                                    num_auxiliary_variables,
                                    num_layers,
@@ -222,7 +226,7 @@ def _cf_surrogate_for_distribution(dist,
     global_auxiliary_variables: The sampled global auxiliary variables
       (available only if using auxiliary variables). Default value: None.
     variables: Optional nested structure of `tf.Variable`s returned from a
-      previous call to `_cf_surrogate_for_distribution`. If `None`,
+      previous call to `_cascading_flow_surrogate_for_distribution`. If `None`,
       new variables will be created; otherwise, constructs a surrogate posterior
       backed by the passed-in variables.
       Default value: `None`.
@@ -240,7 +244,7 @@ def _cf_surrogate_for_distribution(dist,
   """
 
   if hasattr(dist, '_model_coroutine'):
-    surrogate_posterior, variables = _cf_surrogate_for_joint_distribution(
+    surrogate_posterior, variables = _cascading_flow_surrogate_for_joint_distribution(
       dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       variables=variables,
@@ -277,7 +281,7 @@ def _build_highway_flow_block(num_layers, width,
   return bijectors
 
 
-def _cf_surrogate_for_joint_distribution(
+def _cascading_flow_surrogate_for_joint_distribution(
     dist, base_distribution_surrogate_fn, variables,
     num_auxiliary_variables, num_layers, global_auxiliary_variables,
     seed=None):
@@ -330,7 +334,7 @@ def posterior_generator(seed=seed):
           dist = dist.distribution
 
         seed, init_seed = samplers.split_seed(seed)
-        surrogate_posterior, variables = _cf_surrogate_for_distribution(
+        surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution(
           dist,
           base_distribution_surrogate_fn=base_distribution_surrogate_fn,
           num_auxiliary_variables=num_auxiliary_variables,
@@ -370,7 +374,7 @@ def posterior_generator(seed=seed):
     # try to build new variables on every invocation; the recursive call will
     # define a new `posterior_generator` that knows about the variables we're
     # about to create.
-    return _cf_surrogate_for_joint_distribution(
+    return _cascading_flow_surrogate_for_joint_distribution(
       dist=dist,
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       num_auxiliary_variables=num_auxiliary_variables,
@@ -386,6 +390,12 @@ def posterior_generator(seed=seed):
     use_vectorized_map=dist.use_vectorized_map,
     name=_get_name(dist))
 
+  '''tokenize = lambda jd: jd._model_unflatten(
+    # pylint: disable=protected-access, g-long-lambda
+    range(len(jd._model_flatten(jd.dtype)))
+    # pylint: disable=protected-access
+  )'''
+
   # Ensure that the surrogate posterior structure matches that of the prior.
   # todo: check me, do we need this? in case needs to be modified
   #  if we use auxiliary variables, then the structure won't match the one of the
@@ -405,7 +415,7 @@ def posterior_generator(seed=seed):
   return surrogate_posterior, variables
 
 
-def _cf_convex_update_for_base_distribution(dist,
+def _cascading_flow_convex_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables,
                                             num_layers,
@@ -436,21 +446,24 @@ def _cf_convex_update_for_base_distribution(dist,
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
-    batch_shape = global_auxiliary_variables.shape[0] if len(
-      global_auxiliary_variables.shape) > 1 else []
+    flatten_event = reshape.Reshape(
+      event_shape_out=[-1],
+      event_shape_in=dist.event_shape_tensor())
 
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
-          batch_broadcast.BatchBroadcast(dist,
-                                         to_shape=batch_shape),
+          transformed_distribution.TransformedDistribution(
+            distribution=dist, bijector=flatten_event),
           independent.Independent(
-            deterministic.Deterministic(
-              global_auxiliary_variables),
+            deterministic.Deterministic(global_auxiliary_variables),
             reinterpreted_batch_ndims=1)]),
         bijector=variables))
 
+    cascading_flows = joint_map.JointMap(
+      [invert.Invert(flatten_event), identity.Identity()])(cascading_flows)
+
   else:
     cascading_flows = transformed_distribution.TransformedDistribution(
       distribution=dist,
diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 0b45486e6c..bfa556d987 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -24,14 +24,45 @@
 import tensorflow_probability as tfp
 from tensorflow_probability.python.internal import prefer_static as ps
 from tensorflow_probability.python.internal import test_util
+from tensorflow.python.util import nest
 
 # Dependency imports
 
 tfb = tfp.bijectors
 tfd = tfp.distributions
-
+Root = tfd.JointDistributionCoroutine.Root
 
 @test_util.test_all_tf_execution_regimes
+class CascadingFlowTests(test_util.TestCase):
+
+  def test_shapes(self):
+    @tfd.JointDistributionCoroutine
+    def test_shapes_model():
+      # Matrix-valued random variable with batch shape [3].
+      A = yield Root(
+        tfd.WishartTriL(df=2, scale_tril=tf.eye(2, batch_shape=[3]), name='A'))
+      # Vector-valued random variable with batch shape [3] (inherited from `A`)
+      x = yield tfd.MultivariateNormalDiag(loc=tf.zeros([2]),
+                                           scale_tril=tf.linalg.cholesky(A),
+                                           name='x')
+      # Scalar-valued random variable, with batch shape `[4, 3]`.
+      y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3]))
+
+    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10)
+
+    x1 = test_shapes_model.sample()
+    x2 = nest.map_structure_up_to(
+      x1,
+      # Strip auxiliary variables.
+      lambda *rv_and_aux: rv_and_aux[0],
+      surrogate_posterior.sample())
+
+    # Assert that samples from the surrogate have the same shape as the prior.
+    get_shapes = lambda x: tf.nest.map_structure(lambda xp: xp.shape, x)
+    self.assertAllEqualNested(get_shapes(x1), get_shapes(x2))
+
+
+'''@test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
   def _expected_num_trainable_variables(self, prior_dist, num_layers):
@@ -334,7 +365,7 @@ def centered_horseshoe(ndims=100):
                           tfd.Normal)
     self.assertIsInstance(surrogate_dists.local_scale.distribution,
                           tfd.Normal)
-    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)
+    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)'''
 
 
 if __name__ == '__main__':

From 487f7dd800859d7c5ebd1e69075d4303c252b2c0 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 14 Jun 2021 17:10:34 +0200
Subject: [PATCH 39/54] removed testCFDistributionSubstitution

---
 .../experimental/vi/cascading_flows_test.py   | 63 +------------------
 1 file changed, 1 insertion(+), 62 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index bfa556d987..ba03cf19ca 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -62,7 +62,7 @@ def test_shapes_model():
     self.assertAllEqualNested(get_shapes(x1), get_shapes(x2))
 
 
-'''@test_util.test_all_tf_execution_regimes
+@test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
   def _expected_num_trainable_variables(self, prior_dist, num_layers):
@@ -307,66 +307,5 @@ def nested_model():
     return tfd.JointDistributionCoroutineAutoBatched(nested_model)
 
 
-@test_util.test_all_tf_execution_regimes
-class TestCFDistributionSubstitution(test_util.TestCase):
-
-  def test_default_substitutes_trainable_families(self):
-    @tfd.JointDistributionCoroutineAutoBatched
-    def model():
-      yield tfd.Sample(
-        tfd.Uniform(low=-2., high=7.),
-        sample_shape=[2],
-        name='a')
-      yield tfd.HalfNormal(1., name='b')
-      yield tfd.Exponential(rate=[1., 2.], name='c')
-      yield tfd.Chi2(df=3., name='d')
-
-    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
-      model)
-    self.assertAllEqualNested(model.event_shape, surrogate.event_shape)
-
-    surrogate_dists, _ = surrogate.sample_distributions()
-    self.assertIsInstance(surrogate_dists.a, tfd.Independent)
-    self.assertIsInstance(surrogate_dists.a.distribution,
-                          tfd.TransformedDistribution)
-    self.assertIsInstance(surrogate_dists.a.distribution.distribution,
-                          tfd.Beta)
-    self.assertIsInstance(surrogate_dists.b, tfd.TruncatedNormal)
-    self.assertIsInstance(surrogate_dists.c, tfd.Gamma)
-    self.assertIsInstance(surrogate_dists.d, tfd.Gamma)
-
-  def test_can_specify_custom_substitution(self):
-    @tfd.JointDistributionCoroutineAutoBatched
-    def centered_horseshoe(ndims=100):
-      global_scale = yield tfd.HalfCauchy(
-        loc=0., scale=1., name='global_scale')
-      local_scale = yield tfd.HalfCauchy(
-        loc=0., scale=tf.ones([ndims]), name='local_scale')
-      yield tfd.Normal(
-        loc=0., scale=tf.sqrt(global_scale * local_scale),
-        name='weights')
-
-    tfp.experimental.vi.register_asvi_substitution_rule(
-      condition=tfd.HalfCauchy,
-      substitution_fn=(
-        lambda d: tfb.Softplus(1e-6)(
-          tfd.Normal(loc=d.loc, scale=d.scale))))
-    surrogate = tfp.experimental.vi.build_cf_surrogate_posterior(
-      centered_horseshoe)
-    self.assertAllEqualNested(centered_horseshoe.event_shape,
-                              surrogate.event_shape)
-
-    # If the surrogate was built with names or structure differing from the
-    # model, so that it had to be `tfb.Restructure`'d, then this
-    # sample_distributions call will fail because the surrogate isn't an
-    # instance of tfd.JointDistribution.
-    surrogate_dists, _ = surrogate.sample_distributions()
-    self.assertIsInstance(surrogate_dists.global_scale.distribution,
-                          tfd.Normal)
-    self.assertIsInstance(surrogate_dists.local_scale.distribution,
-                          tfd.Normal)
-    self.assertIsInstance(surrogate_dists.weights, tfd.Normal)'''
-
-
 if __name__ == '__main__':
   tf.test.main()

From e361a46b31731440e5bd6a33e9d57ad01734f004 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 14 Jun 2021 17:21:17 +0200
Subject: [PATCH 40/54] removed convex from name

---
 .../python/experimental/vi/cascading_flows.py                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 8599bf51a6..e0f94a28c5 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -194,7 +194,7 @@ def target_log_prob_aux_vars(z_and_eps):
     surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution(
       dist=prior,
       base_distribution_surrogate_fn=functools.partial(
-        _cascading_flow_convex_update_for_base_distribution,
+        _cascading_flow_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
         num_auxiliary_variables=num_auxiliary_variables,
         num_layers=num_layers),
@@ -415,7 +415,7 @@ def posterior_generator(seed=seed):
   return surrogate_posterior, variables
 
 
-def _cascading_flow_convex_update_for_base_distribution(dist,
+def _cascading_flow_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables,
                                             num_layers,

From 70ffe7b8de755d93232503ffe7fdc0a7f812a004 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 14 Jun 2021 17:29:55 +0200
Subject: [PATCH 41/54] fixed comment

---
 .../python/experimental/vi/cascading_flows.py                 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index e0f94a28c5..368436933e 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -225,7 +225,7 @@ def _cascading_flow_surrogate_for_distribution(dist,
     num_layers: Number of layers to use in each Highway Flow architecture.
     global_auxiliary_variables: The sampled global auxiliary variables
       (available only if using auxiliary variables). Default value: None.
-    variables: Optional nested structure of `tf.Variable`s returned from a
+    variables: Optional nested structure containing `tf.Variable`s returned from a
       previous call to `_cascading_flow_surrogate_for_distribution`. If `None`,
       new variables will be created; otherwise, constructs a surrogate posterior
       backed by the passed-in variables.
@@ -236,7 +236,7 @@ def _cascading_flow_surrogate_for_distribution(dist,
       surrogate posterior distribution, with the same structure and `name` as
       `dist`, and with addition of global and local auxiliary variables if
       `num_auxiliary_variables > 0`.
-    variables: Nested structure of `tf.Variable` trainable parameters for the
+    variables: Nested structure containing `tf.Variable` trainable parameters for the
       surrogate posterior. If `dist` is a base distribution, this is
       a `tfb.Chain` of bijectors containing HighwayFlow blocks and `Reshape`
       bijectors. If `dist` is a joint distribution, this is a `dist.dtype`

From 398d4598547f15e7bb59ac8fa677f2172a4bc3a8 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 15 Jun 2021 14:27:50 +0200
Subject: [PATCH 42/54] adjusted names

---
 tensorflow_probability/python/experimental/vi/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/__init__.py b/tensorflow_probability/python/experimental/vi/__init__.py
index cc5530300a..1f2fa2f900 100644
--- a/tensorflow_probability/python/experimental/vi/__init__.py
+++ b/tensorflow_probability/python/experimental/vi/__init__.py
@@ -17,7 +17,7 @@
 from tensorflow_probability.python.experimental.vi import util
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import build_asvi_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.automatic_structured_vi import register_asvi_substitution_rule
-from tensorflow_probability.python.experimental.vi.cascading_flows import build_cf_surrogate_posterior
+from tensorflow_probability.python.experimental.vi.cascading_flows import build_cascading_flow_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_affine_surrogate_posterior_from_base_distribution
 from tensorflow_probability.python.experimental.vi.surrogate_posteriors import build_factored_surrogate_posterior
@@ -30,7 +30,7 @@
     'build_affine_surrogate_posterior',
     'build_affine_surrogate_posterior_from_base_distribution',
     'build_asvi_surrogate_posterior',
-    'build_cf_surrogate_posterior',
+    'build_cascading_flow_surrogate_posterior',
     'build_factored_surrogate_posterior',
     'build_split_flow_surrogate_posterior',
     'build_trainable_location_scale_distribution',

From 2c44c483c92122b1759beecd0eaeb5ba8ce808a6 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Tue, 15 Jun 2021 14:28:10 +0200
Subject: [PATCH 43/54] fixed dimensions of prior

---
 .../python/experimental/vi/cascading_flows.py | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 368436933e..7608c07945 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -356,11 +356,7 @@ def posterior_generator(seed=seed):
         # When using auxiliary variables, value out is a list containing
         # [latent_variables, auxiliary_variables].
         if num_auxiliary_variables>0:
-          if len(dist.event_shape) == 0:
-            dist = prior_gen.send(value_out[0][...,0])
-          else:
-            dist = prior_gen.send(value_out[0])
-
+          dist = prior_gen.send(value_out[0])
         else:
           dist = prior_gen.send(value_out)
         i += 1
@@ -423,32 +419,35 @@ def _cascading_flow_update_for_base_distribution(dist,
                                             variables,
                                             seed=None):
   """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
+  event_shape = dist.event_shape_tensor()
+  flat_event_shape = tf.nest.flatten(event_shape)
+  flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape)
+  ndims = int(tf.reduce_sum(flat_event_size))
+  flatten_event = reshape.Reshape(
+    event_shape_out=[-1],
+    event_shape_in=dist.event_shape_tensor())
 
   if variables is None:
-    actual_event_shape = dist.event_shape_tensor()
-    int_event_shape = int(actual_event_shape) if \
-      actual_event_shape.shape.as_list()[0] > 0 else 1
-    bijectors = [reshape.Reshape([-1],
-                                 event_shape_in=actual_event_shape +
-                                                num_auxiliary_variables)]
+
+    '''bijectors = [reshape.Reshape([-1],
+                                 event_shape_in=ndims +
+                                                num_auxiliary_variables)]'''
+    bijectors = []
 
     bijectors.extend(
       _build_highway_flow_block(
         num_layers,
         width=tf.reduce_prod(
-          actual_event_shape + num_auxiliary_variables),
+          ndims + num_auxiliary_variables),
         residual_fraction_initial_value=initial_prior_weight,
-        gate_first_n=int_event_shape, seed=seed))
+        gate_first_n=ndims, seed=seed))
 
-    bijectors.append(
-      reshape.Reshape(actual_event_shape + num_auxiliary_variables))
+    '''bijectors.append(
+      reshape.Reshape(ndims + num_auxiliary_variables))'''
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
-    flatten_event = reshape.Reshape(
-      event_shape_out=[-1],
-      event_shape_in=dist.event_shape_tensor())
 
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(
@@ -457,7 +456,7 @@ def _cascading_flow_update_for_base_distribution(dist,
           transformed_distribution.TransformedDistribution(
             distribution=dist, bijector=flatten_event),
           independent.Independent(
-            deterministic.Deterministic(global_auxiliary_variables),
+            deterministic.Deterministic(global_auxiliary_variables, ),
             reinterpreted_batch_ndims=1)]),
         bijector=variables))
 
@@ -469,6 +468,8 @@ def _cascading_flow_update_for_base_distribution(dist,
       distribution=dist,
       bijector=variables)
 
+    cascading_flows = invert.Invert(flatten_event)(cascading_flows)
+
   return cascading_flows, variables
 
 

From 5fb11ec6aa7e0f3952b50992045d0e8e6749440b Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Wed, 16 Jun 2021 11:27:04 +0200
Subject: [PATCH 44/54] readded batchbroadcast

---
 .../python/experimental/vi/cascading_flows.py | 56 ++++++++++---------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 7608c07945..609c9d645f 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -386,28 +386,31 @@ def posterior_generator(seed=seed):
     use_vectorized_map=dist.use_vectorized_map,
     name=_get_name(dist))
 
-  '''tokenize = lambda jd: jd._model_unflatten(
+  tokenize = lambda jd: jd._model_unflatten(
     # pylint: disable=protected-access, g-long-lambda
     range(len(jd._model_flatten(jd.dtype)))
     # pylint: disable=protected-access
-  )'''
-
-  # Ensure that the surrogate posterior structure matches that of the prior.
-  # todo: check me, do we need this? in case needs to be modified
-  #  if we use auxiliary variables, then the structure won't match the one of the
-  #  prior
-  '''try:
-    tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
-  except TypeError:
-    tokenize = lambda jd: jd._model_unflatten(
-      # pylint: disable=protected-access, g-long-lambda
-      range(len(jd._model_flatten(jd.dtype)))
-      # pylint: disable=protected-access
-    )
+  )
+
+  dist_tokens = tokenize(dist)
+
+  if num_auxiliary_variables == 0:
+    try:
+      tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
+    except TypeError:
+      surrogate_posterior = restructure.Restructure(
+        output_structure=tokenize(dist),
+        input_structure=tokenize(surrogate_posterior))(
+        surrogate_posterior, name=_get_name(dist))
+
+  '''else:
     surrogate_posterior = restructure.Restructure(
-      output_structure=tokenize(dist),
+      output_structure=(
+        tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens),
+        [0] + [2 * k + 2 for k in tf.nest.flatten(dist_tokens)]),
       input_structure=tokenize(surrogate_posterior))(
       surrogate_posterior, name=_get_name(dist))'''
+
   return surrogate_posterior, variables
 
 
@@ -422,16 +425,16 @@ def _cascading_flow_update_for_base_distribution(dist,
   event_shape = dist.event_shape_tensor()
   flat_event_shape = tf.nest.flatten(event_shape)
   flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape)
-  ndims = int(tf.reduce_sum(flat_event_size))
+  try:
+    ndims = int(tf.reduce_sum(flat_event_size))
+  except:
+    a=0
   flatten_event = reshape.Reshape(
     event_shape_out=[-1],
     event_shape_in=dist.event_shape_tensor())
 
   if variables is None:
 
-    '''bijectors = [reshape.Reshape([-1],
-                                 event_shape_in=ndims +
-                                                num_auxiliary_variables)]'''
     bijectors = []
 
     bijectors.extend(
@@ -442,19 +445,17 @@ def _cascading_flow_update_for_base_distribution(dist,
         residual_fraction_initial_value=initial_prior_weight,
         gate_first_n=ndims, seed=seed))
 
-    '''bijectors.append(
-      reshape.Reshape(ndims + num_auxiliary_variables))'''
-
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
-
+    batch_shape = global_auxiliary_variables.shape[0] if len(
+      global_auxiliary_variables.shape) > 1 else []
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
-          transformed_distribution.TransformedDistribution(
-            distribution=dist, bijector=flatten_event),
+          batch_broadcast.BatchBroadcast(transformed_distribution.TransformedDistribution(
+            distribution=dist, bijector=flatten_event), to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(global_auxiliary_variables, ),
             reinterpreted_batch_ndims=1)]),
@@ -465,7 +466,8 @@ def _cascading_flow_update_for_base_distribution(dist,
 
   else:
     cascading_flows = transformed_distribution.TransformedDistribution(
-      distribution=dist,
+      distribution=transformed_distribution.TransformedDistribution(
+            distribution=dist, bijector=flatten_event),
       bijector=variables)
 
     cascading_flows = invert.Invert(flatten_event)(cascading_flows)

From 5a20976786e5b42e84db5218b3d50ee6db652bb3 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Wed, 16 Jun 2021 11:27:44 +0200
Subject: [PATCH 45/54] small fixes

---
 .../python/experimental/vi/cascading_flows_test.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index ba03cf19ca..c6d0531846 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -42,13 +42,13 @@ def test_shapes_model():
       A = yield Root(
         tfd.WishartTriL(df=2, scale_tril=tf.eye(2, batch_shape=[3]), name='A'))
       # Vector-valued random variable with batch shape [3] (inherited from `A`)
-      x = yield tfd.MultivariateNormalDiag(loc=tf.zeros([2]),
+      x = yield tfd.MultivariateNormalTriL(loc=tf.zeros([2]),
                                            scale_tril=tf.linalg.cholesky(A),
                                            name='x')
       # Scalar-valued random variable, with batch shape `[4, 3]`.
       y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3]))
 
-    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10)
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10)
 
     x1 = test_shapes_model.sample()
     x2 = nest.map_structure_up_to(
@@ -81,7 +81,7 @@ def _expected_num_trainable_variables(self, prior_dist, num_layers):
   def test_dims_and_gradients(self):
     prior_dist = self.make_prior_dist()
     num_layers = 3
-    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist, num_layers=num_layers)
 
     # Test that the correct number of trainable variables are being tracked
@@ -110,7 +110,7 @@ def test_dims_and_gradients(self):
   def test_initialization_is_deterministic_following_seed(self):
     prior_dist = self.make_prior_dist()
 
-    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist,
       seed=test_util.test_seed(sampler_type='stateless'))
     self.evaluate(
@@ -118,7 +118,7 @@ def test_initialization_is_deterministic_following_seed(self):
     posterior_sample = surrogate_posterior.sample(
       seed=test_util.test_seed(sampler_type='stateless'))
 
-    surrogate_posterior2 = tfp.experimental.vi.build_cf_surrogate_posterior(
+    surrogate_posterior2 = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist,
       seed=test_util.test_seed(sampler_type='stateless'))
     self.evaluate(
@@ -175,7 +175,7 @@ def test_fitting_surrogate_posterior(self):
 
     prior_dist = self.make_prior_dist()
     observations = self.get_observations(prior_dist)
-    surrogate_posterior = tfp.experimental.vi.build_cf_surrogate_posterior(
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist)
     target_log_prob = self.get_target_log_prob(observations, prior_dist)
 

From 14e34ee7145089abdc74e2b960add1171dfd62a9 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Wed, 16 Jun 2021 11:28:42 +0200
Subject: [PATCH 46/54] removed try except

---
 .../python/experimental/vi/cascading_flows.py                | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 609c9d645f..14dce602cd 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -425,10 +425,7 @@ def _cascading_flow_update_for_base_distribution(dist,
   event_shape = dist.event_shape_tensor()
   flat_event_shape = tf.nest.flatten(event_shape)
   flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape)
-  try:
-    ndims = int(tf.reduce_sum(flat_event_size))
-  except:
-    a=0
+  ndims = int(tf.reduce_sum(flat_event_size))
   flatten_event = reshape.Reshape(
     event_shape_out=[-1],
     event_shape_in=dist.event_shape_tensor())

From fa69f67c9cb0a50082a2e92e13b828cabc14798b Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Wed, 16 Jun 2021 11:55:06 +0200
Subject: [PATCH 47/54] added support for distributions withc constrained
 support and test

---
 .../python/experimental/vi/cascading_flows.py  | 18 ++++++++++--------
 .../experimental/vi/cascading_flows_test.py    | 11 +++++++++--
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 14dce602cd..62c1e61d8f 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -399,7 +399,7 @@ def posterior_generator(seed=seed):
       tf.nest.assert_same_structure(dist.dtype, surrogate_posterior.dtype)
     except TypeError:
       surrogate_posterior = restructure.Restructure(
-        output_structure=tokenize(dist),
+        output_structure=dist_tokens,
         input_structure=tokenize(surrogate_posterior))(
         surrogate_posterior, name=_get_name(dist))
 
@@ -426,10 +426,14 @@ def _cascading_flow_update_for_base_distribution(dist,
   flat_event_shape = tf.nest.flatten(event_shape)
   flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape)
   ndims = int(tf.reduce_sum(flat_event_size))
-  flatten_event = reshape.Reshape(
+  constraining_bijector = dist.experimental_default_event_space_bijector()
+  flatten_bijector = reshape.Reshape(
     event_shape_out=[-1],
     event_shape_in=dist.event_shape_tensor())
 
+  constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector])
+  processed_dist = transformed_distribution.TransformedDistribution(distribution=dist,
+                                                                    bijector=constraining_and_flattening_bijector)
   if variables is None:
 
     bijectors = []
@@ -451,23 +455,21 @@ def _cascading_flow_update_for_base_distribution(dist,
       [-1, num_auxiliary_variables])(
       transformed_distribution.TransformedDistribution(
         distribution=blockwise.Blockwise([
-          batch_broadcast.BatchBroadcast(transformed_distribution.TransformedDistribution(
-            distribution=dist, bijector=flatten_event), to_shape=batch_shape),
+          batch_broadcast.BatchBroadcast(processed_dist, to_shape=batch_shape),
           independent.Independent(
             deterministic.Deterministic(global_auxiliary_variables, ),
             reinterpreted_batch_ndims=1)]),
         bijector=variables))
 
     cascading_flows = joint_map.JointMap(
-      [invert.Invert(flatten_event), identity.Identity()])(cascading_flows)
+      [invert.Invert(constraining_and_flattening_bijector), identity.Identity()])(cascading_flows)
 
   else:
     cascading_flows = transformed_distribution.TransformedDistribution(
-      distribution=transformed_distribution.TransformedDistribution(
-            distribution=dist, bijector=flatten_event),
+      distribution=processed_dist,
       bijector=variables)
 
-    cascading_flows = invert.Invert(flatten_event)(cascading_flows)
+    cascading_flows = invert.Invert(constraining_and_flattening_bijector)(cascading_flows)
 
   return cascading_flows, variables
 
diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index c6d0531846..8bcbc6a26c 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -65,7 +65,7 @@ def test_shapes_model():
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
-  def _expected_num_trainable_variables(self, prior_dist, num_layers):
+  '''def _expected_num_trainable_variables(self, prior_dist, num_layers):
     """Infers the expected number of trainable variables for a non-nested JD."""
     prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
     expected_num_trainable_variables = 0
@@ -127,7 +127,14 @@ def test_initialization_is_deterministic_following_seed(self):
       seed=test_util.test_seed(sampler_type='stateless'))
 
     self.assertAllEqualNested(posterior_sample, posterior_sample2)
-
+'''
+  def test_surrogate_and_prior_have_same_domain(self):
+    prior_dist = self.make_prior_dist()
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
+      prior=prior_dist,
+      seed=test_util.test_seed(sampler_type='stateless'))
+    self.assertAllFinite(prior_dist.log_prob(
+      surrogate_posterior.sample(10, seed=test_util.test_seed())))
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,

From e296c6f3474f7e6b3ea350e2ce0b3aa39b98811d Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 18 Jun 2021 13:59:07 +0200
Subject: [PATCH 48/54] fixed output reshape

---
 .../python/experimental/vi/cascading_flows.py      | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 62c1e61d8f..c9ea70c04e 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -386,11 +386,9 @@ def posterior_generator(seed=seed):
     use_vectorized_map=dist.use_vectorized_map,
     name=_get_name(dist))
 
-  tokenize = lambda jd: jd._model_unflatten(
-    # pylint: disable=protected-access, g-long-lambda
-    range(len(jd._model_flatten(jd.dtype)))
-    # pylint: disable=protected-access
-  )
+  tokenize = lambda jd: tf.nest.pack_sequence_as(
+    jd.dtype,
+    range(len(tf.nest.flatten(jd.dtype))))
 
   dist_tokens = tokenize(dist)
 
@@ -403,13 +401,13 @@ def posterior_generator(seed=seed):
         input_structure=tokenize(surrogate_posterior))(
         surrogate_posterior, name=_get_name(dist))
 
-  '''else:
+  else:
     surrogate_posterior = restructure.Restructure(
       output_structure=(
         tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens),
         [0] + [2 * k + 2 for k in tf.nest.flatten(dist_tokens)]),
       input_structure=tokenize(surrogate_posterior))(
-      surrogate_posterior, name=_get_name(dist))'''
+      surrogate_posterior, name=_get_name(dist))
 
   return surrogate_posterior, variables
 
@@ -449,7 +447,7 @@ def _cascading_flow_update_for_base_distribution(dist,
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
   if num_auxiliary_variables > 0:
-    batch_shape = global_auxiliary_variables.shape[0] if len(
+    batch_shape = global_auxiliary_variables.shape[:-1] if len(
       global_auxiliary_variables.shape) > 1 else []
     cascading_flows = split.Split(
       [-1, num_auxiliary_variables])(

From 55155e8f29b7b66b57728a1e28df8a325a448f49 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 21 Jun 2021 10:54:18 +0200
Subject: [PATCH 49/54] removed discrete test

---
 .../experimental/vi/cascading_flows_test.py   | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 8bcbc6a26c..5acca06910 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -36,7 +36,7 @@
 class CascadingFlowTests(test_util.TestCase):
 
   def test_shapes(self):
-    @tfd.JointDistributionCoroutine
+
     def test_shapes_model():
       # Matrix-valued random variable with batch shape [3].
       A = yield Root(
@@ -45,10 +45,11 @@ def test_shapes_model():
       x = yield tfd.MultivariateNormalTriL(loc=tf.zeros([2]),
                                            scale_tril=tf.linalg.cholesky(A),
                                            name='x')
-      # Scalar-valued random variable, with batch shape `[4, 3]`.
-      y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([4, 3]))
+      # Scalar-valued random variable, with batch shape `[3]`.
+      y = yield tfd.Normal(loc=tf.reduce_sum(x, axis=-1), scale=tf.ones([3]))
 
-    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(test_shapes_model, num_auxiliary_variables=10)
+    prior = tfd.JointDistributionCoroutineAutoBatched(test_shapes_model, batch_ndims=1)
+    surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior) #num_auxiliary_variables=10)
 
     x1 = test_shapes_model.sample()
     x2 = nest.map_structure_up_to(
@@ -65,7 +66,7 @@ def test_shapes_model():
 @test_util.test_all_tf_execution_regimes
 class _TrainableCFSurrogate(object):
 
-  '''def _expected_num_trainable_variables(self, prior_dist, num_layers):
+  def _expected_num_trainable_variables(self, prior_dist, num_layers):
     """Infers the expected number of trainable variables for a non-nested JD."""
     prior_dists = prior_dist._get_single_sample_distributions()  # pylint: disable=protected-access
     expected_num_trainable_variables = 0
@@ -127,7 +128,7 @@ def test_initialization_is_deterministic_following_seed(self):
       seed=test_util.test_seed(sampler_type='stateless'))
 
     self.assertAllEqualNested(posterior_sample, posterior_sample2)
-'''
+
   def test_surrogate_and_prior_have_same_domain(self):
     prior_dist = self.make_prior_dist()
     surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
@@ -269,20 +270,6 @@ def _prior_model_fn():
 
     return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
 
-
-@test_util.test_all_tf_execution_regimes
-class CFSurrogatePosteriorTestDiscreteLatent(
-  test_util.TestCase, _TrainableCFSurrogate):
-
-  def make_prior_dist(self):
-    def _prior_model_fn():
-      a = yield tfd.Bernoulli(logits=0.5, name='a')
-      yield tfd.Normal(loc=2. * tf.cast(a, tf.float32) - 1.,
-                       scale=1., name='b')
-
-    return tfd.JointDistributionCoroutineAutoBatched(_prior_model_fn)
-
-
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestNesting(test_util.TestCase,
                                       _TrainableCFSurrogate):

From 9eb8b97f1b70b75c7921179a8162b9b4b669aefd Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 21 Jun 2021 10:55:55 +0200
Subject: [PATCH 50/54] working on batch shape

---
 .../python/experimental/vi/cascading_flows.py             | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index c9ea70c04e..fec1d8c4a1 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -291,6 +291,10 @@ def _cascading_flow_surrogate_for_joint_distribution(
   flat_variables = dist._model_flatten(
     variables) if variables else None  # pylint: disable=protected-access
   prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
+  prior_batch_shape = dist.batch_shape_tensor()
+  if tf.nest.is_nested(prior_batch_shape):
+    prior_batch_shape = functools.reduce(tf.broadcast_static_shape,
+                                         dist._model_flatten(prior_batch_shape))
 
   def posterior_generator(seed=seed):
     prior_gen = prior_coroutine()
@@ -312,8 +316,8 @@ def posterior_generator(seed=seed):
         variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
       eps = transformed_distribution.TransformedDistribution(
-        distribution=sample.Sample(normal.Normal(0., 1.),
-                                   num_auxiliary_variables),
+        distribution=batch_broadcast.BatchBroadcast(sample.Sample(normal.Normal(0., 1.),
+                                   num_auxiliary_variables), prior_batch_shape),
         bijector=variables)
 
       eps = Root(eps)

From 0c0fb39af14232ce18c1391833d1346c40a608c2 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 21 Jun 2021 17:34:01 +0200
Subject: [PATCH 51/54] small bug fixed

---
 .../python/experimental/vi/cascading_flows_test.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index 5acca06910..fb0890b12c 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -51,7 +51,7 @@ def test_shapes_model():
     prior = tfd.JointDistributionCoroutineAutoBatched(test_shapes_model, batch_ndims=1)
     surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(prior) #num_auxiliary_variables=10)
 
-    x1 = test_shapes_model.sample()
+    x1 = surrogate_posterior.sample()
     x2 = nest.map_structure_up_to(
       x1,
       # Strip auxiliary variables.

From 8d8777d11e044d11cf2c87f132a0f5abed5335b5 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Fri, 23 Jul 2021 13:36:00 +0200
Subject: [PATCH 52/54] changed shapes to static and added auxiliary variables
 without global flow

---
 .../python/experimental/vi/cascading_flows.py | 59 +++++++++++++++----
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index fec1d8c4a1..3dcc12fe59 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -42,9 +42,10 @@
 from tensorflow_probability.python.distributions import normal
 from tensorflow_probability.python.distributions import sample
 from tensorflow_probability.python.distributions import transformed_distribution
-from tensorflow_probability.python.experimental.bijectors import \
-    build_trainable_highway_flow
+
+from tensorflow_probability.python.experimental.bijectors import build_trainable_highway_flow
 from tensorflow_probability.python.internal import samplers
+from tensorflow_probability.python.internal import prefer_static as ps
 
 __all__ = [
   'build_cascading_flow_surrogate_posterior'
@@ -52,12 +53,13 @@
 
 Root = joint_distribution_coroutine.JointDistributionCoroutine.Root
 
-
+# todo: add check that id use_global_auxiliary_variables is true then num_auxiliary variables must be >=1
 def build_cascading_flow_surrogate_posterior(
     prior,
     num_auxiliary_variables=0,
     initial_prior_weight=0.98,
     num_layers=3,
+    use_global_auxiliary_variables=False,
     seed=None,
     name=None):
   """Builds a structured surrogate posterior with cascading flows.
@@ -190,6 +192,8 @@ def target_log_prob_aux_vars(z_and_eps):
   models." International Conference on Machine Learning. PMLR, 2016.
 
   """
+  if num_auxiliary_variables == 0 and use_global_auxiliary_variables == True:
+    raise ValueError('cannot use global auxiliary variables if auxiliary variables is 0')
   with tf.name_scope(name or 'build_cascading_flow_surrogate_posterior'):
     surrogate_posterior, variables = _cascading_flow_surrogate_for_distribution(
       dist=prior,
@@ -197,9 +201,11 @@ def target_log_prob_aux_vars(z_and_eps):
         _cascading_flow_update_for_base_distribution,
         initial_prior_weight=initial_prior_weight,
         num_auxiliary_variables=num_auxiliary_variables,
-        num_layers=num_layers),
+        num_layers=num_layers,
+        use_global_auxiliary_variables=use_global_auxiliary_variables,),
       num_auxiliary_variables=num_auxiliary_variables,
       num_layers=num_layers,
+      use_global_auxiliary_variables=use_global_auxiliary_variables,
       seed=seed)
     surrogate_posterior.also_track = variables
     return surrogate_posterior
@@ -209,6 +215,7 @@ def _cascading_flow_surrogate_for_distribution(dist,
                                    base_distribution_surrogate_fn,
                                    num_auxiliary_variables,
                                    num_layers,
+                                   use_global_auxiliary_variables,
                                    global_auxiliary_variables=None,
                                    variables=None,
                                    seed=None):
@@ -250,11 +257,13 @@ def _cascading_flow_surrogate_for_distribution(dist,
       variables=variables,
       num_auxiliary_variables=num_auxiliary_variables,
       num_layers=num_layers,
+      use_global_auxiliary_variables=use_global_auxiliary_variables,
       global_auxiliary_variables=global_auxiliary_variables,
       seed=seed)
   else:
     surrogate_posterior, variables = base_distribution_surrogate_fn(
       dist=dist, variables=variables,
+      use_global_auxiliary_variables=use_global_auxiliary_variables,
       global_auxiliary_variables=global_auxiliary_variables,
       num_layers=num_layers,
       seed=seed)
@@ -283,7 +292,7 @@ def _build_highway_flow_block(num_layers, width,
 
 def _cascading_flow_surrogate_for_joint_distribution(
     dist, base_distribution_surrogate_fn, variables,
-    num_auxiliary_variables, num_layers, global_auxiliary_variables,
+    num_auxiliary_variables, num_layers, use_global_auxiliary_variables, global_auxiliary_variables,
     seed=None):
   """Builds a structured joint surrogate posterior for a joint model."""
 
@@ -292,15 +301,16 @@ def _cascading_flow_surrogate_for_joint_distribution(
     variables) if variables else None  # pylint: disable=protected-access
   prior_coroutine = dist._model_coroutine  # pylint: disable=protected-access
   prior_batch_shape = dist.batch_shape_tensor()
+  #fixme
   if tf.nest.is_nested(prior_batch_shape):
-    prior_batch_shape = functools.reduce(tf.broadcast_static_shape,
+    prior_batch_shape = functools.reduce(ps.broadcast_shape,
                                          dist._model_flatten(prior_batch_shape))
 
   def posterior_generator(seed=seed):
     prior_gen = prior_coroutine()
     dist = next(prior_gen)
 
-    if num_auxiliary_variables > 0:
+    if use_global_auxiliary_variables == True:
       i = 1
 
       if flat_variables:
@@ -344,10 +354,11 @@ def posterior_generator(seed=seed):
           num_auxiliary_variables=num_auxiliary_variables,
           num_layers=num_layers,
           variables=flat_variables[i] if flat_variables else None,
+          use_global_auxiliary_variables=use_global_auxiliary_variables,
           global_auxiliary_variables=global_auxiliary_variables,
           seed=init_seed)
 
-        if was_root and num_auxiliary_variables == 0:
+        if was_root and use_global_auxiliary_variables == False:
           surrogate_posterior = Root(surrogate_posterior)
         # If variables were not given---i.e., we're creating new
         # variables---then yield the new variables along with the surrogate
@@ -379,6 +390,7 @@ def posterior_generator(seed=seed):
       base_distribution_surrogate_fn=base_distribution_surrogate_fn,
       num_auxiliary_variables=num_auxiliary_variables,
       num_layers=num_layers,
+      use_global_auxiliary_variables=use_global_auxiliary_variables,
       global_auxiliary_variables=global_auxiliary_variables,
       variables=dist._model_unflatten(
         # pylint: disable=protected-access
@@ -405,7 +417,7 @@ def posterior_generator(seed=seed):
         input_structure=tokenize(surrogate_posterior))(
         surrogate_posterior, name=_get_name(dist))
 
-  else:
+  elif use_global_auxiliary_variables:
     surrogate_posterior = restructure.Restructure(
       output_structure=(
         tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens),
@@ -413,6 +425,14 @@ def posterior_generator(seed=seed):
       input_structure=tokenize(surrogate_posterior))(
       surrogate_posterior, name=_get_name(dist))
 
+  else:
+    surrogate_posterior = restructure.Restructure(
+      output_structure=(
+        tf.nest.map_structure(lambda k: 2 * k, dist_tokens),
+        [2 * k + 1 for k in tf.nest.flatten(dist_tokens)]),
+      input_structure=tokenize(surrogate_posterior))(
+      surrogate_posterior, name=_get_name(dist))
+
   return surrogate_posterior, variables
 
 
@@ -420,17 +440,18 @@ def _cascading_flow_update_for_base_distribution(dist,
                                             initial_prior_weight,
                                             num_auxiliary_variables,
                                             num_layers,
+                                            use_global_auxiliary_variables,
                                             global_auxiliary_variables,
                                             variables,
                                             seed=None):
   """Creates a trainable surrogate for a (non-meta, non-joint) distribution."""
   event_shape = dist.event_shape_tensor()
   flat_event_shape = tf.nest.flatten(event_shape)
-  flat_event_size = tf.nest.map_structure(tf.reduce_prod, flat_event_shape)
-  ndims = int(tf.reduce_sum(flat_event_size))
+  flat_event_size = tf.nest.map_structure(ps.reduce_prod, flat_event_shape)
+  ndims = ps.reduce_sum(flat_event_size)
   constraining_bijector = dist.experimental_default_event_space_bijector()
   flatten_bijector = reshape.Reshape(
-    event_shape_out=[-1],
+    event_shape_out=flat_event_size,
     event_shape_in=dist.event_shape_tensor())
 
   constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector])
@@ -450,7 +471,7 @@ def _cascading_flow_update_for_base_distribution(dist,
 
     variables = chain.Chain(bijectors=list(reversed(bijectors)))
 
-  if num_auxiliary_variables > 0:
+  if num_auxiliary_variables > 0 and use_global_auxiliary_variables == True:
     batch_shape = global_auxiliary_variables.shape[:-1] if len(
       global_auxiliary_variables.shape) > 1 else []
     cascading_flows = split.Split(
@@ -466,6 +487,18 @@ def _cascading_flow_update_for_base_distribution(dist,
     cascading_flows = joint_map.JointMap(
       [invert.Invert(constraining_and_flattening_bijector), identity.Identity()])(cascading_flows)
 
+  elif num_auxiliary_variables > 0 and use_global_auxiliary_variables == False:
+    cascading_flows = split.Split(
+      [-1, num_auxiliary_variables])(
+      transformed_distribution.TransformedDistribution(
+        distribution=blockwise.Blockwise([processed_dist,
+          batch_broadcast.BatchBroadcast(
+            sample.Sample(normal.Normal(0.,1.), num_auxiliary_variables), to_shape=processed_dist.batch_shape)]),
+        bijector=variables))
+
+    cascading_flows = joint_map.JointMap(
+      [invert.Invert(constraining_and_flattening_bijector),
+       identity.Identity()])(cascading_flows)
   else:
     cascading_flows = transformed_distribution.TransformedDistribution(
       distribution=processed_dist,

From d514f7283e5451623021682aaecca4db492c63a0 Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 9 Aug 2021 12:18:07 +0200
Subject: [PATCH 53/54] fixed constraining_bijector

---
 .../python/experimental/vi/cascading_flows.py         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows.py b/tensorflow_probability/python/experimental/vi/cascading_flows.py
index 3dcc12fe59..e8796b35f7 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows.py
@@ -332,11 +332,9 @@ def posterior_generator(seed=seed):
 
       eps = Root(eps)
 
-      value_out = yield (eps if flat_variables
+      global_auxiliary_variables = yield (eps if flat_variables
                          else (eps, variables))
 
-      global_auxiliary_variables = value_out
-
     else:
       global_auxiliary_variables = None
       i = 0
@@ -417,7 +415,8 @@ def posterior_generator(seed=seed):
         input_structure=tokenize(surrogate_posterior))(
         surrogate_posterior, name=_get_name(dist))
 
-  elif use_global_auxiliary_variables:
+  #FIXME: this part is commented out as blows up RAM memory
+  '''elif use_global_auxiliary_variables:
     surrogate_posterior = restructure.Restructure(
       output_structure=(
         tf.nest.map_structure(lambda k: 2 * k + 1, dist_tokens),
@@ -431,7 +430,7 @@ def posterior_generator(seed=seed):
         tf.nest.map_structure(lambda k: 2 * k, dist_tokens),
         [2 * k + 1 for k in tf.nest.flatten(dist_tokens)]),
       input_structure=tokenize(surrogate_posterior))(
-      surrogate_posterior, name=_get_name(dist))
+      surrogate_posterior, name=_get_name(dist))'''
 
   return surrogate_posterior, variables
 
@@ -454,7 +453,7 @@ def _cascading_flow_update_for_base_distribution(dist,
     event_shape_out=flat_event_size,
     event_shape_in=dist.event_shape_tensor())
 
-  constraining_and_flattening_bijector = chain.Chain([flatten_bijector, constraining_bijector])
+  constraining_and_flattening_bijector = chain.Chain([flatten_bijector, invert.Invert(constraining_bijector)])
   processed_dist = transformed_distribution.TransformedDistribution(distribution=dist,
                                                                     bijector=constraining_and_flattening_bijector)
   if variables is None:

From 5eabcf8a0fdf0125c2a1bed6071c91d441b9812b Mon Sep 17 00:00:00 2001
From: GianluigiSilvestri <gianlu.silvestri@gmail.com>
Date: Mon, 9 Aug 2021 14:39:28 +0200
Subject: [PATCH 54/54] working cf and cf with local aux vars

---
 .../experimental/vi/cascading_flows_test.py   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
index fb0890b12c..de10179486 100644
--- a/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
+++ b/tensorflow_probability/python/experimental/vi/cascading_flows_test.py
@@ -92,9 +92,9 @@ def test_dims_and_gradients(self):
 
     # Test that the sample shape is correct
     three_posterior_samples = surrogate_posterior.sample(
-      3, seed=test_util.test_seed(sampler_type='stateless'))
+      3, seed=(0,0))
     three_prior_samples = prior_dist.sample(
-      3, seed=test_util.test_seed(sampler_type='stateless'))
+      3, seed=(0,0))
     self.assertAllEqualNested(
       [s.shape for s in tf.nest.flatten(three_prior_samples)],
       [s.shape for s in tf.nest.flatten(three_posterior_samples)])
@@ -102,7 +102,7 @@ def test_dims_and_gradients(self):
     # Test that gradients are available wrt the variational parameters.
     with tf.GradientTape() as tape:
       posterior_sample = surrogate_posterior.sample(
-        seed=test_util.test_seed(sampler_type='stateless'))
+        seed=(0,0))
       posterior_logprob = surrogate_posterior.log_prob(posterior_sample)
     grad = tape.gradient(posterior_logprob,
                          surrogate_posterior.trainable_variables)
@@ -113,19 +113,19 @@ def test_initialization_is_deterministic_following_seed(self):
 
     surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist,
-      seed=test_util.test_seed(sampler_type='stateless'))
+      seed=(0,0))
     self.evaluate(
       [v.initializer for v in surrogate_posterior.trainable_variables])
     posterior_sample = surrogate_posterior.sample(
-      seed=test_util.test_seed(sampler_type='stateless'))
+      seed=(0,0))
 
     surrogate_posterior2 = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist,
-      seed=test_util.test_seed(sampler_type='stateless'))
+      seed=(0,0))
     self.evaluate(
       [v.initializer for v in surrogate_posterior2.trainable_variables])
     posterior_sample2 = surrogate_posterior2.sample(
-      seed=test_util.test_seed(sampler_type='stateless'))
+      seed=(0,0))
 
     self.assertAllEqualNested(posterior_sample, posterior_sample2)
 
@@ -133,9 +133,9 @@ def test_surrogate_and_prior_have_same_domain(self):
     prior_dist = self.make_prior_dist()
     surrogate_posterior = tfp.experimental.vi.build_cascading_flow_surrogate_posterior(
       prior=prior_dist,
-      seed=test_util.test_seed(sampler_type='stateless'))
+      seed=(0,0))
     self.assertAllFinite(prior_dist.log_prob(
-      surrogate_posterior.sample(10, seed=test_util.test_seed())))
+      surrogate_posterior.sample(10, seed=(0,0))))
 
 @test_util.test_all_tf_execution_regimes
 class CFSurrogatePosteriorTestBrownianMotion(test_util.TestCase,
@@ -192,7 +192,7 @@ def test_fitting_surrogate_posterior(self):
       target_log_prob,
       surrogate_posterior,
       num_steps=5,  # Don't optimize to completion.
-      optimizer=tf.optimizers.Adam(0.1),
+      optimizer=tf.optimizers.Adam(1e-3),
       sample_size=10)
 
     # Compute posterior statistics.