From 59cf12c22061a96a1a95e5327dffa19abb78e1c6 Mon Sep 17 00:00:00 2001
From: Szymon <szymon.jan.mamon@gmail.com>
Date: Mon, 11 Nov 2024 09:37:10 +0100
Subject: [PATCH] Partial Least Square SVD (#307)

---
 lib/scholar/cross_decomposition/pls_svd.ex    | 347 ++++++++++++++++++
 .../cross_decomposition/pls_svd_test.exs      | 221 +++++++++++
 2 files changed, 568 insertions(+)
 create mode 100644 lib/scholar/cross_decomposition/pls_svd.ex
 create mode 100644 test/scholar/cross_decomposition/pls_svd_test.exs

diff --git a/lib/scholar/cross_decomposition/pls_svd.ex b/lib/scholar/cross_decomposition/pls_svd.ex
new file mode 100644
index 00000000..656e384f
--- /dev/null
+++ b/lib/scholar/cross_decomposition/pls_svd.ex
@@ -0,0 +1,347 @@
+defmodule Scholar.CrossDecomposition.PLSSVD do
+  @moduledoc """
+  Partial Least Square SVD.
+
+  This transformer simply performs a SVD on the cross-covariance matrix.
+  It is able to project both the training data `x` and the targets
+  `y`. The training data `x` is projected on the left singular vectors, while
+  the targets are projected on the right singular vectors.
+  """
+  import Nx.Defn
+
+  @derive {Nx.Container,
+           containers: [
+             :x_mean,
+             :y_mean,
+             :x_std,
+             :y_std,
+             :x_weights,
+             :y_weights
+           ]}
+  defstruct [
+    :x_mean,
+    :y_mean,
+    :x_std,
+    :y_std,
+    :x_weights,
+    :y_weights
+  ]
+
+  opts_schema = [
+    num_components: [
+      default: 2,
+      type: :pos_integer,
+      doc: "The number of components to keep. Should be in `[1,
+        min(n_samples, n_features, n_targets)]`."
+    ],
+    scale: [
+      default: true,
+      type: :boolean,
+      doc: "Whether to scale `x` and `y`."
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+    Fit model to data.
+    Takes as arguments: 
+
+    * `x` - training samples, `{num_samples, num_features}` shaped tensor
+    
+    * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return Values
+
+    The function returns fitted estimator represented by struct with the following parameters:
+
+    * `:x_mean` - tensor of shape `{num_features}` which represents `x` tensor mean values calculated along axis 0.
+
+    * `:y_mean` - tensor of shape `{num_targets}` which represents `x` tensor mean values calculated along axis 0.
+
+    * `:x_std` - tensor of shape `{num_features}` which represents `x` tensor standard deviation values calculated along axis 0.
+
+    * `:y_std` -  tensor of shape `{num_targets}` which represents `y` tensor standard deviation values calculated along axis 0.
+
+    * `:x_weights` -  tensor of shape `{num_features, num_components}` the left singular vectors of the SVD of the cross-covariance matrix.
+
+    * `:y_weights` -  tensor of shape `{num_components, num_targets}` the transposed right singular vectors of the SVD of the cross-covariance matrix.
+
+  ## Examples
+
+      iex> x = Nx.tensor([[0.0, 0.0, 1.0],
+      ...>                [1.0, 0.0, 0.0],
+      ...>                [2.0, 2.0, 2.0],
+      ...>                [2.0, 5.0, 4.0]])
+      iex> y = Nx.tensor([[0.1, -0.2],
+      ...>                [0.9, 1.1],
+      ...>                [6.2, 5.9],
+      ...>                [11.9, 12.3]])
+      iex> model = Scholar.CrossDecomposition.PLSSVD.fit(x, y)
+      iex> model.x_mean
+      #Nx.Tensor<
+        f32[3]
+        [1.25, 1.75, 1.75]
+      >
+      iex> model.y_std
+      #Nx.Tensor<
+        f32[2]
+        [5.467098712921143, 5.661198616027832]
+      >
+      iex> model.x_weights
+      #Nx.Tensor<
+        f32[3][2]
+        [
+          [0.521888256072998, -0.11256571859121323],
+          [0.6170258522033691, 0.7342619299888611],
+          [0.5889922380447388, -0.6694686412811279]
+        ]
+      >
+  """
+
+  deftransform fit(x, y, opts \\ []) do
+    fit_n(x, y, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_n(x, y, opts) do
+    {x, y} = check_x_y(x, y, opts)
+    num_components = opts[:num_components]
+    {x, x_mean, x_std} = center_scale(x, opts)
+    {y, y_mean, y_std} = center_scale(y, opts)
+
+    c = Nx.dot(x, [0], y, [0])
+
+    {u, _s, vt} = Nx.LinAlg.svd(c, full_matrices?: false)
+    u = Nx.slice_along_axis(u, 0, num_components, axis: 1)
+    vt = Nx.slice_along_axis(vt, 0, num_components, axis: 0)
+    {u, vt} = Scholar.Decomposition.Utils.flip_svd(u, vt)
+
+    x_weights = u
+    y_weights = vt
+
+    %__MODULE__{
+      x_mean: x_mean,
+      y_mean: y_mean,
+      x_std: x_std,
+      y_std: y_std,
+      x_weights: x_weights,
+      y_weights: y_weights
+    }
+  end
+
+  @doc """
+    Apply the dimensionality reduction.
+    Takes as arguments: 
+
+    * fitted estimator struct which is return value of `fit/3` function from this module
+
+    * `x` - training samples, `{num_samples, num_features}` shaped tensor
+    
+    * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return Values
+    
+    Returns tuple with transformed data `{x_transformed, y_transformed}` where:
+
+    * `x_transformed` is `{num_samples, num_features}` shaped tensor.
+    
+    * `y_transformed` is `{num_samples, num_features}` shaped tensor.
+
+  ## Examples
+
+      iex> x = Nx.tensor([[0.0, 0.0, 1.0],
+      ...>                [1.0, 0.0, 0.0],
+      ...>                [2.0, 2.0, 2.0],
+      ...>                [2.0, 5.0, 4.0]])
+      iex> y = Nx.tensor([[0.1, -0.2],
+      ...>                [0.9, 1.1],
+      ...>                [6.2, 5.9],
+      ...>                [11.9, 12.3]])
+      iex> model = Scholar.CrossDecomposition.PLSSVD.fit(x, y)
+      iex> {x, y} = Scholar.CrossDecomposition.PLSSVD.transform(model, x, y)
+      iex> x
+      #Nx.Tensor<
+        f32[4][2]
+        [
+          [-1.397004246711731, -0.10283949971199036],
+          [-1.1967883110046387, 0.17159013450145721],
+          [0.5603229403495789, -0.10849219560623169],
+          [2.0334696769714355, 0.039741579443216324]
+        ]
+      >
+      iex> y
+      #Nx.Tensor<
+        f32[4][2]
+        [
+          [-1.2260178327560425, -0.019306711852550507],
+          [-0.9602956175804138, 0.04015407711267471],
+          [0.3249155580997467, -0.04311027377843857],
+          [1.8613981008529663, 0.022262824699282646]
+        ]
+      >
+
+  """
+  deftransform transform(model, x, y, opts \\ []) do
+    transform_n(model, x, y, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp transform_n(
+          %__MODULE__{
+            x_mean: x_mean,
+            y_mean: y_mean,
+            x_std: x_std,
+            y_std: y_std,
+            x_weights: x_weights,
+            y_weights: y_weights
+          } = _model,
+          x,
+          y,
+          opts
+        ) do
+    {x, y} = check_x_y(x, y, opts)
+
+    xr = (x - x_mean) / x_std
+    x_scores = Nx.dot(xr, x_weights)
+
+    yr = (y - y_mean) / y_std
+    y_scores = Nx.dot(yr, [1], y_weights, [1])
+    {x_scores, y_scores}
+  end
+
+  @doc """
+  Learn and apply the dimensionality reduction.
+
+  The arguments are:
+    
+    * `x` - training samples, `{num_samples, num_features}` shaped tensor
+    
+    * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return Values
+
+  Returns tuple with transformed data `{x_transformed, y_transformed}` where:
+
+    * `x_transformed` is `{num_samples, num_features}` shaped tensor.
+    
+    * `y_transformed` is `{num_samples, num_features}` shaped tensor.
+
+  ## Examples
+
+      iex> x = Nx.tensor([[0.0, 0.0, 1.0],
+      ...>                [1.0, 0.0, 0.0],
+      ...>                [2.0, 2.0, 2.0],
+      ...>                [2.0, 5.0, 4.0]])
+      iex> y = Nx.tensor([[0.1, -0.2],
+      ...>                [0.9, 1.1],
+      ...>                [6.2, 5.9],
+      ...>                [11.9, 12.3]])
+      iex> {x, y} = Scholar.CrossDecomposition.PLSSVD.fit_transform(x, y)
+      iex> x
+      #Nx.Tensor<
+        f32[4][2]
+        [
+          [-1.397004246711731, -0.10283949971199036],
+          [-1.1967883110046387, 0.17159013450145721],
+          [0.5603229403495789, -0.10849219560623169],
+          [2.0334696769714355, 0.039741579443216324]
+        ]
+      >
+      iex> y
+      #Nx.Tensor<
+        f32[4][2]
+        [
+          [-1.2260178327560425, -0.019306711852550507],
+          [-0.9602956175804138, 0.04015407711267471],
+          [0.3249155580997467, -0.04311027377843857],
+          [1.8613981008529663, 0.022262824699282646]
+        ]
+      >
+
+  """
+
+  deftransform fit_transform(x, y, opts \\ []) do
+    fit_transform_n(x, y, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_transform_n(x, y, opts) do
+    fit(x, y, opts)
+    |> transform(x, y, opts)
+  end
+
+  defnp check_x_y(x, y, opts) do
+    y =
+      case Nx.shape(y) do
+        {n} -> Nx.reshape(y, {n, 1})
+        _ -> y
+      end
+
+    num_components = opts[:num_components]
+    {num_samples, num_features} = Nx.shape(x)
+    {num_samples_y, num_targets} = Nx.shape(y)
+
+    cond do
+      num_samples != num_samples_y ->
+        raise ArgumentError,
+              """
+              num_samples must be the same for x and y \
+              x num_samples = #{num_samples}, y num_samples = #{num_samples_y}
+              """
+
+      num_components > num_features ->
+        raise ArgumentError,
+              """
+              num_components must be less than or equal to \
+              num_features = #{num_features}, got #{num_components}
+              """
+
+      num_components > num_samples ->
+        raise ArgumentError,
+              """
+              num_components must be less than or equal to \
+              num_samples = #{num_samples}, got #{num_components}
+              """
+
+      num_components > num_targets ->
+        raise ArgumentError,
+              """
+              num_components must be less than or equal to \
+              num_targets = #{num_targets}, got #{num_components}
+              """
+
+      true ->
+        nil
+    end
+
+    {x, y}
+  end
+
+  defnp center_scale(x, opts) do
+    scale = opts[:scale]
+    x_mean = Nx.mean(x, axes: [0])
+    x = x - x_mean
+
+    if scale do
+      x_std = Nx.standard_deviation(x, axes: [0], ddof: 1)
+      x_std = Nx.select(x_std == 0.0, 1.0, x_std)
+      x = x / Nx.broadcast(x_std, Nx.shape(x))
+
+      {x, x_mean, x_std}
+    else
+      x_std = Nx.broadcast(1, {Nx.axis_size(x, 1)})
+
+      {x, x_mean, x_std}
+    end
+  end
+end
diff --git a/test/scholar/cross_decomposition/pls_svd_test.exs b/test/scholar/cross_decomposition/pls_svd_test.exs
new file mode 100644
index 00000000..5662bfc4
--- /dev/null
+++ b/test/scholar/cross_decomposition/pls_svd_test.exs
@@ -0,0 +1,221 @@
+defmodule Scholar.CrossDecomposition.PLSSVDTest do
+  use Scholar.Case, async: true
+  alias Scholar.CrossDecomposition.PLSSVD
+  doctest PLSSVD
+
+  defp x do
+    Nx.tensor([
+      [0.0, 0.0, 1.0, 16.0],
+      [1.0, 0.0, 0.0, 25.2],
+      [2.0, 2.0, 2.0, -2.3],
+      [2.0, 5.0, 4.0, 4.5],
+      [5.0, -2.0, 3.3, 4.5]
+    ])
+  end
+
+  defp y do
+    Nx.tensor([
+      [0.1, -0.2, 3.0],
+      [0.9, 1.1, 5.1],
+      [6.2, 5.9, 2.5],
+      [11.9, 12.3, -6.0],
+      [7.6, 1.8, 4.9]
+    ])
+  end
+
+  defp y_1d do
+    Nx.tensor([0.1, -0.2, 3.0, 6.9, 3])
+  end
+
+  test "fit test" do
+    model = Scholar.CrossDecomposition.PLSSVD.fit(x(), y())
+
+    assert_all_close(
+      model.x_mean,
+      Nx.tensor([2.0, 1.0, 2.059999942779541, 9.579999923706055]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      model.y_mean,
+      Nx.tensor([5.339999675750732, 4.179999828338623, 1.899999976158142]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      model.x_std,
+      Nx.tensor([1.8708287477493286, 2.6457512378692627, 1.6334013938903809, 10.931011199951172]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      model.y_std,
+      Nx.tensor([4.90030574798584, 5.08005952835083, 4.561249732971191]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      model.x_weights,
+      Nx.tensor([
+        [0.17879533767700195, 0.7447080016136169],
+        [0.6228733062744141, -0.5843358635902405],
+        [0.6137028336524963, 0.1790202558040619],
+        [-0.4510321617126465, -0.26816627383232117]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      model.y_weights,
+      Nx.tensor([
+        [0.6292941570281982, 0.5848351716995239, -0.5118170976638794],
+        [0.7398861646652222, -0.2493150532245636, 0.6248283386230469]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test "transform test" do
+    model = Scholar.CrossDecomposition.PLSSVD.fit(x(), y())
+    {x_transformed, y_transformed} = Scholar.CrossDecomposition.PLSSVD.transform(model, x(), y())
+
+    assert_all_close(
+      x_transformed,
+      Nx.tensor([
+        [-1.0897283554077148, -0.8489431142807007],
+        [-1.7494868040084839, -0.7861797213554382],
+        [0.703069806098938, 0.06401326507329941],
+        [1.8802037239074707, -0.5461838245391846],
+        [0.25594159960746765, 2.117293357849121]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      y_transformed,
+      Nx.tensor([
+        [-1.3005900382995605, -0.42553290724754333],
+        [-1.2838343381881714, -0.08087197691202164],
+        [0.24112752079963684, 0.12762844562530518],
+        [2.6636931896209717, -0.49021831154823303],
+        [-0.3203960657119751, 0.8689947128295898]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test "fit_transform test - all options are default" do
+    {x_transformed, y_transformed} = Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y())
+
+    assert_all_close(
+      x_transformed,
+      Nx.tensor([
+        [-1.0897283554077148, -0.8489431142807007],
+        [-1.7494868040084839, -0.7861797213554382],
+        [0.703069806098938, 0.06401326507329941],
+        [1.8802037239074707, -0.5461838245391846],
+        [0.25594159960746765, 2.117293357849121]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      y_transformed,
+      Nx.tensor([
+        [-1.3005900382995605, -0.42553290724754333],
+        [-1.2838343381881714, -0.08087197691202164],
+        [0.24112752079963684, 0.12762844562530518],
+        [2.6636931896209717, -0.49021831154823303],
+        [-0.3203960657119751, 0.8689947128295898]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test "fit_transform test - :num_components set to 1" do
+    {x_transformed, y_transformed} =
+      Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y(), num_components: 1)
+
+    assert_all_close(
+      x_transformed,
+      Nx.tensor([
+        [-1.0897283554077148],
+        [-1.7494868040084839],
+        [0.703069806098938],
+        [1.8802037239074707],
+        [0.25594159960746765]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      y_transformed,
+      Nx.tensor([
+        [-1.3005900382995605],
+        [-1.2838343381881714],
+        [0.24112752079963684],
+        [2.6636931896209717],
+        [-0.3203960657119751]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test "fit_transform test - y is has only one dimension" do
+    {x_transformed, y_transformed} =
+      Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y_1d(), num_components: 1)
+
+    assert_all_close(
+      x_transformed,
+      Nx.tensor([
+        [-1.2138643264770508],
+        [-1.868216872215271],
+        [0.703800618648529],
+        [1.7553009986877441],
+        [0.6229796409606934]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      y_transformed,
+      Nx.tensor([
+        [-0.8578669428825378],
+        [-0.9624848365783691],
+        [0.15343964099884033],
+        [1.5134726762771606],
+        [0.15343964099884033]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+
+  test "fit_transform test - :scale is set to :false" do
+    {x_transformed, y_transformed} =
+      Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y(), scale: false)
+
+    assert_all_close(
+      x_transformed,
+      Nx.tensor([
+        [6.641565322875977, 1.5491820573806763],
+        [15.36169719696045, 3.2503585815429688],
+        [-11.394588470458984, -2.017521619796753],
+        [-6.2775702476501465, 2.303945779800415],
+        [-4.3311028480529785, -5.085964679718018]
+      ]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      y_transformed,
+      Nx.tensor([
+        [6.744043827056885, 1.1535897254943848],
+        [6.1893134117126465, -0.3978065252304077],
+        [-1.4090275764465332, -0.40731552243232727],
+        [-12.453459739685059, 3.961534023284912],
+        [0.9291285872459412, -4.310001850128174]
+      ]),
+      atol: 1.0e-3
+    )
+  end
+end