From 59cf12c22061a96a1a95e5327dffa19abb78e1c6 Mon Sep 17 00:00:00 2001 From: Szymon Date: Mon, 11 Nov 2024 09:37:10 +0100 Subject: [PATCH] Partial Least Square SVD (#307) --- lib/scholar/cross_decomposition/pls_svd.ex | 347 ++++++++++++++++++ .../cross_decomposition/pls_svd_test.exs | 221 +++++++++++ 2 files changed, 568 insertions(+) create mode 100644 lib/scholar/cross_decomposition/pls_svd.ex create mode 100644 test/scholar/cross_decomposition/pls_svd_test.exs diff --git a/lib/scholar/cross_decomposition/pls_svd.ex b/lib/scholar/cross_decomposition/pls_svd.ex new file mode 100644 index 00000000..656e384f --- /dev/null +++ b/lib/scholar/cross_decomposition/pls_svd.ex @@ -0,0 +1,347 @@ +defmodule Scholar.CrossDecomposition.PLSSVD do + @moduledoc """ + Partial Least Square SVD. + + This transformer simply performs a SVD on the cross-covariance matrix. + It is able to project both the training data `x` and the targets + `y`. The training data `x` is projected on the left singular vectors, while + the targets are projected on the right singular vectors. + """ + import Nx.Defn + + @derive {Nx.Container, + containers: [ + :x_mean, + :y_mean, + :x_std, + :y_std, + :x_weights, + :y_weights + ]} + defstruct [ + :x_mean, + :y_mean, + :x_std, + :y_std, + :x_weights, + :y_weights + ] + + opts_schema = [ + num_components: [ + default: 2, + type: :pos_integer, + doc: "The number of components to keep. Should be in `[1, + min(n_samples, n_features, n_targets)]`." + ], + scale: [ + default: true, + type: :boolean, + doc: "Whether to scale `x` and `y`." + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Fit model to data. + Takes as arguments: + + * `x` - training samples, `{num_samples, num_features}` shaped tensor + + * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + The function returns fitted estimator represented by struct with the following parameters: + + * `:x_mean` - tensor of shape `{num_features}` which represents `x` tensor mean values calculated along axis 0. + + * `:y_mean` - tensor of shape `{num_targets}` which represents `x` tensor mean values calculated along axis 0. + + * `:x_std` - tensor of shape `{num_features}` which represents `x` tensor standard deviation values calculated along axis 0. + + * `:y_std` - tensor of shape `{num_targets}` which represents `y` tensor standard deviation values calculated along axis 0. + + * `:x_weights` - tensor of shape `{num_features, num_components}` the left singular vectors of the SVD of the cross-covariance matrix. + + * `:y_weights` - tensor of shape `{num_components, num_targets}` the transposed right singular vectors of the SVD of the cross-covariance matrix. + + ## Examples + + iex> x = Nx.tensor([[0.0, 0.0, 1.0], + ...> [1.0, 0.0, 0.0], + ...> [2.0, 2.0, 2.0], + ...> [2.0, 5.0, 4.0]]) + iex> y = Nx.tensor([[0.1, -0.2], + ...> [0.9, 1.1], + ...> [6.2, 5.9], + ...> [11.9, 12.3]]) + iex> model = Scholar.CrossDecomposition.PLSSVD.fit(x, y) + iex> model.x_mean + #Nx.Tensor< + f32[3] + [1.25, 1.75, 1.75] + > + iex> model.y_std + #Nx.Tensor< + f32[2] + [5.467098712921143, 5.661198616027832] + > + iex> model.x_weights + #Nx.Tensor< + f32[3][2] + [ + [0.521888256072998, -0.11256571859121323], + [0.6170258522033691, 0.7342619299888611], + [0.5889922380447388, -0.6694686412811279] + ] + > + """ + + deftransform fit(x, y, opts \\ []) do + fit_n(x, y, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_n(x, y, opts) do + {x, y} = check_x_y(x, y, opts) + num_components = opts[:num_components] + {x, x_mean, x_std} = center_scale(x, opts) + {y, y_mean, y_std} = center_scale(y, opts) + + c = Nx.dot(x, [0], y, [0]) + + {u, _s, vt} = Nx.LinAlg.svd(c, full_matrices?: false) + u = Nx.slice_along_axis(u, 0, num_components, axis: 1) + vt = Nx.slice_along_axis(vt, 0, num_components, axis: 0) + {u, vt} = Scholar.Decomposition.Utils.flip_svd(u, vt) + + x_weights = u + y_weights = vt + + %__MODULE__{ + x_mean: x_mean, + y_mean: y_mean, + x_std: x_std, + y_std: y_std, + x_weights: x_weights, + y_weights: y_weights + } + end + + @doc """ + Apply the dimensionality reduction. + Takes as arguments: + + * fitted estimator struct which is return value of `fit/3` function from this module + + * `x` - training samples, `{num_samples, num_features}` shaped tensor + + * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns tuple with transformed data `{x_transformed, y_transformed}` where: + + * `x_transformed` is `{num_samples, num_features}` shaped tensor. + + * `y_transformed` is `{num_samples, num_features}` shaped tensor. + + ## Examples + + iex> x = Nx.tensor([[0.0, 0.0, 1.0], + ...> [1.0, 0.0, 0.0], + ...> [2.0, 2.0, 2.0], + ...> [2.0, 5.0, 4.0]]) + iex> y = Nx.tensor([[0.1, -0.2], + ...> [0.9, 1.1], + ...> [6.2, 5.9], + ...> [11.9, 12.3]]) + iex> model = Scholar.CrossDecomposition.PLSSVD.fit(x, y) + iex> {x, y} = Scholar.CrossDecomposition.PLSSVD.transform(model, x, y) + iex> x + #Nx.Tensor< + f32[4][2] + [ + [-1.397004246711731, -0.10283949971199036], + [-1.1967883110046387, 0.17159013450145721], + [0.5603229403495789, -0.10849219560623169], + [2.0334696769714355, 0.039741579443216324] + ] + > + iex> y + #Nx.Tensor< + f32[4][2] + [ + [-1.2260178327560425, -0.019306711852550507], + [-0.9602956175804138, 0.04015407711267471], + [0.3249155580997467, -0.04311027377843857], + [1.8613981008529663, 0.022262824699282646] + ] + > + + """ + deftransform transform(model, x, y, opts \\ []) do + transform_n(model, x, y, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp transform_n( + %__MODULE__{ + x_mean: x_mean, + y_mean: y_mean, + x_std: x_std, + y_std: y_std, + x_weights: x_weights, + y_weights: y_weights + } = _model, + x, + y, + opts + ) do + {x, y} = check_x_y(x, y, opts) + + xr = (x - x_mean) / x_std + x_scores = Nx.dot(xr, x_weights) + + yr = (y - y_mean) / y_std + y_scores = Nx.dot(yr, [1], y_weights, [1]) + {x_scores, y_scores} + end + + @doc """ + Learn and apply the dimensionality reduction. + + The arguments are: + + * `x` - training samples, `{num_samples, num_features}` shaped tensor + + * `y` - targets, `{num_samples, num_targets}` shaped `y` tensor + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + Returns tuple with transformed data `{x_transformed, y_transformed}` where: + + * `x_transformed` is `{num_samples, num_features}` shaped tensor. + + * `y_transformed` is `{num_samples, num_features}` shaped tensor. + + ## Examples + + iex> x = Nx.tensor([[0.0, 0.0, 1.0], + ...> [1.0, 0.0, 0.0], + ...> [2.0, 2.0, 2.0], + ...> [2.0, 5.0, 4.0]]) + iex> y = Nx.tensor([[0.1, -0.2], + ...> [0.9, 1.1], + ...> [6.2, 5.9], + ...> [11.9, 12.3]]) + iex> {x, y} = Scholar.CrossDecomposition.PLSSVD.fit_transform(x, y) + iex> x + #Nx.Tensor< + f32[4][2] + [ + [-1.397004246711731, -0.10283949971199036], + [-1.1967883110046387, 0.17159013450145721], + [0.5603229403495789, -0.10849219560623169], + [2.0334696769714355, 0.039741579443216324] + ] + > + iex> y + #Nx.Tensor< + f32[4][2] + [ + [-1.2260178327560425, -0.019306711852550507], + [-0.9602956175804138, 0.04015407711267471], + [0.3249155580997467, -0.04311027377843857], + [1.8613981008529663, 0.022262824699282646] + ] + > + + """ + + deftransform fit_transform(x, y, opts \\ []) do + fit_transform_n(x, y, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_transform_n(x, y, opts) do + fit(x, y, opts) + |> transform(x, y, opts) + end + + defnp check_x_y(x, y, opts) do + y = + case Nx.shape(y) do + {n} -> Nx.reshape(y, {n, 1}) + _ -> y + end + + num_components = opts[:num_components] + {num_samples, num_features} = Nx.shape(x) + {num_samples_y, num_targets} = Nx.shape(y) + + cond do + num_samples != num_samples_y -> + raise ArgumentError, + """ + num_samples must be the same for x and y \ + x num_samples = #{num_samples}, y num_samples = #{num_samples_y} + """ + + num_components > num_features -> + raise ArgumentError, + """ + num_components must be less than or equal to \ + num_features = #{num_features}, got #{num_components} + """ + + num_components > num_samples -> + raise ArgumentError, + """ + num_components must be less than or equal to \ + num_samples = #{num_samples}, got #{num_components} + """ + + num_components > num_targets -> + raise ArgumentError, + """ + num_components must be less than or equal to \ + num_targets = #{num_targets}, got #{num_components} + """ + + true -> + nil + end + + {x, y} + end + + defnp center_scale(x, opts) do + scale = opts[:scale] + x_mean = Nx.mean(x, axes: [0]) + x = x - x_mean + + if scale do + x_std = Nx.standard_deviation(x, axes: [0], ddof: 1) + x_std = Nx.select(x_std == 0.0, 1.0, x_std) + x = x / Nx.broadcast(x_std, Nx.shape(x)) + + {x, x_mean, x_std} + else + x_std = Nx.broadcast(1, {Nx.axis_size(x, 1)}) + + {x, x_mean, x_std} + end + end +end diff --git a/test/scholar/cross_decomposition/pls_svd_test.exs b/test/scholar/cross_decomposition/pls_svd_test.exs new file mode 100644 index 00000000..5662bfc4 --- /dev/null +++ b/test/scholar/cross_decomposition/pls_svd_test.exs @@ -0,0 +1,221 @@ +defmodule Scholar.CrossDecomposition.PLSSVDTest do + use Scholar.Case, async: true + alias Scholar.CrossDecomposition.PLSSVD + doctest PLSSVD + + defp x do + Nx.tensor([ + [0.0, 0.0, 1.0, 16.0], + [1.0, 0.0, 0.0, 25.2], + [2.0, 2.0, 2.0, -2.3], + [2.0, 5.0, 4.0, 4.5], + [5.0, -2.0, 3.3, 4.5] + ]) + end + + defp y do + Nx.tensor([ + [0.1, -0.2, 3.0], + [0.9, 1.1, 5.1], + [6.2, 5.9, 2.5], + [11.9, 12.3, -6.0], + [7.6, 1.8, 4.9] + ]) + end + + defp y_1d do + Nx.tensor([0.1, -0.2, 3.0, 6.9, 3]) + end + + test "fit test" do + model = Scholar.CrossDecomposition.PLSSVD.fit(x(), y()) + + assert_all_close( + model.x_mean, + Nx.tensor([2.0, 1.0, 2.059999942779541, 9.579999923706055]), + atol: 1.0e-3 + ) + + assert_all_close( + model.y_mean, + Nx.tensor([5.339999675750732, 4.179999828338623, 1.899999976158142]), + atol: 1.0e-3 + ) + + assert_all_close( + model.x_std, + Nx.tensor([1.8708287477493286, 2.6457512378692627, 1.6334013938903809, 10.931011199951172]), + atol: 1.0e-3 + ) + + assert_all_close( + model.y_std, + Nx.tensor([4.90030574798584, 5.08005952835083, 4.561249732971191]), + atol: 1.0e-3 + ) + + assert_all_close( + model.x_weights, + Nx.tensor([ + [0.17879533767700195, 0.7447080016136169], + [0.6228733062744141, -0.5843358635902405], + [0.6137028336524963, 0.1790202558040619], + [-0.4510321617126465, -0.26816627383232117] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + model.y_weights, + Nx.tensor([ + [0.6292941570281982, 0.5848351716995239, -0.5118170976638794], + [0.7398861646652222, -0.2493150532245636, 0.6248283386230469] + ]), + atol: 1.0e-3 + ) + end + + test "transform test" do + model = Scholar.CrossDecomposition.PLSSVD.fit(x(), y()) + {x_transformed, y_transformed} = Scholar.CrossDecomposition.PLSSVD.transform(model, x(), y()) + + assert_all_close( + x_transformed, + Nx.tensor([ + [-1.0897283554077148, -0.8489431142807007], + [-1.7494868040084839, -0.7861797213554382], + [0.703069806098938, 0.06401326507329941], + [1.8802037239074707, -0.5461838245391846], + [0.25594159960746765, 2.117293357849121] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + y_transformed, + Nx.tensor([ + [-1.3005900382995605, -0.42553290724754333], + [-1.2838343381881714, -0.08087197691202164], + [0.24112752079963684, 0.12762844562530518], + [2.6636931896209717, -0.49021831154823303], + [-0.3203960657119751, 0.8689947128295898] + ]), + atol: 1.0e-3 + ) + end + + test "fit_transform test - all options are default" do + {x_transformed, y_transformed} = Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y()) + + assert_all_close( + x_transformed, + Nx.tensor([ + [-1.0897283554077148, -0.8489431142807007], + [-1.7494868040084839, -0.7861797213554382], + [0.703069806098938, 0.06401326507329941], + [1.8802037239074707, -0.5461838245391846], + [0.25594159960746765, 2.117293357849121] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + y_transformed, + Nx.tensor([ + [-1.3005900382995605, -0.42553290724754333], + [-1.2838343381881714, -0.08087197691202164], + [0.24112752079963684, 0.12762844562530518], + [2.6636931896209717, -0.49021831154823303], + [-0.3203960657119751, 0.8689947128295898] + ]), + atol: 1.0e-3 + ) + end + + test "fit_transform test - :num_components set to 1" do + {x_transformed, y_transformed} = + Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y(), num_components: 1) + + assert_all_close( + x_transformed, + Nx.tensor([ + [-1.0897283554077148], + [-1.7494868040084839], + [0.703069806098938], + [1.8802037239074707], + [0.25594159960746765] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + y_transformed, + Nx.tensor([ + [-1.3005900382995605], + [-1.2838343381881714], + [0.24112752079963684], + [2.6636931896209717], + [-0.3203960657119751] + ]), + atol: 1.0e-3 + ) + end + + test "fit_transform test - y is has only one dimension" do + {x_transformed, y_transformed} = + Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y_1d(), num_components: 1) + + assert_all_close( + x_transformed, + Nx.tensor([ + [-1.2138643264770508], + [-1.868216872215271], + [0.703800618648529], + [1.7553009986877441], + [0.6229796409606934] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + y_transformed, + Nx.tensor([ + [-0.8578669428825378], + [-0.9624848365783691], + [0.15343964099884033], + [1.5134726762771606], + [0.15343964099884033] + ]), + atol: 1.0e-3 + ) + end + + test "fit_transform test - :scale is set to :false" do + {x_transformed, y_transformed} = + Scholar.CrossDecomposition.PLSSVD.fit_transform(x(), y(), scale: false) + + assert_all_close( + x_transformed, + Nx.tensor([ + [6.641565322875977, 1.5491820573806763], + [15.36169719696045, 3.2503585815429688], + [-11.394588470458984, -2.017521619796753], + [-6.2775702476501465, 2.303945779800415], + [-4.3311028480529785, -5.085964679718018] + ]), + atol: 1.0e-3 + ) + + assert_all_close( + y_transformed, + Nx.tensor([ + [6.744043827056885, 1.1535897254943848], + [6.1893134117126465, -0.3978065252304077], + [-1.4090275764465332, -0.40731552243232727], + [-12.453459739685059, 3.961534023284912], + [0.9291285872459412, -4.310001850128174] + ]), + atol: 1.0e-3 + ) + end +end