Merge pull request #42 from invenia/gm/refactor_scaling

Refactor scaling to only compute one mean and std
invenia · Mar 9, 2021 · f6bb5e4 · f6bb5e4 · glennmoy · Mar 9, 2021
2 parents 8dd66b2 + 3edd9d8
commit f6bb5e4
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 175 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FeatureTransforms"
 uuid = "8fd68953-04b8-4117-ac19-158bf6de9782"
 authors = ["Invenia Technical Computing Corporation"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -80,12 +80,16 @@ julia> output_cols = [:temperature, :humidity];
 
 For many models it is helpful to normalize the training data.
 We can use `MeanStdScaling` for that purpose.
-Note that we are mutating the data frame in-place using `apply!`, and the order of columns specified does not matter.
+Note that we are mutating the data frame in-place using `apply!` one column at a time.
 
 ```jldoctest example
-julia> scaling = MeanStdScaling(train_df; cols=output_cols);
+julia> temp_scaling = MeanStdScaling(train_df; cols=:temperature);
 
-julia> FeatureTransforms.apply!(train_df, scaling; cols=output_cols)
+julia> hum_scaling = MeanStdScaling(train_df; cols=:humidity);
+
+julia> FeatureTransforms.apply!(train_df, temp_scaling; cols=:temperature);
+
+julia> FeatureTransforms.apply!(train_df, hum_scaling; cols=:humidity)
 22×4 DataFrame
  Row │ time                 temperature  humidity     hour_of_day_sin
      │ DateTime             Float64      Float64      Float64
@@ -112,7 +116,9 @@ julia> FeatureTransforms.apply!(train_df, scaling; cols=output_cols)
 We can use the same `scaling` transform to normalize the test data:
 
 ```jldoctest example
-julia> FeatureTransforms.apply!(test_df, scaling; cols=output_cols)
+julia> FeatureTransforms.apply!(test_df, temp_scaling; cols=:temperature);
+
+julia> FeatureTransforms.apply!(test_df, hum_scaling; cols=:humidity)
 2×4 DataFrame
  Row │ time                 temperature  humidity  hour_of_day_sin
      │ DateTime             Float64      Float64   Float64
@@ -127,7 +133,9 @@ We can scale this back to the original units of temperature and humidity by conv
 ```jldoctest example
 julia> predictions = DataFrame([-0.36 0.61; -0.45 0.68], output_cols);
 
-julia> FeatureTransforms.apply!(predictions, scaling; cols=output_cols, inverse=true)
+julia> FeatureTransforms.apply!(predictions, temp_scaling; cols=:temperature, inverse=true);
+
+julia> FeatureTransforms.apply!(predictions, hum_scaling; cols=:humidity, inverse=true)
 2×2 DataFrame
  Row │ temperature  humidity 
      │ Float64      Float64  

diff --git a/docs/src/transforms.md b/docs/src/transforms.md
@@ -78,8 +78,8 @@ A single `Transform` instance can be applied to different data types, with suppo
 !!! note
 
     Some `Transform` subtypes have restrictions on how they can be applied once constructed.
-    For instance, `MeanStdScaling` stores the mean and standard deviation of some data for specified dimensions or column names.
-    So `MeanStdScaling` should only be applied to the same data type and for the same dimensions or subset of column names specified in construction.
+    For instance, `MeanStdScaling` stores the mean and standard deviation of some data, potentially specified via some dimension and column names.
+    So `MeanStdScaling` should only be applied to the same data, and for the same dimension and subset of column names, as those used in construction.
 
 ## Applying to `AbstractArray`
 
@@ -116,9 +116,10 @@ julia> FeatureTransforms.apply(M, p; inds=[4, 5, 6])
 ### Applying along dimensions using `dims`
 
 Transforms can be applied to `AbstractArray` data with a `dims` keyword argument.
-This will apply the `Transform` to slices of the array along dimensions determined by `dims`.
-For example, given a `Matrix`, `dims=1` applies to each column, and `dims=2` applies
-to each row.
+This will apply the `Transform` to slices of the array along this dimension, which can be selected by the `inds` keyword.
+So when `dims` and `inds` are used together, the `inds` change from being the global indices of the array to the relative indices of each slice.
+
+For example, given a `Matrix`, `dims=1` slices the data column-wise and `inds=[2, 3]` selects the 2nd and 3rd rows.
 
 !!! note
 
@@ -132,49 +133,37 @@ julia> M
  1.0  5.0
  3.0  6.0
 
-julia> normalize_cols = MeanStdScaling(M; dims=1);
-
-julia> normalize_cols(M; dims=1)
-3×2 Array{Float64,2}:
-  0.0  -1.0
- -1.0   0.0
-  1.0   1.0
-
-julia> normalize_rows = MeanStdScaling(M; dims=2);
+julia> normalize_row = MeanStdScaling(M; dims=1, inds=[2])
+MeanStdScaling(3.0, 2.8284271247461903)
 
-julia> normalize_rows(M; dims=2)
-3×2 Array{Float64,2}:
- -0.707107  0.707107
+julia> normalize_row(M; dims=1, inds=[2])
+1×2 Array{Float64,2}:
  -0.707107  0.707107
- -0.707107  0.707107
-```
 
-### Using `dims` and `inds` together
+julia> normalize_col = MeanStdScaling(M; dims=2, inds=[2])
+MeanStdScaling(5.0, 1.0)
 
-When using `dims` with `inds`, the `inds` change from being the global indices of the array to the relative indices of each slice.
-For example, the following is another way to square the second column of an array, applying to  index 2 of each row:
-
-```jldoctest transforms
-julia> FeatureTransforms.apply(M, p; dims=2, inds=[2])
+julia> normalize_col(M; dims=2, inds=[2])
 3×1 Array{Float64,2}:
- 16.0
- 25.0
- 36.0
+ -1.0
+  0.0
+  1.0
+
 ```
 
 ## Applying to `Table`
 
 ### Default
 
-Without specifying optional arguments, a `Transform` is applied to every column of a `Table` independently:
+Without specifying optional arguments, a `Transform` is applied to all the data in a `Table`:
 
 ```jldoctest transforms
 julia> nt = (a = [2.0, 1.0, 3.0], b = [4.0, 5.0, 6.0]);
 
-julia> scaling = MeanStdScaling(nt);
+julia> scaling = MeanStdScaling(nt);  # compute statistics using all data
 
 julia> FeatureTransforms.apply!(nt, scaling)
-(a = [0.0, -1.0, 1.0], b = [-1.0, 0.0, 1.0])
+(a = [-0.8017837257372732, -1.3363062095621219, -0.2672612419124244], b = [0.2672612419124244, 0.8017837257372732, 1.3363062095621219])
 ```
 
 !!! note
@@ -185,8 +174,8 @@ julia> FeatureTransforms.apply!(nt, scaling)
     ```julia-repl
     julia> FeatureTransforms.apply(nt, scaling)
     2-element Array{Array{Float64,1},1}:
-    [-2.0, -3.0, -1.0]
-    [-6.0, -5.0, -4.0]
+    [-2.2994001219583993, -2.585114407672685, -2.0136858362441137]
+    [-1.7279715505298279, -1.442257264815542, -1.1565429791012565]
     ```
 
 ### Applying to specific columns with `cols`
@@ -233,7 +222,7 @@ julia> scaling = MeanStdScaling(nt);
 julia> FeatureTransforms.apply!(nt, scaling);
 
 julia> nt
-(a = [0.0, -1.0, 1.0], b = [-1.0, 0.0, 1.0])
+(a = [-0.8017837257372732, -1.3363062095621219, -0.2672612419124244], b = [0.2672612419124244, 0.8017837257372732, 1.3363062095621219])
 
 julia> FeatureTransforms.apply!(nt, scaling; inverse=true);
 

diff --git a/src/scaling.jl b/src/scaling.jl
@@ -1,7 +1,7 @@
 """
     AbstractScaling <: Transform
 
-Linearly scale the data as `ax + b`, according to some statistics `a` and `b`.
+Linearly scale the data according to some statistics.
 """
 abstract type AbstractScaling <: Transform end
 
@@ -12,93 +12,65 @@ Represents the no-op scaling which simply returns the `data` it is applied on.
 """
 struct IdentityScaling <: AbstractScaling end
 
-_apply(x, scaling::IdentityScaling; kwargs...) = x
-_apply!(x, scaling::IdentityScaling; kwargs...) = _apply(x, scaling; kwargs...)
-
+@inline _apply(x, ::IdentityScaling; kwargs...) = x
 
 """
-    MeanStdScaling(mean, std) <: AbstractScaling
+    MeanStdScaling(μ, σ) <: AbstractScaling
 
-Linearly scale the data by a statistical `mean` and standard deviation `std`.
+Linearly scale the data by the statistical mean `μ` and standard deviation `σ`.
 This is also known as standardization, or the Z score transform.
-Once computed, the statistics of a `MeanStdScaling` are immutable.
-
-Can take a precomputed `mean` and `std` as arguments, or compute them from data.
-
-# Arguments
-* `mean::NamedTuple`: tuple of mean values, named by the scope of values it applies to.
-  `(all=μ, )` will apply to all data; `(1=μ1, 2=μ2)` for `AbstractArray` data will apply μ1
-  to the first slice and μ2 to the second slice; `(a=μ1, b=μ2)` for `Table` data will apply
-  μ1 to column `a` and μ2 to column `b`.
-* `std::NamedTuple`: similar to `mean` but for standard deviation values.
 
 # Keyword arguments to `apply`
-* `inverse=true`: inverts the scaling (e.g. to reconstruct the unscaled data)
-* `eps=1e-3`: replaces all 0 values in `std` before scaling (if `inverse=false`)
+* `inverse=true`: inverts the scaling (e.g. to reconstruct the unscaled data).
+* `eps=1e-3`: used in place of all 0 values in `σ` before scaling (if `inverse=false`).
 """
 struct MeanStdScaling <: AbstractScaling
-    mean::NamedTuple
-    std::NamedTuple
-end
-
-"""
-    MeanStdScaling(data; kwargs...) <: Scaling
-
-Construct a [`MeanStdScaling`](@ref) using the mean and standard deviation of the data.
-
-!!! note
-    `dims` and `cols` keyword arguments must be specified the same way when constructing
-    and applying the transform.
-    Otherwise, the results will be inconsistent, or an error may occur.
-
-# Keyword arguments
-* `dims=:`: for `AbstractArray` data, the dimension(s) to compute statistics along.
-* `cols=nothing`: for `Table` data, the column names to compute statistics for.
-"""
-function MeanStdScaling(data; kwargs...)
-    μ, σ = compute_stats(data; kwargs...)
-    return MeanStdScaling(μ, σ)
-end
+    μ::Real
+    σ::Real
+
+    """
+        MeanStdScaling(A::AbstractArray; dims=:, inds=:) -> MeanStdScaling
+        MeanStdScaling(table, cols=nothing) -> MeanStdScaling
+
+    Construct a [`MeanStdScaling`](@ref) transform from the statistics of the given data.
+    By default _all the data_ is considered when computing the mean and standard deviation.
+    This can be restricted to certain slices via the keyword arguments (see below).
+
+    # `AbstractArray` keyword arguments
+    * `dims=:`: the dimension along which to take the `inds` slices. Default uses all dims.
+    * `inds=:`: the indices to use in computing the statistics. Default uses all indices.
+
+    # `Table` keyword arguments
+    * `cols=nothing`: the columns to use in computing the statistics. Default uses all columns.
+
+    !!! note
+        If you want the `MeanStdScaling` to transform your data consistently you should use
+        the same `inds`, `dims`, or `cols` keywords when calling `apply`. Otherwise, `apply`
+        might rescale the wrong data or throw an error.
+    """
+    function MeanStdScaling(A::AbstractArray; dims=:, inds=:)
+        dims == Colon() && return new(compute_stats(A)...)
+        return new(compute_stats(selectdim(A, dims, inds))...)
+    end
 
-function compute_stats(A::AbstractArray; dims=:)
-    if dims == Colon()
-        μ = (all = mean(A), )
-        σ = (all = std(A), )
-    else
-        μ_pairs = [(Symbol(i), x) for (i, x) in enumerate(mean(A; dims=dims))]
-        σ_pairs = [(Symbol(i), x) for (i, x) in enumerate(std(A; dims=dims))]
+    function MeanStdScaling(table; cols=nothing)
+        Tables.istable(table) || throw(MethodError(MeanStdScaling, table))
+        columntable = Tables.columns(table)
 
-        μ = (; μ_pairs...)
-        σ = (; σ_pairs...)
+        cols = _to_vec(cols)  # handle single column name
+        cnames = cols === nothing ? propertynames(columntable) : cols
+        data = reduce(vcat, [getproperty(columntable, c) for c in cnames])
+        return new(compute_stats(data)...)
     end
-
-    return μ, σ
 end
 
-function compute_stats(table; cols=nothing)
-    columntable = Tables.columns(table)
-    cnames = cols === nothing ? propertynames(columntable) : cols
+compute_stats(x) = (mean(x), std(x))
 
-    μ_pairs = [(cname, mean(getproperty(columntable, cname))) for cname in cnames]
-    σ_pairs = [(cname, std(getproperty(columntable, cname))) for cname in cnames]
-
-    return (; μ_pairs...), (; σ_pairs...)
-end
-
-function _apply(
-    A::AbstractArray, scaling::MeanStdScaling;
-    name=nothing, inverse=false, eps=1e-3, kwargs...
-)
-    name = name === nothing ? :all : name
-    μ = scaling.mean[name]
-    σ = scaling.std[name]
-    if inverse
-        return μ .+ σ .* A
-    else
-        # Avoid division by 0
-        # If std is 0 then data was uniform, so the scaled value would end up ≈ 0
-        # Therefore the particular `eps` value should not matter much.
-        σ_safe = σ == 0 ? eps : σ
-        return (A .- μ) ./ σ_safe
-    end
+function _apply(A::AbstractArray, scaling::MeanStdScaling; inverse=false, eps=1e-3)
+    inverse && return scaling.μ .+ scaling.σ .* A
+    # Avoid division by 0
+    # If std is 0 then data was uniform, so the scaled value would end up ≈ 0
+    # Therefore the particular `eps` value should not matter much.
+    σ_safe = maximum([scaling.σ, eps])
+    return (A .- scaling.μ) ./ σ_safe
 end
diff --git a/src/transformers.jl b/src/transformers.jl
@@ -76,10 +76,8 @@ function apply(A::AbstractArray, t::Transform; dims=:, inds=:, kwargs...)
         end
     end
 
-    slice_index = 0
     return @views mapslices(A, dims=dims) do x
-        slice_index += 1
-        _apply(x[inds], t; name=Symbol(slice_index), kwargs...)
+        _apply(x[inds], t; kwargs...)
     end
 end
 
@@ -115,7 +113,7 @@ end
 
 # 3-arg forms are simply to dispatch on whether cols is a Symbol or a collection
 function _apply(table, t::Transform, col; kwargs...)
-    return _apply(getproperty(table, col), t; name=col, kwargs...)
+    return _apply(getproperty(table, col), t; kwargs...)
 end
 
 function _apply(table, t::Transform, cols::Union{Tuple, AbstractArray}; kwargs...)
@@ -140,7 +138,7 @@ function apply!(table::T, t::Transform; cols=nothing, kwargs...)::T where T
 
     cnames = cols === nothing ? propertynames(columntable) : cols
     for cname in cnames
-        apply!(getproperty(columntable, cname), t; name=cname, kwargs...)
+        apply!(getproperty(columntable, cname), t; kwargs...)
     end
 
     return table