Skip to content

Commit

Permalink
Clarify standard deviation / variance documentation with Bessel's cor…
Browse files Browse the repository at this point in the history
…rection (#4786)

* First attempt at fixing std/var docs

* Fix Java file formatting

* Chip review suggestions

* Spotless apply

* Spotless apply (again)
  • Loading branch information
alexpeters1208 authored Nov 14, 2023
1 parent df45e1c commit 318c9d2
Show file tree
Hide file tree
Showing 21 changed files with 340 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,11 @@ class Aggregate {
}

/**
* Returns an aggregator that computes the standard deviation of values, within an aggregation
* group, for each input column.
* Returns an aggregator that computes the sample standard deviation of values, within an
* aggregation group, for each input column.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*/
[[nodiscard]]
static Aggregate Std(std::vector<std::string> column_specs);
Expand Down Expand Up @@ -608,8 +611,11 @@ class Aggregate {
}

/**
* Returns an aggregator that computes the variance of values, within an aggregation group,
* Returns an aggregator that computes the sample variance of values, within an aggregation group,
* for each input column.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*/
[[nodiscard]]
static Aggregate Var(std::vector<std::string> column_specs);
Expand Down Expand Up @@ -801,8 +807,11 @@ Aggregate AggPct(double percentile, Args &&... args) {
}

/**
* Returns an aggregator that computes the standard deviation of values, within an aggregation
* group, for each input column.
* Returns an aggregator that computes the sample standard deviation of values, within an aggregation group,
* for each input column.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*/
template<typename ...Args>
[[nodiscard]]
Expand All @@ -821,8 +830,11 @@ Aggregate aggSum(Args &&... args) {
}

/**
* Returns an aggregator that computes the variance of values, within an aggregation group,
* Returns an aggregator that computes the sample variance of values, within an aggregation group,
* for each input column.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*/
template<typename ...Args>
[[nodiscard]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -528,11 +528,13 @@ UpdateByOperation rollingCountTime(std::string timestamp_col, std::vector<std::s
deephaven::client::utility::DurationSpecifier rev_time,
deephaven::client::utility::DurationSpecifier fwd_time = 0);
/**
* Creates a rolling standard deviation UpdateByOperation for the supplied column names, using ticks as the
* windowing unit. Ticks are row counts, and you may specify the reverse and forward window in
* number of rows to include. The current row is considered to belong to the reverse window but
* not the forward window. Also, negative values are allowed and can be used to generate completely
* forward or completely reverse windows.
* Creates a rolling sample standard deviation UpdateByOperation for the supplied column names, using ticks as the
* windowing unit. Ticks are row counts, and you may specify the reverse and forward window in number of rows to include.
* The current row is considered to belong to the reverse window but not the forward window. Also, negative values are
* allowed and can be used to generate completely forward or completely reverse windows.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* See the documentation of rollingSumTick() for examples of window values.
*
Expand All @@ -543,11 +545,14 @@ UpdateByOperation rollingCountTime(std::string timestamp_col, std::vector<std::s
*/
UpdateByOperation rollingStdTick(std::vector<std::string> cols, int rev_ticks, int fwd_ticks = 0);
/**
* Creates a rolling standard deviation UpdateByOperation for the supplied column names, using time as the
* windowing unit. This function accepts nanoseconds or time strings as the reverse and forward
* window parameters. Negative values are allowed and can be used to generate completely forward or
* completely reverse windows. A row containing a null in the timestamp column belongs to no window
* and will not be considered in the windows of other rows; its output will be null.
* Creates a rolling sample standard deviation UpdateByOperation for the supplied column names, using time as the
* windowing unit. This function accepts nanoseconds or time strings as the reverse and forward window parameters.
* Negative values are allowed and can be used to generate completely forward or completely reverse windows.
* A row containing a null in the timestamp column belongs to no window and will not be considered in the windows
* of other rows; its output will be null.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* See the documentation of rollingSumTime() for examples of window values.
*
Expand Down
104 changes: 73 additions & 31 deletions engine/function/src/templates/Numeric.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -427,20 +427,26 @@ public class Numeric {
}

/**
* Returns the variance. Null values are excluded.
* Returns the sample variance. Null values are excluded.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return variance of non-null values.
* @return sample variance of non-null values.
*/
public static double var(${pt.boxed}[] values) {
return var(unbox(values));
}

/**
* Returns the variance. Null values are excluded.
* Returns the sample variance. Null values are excluded.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return variance of non-null values.
* @return sample variance of non-null values.
*/
public static double var(${pt.primitive}... values) {
if (values == null) {
Expand All @@ -451,10 +457,13 @@ public class Numeric {
}

/**
* Returns the variance. Null values are excluded.
* Returns the sample variance. Null values are excluded.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return variance of non-null values.
* @return sample variance of non-null values.
*/
public static double var(${pt.vector} values) {
if (values == null) {
Expand All @@ -476,7 +485,7 @@ public class Numeric {
}
}

// Return NaN if poisoned or too few values to compute variance.
// Return NaN if poisoned or too few values to compute sample variance.
if (count <= 1 || Double.isNaN(sum) || Double.isNaN(sum2)) {
return Double.NaN;
}
Expand All @@ -487,19 +496,22 @@ public class Numeric {
final double delta = sum2 - vs2bar;
final double rel_eps = delta / eps;

// Return zero when the variance is leq the floating point error.
// Return zero when the sample variance is leq the floating point error.
return Math.abs(rel_eps) > 1.0 ? delta / (count - 1) : 0.0;
}

<#list primitiveTypes as pt2>
<#if pt2.valueType.isNumber >

/**
* Returns the weighted variance. Null values are excluded.
* Returns the weighted sample variance. Null values are excluded.
*
* Weighted sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted variance of non-null values.
* @return weighted sample variance of non-null values.
*/
public static double wvar(${pt.primitive}[] values, ${pt2.primitive}[] weights) {
if (values == null || weights == null) {
Expand All @@ -510,11 +522,14 @@ public class Numeric {
}

/**
* Returns the weighted variance. Null values are excluded.
* Returns the weighted sample variance. Null values are excluded.
*
* Weighted sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted variance of non-null values.
* @return weighted sample variance of non-null values.
*/
public static double wvar(${pt.primitive}[] values, ${pt2.vector} weights) {
if (values == null || weights == null) {
Expand All @@ -525,11 +540,14 @@ public class Numeric {
}

/**
* Returns the weighted variance. Null values are excluded.
* Returns the weighted sample variance. Null values are excluded.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @param weights weights
* @return weighted variance of non-null values.
* @return weighted sample variance of non-null values.
*/
public static double wvar(${pt.vector} values, ${pt2.primitive}[] weights) {
if (values == null || weights == null) {
Expand All @@ -540,11 +558,14 @@ public class Numeric {
}

/**
* Returns the weighted variance. Null values are excluded.
* Returns the weighted sample variance. Null values are excluded.
*
* Sample variance is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @param weights weights
* @return weighted variance of non-null values.
* @return weighted sample variance of non-null values.
*/
public static double wvar(${pt.vector} values, ${pt2.vector} weights) {
if (values == null || weights == null) {
Expand Down Expand Up @@ -579,7 +600,7 @@ public class Numeric {
}
}

// Return NaN if poisoned or too few values to compute variance.
// Return NaN if poisoned or too few values to compute sample variance.
if (count <= 1 || Double.isNaN(sum) || Double.isNaN(sum2) || Double.isNaN(count) || Double.isNaN(count2)) {
return Double.NaN;
}
Expand All @@ -597,20 +618,26 @@ public class Numeric {


/**
* Returns the standard deviation. Null values are excluded.
* Returns the sample standard deviation. Null values are excluded.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return standard deviation of non-null values.
* @return sample standard deviation of non-null values.
*/
public static double std(${pt.boxed}[] values) {
return std(unbox(values));
}

/**
* Returns the standard deviation. Null values are excluded.
* Returns the sample standard deviation. Null values are excluded.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return standard deviation of non-null values.
* @return sample standard deviation of non-null values.
*/
public static double std(${pt.primitive}... values) {
if (values == null) {
Expand All @@ -621,10 +648,13 @@ public class Numeric {
}

/**
* Returns the standard deviation. Null values are excluded.
* Returns the sample standard deviation. Null values are excluded.
*
* Sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the sample variance will be an unbiased estimator of population variance.
*
* @param values values.
* @return standard deviation of non-null values.
* @return sample standard deviation of non-null values.
*/
public static double std(${pt.vector} values) {
if (values == null) {
Expand All @@ -639,11 +669,14 @@ public class Numeric {
<#if pt2.valueType.isNumber >

/**
* Returns the weighted standard deviation. Null values are excluded.
* Returns the weighted sample standard deviation. Null values are excluded.
*
* Weighted sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted standard deviation of non-null values.
* @return weighted sample standard deviation of non-null values.
*/
public static double wstd(${pt.primitive}[] values, ${pt2.primitive}[] weights) {
if (values == null || weights == null) {
Expand All @@ -654,11 +687,14 @@ public class Numeric {
}

/**
* Returns the weighted standard deviation. Null values are excluded.
* Returns the weighted sample standard deviation. Null values are excluded.
*
* Weighted sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted standard deviation of non-null values.
* @return weighted sample standard deviation of non-null values.
*/
public static double wstd(${pt.primitive}[] values, ${pt2.vector} weights) {
if (values == null || weights == null) {
Expand All @@ -669,11 +705,14 @@ public class Numeric {
}

/**
* Returns the weighted standard deviation. Null values are excluded.
* Returns the weighted sample standard deviation. Null values are excluded.
*
* Weighted sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted standard deviation of non-null values.
* @return weighted sample standard deviation of non-null values.
*/
public static double wstd(${pt.vector} values, ${pt2.primitive}[] weights) {
if (values == null || weights == null) {
Expand All @@ -684,11 +723,14 @@ public class Numeric {
}

/**
* Returns the weighted standard deviation. Null values are excluded.
* Returns the weighted sample standard deviation. Null values are excluded.
*
* Weighted sample standard deviation is computed using Bessel's correction (https://en.wikipedia.org/wiki/Bessel%27s_correction),
* which ensures that the weighted sample variance will be an unbiased estimator of weighted population variance.
*
* @param values values.
* @param weights weights
* @return weighted standard deviation of non-null values.
* @return weighted sample standard deviation of non-null values.
*/
public static double wstd(${pt.vector} values, ${pt2.vector} weights) {
if (values == null || weights == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ public enum AggType {
Sum,
/** Return the sum of absolute values in each group. */
AbsSum,
/** Return the variance of values in each group. */
/** Return the sample variance of values in each group. */
Var,
/** Return the average of values in each group. */
Avg,
/** Return the standard deviation of each group. */
/** Return the sample standard deviation of each group. */
Std,
/** Return the first value of each group. */
First,
Expand Down
Loading

0 comments on commit 318c9d2

Please sign in to comment.