JuliaStats · gragusa · Jun 10, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 17, 2022
diff --git a/src/GLM.jl b/src/GLM.jl
@@ -12,14 +12,14 @@ module GLM
     import Statistics: cor
     import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
                       loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
-                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
+                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue, weights
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
            loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept
+           cooksdistance, hasintercept, weights, AnalyticWeights, ProbabilityWeights, FrequencyWeights
 
     export
         # types
@@ -52,17 +52,17 @@ module GLM
         LinearModel,
 
         # functions
-        canonicallink,  # canonical link function for a distribution
-        deviance,       # deviance of fitted and observed responses
-        devresid,       # vector of squared deviance residuals
-        formula,        # extract the formula from a model
-        glm,            # general interface
-        linpred,        # linear predictor
-        lm,             # linear model
-        negbin,         # interface to fitting negative binomial regression
-        nobs,           # total number of observations
-        predict,        # make predictions
-        ftest           # compare models with an F test
+        canonicallink,     # canonical link function for a distribution
+        deviance,          # deviance of fitted and observed responses
+        devresid,          # vector of squared deviance residuals
+        formula,           # extract the formula from a model
+        glm,               # general interface
+        linpred,           # linear predictor
+        lm,                # linear model
+        negbin,            # interface to fitting negative binomial regression
+        nobs,              # total number of observations
+        predict,           # make predictions
+        ftest              # compare models with an F test
 
     const FP = AbstractFloat
     const FPVector{T<:FP} = AbstractArray{T,1}

diff --git a/src/glmfit.jl b/src/glmfit.jl
@@ -3,7 +3,7 @@
 
 The response vector and various derived vectors in a generalized linear model.
 """
-struct GlmResp{V<:FPVector,D<:UnivariateDistribution,L<:Link} <: ModResp
+struct GlmResp{V<:FPVector, D<:UnivariateDistribution,L<:Link,W<:AbstractWeights{<:Real}} <: ModResp
     "`y`: response vector"
     y::V
     d::D
@@ -17,15 +17,15 @@ struct GlmResp{V<:FPVector,D<:UnivariateDistribution,L<:Link} <: ModResp
     mu::V
     "`offset:` offset added to `Xβ` to form `eta`.  Can be of length 0"
     offset::V
-    "`wts:` prior case weights.  Can be of length 0."
-    wts::V
+    "`wts`: prior case weights.  Can be of length 0."
+    wts::W
     "`wrkwt`: working case weights for the Iteratively Reweighted Least Squares (IRLS) algorithm"
     wrkwt::V
     "`wrkresid`: working residuals for IRLS"
     wrkresid::V
 end
 
-function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::V) where {V<:FPVector, D, L}
+function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::W) where {V<:FPVector, D, L, W}
     n  = length(y)
     nη = length(η)
     nμ = length(μ)
@@ -41,36 +41,34 @@ function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::V) where {V<:FPVec
     end
 
     # Lengths of wts and off can be either n or 0
-    if lw != 0 && lw != n
-        throw(DimensionMismatch("wts must have length $n or length 0 but was $lw"))
+    if lw != n
+        throw(DimensionMismatch("wts must have length $n but was $lw"))
     end
     if lo != 0 && lo != n
         throw(DimensionMismatch("offset must have length $n or length 0 but was $lo"))
     end
 
-    return GlmResp{V,D,L}(y, d, l, similar(y), η, μ, off, wts, similar(y), similar(y))
+    return GlmResp{V,D,L,W}(y, d, l, similar(y), η, μ, off, wts, similar(y), similar(y))
 end
 
-function GlmResp(y::FPVector, d::Distribution, l::Link, off::FPVector, wts::FPVector)
+function GlmResp(y::FPVector, d::Distribution, l::Link, off::FPVector, wts::AbstractWeights{<:Real})
     # Instead of convert(Vector{Float64}, y) to be more ForwardDiff friendly
     _y   = convert(Vector{float(eltype(y))}, y)
     _off = convert(Vector{float(eltype(off))}, off)
-    _wts = convert(Vector{float(eltype(wts))}, wts)
     η    = similar(_y)
     μ    = similar(_y)
-    r    = GlmResp(_y, d, l, η, μ, _off, _wts)
-    initialeta!(r.eta, d, l, _y, _wts, _off)
+    r    = GlmResp(_y, d, l, η, μ, _off, wts)
+    initialeta!(r.eta, d, l, _y, wts, _off)
     updateμ!(r, r.eta)
     return r
 end
 
-function GlmResp(y::AbstractVector{<:Real}, d::D, l::L, off::AbstractVector{<:Real},
-                 wts::AbstractVector{<:Real}) where {D, L}
-        GlmResp(float(y), d, l, float(off), float(wts))
+function GlmResp(y::AbstractVector{<:Real}, d::D, l::L, off::AbstractVector{<:Real}, wts::AbstractWeights{<:Real}) where {D, L}
+    GlmResp(float(y), d, l, float(off), wts)
 end
 
 deviance(r::GlmResp) = sum(r.devresid)
-
+weights(r::GlmResp) = r.wts
 """
     cancancel(r::GlmResp{V,D,L})
 
@@ -288,7 +286,7 @@ function _fit!(m::AbstractGLM, verbose::Bool, maxiter::Integer, minstepfac::Real
     lp = r.mu
 
     # Initialize β, μ, and compute deviance
-    if start == nothing || isempty(start)
+    if start === nothing || isempty(start)
         # Compute beta update based on default response value
         # if no starting values have been passed
         delbeta!(p, wrkresp(r), r.wrkwt)
@@ -440,12 +438,9 @@ const FIT_GLM_DOC = """
 
     # Keyword Arguments
     - `dofit::Bool=true`: Determines whether model will be fit
-    - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
-      Such weights are equivalent to repeating each observation a number of times equal
-      to its weight. Do note that this interpretation gives equal point estimates but
-      different standard errors from analytical (a.k.a. inverse variance) weights and
-      from probability (a.k.a. sampling) weights which are the default in some other
-      software.
+    - `wts::AbstractWeights=aweights(similar(y,0))`: Weights of observations.
+      Allowed weights are `AnalyticalWeights`, `FrequencyWeights`, or `ProbabilityWeights`.
+      If a vector is passed (deprecated) it is coerced to FrequencyWeights.
       Can be length 0 to indicate no weighting (default).
     - `offset::Vector=similar(y,0)`: offset added to `Xβ` to form `eta`.  Can be of
       length 0
@@ -476,17 +471,26 @@ function fit(::Type{M},
     d::UnivariateDistribution,
     l::Link = canonicallink(d);
     dofit::Bool = true,
-    wts::AbstractVector{<:Real}      = similar(y, 0),
-    offset::AbstractVector{<:Real}   = similar(y, 0),
+    wts::AbstractVector{<:Real} = uweights(length(y)),
+    offset::AbstractVector{<:Real} = similar(y, 0),
     fitargs...) where {M<:AbstractGLM}
-
     # Check that X and y have the same number of observations
     if size(X, 1) != size(y, 1)
         throw(DimensionMismatch("number of rows in X and y must match"))
     end
-
-    rr = GlmResp(y, d, l, offset, wts)
-    res = M(rr, cholpred(X), false)
+    # For backward compatibility accept wts as AbstractArray and coerce them to FrequencyWeights
+    _wts = if isa(wts, AbstractWeights)
+        wts
+    elseif isa(wts, AbstractVector)
+        Base.depwarn("Passing weights as vector is deprecated in favor of explicitely using " *
+                     "AnalyticalWeights, ProbabilityWeights, or FrequencyWeights. Proceeding " *
+                     "by coercing wts to `FrequencyWeights`", :fit)
+        fweights(wts)
+    else
+        throw(ArgumentError("`wts` should be an AbstractVector coercible to AbstractWeights"))
+    end
+    rr = GlmResp(y, d, l, offset, _wts)
+    res = M(rr, cholpred(X, false, _wts), false)
     return dofit ? fit!(res; fitargs...) : res
 end
 
@@ -537,6 +541,7 @@ function dispersion(m::AbstractGLM, sqr::Bool=false)
     end
 end
 
+
 """
     predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[],
             interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95,
@@ -649,3 +654,18 @@ function checky(y, d::Binomial)
     end
     return nothing
 end
+
+"""
+    nobs(obj::LinearModel)
+    nobs(obj::GLM)
+
+For linear and generalized linear models, returns the number of rows, or,
+when prior weights of type FrequencyWeights are specified, the sum of weights.
+"""
+nobs(obj::LinPredModel) = nobs(obj.rr)
+
+nobs(r::LmResp{V,W}) where {V,W} = oftype(sum(one(eltype(r.wts))), length(r.y))
+nobs(r::LmResp{V,W}) where {V,W<:FrequencyWeights} = isempty(r.wts) ? oftype(sum(one(eltype(r.wts))), length(r.y)) : r.wts.sum
+
+nobs(r::GlmResp{V,D,L,W}) where {V,D,L,W<:FrequencyWeights} = isempty(r.wts) ? oftype(sum(one(eltype(r.wts))), length(r.y)) : r.wts.sum
+nobs(r::GlmResp{V,D,L,W}) where {V,D,L,W} = oftype(sum(one(eltype(r.wts))), length(r.y))