Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Taking Weights Seriously #485

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/GLM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ module GLM
import Statistics: cor
import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue, weights
import StatsFuns: xlogy
import SpecialFunctions: erfc, erfcinv, digamma, trigamma
import StatsModels: hasintercept
export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
cooksdistance, hasintercept
cooksdistance, hasintercept, weights, AnalyticWeights, ProbabilityWeights, FrequencyWeights

export
# types
Expand Down Expand Up @@ -52,17 +52,17 @@ module GLM
LinearModel,

# functions
canonicallink, # canonical link function for a distribution
deviance, # deviance of fitted and observed responses
devresid, # vector of squared deviance residuals
formula, # extract the formula from a model
glm, # general interface
linpred, # linear predictor
lm, # linear model
negbin, # interface to fitting negative binomial regression
nobs, # total number of observations
predict, # make predictions
ftest # compare models with an F test
canonicallink, # canonical link function for a distribution
deviance, # deviance of fitted and observed responses
devresid, # vector of squared deviance residuals
formula, # extract the formula from a model
glm, # general interface
linpred, # linear predictor
lm, # linear model
negbin, # interface to fitting negative binomial regression
nobs, # total number of observations
predict, # make predictions
ftest # compare models with an F test

const FP = AbstractFloat
const FPVector{T<:FP} = AbstractArray{T,1}
Expand Down
76 changes: 48 additions & 28 deletions src/glmfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

The response vector and various derived vectors in a generalized linear model.
"""
struct GlmResp{V<:FPVector,D<:UnivariateDistribution,L<:Link} <: ModResp
struct GlmResp{V<:FPVector, D<:UnivariateDistribution,L<:Link,W<:AbstractWeights{<:Real}} <: ModResp
"`y`: response vector"
y::V
d::D
Expand All @@ -17,15 +17,15 @@ struct GlmResp{V<:FPVector,D<:UnivariateDistribution,L<:Link} <: ModResp
mu::V
"`offset:` offset added to `Xβ` to form `eta`. Can be of length 0"
offset::V
"`wts:` prior case weights. Can be of length 0."
wts::V
"`wts`: prior case weights. Can be of length 0."
wts::W
"`wrkwt`: working case weights for the Iteratively Reweighted Least Squares (IRLS) algorithm"
wrkwt::V
"`wrkresid`: working residuals for IRLS"
wrkresid::V
end

function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::V) where {V<:FPVector, D, L}
function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::W) where {V<:FPVector, D, L, W}
n = length(y)
nη = length(η)
nμ = length(μ)
Expand All @@ -41,36 +41,34 @@ function GlmResp(y::V, d::D, l::L, η::V, μ::V, off::V, wts::V) where {V<:FPVec
end

# Lengths of wts and off can be either n or 0
if lw != 0 && lw != n
throw(DimensionMismatch("wts must have length $n or length 0 but was $lw"))
if lw != n
throw(DimensionMismatch("wts must have length $n but was $lw"))
end
if lo != 0 && lo != n
throw(DimensionMismatch("offset must have length $n or length 0 but was $lo"))
end

return GlmResp{V,D,L}(y, d, l, similar(y), η, μ, off, wts, similar(y), similar(y))
return GlmResp{V,D,L,W}(y, d, l, similar(y), η, μ, off, wts, similar(y), similar(y))
end

function GlmResp(y::FPVector, d::Distribution, l::Link, off::FPVector, wts::FPVector)
function GlmResp(y::FPVector, d::Distribution, l::Link, off::FPVector, wts::AbstractWeights{<:Real})
# Instead of convert(Vector{Float64}, y) to be more ForwardDiff friendly
_y = convert(Vector{float(eltype(y))}, y)
_off = convert(Vector{float(eltype(off))}, off)
_wts = convert(Vector{float(eltype(wts))}, wts)
η = similar(_y)
μ = similar(_y)
r = GlmResp(_y, d, l, η, μ, _off, _wts)
initialeta!(r.eta, d, l, _y, _wts, _off)
r = GlmResp(_y, d, l, η, μ, _off, wts)
initialeta!(r.eta, d, l, _y, wts, _off)
updateμ!(r, r.eta)
return r
end

function GlmResp(y::AbstractVector{<:Real}, d::D, l::L, off::AbstractVector{<:Real},
wts::AbstractVector{<:Real}) where {D, L}
GlmResp(float(y), d, l, float(off), float(wts))
function GlmResp(y::AbstractVector{<:Real}, d::D, l::L, off::AbstractVector{<:Real}, wts::AbstractWeights{<:Real}) where {D, L}
GlmResp(float(y), d, l, float(off), wts)
end

deviance(r::GlmResp) = sum(r.devresid)

weights(r::GlmResp) = r.wts
"""
cancancel(r::GlmResp{V,D,L})

Expand Down Expand Up @@ -288,7 +286,7 @@ function _fit!(m::AbstractGLM, verbose::Bool, maxiter::Integer, minstepfac::Real
lp = r.mu

# Initialize β, μ, and compute deviance
if start == nothing || isempty(start)
if start === nothing || isempty(start)
# Compute beta update based on default response value
# if no starting values have been passed
delbeta!(p, wrkresp(r), r.wrkwt)
Expand Down Expand Up @@ -440,12 +438,9 @@ const FIT_GLM_DOC = """

# Keyword Arguments
- `dofit::Bool=true`: Determines whether model will be fit
- `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
Such weights are equivalent to repeating each observation a number of times equal
to its weight. Do note that this interpretation gives equal point estimates but
different standard errors from analytical (a.k.a. inverse variance) weights and
from probability (a.k.a. sampling) weights which are the default in some other
software.
- `wts::AbstractWeights=aweights(similar(y,0))`: Weights of observations.
Allowed weights are `AnalyticalWeights`, `FrequencyWeights`, or `ProbabilityWeights`.
If a vector is passed (deprecated) it is coerced to FrequencyWeights.
Can be length 0 to indicate no weighting (default).
- `offset::Vector=similar(y,0)`: offset added to `Xβ` to form `eta`. Can be of
length 0
Expand Down Expand Up @@ -476,17 +471,26 @@ function fit(::Type{M},
d::UnivariateDistribution,
l::Link = canonicallink(d);
dofit::Bool = true,
wts::AbstractVector{<:Real} = similar(y, 0),
offset::AbstractVector{<:Real} = similar(y, 0),
wts::AbstractVector{<:Real} = uweights(length(y)),
offset::AbstractVector{<:Real} = similar(y, 0),
fitargs...) where {M<:AbstractGLM}

# Check that X and y have the same number of observations
if size(X, 1) != size(y, 1)
throw(DimensionMismatch("number of rows in X and y must match"))
end

rr = GlmResp(y, d, l, offset, wts)
res = M(rr, cholpred(X), false)
# For backward compatibility accept wts as AbstractArray and coerce them to FrequencyWeights
_wts = if isa(wts, AbstractWeights)
wts
elseif isa(wts, AbstractVector)
Base.depwarn("Passing weights as vector is deprecated in favor of explicitely using " *
"AnalyticalWeights, ProbabilityWeights, or FrequencyWeights. Proceeding " *
"by coercing wts to `FrequencyWeights`", :fit)
fweights(wts)
else
throw(ArgumentError("`wts` should be an AbstractVector coercible to AbstractWeights"))
end
rr = GlmResp(y, d, l, offset, _wts)
res = M(rr, cholpred(X, false, _wts), false)
return dofit ? fit!(res; fitargs...) : res
end

Expand Down Expand Up @@ -537,6 +541,7 @@ function dispersion(m::AbstractGLM, sqr::Bool=false)
end
end


"""
predict(mm::AbstractGLM, newX::AbstractMatrix; offset::FPVector=eltype(newX)[],
interval::Union{Symbol,Nothing}=nothing, level::Real = 0.95,
Expand Down Expand Up @@ -649,3 +654,18 @@ function checky(y, d::Binomial)
end
return nothing
end

"""
nobs(obj::LinearModel)
nobs(obj::GLM)

For linear and generalized linear models, returns the number of rows, or,
when prior weights of type FrequencyWeights are specified, the sum of weights.
"""
nobs(obj::LinPredModel) = nobs(obj.rr)

nobs(r::LmResp{V,W}) where {V,W} = oftype(sum(one(eltype(r.wts))), length(r.y))
nobs(r::LmResp{V,W}) where {V,W<:FrequencyWeights} = isempty(r.wts) ? oftype(sum(one(eltype(r.wts))), length(r.y)) : r.wts.sum

nobs(r::GlmResp{V,D,L,W}) where {V,D,L,W<:FrequencyWeights} = isempty(r.wts) ? oftype(sum(one(eltype(r.wts))), length(r.y)) : r.wts.sum
nobs(r::GlmResp{V,D,L,W}) where {V,D,L,W} = oftype(sum(one(eltype(r.wts))), length(r.y))
Loading