diff --git a/README.md b/README.md index cb799f4..aad1708 100755 --- a/README.md +++ b/README.md @@ -139,12 +139,14 @@ df = dataset("plm", "Cigar") reg(df, @formula(Sales ~ NDI + fe(State) + fe(Year)), method = :Metal) ``` +GPU methods default to single precision (`Float32`) for the demeaning step. On CUDA you can pass `double_precision = true` to use `Float64`; Metal supports only `Float32`. + ## Solution method Denote the model `y = X β + D θ + e` where X is a matrix with few columns and D is the design matrix from categorical variables. Estimates for `β`, along with their standard errors, are obtained in two steps: 1. `y, X` are regressed on `D` using the package [FixedEffects.jl](https://github.com/FixedEffects/FixedEffects.jl) 2. Estimates for `β`, along with their standard errors, are obtained by regressing the projected `y` on the projected `X` (an application of the Frisch Waugh-Lovell Theorem) -3. With the option `save = true`, estimates for the high dimensional fixed effects are obtained after regressing the residuals of the full model minus the residuals of the partialed out models on `D` using the package [FixedEffects.jl](https://github.com/FixedEffects/FixedEffects.jl) +3. With the option `save = :fe` (or `save = :all`), estimates for the high dimensional fixed effects are obtained after regressing the residuals of the full model minus the residuals of the partialed out models on `D` using the package [FixedEffects.jl](https://github.com/FixedEffects/FixedEffects.jl) Here are some references for the solution method: diff --git a/src/FixedEffectModel.jl b/src/FixedEffectModel.jl index 3403787..ee125f1 100644 --- a/src/FixedEffectModel.jl +++ b/src/FixedEffectModel.jl @@ -5,6 +5,16 @@ ## ############################################################################## +""" + FixedEffectModel <: RegressionModel + +The fitted model returned by [`reg`](@ref). It is lightweight: it stores the coefficients, +variance-covariance matrix, fit statistics, and (optionally) saved residuals and fixed +effects, but not the original data. Inspect it with the `StatsAPI` accessors +(`coef`, `vcov`, `stderror`, `confint`, `coeftable`, `nobs`, `dof_residual`, `r2`, …), +and with [`fe`](@ref) for saved fixed effects. `predict`/`residuals` take the data table as +a second argument since it is not retained. +""" struct FixedEffectModel <: RegressionModel coef::Vector{Float64} # Vector of coefficients vcov::Matrix{Float64} # Covariance matrix @@ -43,7 +53,18 @@ struct FixedEffectModel <: RegressionModel p_kp::Float64 # First Stage p value KP end +""" + has_iv(m::FixedEffectModel) + +Return `true` if the model was estimated with instrumental variables (2SLS). +""" has_iv(m::FixedEffectModel) = has_iv(m.formula) + +""" + has_fe(m::FixedEffectModel) + +Return `true` if the model includes high-dimensional fixed effects (`fe(...)` terms). +""" has_fe(m::FixedEffectModel) = has_fe(m.formula) @@ -62,6 +83,12 @@ StatsAPI.nulldeviance(m::FixedEffectModel) = m.tss StatsAPI.rss(m::FixedEffectModel) = m.rss StatsAPI.mss(m::FixedEffectModel) = nulldeviance(m) - rss(m) StatsModels.formula(m::FixedEffectModel) = m.formula_schema +""" + dof_fes(m::FixedEffectModel) + +Return the number of degrees of freedom absorbed by the fixed effects (the total number of +fixed-effect levels across all `fe(...)` dimensions). +""" dof_fes(m::FixedEffectModel) = m.dof_fes function StatsAPI.loglikelihood(m::FixedEffectModel) @@ -256,9 +283,12 @@ function StatsAPI.coeftable(m::FixedEffectModel; level = 0.95) coefnms = coefnms[newindex] end tt = cc ./ se + # Label the confidence-interval columns with the requested level, not a hard-coded 95%. + pct = 100 * level + pctstr = isinteger(pct) ? string(Int(pct)) : string(pct) CoefTable( hcat(cc, se, tt, fdistccdf.(Ref(1), Ref(StatsAPI.dof_residual(m)), abs2.(tt)), conf_int[:, 1:2]), - ["Estimate","Std. Error","t-stat", "Pr(>|t|)", "Lower 95%", "Upper 95%" ], + ["Estimate","Std. Error","t-stat", "Pr(>|t|)", "Lower $(pctstr)%", "Upper $(pctstr)%" ], ["$(coefnms[i])" for i = 1:length(cc)], 4) end diff --git a/src/fit.jl b/src/fit.jl index d7790a3..22c476b 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -23,6 +23,8 @@ Estimate a linear model with high dimensional categorical variables / instrument ### Details Models with instruments variables are estimated using 2SLS. `reg` tests for weak instruments by computing the Kleibergen-Paap rk Wald F statistic, a generalization of the Cragg-Donald Wald F statistic for non i.i.d. errors. The statistic is similar to the one returned by the Stata command `ivreg2`. +Regressors that are collinear with other regressors (or with the fixed effects) are dropped from the estimation. A dropped coefficient is reported as `0` with a `NaN` standard error (and `NaN` t-statistic, p-value, and confidence interval). + ### Examples ```julia using RDatasets, FixedEffectModels @@ -116,6 +118,13 @@ function StatsAPI.fit(::Type{FixedEffectModel}, has_iv = formula_iv != FormulaTerm(ConstantTerm(0), ConstantTerm(0)) formula, formula_fes = parse_fe(formula) has_fes = formula_fes != FormulaTerm(ConstantTerm(0), ConstantTerm(0)) + # HC2/HC3 compute leverage from the partialled-out (demeaned) regressors only, which + # omits the absorbed fixed-effect contribution to leverage. The resulting standard + # errors are silently anti-conservative, so guard against the combination (mirroring + # the IV guard, where HC2/HC3 are likewise rejected). + if has_fes && vcov isa Vcov.RobustCovariance && vcov.correction ∈ (:hc2, :hc3) + throw(ArgumentError("$(uppercase(string(vcov.correction))) standard errors are not supported with fixed effects, because the leverage correction does not account for the absorbed fixed effects. Use Vcov.robust() (HC1) or Vcov.cluster(...) instead.")) + end # when save = :fe but there are no fixed effects in the formula, don't save fixed effects save_fes = save ∈ (:fe, :all) && has_fes has_weights = weights !== nothing @@ -325,9 +334,14 @@ function StatsAPI.fit(::Type{FixedEffectModel}, # Compute Fstat F = Fstat(coef, matrix_vcov, has_intercept) - # dof_ is the number of estimated coefficients beyond the intercept. + # dof_ is the number of estimated coefficients beyond the intercept (F-stat numerator). dof_ = size(X, 2) - has_intercept - dof_tstat_ = max(1, Vcov.dof_residual(vcov_data, vcov_method) - (has_intercept || has_fe_intercept)) + # Residual dof for t-tests/p-values/CIs. Vcov.dof_residual already returns the final + # value: nobs - size(X,2) - dof_fes for simple/robust (size(X,2) already counts the + # intercept column, or the constant is inside dof_fes when absorbed by a fixed effect), + # and minimum(nclusters)-1 = G-1 for clustering. It must NOT be decremented again for + # the intercept (doing so reported N-K-1 instead of N-K, and G-2 instead of G-1). + dof_tstat_ = max(1, Vcov.dof_residual(vcov_data, vcov_method)) p = fdistccdf(dof_, dof_tstat_, F) # Compute Fstat of First Stage diff --git a/src/partial_out.jl b/src/partial_out.jl index 91559cc..58ffa8b 100644 --- a/src/partial_out.jl +++ b/src/partial_out.jl @@ -5,17 +5,19 @@ Partial out variables in a Dataframe * `df`: A table * `formula::FormulaTerm`: A formula created using `@formula` * `add_mean::Bool`: Should the initial mean added to the returned variable? -* `method::Symbol`: A symbol for the method. Default is :cpu. Alternatively, :gpu requires `CuArrays`. In this case, use the option `double_precision = false` to use `Float32`. +* `method::Symbol`: A symbol for the method. Default is `:cpu`. Alternatively, use `:CUDA` or `:Metal` (load the respective package — CUDA.jl or Metal.jl — before `using FixedEffectModels`). * `maxiter::Integer`: Maximum number of iterations * `double_precision::Bool`: Should the demeaning operation use Float64 rather than Float32? Default to true. -* `tol::Real`: Tolerance +* `tol::Real`: Tolerance for the iterative demeaning. Default `1e-6`, matching `reg` (so that `partial_out` and `reg(...; save = :residuals)` return the same residuals). * `align::Bool`: Should the returned DataFrame align with the original DataFrame in case of missing values? Default to true. * `drop_singletons::Bool=false`: Should singletons be dropped? ### Returns -* `::DataFrame`: a dataframe with as many columns as there are dependent variables and as many rows as the original dataframe. -* `::Vector{Int}`: a vector of iterations for each column -* `::Vector{Bool}`: a vector of success for each column +* `::DataFrame`: residuals, one column per dependent variable, aligned with the original dataframe. +* `::BitVector`: the estimation-sample mask (the rows actually used). +* `::Vector{Int}`: number of demeaning iterations for each column. +* `::Vector{Bool}`: convergence flag for each column. +* `::Int`: degrees of freedom absorbed by the fixed effects. ### Details `partial_out` returns the residuals of a set of variables after regressing them on a set of regressors. The syntax is similar to `reg` - but it accepts multiple dependent variables. It returns a dataframe with as many columns as there are dependent variables and as many rows as the original dataframe. @@ -40,7 +42,7 @@ function partial_out( @nospecialize(contrasts::Dict = Dict{Symbol, Any}()), @nospecialize(method::Symbol = :cpu), @nospecialize(double_precision::Bool = true), - @nospecialize(tol::Real = double_precision ? 1e-8 : 1e-6), + @nospecialize(tol::Real = 1e-6), @nospecialize(align = true), @nospecialize(drop_singletons = false)) @@ -48,6 +50,11 @@ function partial_out( # DataFrame indexing/views, and copycols = false avoids copies when possible. df = DataFrame(df; copycols = false) + if method == :gpu + @info "method = :gpu is deprecated. Use method = :CUDA or method = :Metal" + method = :CUDA + end + if (ConstantTerm(0) ∉ eachterm(f.rhs)) && (ConstantTerm(1) ∉ eachterm(f.rhs)) f = FormulaTerm(f.lhs, tuple(ConstantTerm(1), eachterm(f.rhs)...)) end diff --git a/src/utils/basecol.jl b/src/utils/basecol.jl index 2e7c3b9..326f9e3 100644 --- a/src/utils/basecol.jl +++ b/src/utils/basecol.jl @@ -5,7 +5,8 @@ Generalized inverse via sweep operator. Returns minus the symmetric inverse (negated so callers negate back). """ function invsym!(X::Symmetric; has_intercept = false, setzeros = false, diagonal = 1:size(X, 2)) - tols = max.(diag(X), 1) + # Per-column reference scale for the numerically-zero pivot test below. + tols = sweep_tolerances(X) buffer = zeros(size(X, 1)) for j in diagonal d = X[j,j] @@ -17,17 +18,31 @@ function invsym!(X::Symmetric; has_intercept = false, setzeros = false, diagonal copy!(buffer, view(X, :, j)) Symmetric(BLAS.syrk!('U', 'N', -1/d, buffer, one(eltype(X)), X.data)) rmul!(buffer, 1 / d) - @views copy!(X.data[1:j-1,j], buffer[1:j-1]) - @views copy!(X.data[j, j+1:end], buffer[j+1:end]) + @views copy!(X.data[1:j-1,j], buffer[1:j-1]) + @views copy!(X.data[j, j+1:end], buffer[j+1:end]) X[j,j] = - 1 / d end if setzeros && has_intercept && j == 1 - tols = max.(diag(X), 1) + tols = sweep_tolerances(X) end end return X end +# Reference scale per column used by invsym! to flag a (post-sweep) pivot as numerically +# zero: a column is dropped when |pivot| < scale_j * sqrt(eps). The scale is the column's +# own diagonal, floored by a tiny multiple of the largest diagonal so the test is +# scale-invariant. The previous absolute floor of 1 spuriously dropped genuinely +# independent regressors whose total sum of squares was below ~sqrt(eps), while the +# relative floor still drops all-zero columns (diag 0 -> floored to ref*eps > 0). On +# well-scaled designs (all diagonals >= 1) this reproduces the old tolerances exactly. +function sweep_tolerances(X::Symmetric) + d = diag(X) + ref = maximum(d; init = zero(eltype(d))) + floor_ = ref > 0 ? ref * eps(eltype(d)) : eps(eltype(d)) + return max.(d, floor_) +end + """ basis!(XX::Symmetric; has_intercept=false) @@ -189,6 +204,11 @@ end Expand coefficient vector and vcov matrix to account for omitted (collinear) variables and IV-reclassified variable permutations. + +Coefficients dropped for collinearity are reinserted with a value of `0`, and the +corresponding rows and columns of the variance-covariance matrix are filled with `NaN`. +This is the convention used to flag an omitted regressor: its standard error, t-statistic, +p-value, and confidence interval are therefore reported as `NaN`. """ function reinsert_omitted!( coef::Vector{Float64}, # estimated coefficients (excluding omitted) @@ -197,28 +217,19 @@ function reinsert_omitted!( perm::Union{Nothing, Vector{Int}} # permutation from IV reclassification, or nothing ) if !all(basis_coef) + kept = findall(basis_coef) newcoef = zeros(length(basis_coef)) + newcoef[kept] = coef + newmatrix_vcov = fill(NaN, (length(basis_coef), length(basis_coef))) - newindex = [searchsortedfirst(cumsum(basis_coef), i) for i in 1:length(coef)] - for i in eachindex(newindex) - newcoef[newindex[i]] = coef[i] - for j in eachindex(newindex) - newmatrix_vcov[newindex[i], newindex[j]] = matrix_vcov[i, j] - end - end + newmatrix_vcov[kept, kept] = Matrix(matrix_vcov) coef = newcoef matrix_vcov = Symmetric(newmatrix_vcov) end if perm !== nothing - _invperm = invperm(perm) - coef = coef[_invperm] - newmatrix_vcov = zeros(size(matrix_vcov)) - for i in 1:size(newmatrix_vcov, 1) - for j in 1:size(newmatrix_vcov, 1) - newmatrix_vcov[i, j] = matrix_vcov[_invperm[i], _invperm[j]] - end - end - matrix_vcov = Symmetric(newmatrix_vcov) + invp = invperm(perm) + coef = coef[invp] + matrix_vcov = Symmetric(Matrix(matrix_vcov)[invp, invp]) end return coef, matrix_vcov end diff --git a/src/utils/fixedeffects.jl b/src/utils/fixedeffects.jl index 2155bbb..e653128 100644 --- a/src/utils/fixedeffects.jl +++ b/src/utils/fixedeffects.jl @@ -52,8 +52,20 @@ end function nunique(fe::FixedEffect) seen = falses(fe.n) - @inbounds for ref in fe.refs - seen[ref] = true + if fe.interaction isa UnitWeights + @inbounds for ref in fe.refs + seen[ref] = true + end + else + # Continuous-slope fixed effect (e.g. fe(id)&x): a group whose interaction is + # identically zero contributes no column to the design and absorbs no degree of + # freedom (the demeaning sets its scale to 0, SolverCPU.scale!). Count only groups + # with at least one nonzero interaction value, so dof_fes is not overstated. + @inbounds for i in eachindex(fe.refs, fe.interaction) + if !iszero(fe.interaction[i]) + seen[fe.refs[i]] = true + end + end end count(seen) end diff --git a/src/utils/formula.jl b/src/utils/formula.jl index f56cc05..d58eba0 100644 --- a/src/utils/formula.jl +++ b/src/utils/formula.jl @@ -14,6 +14,7 @@ eachterm(@nospecialize(x::NTuple{N, AbstractTerm})) where {N} = x ############################################################################## has_iv(@nospecialize(f::FormulaTerm)) = any(x -> x isa FormulaTerm, eachterm(f.rhs)) function parse_iv(@nospecialize(f::FormulaTerm)) + f.lhs isa FormulaTerm && throw(ArgumentError("Malformed formula: the left-hand side cannot contain `~`. The instrumental-variable syntax is `y ~ exogenous + (endogenous ~ instruments)`.")) if has_iv(f) i = findfirst(x -> x isa FormulaTerm, eachterm(f.rhs)) term = eachterm(f.rhs)[i] @@ -41,6 +42,18 @@ struct FixedEffectTerm <: AbstractTerm x::Symbol end StatsModels.termvars(t::FixedEffectTerm) = [t.x] + +""" + fe(x) + +Mark a variable as a high-dimensional fixed effect (a categorical variable to be absorbed) +inside a `@formula` passed to [`reg`](@ref) or [`partial_out`](@ref), e.g. +`@formula(y ~ x + fe(id))`. Several fixed effects are added with `+`, and they can be +interacted with `&`/`*`: `fe(id)&fe(year)` for interacted fixed effects, `fe(id)&x` for +group-specific slopes on a continuous variable `x`. + +When building a formula programmatically, `fe` also accepts a `Symbol`: `fe(:id)`. +""" fe(x::Term) = fe(Symbol(x)) fe(s::Symbol) = FixedEffectTerm(s) diff --git a/test/collinearity.jl b/test/collinearity.jl index c0a9003..e5c889d 100644 --- a/test/collinearity.jl +++ b/test/collinearity.jl @@ -1,4 +1,5 @@ -using DataFrames, CSV, FixedEffectModels, Random, Statistics, Test +using DataFrames, CSV, FixedEffectModels, Random, Statistics, Test, LinearAlgebra +using FixedEffectModels: reinsert_omitted! @testset "collinearity_with_fixedeffects" begin @@ -44,3 +45,47 @@ using DataFrames, CSV, FixedEffectModels, Random, Statistics, Test rr = reg(df, @formula(x1 ~ x2)) @test all(!isnan, stderror(rr)) end + +@testset "reinsert omitted" begin + coef = [10.0, 20.0] + vcov = Symmetric([1.0 0.2; 0.2 4.0]) + basis_coef = BitVector([true, false, true]) + perm = [2, 1, 3] + + newcoef, newvcov = reinsert_omitted!(coef, vcov, basis_coef, perm) + newvcov_matrix = Matrix(newvcov) + + @test newcoef == [0.0, 10.0, 20.0] + @test isnan(newvcov_matrix[1, 1]) + @test isnan(newvcov_matrix[1, 2]) + @test isnan(newvcov_matrix[1, 3]) + @test isnan(newvcov_matrix[2, 1]) + @test newvcov_matrix[2, 2] == 1.0 + @test newvcov_matrix[2, 3] == 0.2 + @test isnan(newvcov_matrix[3, 1]) + @test newvcov_matrix[3, 2] == 0.2 + @test newvcov_matrix[3, 3] == 4.0 +end + +@testset "small-scale independent regressor is kept" begin + # The rank test is scale-invariant: a genuinely independent regressor whose total + # sum of squares is below sqrt(eps) used to be dropped as collinear (NaN stderror). + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.tiny = df.Price .* 3e-8 + m = reg(df, @formula(Sales ~ tiny)) + mP = reg(df, @formula(Sales ~ Price)) + @test !isnan(stderror(m)[2]) + # scaling a regressor by c scales its coefficient by 1/c and leaves the t-stat invariant + @test coef(m)[2] * 3e-8 ≈ coef(mP)[2] rtol = 1e-4 + @test coef(m)[2] / stderror(m)[2] ≈ coef(mP)[2] / stderror(mP)[2] rtol = 1e-4 +end + +@testset "collinear regressor reported as zero coef and NaN stderror" begin + df = DataFrame(y = [1.0, 3.0, 2.0, 5.0, 4.0, 6.0], x = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + df.x2 = 2 .* df.x # perfectly collinear with x + m = reg(df, @formula(y ~ x + x2)) + # coefnames are [(Intercept), x, x2]; the later collinear column x2 is dropped + @test coef(m)[3] == 0 + @test isnan(stderror(m)[3]) + @test !isnan(stderror(m)[2]) +end diff --git a/test/fit.jl b/test/fit.jl index d58f7a7..fe88a30 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -777,4 +777,37 @@ end df_ivinf = DataFrame(y = [1.0, 2.0, 3.0], x = [0.0, 1.0, 2.0], z = [1.0, Inf, 3.0]) @test_throws ArgumentError reg(df_ivinf, @formula(y ~ (x ~ z))) + + # malformed formula: a `~` on the left-hand side gives a clear error, not a MethodError + @test_throws ArgumentError reg(df, @formula((y ~ z) ~ x)) +end + +@testset "residual degrees of freedom and CI labels" begin + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + + # The intercept is counted exactly once: residual dof is N - K, not N - K - 1. + @test dof_residual(reg(df, @formula(Sales ~ Price))) == 1378 + @test dof_residual(reg(df, @formula(Sales ~ 0 + Price))) == 1379 + @test dof_residual(reg(df, @formula(Sales ~ CPI + (Price ~ Pimin)))) == 1377 + # A fixed effect absorbs the constant once (dof = N - slopes - #levels). + @test dof_residual(reg(df, @formula(Sales ~ Price + fe(State)))) == 1380 - 1 - length(unique(df.State)) + # Cluster-robust dof is G - 1 (clusters minus one), not G - 2. + @test dof_residual(reg(df, @formula(Sales ~ Price), Vcov.cluster(:State))) == length(unique(df.State)) - 1 + + # coeftable labels the confidence-interval columns with the requested level. + m = reg(df, @formula(Sales ~ Price)) + @test coeftable(m).colnms[end-1:end] == ["Lower 95%", "Upper 95%"] + @test coeftable(m; level = 0.90).colnms[end-1:end] == ["Lower 90%", "Upper 90%"] + @test coeftable(m; level = 0.99).colnms[end-1:end] == ["Lower 99%", "Upper 99%"] +end + +@testset "continuous-slope FE degrees of freedom" begin + # A group whose continuous interaction is identically zero contributes no column + # to the design and must not be counted in dof_fes. + df = DataFrame(id = repeat(1:3, inner = 4), + z = Float64[3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8], + y = Float64[5, 2, 7, 3, 1, 8, 4, 9, 6, 2, 5, 3]) + df.x = Float64.(df.id .!= 1) # interaction identically zero for id == 1 + m = reg(df, @formula(y ~ z + fe(id)&x)) + @test FixedEffectModels.dof_fes(m) == 2 end diff --git a/test/partial_out.jl b/test/partial_out.jl index c1903e2..77b85cb 100644 --- a/test/partial_out.jl +++ b/test/partial_out.jl @@ -38,5 +38,8 @@ end @test_throws ArgumentError partial_out(df, @formula(y ~ (x ~ z))) @test_throws ArgumentError partial_out(df, @formula(y ~ x), weights = :w) @test_throws ArgumentError partial_out(DataFrame(y = [missing, missing], x = [1.0, 2.0]), @formula(y ~ x)) + + # method = :gpu is deprecated in favor of :CUDA / :Metal + @test_logs (:info, r"method = :gpu is deprecated") partial_out(df, @formula(y ~ x), method = :gpu) end