# clear workspace
rm(list=ls())
gc()
# libraries
library(tidyverse) # general data manipulation
library(broom) # model results to data.frame for compact viewing
library(optimx) # optim + extras
library(sandwich) # Adjust standard errors
library(lmtest) # Recalculate model errors with sandwich functions with coeftest()
# library(knitr) # Used to display the table
# Set random seed for reproducibility
set.seed(101)
This is a sequel to my last post. Turns out I had a ‘think-o’: ‘’if not independent, then we could use the fully dense [variance-covariance matrix].’’ This was in the context of robust standard errors using diag(resid^2)
as the error variance-covariance matrix, \(\Omega\), within the ‘meat’ of the ‘sandwich.’ Let’s say I was …uhh… thinking how in clustered standard errors we allow for \(\Omega\) to have a block-diagonal structure, where each block is a conditional variance-covariance sub-matrix, \(\Omega_g\), is fully dense … like me!
In this post, I will demonstrate how to do clustering and how to use weights when manually calculating SEs. For this post, I consulted Luke Sonnnet’s slides, Rios-Avila’s slides, Cameron’s (third set of) slides, MacKinnon’s 2023 paper, as well as Wooldridge’s 2023 paper. I also used Claude.ai to debug some errors.
Set up
Create Data
I am going to create data where clustering should matter.
# obs - high so that coeficients are well estimated
<- 10000
n
# clusters
<- 50
num.clusters <- sort(floor(runif(n)*num.clusters)+1)
grps
# cluster effects
<- rnorm(num.clusters,0,2)
cluster.y.effect <- rnorm(num.clusters,0,2)
cluster.u.effect <- rnorm(num.clusters,0,0.2)
cluster.x1.effect <- rnorm(num.clusters,0,0.2)
cluster.x2.effect <- rnorm(num.clusters,0,0.2)
cluster.x1.x2.effect
# two x vars (will add a constant)
=rnorm(n,1,1 + cluster.x1.effect/3)
x1=rnorm(n,1,2)
x2
# clustered errors
<- runif(n, min=0.5, max=4)
sd <- sd*(x1/5 + 1) # need to ensure that all s's are positive
s.true <- rnorm(n,0,s.true) +
error +
cluster.u.effect[grps] *cluster.x1.effect[grps] + x2*cluster.x2.effect[grps] +
x1*x2*cluster.x1.x2.effect[grps]
x1<- error - mean(error)
error
# true coefficents
<- c(1,-4,2)
coef.true
#Generate the dependent variable
<- coef.true[1] + (coef.true[2]*x1) + (coef.true[3]*x2) + error y1
Canned SEs
Here, I’ll quickly show how to get non-robust, robust, and clustered SEs. Note, this approach uses lmtest
and sandwich
. If I had a lot of FEs, then I would likely use feols
package.
# Model, non robust
<- lm(y1 ~ x1 + x2)
m1.nonrobust
# Robust standard errors
<- coeftest(m1.nonrobust,vcov = vcovHC)
m1.robust # clustered
<- coeftest(m1.nonrobust, vcov = vcovCL, cluster = ~grps)
m1.clu
# Assuming m1.nonrobust, m1.robust, and m1.clu are already estimated
# Function to extract standard errors
<- function(model) {
get_se if (class(model)[1] == "lm") {
return(sqrt(diag(vcov(model))))
else {
} return(model[, "Std. Error"])
}
}
# Create a data frame with all standard errors
<- data.frame(
se_comparison Coef = m1.nonrobust$coefficients,
Non_Robust = get_se(m1.nonrobust),
Robust = get_se(m1.robust),
Clustered = get_se(m1.clu)
)
# Round all numeric columns to 4 decimal places
<- se_comparison %>%
se_comparison mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se_comparison
Coef Non_Robust Robust Clustered
(Intercept) 0.9445 0.0541 0.0482 0.2640
x1 -3.9659 0.0361 0.0371 0.0524
x2 2.0216 0.0181 0.0189 0.0456
Estimating via Optimization
As before, my goal is to write the functions as generally as possible. I do this by first writing a residual function, then a SSR function that calls the residual function.
# We are going to code of our sum of squared residuals function
# first, write the residual function
<- function(coef.arg){
f.resid <- coef.arg[1]
b0 <- coef.arg[2]
b1 <- coef.arg[3]
b2 <- (y1 - (b0 + b1*x1 + b2*x2))
u.i return(u.i)
}# second, write the SSR function
# note: that we divide by 2!
# if we do not, then coefs are true but SEs are wrong!
<- function(coef.arg){
f.ssr # coef.arg <- coef.init
<- (f.resid(coef.arg))^2
u2 <- sum(u2)/2
ssr if(is.na(ssr) | is.nan(ssr)) return(Inf)
return(ssr)
}
# initial coef values
<- c(1,1,1)
coef.init
# run optimx
# note: (1) use BFGS since no constraints, (2) request hessian
<- optimx(coef.init,
out.reg method = c("BFGS"),
f.ssr,control = list(maxit=500000,trace=0),
hessian=T
)
# final coefs
<- as.numeric(out.reg[1,1:3])
coef.fin coef.fin
[1] 0.9444571 -3.9659487 2.0215917
Manual Non-robust SEs
As before, non-robust SEs are the degrees-of-freedom adjusted residual SD scaled squared root of the inverse hessian.
# calculate residuals at final estimate
<- f.resid(coef.fin)
resid
# get DoF adjusted SD of resid(b.star)
<- sd(resid)*sqrt(((n-1)/(n-3)))
s2.hat # get hessian from details
<- attributes(out.reg)$details["BFGS", "nhatend"][[1]]
hessian # calculate the non-robust SEs
<- sqrt(diag(base::solve(hessian)))*s2.hat se.non.robust
Manual Robust SEs
Note, for the robust SEs we need a more intensive estimator. It is called the sandwich estimator. First, we calculate the jacobian, \(\nabla g\), (vector valued gradient) of the residual function, \(u\), which is in n-by-k. The ‘meat’ of the sandwich by calculating product of the score with itself, under the assumption of heteroskedasticity but independent draws. So, second, we calculate \(m = \nabla g^\mathsf{T} \times \mathsf{diag}(u^2)\times \nabla g\). The ‘bread’ is the the inverse of the covariance of the jacobian. So, third, we calculate \(b = \nabla g^\mathsf{T} \nabla g\). Finally, we make a sandwich: \(\mathsf{vcov}_{\mathsf{robust}}(\hat{\beta}) = b\times m\times b\). And that’s it.
# Robust SEs manual need the jacobian (multivalued function gradient) wrt the parameters
# we will do this numerically (although it is analytic for OLS)
<- numDeriv::jacobian(func = f.resid, x = coef.fin)
gradient.of.residual
# robust meat assumes independent but not identically distributed
# the meat is the cross product of score = gT*u, which is the FOC
<- t(gradient.of.residual) %*% diag(resid^2) %*% gradient.of.residual
meat.robust
# E[gTg] is the covariance of g
<- solve(t(gradient.of.residual) %*% gradient.of.residual)
bread.robust
# Calculate the sandwich
<- bread.robust %*% meat.robust %*% bread.robust
sandwich.robust
# Calculate robust standard errors
<- sqrt(diag(sandwich.robust))
se.robust # print(round(se.robust,4))
Manual Cluster Robust SEs
The cluster robust SEs use the same sandwich principal as the robust; however, we allow the variance-covariance matrix of the errors, \(\Omega\), to have a block diagonal structure such that each cluster-group has a dense conditional variance-covariance matrix \(\Omega_{c}\). Calculating the block diagonal matrix could be a little complicated, but it turns out that, assuming \(\Omega\) has the block diagonal structure, \(g^{\mathsf{T}}\Omega g\) is \(k\times k\) and can be calculated by \(\sum_{c\in C} \nabla g_{c}^{\mathsf{T}}\Omega_{c} \nabla g_{c}\) (which are also \(k\times k\).
# Cluster Robust estimate of V[\hat{\bbeta}]
# in Robust, just use diag
# for cluster, theory says use block diagonal
# however, that is a big matrix (tho we do not need to invert)
# rather, can sum cluster group specific score matrices
# bread is the same
# bread.robust <- solve(t(gradient.of.residual) %*% gradient.of.residual)
# also has an adjustment factor since summing over clusters
<- matrix(0, nrow = ncol(gradient.of.residual), ncol = ncol(gradient.of.residual))
meat.cl for (g in 1:num.clusters) {
# g <- 1
<- meat.cl +
meat.cl t(gradient.of.residual[grps == g, ]) %*% resid[grps == g]) %*%
( (t(resid[grps == g]) %*% gradient.of.residual[grps == g, ])
(
)
}#
<- ((num.clusters/(num.clusters-1)) * ((n - 1)/(n - 3)))
adj.factor <- (adj.factor * (bread.robust %*% (meat.cl %*% bread.robust)))
sandwich.clu <- sqrt(diag(sandwich.clu))
se.clustered # print(round(se.clustered,4))
# Create a data frame with all standard errors
$Non_Robust.2 <- se.non.robust
se_comparison$Robust.2 <- se.robust
se_comparison$Clustered.2 <- se.clustered
se_comparison
# Round all numeric columns to 4 decimal places
<- se_comparison %>%
se_comparison mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se_comparison
Coef Non_Robust Robust Clustered Non_Robust.2 Robust.2
(Intercept) 0.9445 0.0541 0.0482 0.2640 0.0541 0.0482
x1 -3.9659 0.0361 0.0371 0.0524 0.0361 0.0371
x2 2.0216 0.0181 0.0189 0.0456 0.0181 0.0189
Clustered.2
(Intercept) 0.2640
x1 0.0524
x2 0.0456
Weighted OLS
Here is how to add weights to the estimation.
First Let’s look at our target
# arbitrary weights for now to highlight difference
<- ((1:n/n)-0.5)^2 + 0.001
w
# Model
<- lm(y1 ~ x1 + x2, weights = w)
m1.nonrobust.w
# Robust standard errors
<- coeftest(m1.nonrobust.w,vcov = vcovHC)
m1.robust.w <- coeftest(m1.nonrobust.w, vcov = vcovCL, cluster = ~grps, type = "HC1")
m1.clu.w
# Create a data frame with all standard errors
<- data.frame(
se.comparison.w Coef = m1.nonrobust.w$coefficients,
Non_Robust = get_se(m1.nonrobust.w),
Robust = get_se(m1.robust.w),
Clustered = get_se(m1.clu.w)
)
# Round all numeric columns to 4 decimal places
<- se.comparison.w %>%
se.comparison.w mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se.comparison.w
Coef Non_Robust Robust Clustered
(Intercept) 1.2316 0.0547 0.0653 0.3740
x1 -3.8870 0.0362 0.0507 0.0642
x2 2.0300 0.0184 0.0251 0.0586
Manual WLS estimates
Adding weights to our estimation set up is trivial for coefficients.
<- function(coef.arg){
f.ssr.w # coef.arg <- coef.init
<- w*((f.resid(coef.arg))^2)
u2 <- sum(u2)/2
ssr if(is.na(ssr) | is.nan(ssr)) return(Inf)
return(ssr)
}
# initial coef values
<- c(1,1,1)
coef.init
# run optimx
# note: (1) use BFGS since no constraints, (2) request hessian
<- optimx(coef.init,
out.reg.w method = c("BFGS"),
f.ssr.w,control = list(maxit=500000,trace=0),
hessian=T
)
# final coefs
<- as.numeric(out.reg.w[1,1:3])
coef.fin.w coef.fin.w
[1] 1.231613 -3.887036 2.029959
Non-Robust SEs of WLS
Non-robust weighed SEs only require adjusting the scaling factor for the weights.
# the residual function does not change but the estimates do!
<- f.resid(coef.fin.w)
resid.w
# new DoF adjusted SD of resid(b.star)
<- sd(resid.w)*sqrt(((n-1)/(n-3)))
s2.hat.w
# get hessian from details
<- attributes(out.reg)$details["BFGS", "nhatend"][[1]]
hessian.w # calculate the non-robust SEs
<- sqrt(diag(base::solve(hessian.w)))*s2.hat.w
se.non.robust.w # print(round(se.non.robust.w,4))
Robust SEs of WLS
Robust weighed SEs require a weighting matrix be added to the sandwich formula.
# can use the same gradient function, but new coef!
<- numDeriv::jacobian(func = f.resid, x = coef.fin.w)
gradient.of.residual.w # weight matrix
<- diag(w)
W #
<- solve(t(gradient.of.residual.w) %*% W %*% gradient.of.residual.w)
bread.w <- (t(gradient.of.residual.w) %*% W %*% diag((resid.w)^2) %*% t(W) %*% gradient.of.residual.w)
meat.w <- bread.w %*% meat.w %*% t(bread.w)
vcov.w <- sqrt(diag(vcov.w))
se.robust.w # se.robust.w
Robust Clustered SEs of WLS
Cluster robust weighed SEs require a weighting matrix be added to the sandwich formula.
# can use the same gradient
# gradient.of.residual.w <- numDeriv::jacobian(func = f.resid, x = coef.fin.w)
# weight matrix
# W <- diag(w)
#
# bread.w <- solve(t(gradient.of.residual.w) %*% W %*% gradient.of.residual.w)
#
<- matrix(0, nrow = ncol(gradient.of.residual.w), ncol = ncol(gradient.of.residual.w))
meat.cl.w for (g in 1:num.clusters) {
# g <- 1
<- meat.cl.w +
meat.cl.w t(gradient.of.residual.w[grps == g, ]) %*% diag(w[grps == g]) %*% resid.w[grps == g]) %*%
( (t(resid.w[grps == g]) %*% t(diag(w[grps == g])) %*% gradient.of.residual.w[grps == g, ])
(
)
}#
<- ((num.clusters/(num.clusters-1)) * ((n - 1)/(n - 3)))
adj.factor <- (adj.factor * (bread.w %*% (meat.cl.w %*% bread.w)))
sandwich.clu.w <- sqrt(diag(sandwich.clu.w))
se.clustered.w # print(round(se.clustered.w,4))
# Create a data frame with all standard errors
$Non_Robust.2 <- se.non.robust.w
se.comparison.w$Robust.2 <- se.robust.w
se.comparison.w$Clustered.2 <- se.clustered.w
se.comparison.w
# Round all numeric columns to 4 decimal places
<- se.comparison.w %>%
se.comparison.w mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se.comparison.w
Coef Non_Robust Robust Clustered Non_Robust.2 Robust.2
(Intercept) 1.2316 0.0547 0.0653 0.3740 0.0541 0.0653
x1 -3.8870 0.0362 0.0507 0.0642 0.0361 0.0506
x2 2.0300 0.0184 0.0251 0.0586 0.0181 0.0251
Clustered.2
(Intercept) 0.3740
x1 0.0642
x2 0.0586