# clear workspace
rm(list=ls())
gc()
# libraries
library(tidyverse) # general data manipulation
library(broom) # model results to data.frame for compact viewing
library(optimx) # optim + extras
library(sandwich) # Adjust standard errors
library(lmtest) # Recalculate model errors with sandwich functions with coeftest()
library(rlang) # helps with functions
# Set random seed for reproducibility
set.seed(101)
# obs - high so that coeficients are well estimated
<- 10000
n
# clusters
<- 50
num.clusters <- sort(floor(runif(n)*num.clusters)+1)
grps
# cluster effects
<- rnorm(num.clusters,0,2)
cluster.y.effect <- rnorm(num.clusters,0,2)
cluster.u.effect <- rnorm(num.clusters,0,0.2)
cluster.x1.effect <- rnorm(num.clusters,0,0.2)
cluster.x2.effect <- rnorm(num.clusters,0,0.2)
cluster.x1.x2.effect
# two x vars (will add a constant)
=rnorm(n,1,1 + cluster.x1.effect/3)
x1=rnorm(n,1,2)
x2
# clustered errors
<- runif(n, min=0.5, max=4)
sd <- sd*(x1/5 + 1) # need to ensure that all s's are positive
s.true <- rnorm(n,0,s.true) +
error +
cluster.u.effect[grps] *cluster.x1.effect[grps] + x2*cluster.x2.effect[grps] +
x1*x2*cluster.x1.x2.effect[grps]
x1<- error - mean(error)
error
# true coefficents
<- c(1,-4,2)
coef.true
#Generate the dependent variable
<- coef.true[1] + (coef.true[2]*x1) + (coef.true[3]*x2) + error y1
This is side-adventure to the last two posts. I wanted to code up a bootstrap using NLS, but honestly I needed to practice doing it with just OLS. We all learn about bootstrap, but the Bayesian bootstrap from Rubin 1981 was probably unknown to most prior to Peter Hull’s 2022 twitter thread.
I primarily learned this in classes or saw on Twitter, so I have no specific citations. However, I did end up using Claude.ai more than the last two posts to debug errors and find more efficient solutions.
Traditional vs Bayesian Bootstrap Intuition
I don’t know as much about bootstrapping as I would like, so this will be heavy on intuition.
The traditional bootstrap uses the idea that if there is sampling variation that we want to account for because our data is not iid, then we can do that sampling process using our data by sampling it according to a assumed sampling process. To implement, (1) sample data \(B\) times with replacement, (2) for each sample \(b\), calculate the main statistic of interest (e.g., \(\hat{\beta}_{b}\)), (3) and then calculate bootstrap SE via the SD of the bootstrapped statistics (e.g., \(\mathsf{sd}_{b}(\hat{\beta}_b)\)) or calculate CIs. For example, for robust SEs we just resample the observations with replacement; while for cluster robust SEs, we sample the clusters with replacement. This works out well enough with most models; however, it can get tricky when the model is non-linear or if there are fixed effects. Sometimes the estimation could fail, which is very annoying… especially if one did not code defensively.
The Bayesian bootstrap essentially replicates the sampling process but with sample weights that mimic the resampling. To implement, (1) create \(B\) sets of random weights \(\omega_{b}\) that follow the Dirichlet distribution (more later), (2) estimate a weighted version of the main statistic of interest, and then (3) calculate bootstrap SE via the SD of the bootstrapped statistics or calculate CIs. This makes an intuitive sense, but it is not actually as straight-forward as one may initially think. I won’t get into any of that. The main benefit is that typically weighting the data is a much simpler approach and much less likely to cause estimation failure.
Set up (same as Pt2)
I will simulate the data the same as in Pt2.
Canned SEs
Here, I’ll quickly show how to get non-robust, robust, and clustered SEs. Note, this approach uses lmtest
and sandwich
. If I had a lot of FEs, then I would likely use feols
package.
# Model, non robust
<- lm(y1 ~ x1 + x2)
m1.nonrobust
# Robust standard errors
<- coeftest(m1.nonrobust,vcov = vcovHC)
m1.robust # clustered
<- coeftest(m1.nonrobust, vcov = vcovCL, cluster = ~grps)
m1.clu
# Assuming m1.nonrobust, m1.robust, and m1.clu are already estimated
# Function to extract standard errors
<- function(model) {
get_se if (class(model)[1] == "lm") {
return(sqrt(diag(vcov(model))))
else {
} return(model[, "Std. Error"])
}
}
# Create a data frame with all standard errors
<- data.frame(
se_comparison Coef = m1.nonrobust$coefficients,
Non_Robust = get_se(m1.nonrobust),
Robust = get_se(m1.robust),
Clustered = get_se(m1.clu)
)
# Round all numeric columns to 4 decimal places
<- se_comparison %>%
se_comparison mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se_comparison
Coef Non_Robust Robust Clustered
(Intercept) 0.9445 0.0541 0.0482 0.2640
x1 -3.9659 0.0361 0.0371 0.0524
x2 2.0216 0.0181 0.0189 0.0456
Traditional Bootstrap
First, I am going to show a traditional bootstrap. I am going to create a bootstrap function that can do either full-sample sampling or cluster-sampling. Cluster-sampling is best if we believe standard errors have cluster-dependence; else, if independent, then full-sample sampling is best.
<- data.frame(y1=y1,
df x1=x1,
x2=x2,
grps = grps) %>%
mutate(id = row_number())
rm(y1,x1,x2,grps)
##### Traditional Bootstrap function
<- function(arg.data,arg.model.formula,
f.traditional.bootstrap arg.boots = 10,arg.cluster = NULL) {
############################################################
# get cluster variable title
<- enquo(arg.cluster)
arg.cluster_quo
# do this only once but only if clustered
if (!(is.null(substitute(arg.cluster))==T)) {
<- arg.data %>% distinct(!!arg.cluster_quo)
unique_clusters <- nrow(unique_clusters)
n_clusters
}
# Initialize results matrix
<- matrix(NA,
results nrow = length(coef(lm(arg.model.formula, data = arg.data))),
ncol = arg.boots)
# Perform bootstrap
for (i in 1:arg.boots) {
if (is.null(substitute(arg.cluster))==T) {
# Full-Sample (Independent) Sampling
# --> sample obs with replacement
<- arg.data %>%
df.boot ungroup() %>%
slice_sample(n=nrow(.), replace = T)
} else {
# Cluster-sampling (Dependent)
# --> Sample clusters with replacement
<- unique_clusters %>%
sampled_clusters slice_sample(n = n_clusters, replace = TRUE)
# Join the sampled clusters back to the original data
<- arg.data %>%
df.boot right_join(sampled_clusters,
by = as.character(quo_name(arg.cluster_quo)),
relationship = "many-to-many")
}# store OLS coefs for each iteration
<- as.numeric(lm(formula = arg.model.formula,data=df.boot)$coefficients)
results[,i]
}
# Calculate bootstrap standard errors via sd
<- apply(results, 1, sd)
boot_se #
return(boot_se)
############################################################
}
#
$trad.ind <- f.traditional.bootstrap(df, as.formula("y1 ~ x1 + x2"), arg.boots = 500)
se_comparison$trad.clu <- f.traditional.bootstrap(df, as.formula("y1 ~ x1 + x2"), arg.boots = 500, arg.cluster=grps)
se_comparison
# Round all numeric columns to 4 decimal places
<- se_comparison %>%
se_comparison mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se_comparison
Coef Non_Robust Robust Clustered trad.ind trad.clu
(Intercept) 0.9445 0.0541 0.0482 0.2640 0.0497 0.2598
x1 -3.9659 0.0361 0.0371 0.0524 0.0364 0.0509
x2 2.0216 0.0181 0.0189 0.0456 0.0190 0.0447
Bayesian Bootstrap
Random Weights via Dirichlet distribution
Next, I want to show how to do the Bayesian Bootstrap. As discussed, the weights need to be drawn from a Dirichlet distribution. We can draw \(x_{i}\sim \text{Dirichlet}(\vec{\alpha})\) by drawing \(y_{i} \sim \text{Gamma}(\text{shape}=\vec{\alpha}, \text{rate}=1)\) and then calculating \(x_{i} = y_{i} / \sum_{j\in N} y_{j}\). Wikipedia says it can also be done assuming \(\vec{\alpha}=\vec{1}\) using the uniform distribution, which it calls ‘Stick Breaking.’ Both are below.
# Function to sample from Dirichlet distribution using base R
<- function(num.vars, alpha) {
rdirichlet_base <- length(alpha)
K <- matrix(rgamma(num.vars * K, shape = alpha, rate = 1),
x nrow = num.vars, byrow = TRUE)
<- rowSums(x)
x_sum return(as.numeric((x / x_sum)*K))
}
# Function to sample from Dirichlet(1,...,1) using the stick-breaking method
<- function(num.vars, num.draws) {
rdirichlet_stick_breaking #
<- matrix(nrow = num.vars, ncol = num.draws)
samples #
for (i in 1:num.vars) {
# Generate K-1 uniform random numbers
# i <- 1
<- runif(num.draws-1)
u
# Add 0 and 1, then sort
<- sort(c(0, u, 1))
breaks
# Calculate differences
<- diff(breaks)
samples[i,]
}#
return(as.numeric(samples*num.draws))
}
# Set parameters
# Draw samples
# Draw the sample
.1 <- as.numeric(rdirichlet_base(num.vars = 1, alpha = rep(1,100)))
sample.2 <- as.numeric(rdirichlet_stick_breaking(num.vars = 1, num.draws=100))
sample.3 <- as.numeric(rdirichlet_base(num.vars = 1, alpha = sample(1:10,100,replace=T)))
sample
plot(sort(sample.1))
lines(sort(sample.2), col="red")
lines(sort(sample.3), col="blue")
hist(sample.1)
hist(sample.3)
Bayesian Bootstrap
##### Bayesian Bootstrap
<- function(arg.data,arg.model.formula,arg.boots = 10,arg.cluster = NULL) {
f.bayesian.bootstrap ############################################################
#
<- enquo(arg.cluster)
arg.cluster_quo
# Initialize results matrix
<- matrix(NA,
results nrow = length(coef(lm(arg.model.formula, data = arg.data))),
ncol = arg.boots)
# Perform bootstrap
for (i in 1:arg.boots) {
if (is.null(substitute(arg.cluster))==T) {
# For robust standard errors, each observation gets its own weight
<- arg.data %>%
arg.data ungroup() %>%
mutate(wts = rgamma(n(), shape = 1, rate = 1),
wts = wts / sum(wts))
} else {
# For cluster-robust, each cluster gets a weight
<- arg.data %>%
arg.data ungroup() %>%
distinct(!!arg.cluster_quo) %>%
mutate(uu = rgamma(n(), shape = 1, rate = 1),
wts = uu / sum(uu)) %>%
select(!!arg.cluster_quo, wts) %>%
full_join(arg.data, by = as.character(quo_name(arg.cluster_quo)))
}<- as.numeric(lm(formula = arg.model.formula,data=arg.data,weights = wts)$coefficients)
results[,i] # drop the wts (only matters for cluster given approach)
<- arg.data %>%
arg.data select(-wts)
}
# Calculate bootstrap standard errors
<- apply(results, 1, sd)
boot_se return(boot_se)
}
#
$bayes.ind <- f.bayesian.bootstrap(df, as.formula("y1 ~ x1 + x2"), arg.boots = 500)
se_comparison$bayes.clu <- f.bayesian.bootstrap(df, as.formula("y1 ~ x1 + x2"), arg.boots = 500, arg.cluster=grps)
se_comparison
# Round all numeric columns to 4 decimal places
<- se_comparison %>%
se_comparison mutate(across(where(is.numeric), ~round(., 4)))
# Display the table
se_comparison
Coef Non_Robust Robust Clustered trad.ind trad.clu bayes.ind
(Intercept) 0.9445 0.0541 0.0482 0.2640 0.0497 0.2598 0.0485
x1 -3.9659 0.0361 0.0371 0.0524 0.0364 0.0509 0.0375
x2 2.0216 0.0181 0.0189 0.0456 0.0190 0.0447 0.0195
bayes.clu
(Intercept) 0.2627
x1 0.0498
x2 0.0401