Faster Two-Way Fixed Effects Estimator in R

In a current project, we need to run linear models on a large data frame (~70 million rows) with fixed effects for both place (e.g., grid-cell) and time (e.g., year) – a common specification for difference-in-differences models with multiple periods and groups. The base lm package in R choked on the task. The excellent lfe package by Gaure can estimate the models but requires long run times.

In attempt to speed up our estimation routines, I built a new function that uses the data.table and RcppEigen packages to partial out the fixed effects and then run OLS on the de-meaned data. I include the code below in hopes that it might be useful to others facing a similar task. Note: this assumes that your panel is balanced.

To start, we need to create some fake data for analysis:

T = 6; N = 10^4
fake_dt <- data.table(
	A = sample(c(0,1), N * T, replace = TRUE),
	B1 = sample(c(0,1), N * T, replace = TRUE),
	B2 = rnorm(N),
	year = rep(1980:(1979 + T), N),
	cell = rep(1:100, each = T))
TwoWayFE <- function(y, Xs, fe1, fe2, dt, cluster = FALSE) {
	pkgs <- c("data.table", "RcppEigen")
	sapply(pkgs, require, character.only = TRUE)

	keep_cols <- c(y, Xs, fe1, fe2)
	model_dt <- dt[, keep_cols, with = FALSE]; rm(keep_cols)
	model_dt <- na.omit(model_dt)

	num_Xs <- length(Xs)
	new_names <- c("y", paste0("x", 1:num_Xs), "fe1", "fe2")
	setnames(model_dt, 1:ncol(model_dt), new_names)

	# Sample Means:
	cols <- new_names[!grepl("fe", new_names)]
	model_dt[, paste0("mean_", cols) :=
		lapply(.SD, mean, na.rm = TRUE), .SDcols = cols]

	# Means by FE1:
	setkey(model_dt, fe1)
	model_dt[,
		paste0("mean_", cols, "_fe1") :=
			lapply(.SD, mean, na.rm = TRUE), .SDcols = cols, by = fe1]
	M <- length(unique(model_dt$fe1))

	# Means by FE2:
	setkey(model_dt, fe2)
	model_dt[,
		paste0("mean_", cols, "_fe2") :=
		lapply(.SD, mean, na.rm = TRUE), .SDcols = cols, by = fe2]
	Y <- length(unique(model_dt$fe2))

	# Demeaning:
	model_dt[, "y_tilde" := y - mean_y_fe2 - mean_y_fe1 + mean_y]

	g <- function(i) {paste0("x",i,"_tilde")}
	LHS <- sapply(1:num_Xs, g)

	f <- function(i) {
		paste0("x",i," - mean_x",i,"_fe2 - mean_x",i,"_fe1 + mean_x", i)
	}
	RHS <- paste0("list(",paste(sapply(1:num_Xs, f), collapse = ", "), ")")

	model_dt[, eval(LHS) := eval(parse(text = RHS))]

	x_cols <- grep("x\\d{1}_tilde", names(model_dt), value = TRUE)
	model_dt <- model_dt[, c("y_tilde", eval(x_cols), "fe1", "fe2"),
		with = FALSE]

	y <- model_dt$y_tilde
	X <- model_dt[, x_cols, with = FALSE]
	cluster_vec <- model_dt$fe1
	rm(model_dt)

	m <- RcppEigen::fastLm(X, y)
	names(m$coefficients) <- Xs

	##############################################
	DoF <- m$df.residual - (M - 1) - (Y - 1) - num_Xs + 1
	# No intercept in model.

	# SEs:
	if(cluster){
		N <- length(cluster_vec)
		K <- m$rank + (M - 1) + (Y - 1)  + 1

		dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
		est_fun <- residuals(m) * X

		dt <- data.table(est_fun, fe1 = cluster_vec)

		setkey(dt, fe1)
		dt <- dt[, lapply(.SD, sum), by = fe1][, 2:ncol(dt), with = FALSE]

		bread <- solve(crossprod(as.matrix(X))) * N
		meat <- crossprod(as.matrix(dt)) / N

		m$se <- as.numeric(sqrt(dfc * 1/N * diag(bread %*% meat %*% bread)))
		message("SEs Clustered on First FE")
	} else {
		m$se <- m$se * sqrt(m$df.residual / DoF)
		message("SEs Not Clustered")
	}

	# Correcting degrees of freedom:
	m$df.residual <- DoF

	return(m)
}

I then check equivalence of the estimates and benchmark this function against both the felm and lm functions.

# CHECKING EQUIVALENCE:
options(digits = 10)

pkgs <- c("RcppEigen", "data.table", "lfe", "testthat", "microbenchmark")
sapply(pkgs, require, character.only = TRUE)

t_dc <- system.time(
m <- TwoWayFE(y = "A", Xs = c("B1", "B2"),
		fe1 = "cell", fe2 = "year",
		dt = fake_dt,
		cluster = FALSE))

library(lfe)
t_felm <- system.time(
m_felm <- felm(A ~ B1 + B2 | factor(cell) + factor(year),
	data = fake_dt))

t_base <- system.time(
m_base <- lm(A ~ B1 + B2 + factor(cell) + factor(year),
	data = fake_dt))

# Comparing Results:
summary(m)$coefficients[1:2, 1:2]
#           Estimate     Std. Error
# B1 -0.002497459836 0.004086039606
# B2  0.001355440208 0.002022403917

summary(m_felm)$coefficients[1:2, 1:2]
#           Estimate     Std. Error
# B1 -0.002497459836 0.004086039606
# B2  0.001355440208 0.002022403917

summary(m_base)$coefficients[2:3, 1:2]
#           Estimate     Std. Error
# B1 -0.002497459836 0.004086039606
# B2  0.001355440208 0.002022403917

# Checking equivalence:
b1_dc <- summary(m)$coefficients[1,1]
b1_felm <- summary(m_felm)$coefficients[1,1]
expect_equivalent(b1_dc, b1_felm)

b1_lm <- summary(m_base)$coefficients[1,1]
expect_equivalent(b1_dc, b1_lm)

Here’s the punchline:

Benchmark

# BENCHMARKING:

# More time-consuming:
op <- microbenchmark(
	DC = TwoWayFE(y = "A", Xs = c("B1", "B2"), fe1 = "cell", fe2 = "year",
		dt = fake_dt, cluster = FALSE),
	FELM = felm(A ~ B1 + B2 | factor(cell) + factor(year), data = fake_dt),
	BASE = lm(A ~ B1 + B2 + factor(cell) + factor(year), data = fake_dt),
	times = 100L)

print(op)
# Unit: milliseconds
#  expr        min         mean      median        max neval
#    DC  44.697352  63.51898258  54.3942015 162.108703   100
#  FELM 502.789730 585.82423354 576.5496225 771.906130   100
#  BASE 696.929222 758.06220078 737.3307755 939.178530   100

ggplot(data = op, aes(x = expr, y = log(time))) +
	geom_boxplot() +
	xlab("Function") +
	ylab("Log(time)") +
	theme_bw()