Faster Two-Way Fixed Effects Estimator in R
21 Nov 2014
In a current project, we need to run linear models on a large data frame (~70 million rows) with fixed effects for both place (e.g., grid-cell) and time (e.g., year) – a common specification for difference-in-differences models with multiple periods and groups. The base `lm`

package in `R`

choked on the task. The excellent `lfe`

package by Gaure can estimate the models but requires long run times.

In attempt to speed up our estimation routines, I built a new function that uses the `data.table`

and `RcppEigen`

packages to partial out the fixed effects and then run OLS on the de-meaned data. I include the code below in hopes that it might be useful to others facing a similar task. Note: this assumes that your panel is balanced.

To start, we need to create some fake data for analysis:

T = 6 ; N = 10 ^ 4
fake_dt <- data.table (
A = sample ( c ( 0 , 1 ), N * T , replace = TRUE ),
B 1 = sample ( c ( 0 , 1 ), N * T , replace = TRUE ),
B 2 = rnorm ( N ),
year = rep ( 1980 : ( 1979 + T ), N ),
cell = rep ( 1 : 100 , each = T ))

TwoWayFE <- function ( y , Xs , fe1 , fe2 , dt , cluster = FALSE ) {
pkgs <- c ( "data.table" , "RcppEigen" )
sapply ( pkgs , require , character.only = TRUE )
keep_cols <- c ( y , Xs , fe1 , fe2 )
model_dt <- dt [, keep_cols , with = FALSE ]; rm ( keep_cols )
model_dt <- na.omit ( model_dt )
num_Xs <- length ( Xs )
new_names <- c ( "y" , paste0 ( "x" , 1 : num_Xs ), "fe1" , "fe2" )
setnames ( model_dt , 1 : ncol ( model_dt ), new_names )
# Sample Means:
cols <- new_names [ ! grepl ( "fe" , new_names )]
model_dt [, paste0 ( "mean_" , cols ) :=
lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols ]
# Means by FE1:
setkey ( model_dt , fe1 )
model_dt [,
paste0 ( "mean_" , cols , "_fe1" ) :=
lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols , by = fe1 ]
M <- length ( unique ( model_dt $ fe1 ))
# Means by FE2:
setkey ( model_dt , fe2 )
model_dt [,
paste0 ( "mean_" , cols , "_fe2" ) :=
lapply ( .SD , mean , na.rm = TRUE ), .SDcols = cols , by = fe2 ]
Y <- length ( unique ( model_dt $ fe2 ))
# Demeaning:
model_dt [, "y_tilde" := y - mean_y_fe2 - mean_y_fe1 + mean_y ]
g <- function ( i ) { paste0 ( "x" , i , "_tilde" )}
LHS <- sapply ( 1 : num_Xs , g )
f <- function ( i ) {
paste0 ( "x" , i , " - mean_x" , i , "_fe2 - mean_x" , i , "_fe1 + mean_x" , i )
}
RHS <- paste0 ( "list(" , paste ( sapply ( 1 : num_Xs , f ), collapse = ", " ), ")" )
model_dt [, eval ( LHS ) := eval ( parse ( text = RHS ))]
x_cols <- grep ( "x\\d{1}_tilde" , names ( model_dt ), value = TRUE )
model_dt <- model_dt [, c ( "y_tilde" , eval ( x_cols ), "fe1" , "fe2" ),
with = FALSE ]
y <- model_dt $ y_tilde
X <- model_dt [, x_cols , with = FALSE ]
cluster_vec <- model_dt $ fe1
rm ( model_dt )
m <- RcppEigen :: fastLm ( X , y )
names ( m $ coefficients ) <- Xs
##############################################
DoF <- m $ df.residual - ( M - 1 ) - ( Y - 1 ) - num_Xs + 1
# No intercept in model.
# SEs:
if ( cluster ){
N <- length ( cluster_vec )
K <- m $ rank + ( M - 1 ) + ( Y - 1 ) + 1
dfc <- ( M / ( M - 1 )) * (( N - 1 ) / ( N - K ))
est_fun <- residuals ( m ) * X
dt <- data.table ( est_fun , fe1 = cluster_vec )
setkey ( dt , fe1 )
dt <- dt [, lapply ( .SD , sum ), by = fe1 ][, 2 : ncol ( dt ), with = FALSE ]
bread <- solve ( crossprod ( as.matrix ( X ))) * N
meat <- crossprod ( as.matrix ( dt )) / N
m $ se <- as.numeric ( sqrt ( dfc * 1 / N * diag ( bread %*% meat %*% bread )))
message ( "SEs Clustered on First FE" )
} else {
m $ se <- m $ se * sqrt ( m $ df.residual / DoF )
message ( "SEs Not Clustered" )
}
# Correcting degrees of freedom:
m $ df.residual <- DoF
return ( m )
}

I then check equivalence of the estimates and benchmark this function against both the `felm`

and `lm`

functions.

# CHECKING EQUIVALENCE:
options ( digits = 10 )
pkgs <- c ( "RcppEigen" , "data.table" , "lfe" , "testthat" , "microbenchmark" )
sapply ( pkgs , require , character.only = TRUE )
t_dc <- system.time (
m <- TwoWayFE ( y = "A" , Xs = c ( "B1" , "B2" ),
fe1 = "cell" , fe2 = "year" ,
dt = fake_dt ,
cluster = FALSE ))
library ( lfe )
t_felm <- system.time (
m_felm <- felm ( A ~ B 1 + B 2 | factor ( cell ) + factor ( year ),
data = fake_dt ))
t_base <- system.time (
m_base <- lm ( A ~ B 1 + B 2 + factor ( cell ) + factor ( year ),
data = fake_dt ))
# Comparing Results:
summary ( m ) $ coefficients [ 1 : 2 , 1 : 2 ]
# Estimate Std. Error
# B1 -0.002497459836 0.004086039606
# B2 0.001355440208 0.002022403917
summary ( m_felm ) $ coefficients [ 1 : 2 , 1 : 2 ]
# Estimate Std. Error
# B1 -0.002497459836 0.004086039606
# B2 0.001355440208 0.002022403917
summary ( m_base ) $ coefficients [ 2 : 3 , 1 : 2 ]
# Estimate Std. Error
# B1 -0.002497459836 0.004086039606
# B2 0.001355440208 0.002022403917
# Checking equivalence:
b 1 _ dc <- summary ( m ) $ coefficients [ 1 , 1 ]
b 1 _ felm <- summary ( m_felm ) $ coefficients [ 1 , 1 ]
expect_equivalent ( b 1 _ dc , b 1 _ felm )
b 1 _ lm <- summary ( m_base ) $ coefficients [ 1 , 1 ]
expect_equivalent ( b 1 _ dc , b 1 _ lm )

Here’s the punchline:

# BENCHMARKING:
# More time-consuming:
op <- microbenchmark (
DC = TwoWayFE ( y = "A" , Xs = c ( "B1" , "B2" ), fe1 = "cell" , fe2 = "year" ,
dt = fake_dt , cluster = FALSE ),
FELM = felm ( A ~ B 1 + B 2 | factor ( cell ) + factor ( year ), data = fake_dt ),
BASE = lm ( A ~ B 1 + B 2 + factor ( cell ) + factor ( year ), data = fake_dt ),
times = 100L )
print ( op )
# Unit: milliseconds
# expr min mean median max neval
# DC 44.697352 63.51898258 54.3942015 162.108703 100
# FELM 502.789730 585.82423354 576.5496225 771.906130 100
# BASE 696.929222 758.06220078 737.3307755 939.178530 100
ggplot ( data = op , aes ( x = expr , y = log ( time ))) +
geom_boxplot () +
xlab ( "Function" ) +
ylab ( "Log(time)" ) +
theme_bw ()