Data preparation
Load datasets
The data can be downloaded from here.
# data of treatment assignment by school
treatment_school <- read_dta("../data/Duflo_Hanna_Ryan/data/Incentives_FINAL/Raw-Data/Tests/treatschool.dta") %>%
mutate(treatment = 1)
# result of random check if a school is open
random_check <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/RandomCheck.dta")
# pre-treatment test results
pre_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Pretest.dta")
# post-treatment test results
post_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Posttest.dta")
Data processing
# Make school-level post treatment average test results
# - indicator for taking a written test
# - score for written test (NA for those who did not take tests)
post_test_school <- post_test %>%
group_by(schid) %>%
summarise_at(vars(post_writ, post_total_w), mean, na.rm = TRUE)
# For later use, get the mean of the average test results across schools
post_total_w_mean <- mean(post_test_school$post_total_w)
# School-level post treatment average test results, where
# scores of students who didn't take tests are replaced to 0
post_test_school_all <- post_test %>%
mutate(post_total_w = replace_na(post_total_w, 0)) %>%
group_by(schid) %>%
summarise(post_total_w_all = mean(post_total_w,na.rm = TRUE))
# Make school-level pre treatment average test results
pre_test_school <- pre_test %>%
group_by(schid) %>%
summarise_at(vars(pre_writ), mean, na.rm = TRUE)
# School-level share of opens
# (In Imbens-Rubin textbook, it appears that all records, including pre-treatment, are counted for this variable.
# For information of post-treatment opens, use filter by month and year)
school_open <- random_check %>%
mutate(open = ifelse(a1_1 == 1, 1, 0)) %>%
# mutate(
# post_med = ifelse(year >= 2005 | (month >= 4 & year >= 2004), 1, 0),
# post_end = ifelse(year >= 2005 | (month >= 11 & year >= 2004), 1, 0),
# ) %>%
# filter(post_med == 1) %>%
group_by(schid) %>%
summarise_at(vars(open), mean, na.rm = TRUE)
df <- post_test_school %>%
# merge treatment status information
left_join(treatment_school, by = c("schid" = "schid")) %>%
mutate(treatment = ifelse(is.na(treatment), 0, 1)) %>%
# merge pre-treatment test results data
left_join(pre_test_school, by = c("schid" = "schid")) %>%
# merge post-treatment test results data with NA replaced to 0
left_join(post_test_school_all, by = c("schid" = "schid")) %>%
# merge school open information
left_join(school_open, by = c("schid" = "schid")) %>%
# It appears that, the post-treatment test scores are divided by the average (ignoring students who didn't take exam)
# Not sure exactly why
mutate(
post_total_w = post_total_w / post_total_w_mean,
post_total_w_all = post_total_w_all / post_total_w_mean,
)
# Summary statistics for control and treatment group variables
control_treat_sum <- df %>%
group_by(treatment) %>%
summarise_at(
vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
list(mean, sd)
) %>%
pivot_longer(cols = -treatment, names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>%
mutate(
across(-treatment, ~ formatC(., digits = 2, format = "f")),
across(-treatment, ~ ifelse(fn == "_fn2", paste0("(", ., ")"), .)),
) %>%
select(-treatment, -fn) %>%
as.matrix() %>%
t()
# Summary statistics for all schools
all_school_sum <- df %>%
summarise_at(
vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
list(min, max)
) %>%
pivot_longer(cols = everything(), names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>%
mutate(across(-fn, ~ formatC(., digits = 2, format = "f"))) %>%
select(-fn) %>%
as.matrix() %>%
t()
Summary statistics (Replication of Table 6.1)
cbind(
c("pctprewritten", "open", "pctpostwritten", "written", "written_all"),
control_treat_sum, all_school_sum
) %>%
set_rownames(NULL) %>%
kable("html", booktabs = TRUE) %>%
add_header_above(
c(" ", rep(c("Average", "(S.D.)"), 2), "Min", "Max")
) %>%
add_header_above(
c(
"Variable",
"Control ($N_c$ = 54)" = 2,
"Treatment ($N_c$ = 53)" = 2,
" ", " "
)
) %>%
pack_rows(index = c(
"Pre-treatment" = 1,
"Post-treatment" = 4
)) %>%
kable_styling(position = "center")
|
Variable
|
Control (\(N_c\) = 54)
|
Treatment (\(N_c\) = 53)
|
|
|
|
|
Average
|
(S.D.)
|
Average
|
(S.D.)
|
Min
|
Max
|
|
Pre-treatment
|
|
pctprewritten
|
0.19
|
(0.19)
|
0.16
|
(0.17)
|
0.00
|
0.67
|
|
Post-treatment
|
|
open
|
0.58
|
(0.19)
|
0.80
|
(0.13)
|
0.00
|
1.00
|
|
pctpostwritten
|
0.47
|
(0.19)
|
0.52
|
(0.23)
|
0.05
|
0.92
|
|
written
|
0.92
|
(0.45)
|
1.09
|
(0.42)
|
0.07
|
2.20
|
|
written_all
|
0.45
|
(0.32)
|
0.59
|
(0.39)
|
0.04
|
1.43
|
Estimates of components of variance of estimator for the effect on open variable (Replication of Table 6.2)
df_control <- df %>% filter(treatment == 0)
df_treatment <- df %>% filter(treatment == 1)
# mean of outcome for control
Yobs_c <- mean(df_control$open)
# mean of outcome for treatment
Yobs_t <- mean(df_treatment$open)
# estimate for average treatment effect
tau_hat <- Yobs_t - Yobs_c
# estimated variance for control (obtain as SD)
s_c <- sqrt(sum((df_control$open - Yobs_c)^2) / (nrow(df_control) - 1))
# estimated variance for treatment (obtain as SD)
s_t <- sqrt(sum((df_treatment$open - Yobs_t)^2) / (nrow(df_treatment) - 1))
# estimated variance for all (obtain as SD)
s <- sqrt(
(s_c^2 * (nrow(df_control) - 1) + s_t^2 * (nrow(df_treatment) - 1)) /
(nrow(df) - 2)
)
# Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
# Sampling variance estimates under constant treatment effects (obtain as SD)
sd_const <- sqrt(s^2 * (1 / nrow(df_control) + 1 / nrow(df_treatment)))
# Sampling variance estimates when potential outcomes are perfectly correlated (obtain as SD)
sd_rho1 <- sqrt(
s_c^2 * (nrow(df_treatment) / (nrow(df) * nrow(df_control))) +
s_t^2 * (nrow(df_control) / (nrow(df) * nrow(df_treatment))) +
s_c * s_t * 2 / nrow(df)
)
bind_cols(
name = c(
"$\\bar{Y}_c^{obs}$", "$\\bar{Y}_t^{obs}$", "$\\hat{\\tau}$",
"$s_c^2$", "$s_t^2$", "$s^2$",
"$\\hat{V}^{\\text{neyman}} = s^2 / N_c + s^2 / N_t$",
"$\\hat{V}^{\\text{const}} = s^2 \\cdot \\left( 1 / N_c + 1 / N_t \\right)$",
"$\\hat{V}^{\\rho_{tc} = 1} = s_c^2 \\cdot \\left( N_t / (N \\cdot N_c ) \\right) + s_t^2 \\cdot \\left( N_c / (N \\cdot N_t) \\right) + s_c \\cdot s_t \\cdot \\left( 2 / N \\right)$"
),
value = c(
Yobs_c, Yobs_t, tau_hat,
s_c, s_t, s,
sd_neyman, sd_const, sd_rho1
)
) %>%
mutate(value = formatC(value, digits = 2, format = "f")) %>%
mutate(
value = ifelse(row_number() >= 4, paste0("$", value, "^2$"), paste0("$", value, "$"))
) %>%
set_colnames(NULL) %>%
kable("html", booktabs = TRUE) %>%
pack_rows(index = c(
"Estimated means" = 3,
"Estimated variance compnents" = 3,
"Sampling variance estimates" = 3
)) %>%
kable_styling(position = "center")
|
Estimated means
|
|
\(\bar{Y}_c^{obs}\)
|
\(0.58\)
|
|
\(\bar{Y}_t^{obs}\)
|
\(0.80\)
|
|
\(\hat{\tau}\)
|
\(0.22\)
|
|
Estimated variance compnents
|
|
\(s_c^2\)
|
\(0.19^2\)
|
|
\(s_t^2\)
|
\(0.13^2\)
|
|
\(s^2\)
|
\(0.16^2\)
|
|
Sampling variance estimates
|
|
\(\hat{V}^{\text{neyman}} = s^2 / N_c + s^2 / N_t\)
|
\(0.03^2\)
|
|
\(\hat{V}^{\text{const}} = s^2 \cdot \left( 1 / N_c + 1 / N_t \right)\)
|
\(0.03^2\)
|
|
\(\hat{V}^{\rho_{tc} = 1} = s_c^2 \cdot \left( N_t / (N \cdot N_c ) \right) + s_t^2 \cdot \left( N_c / (N \cdot N_t) \right) + s_c \cdot s_t \cdot \left( 2 / N \right)\)
|
\(0.03^2\)
|
Estimates of, and confidence intervals for average treatment effects (Replication of Table 6.3)
make_table_6_3 <- function(df, outcome) {
df_control <- df %>% filter(treatment == 0)
df_treatment <- df %>% filter(treatment == 1)
# mean of outcome for control
Yobs_c <- mean(df_control %>% pull(outcome))
# mean of outcome for treatment
Yobs_t <- mean(df_treatment %>% pull(outcome))
# estimate for average treatment effect
tau_hat <- Yobs_t - Yobs_c
# estimated variance for control (obtain as SD)
s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
# estimated variance for treatment (obtain as SD)
s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
# Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
ci <- paste0(
"(",
(tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
(tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
")"
)
return(c(
tau_hat %>% formatC(digits = 2, format = "f"),
paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
ci
))
}
cbind(
c("open", "pctpostwritten", "written", "written_all"),
map(
c("open", "post_writ", "post_total_w", "post_total_w_all"),
~ make_table_6_3(df, .)
) %>%
bind_cols() %>%
t()
) %>%
set_rownames(NULL) %>%
kable("html", booktabs = TRUE, align = c("l", "c", "c", "c")) %>%
add_header_above(
c("Variable", "$\\widehat{ATE}$", "$(\\widehat{\\text{s.e.}})$", "95\\% C.I.")
) %>%
kable_styling(position = "center")
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
|
Variable
|
\(\widehat{ATE}\)
|
\((\widehat{\text{s.e.}})\)
|
95% C.I.
|
|
open
|
0.22
|
(0.03)
|
(0.15, 0.28)
|
|
pctpostwritten
|
0.05
|
(0.04)
|
(-0.03, 0.13)
|
|
written
|
0.17
|
(0.08)
|
(0.00, 0.33)
|
|
written_all
|
0.14
|
(0.07)
|
(0.00, 0.27)
|
estimates of, and confidence intervals for average treatment effects by covariate (Replication of Table 6.4)
make_table_6_4 <- function(df, outcome, covar0) {
if (covar0 == TRUE) {
df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ == 0)
df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ == 0)
} else {
df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ > 0)
df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ > 0)
}
# Average treatment effects for samples with pre-writ == 0
# mean of outcome for control
Yobs_c <- mean(df_control %>% pull(outcome))
# mean of outcome for treatment
Yobs_t <- mean(df_treatment %>% pull(outcome))
# estimate for average treatment effect
tau_hat <- Yobs_t - Yobs_c
# estimated variance for control (obtain as SD)
s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
# estimated variance for treatment (obtain as SD)
s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
# Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
ci <- paste0(
"(",
(tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
(tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
")"
)
return(c(
tau_hat %>% formatC(digits = 2, format = "f"),
paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
ci
))
}
make_table_6_4_diff <- function(df, outcome) {
reg_res <- lm(as.formula(paste(outcome, "treatment*(pre_writ == 0)", sep = " ~ ")), df) %>%
coeftest(., vcov = vcovHC(., type = "HC1"))
est <- reg_res["treatment:pre_writ == 0TRUE", "Estimate"]
se <- reg_res["treatment:pre_writ == 0TRUE", "Std. Error"]
ci <- paste0(
"(",
(est - 1.96 * se) %>% formatC(digits = 2, format = "f"), ", ",
(est + 1.96 * se) %>% formatC(digits = 2, format = "f"),
")"
)
return(c(
est %>% formatC(digits = 2, format = "f"),
paste0("(", se %>% formatC(digits = 2, format = "f"), ")"),
ci
))
}
cbind(
c("open", "pctpostwritten", "written", "written_all"),
map(
c("open", "post_writ", "post_total_w", "post_total_w_all"),
~ make_table_6_4(df, ., covar0 = TRUE)
) %>%
bind_cols() %>%
t(),
map(
c("open", "post_writ", "post_total_w", "post_total_w_all"),
~ make_table_6_4(df, ., covar0 = FALSE)
) %>%
bind_cols() %>%
t(),
map(
c("open", "post_writ", "post_total_w", "post_total_w_all"),
~ make_table_6_4_diff(df, .)
) %>%
bind_cols() %>%
t()
) %>%
set_rownames(NULL) %>%
kable("html", booktabs = TRUE, align = c("l", rep("c", 9))) %>%
add_header_above(
c(" ", rep(c("$\\tau$", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I."), 2), "EST", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I.")
) %>%
add_header_above(
c("", "($N$ = 40)" = 3, "($N$ = 67)" = 3, " " = 3)
) %>%
add_header_above(
c("Variable", "pctprewritten = 0" = 3, "pctprewritten $\\gt$ 0" = 3, "Difference" = 3)
) %>%
kable_styling(position = "center")
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
|
Variable
|
pctprewritten = 0
|
pctprewritten \(\gt\) 0
|
Difference
|
|
|
(\(N\) = 40)
|
(\(N\) = 67)
|
|
|
|
\(\tau\)
|
(\(\widehat{\text{s.e.}}\))
|
95% C.I.
|
\(\tau\)
|
(\(\widehat{\text{s.e.}}\))
|
95% C.I.
|
EST
|
(\(\widehat{\text{s.e.}}\))
|
95% C.I.
|
|
open
|
0.23
|
(0.05)
|
(0.14, 0.32)
|
0.21
|
(0.04)
|
(0.13, 0.29)
|
0.02
|
(0.06)
|
(-0.10, 0.14)
|
|
pctpostwritten
|
-0.04
|
(0.06)
|
(-0.16, 0.08)
|
0.11
|
(0.05)
|
(0.01, 0.21)
|
-0.15
|
(0.08)
|
(-0.31, 0.00)
|
|
written
|
0.20
|
(0.10)
|
(-0.00, 0.41)
|
0.18
|
(0.10)
|
(-0.02, 0.38)
|
0.03
|
(0.15)
|
(-0.26, 0.31)
|
|
written_all
|
0.04
|
(0.08)
|
(-0.10, 0.19)
|
0.22
|
(0.09)
|
(0.05, 0.39)
|
-0.18
|
(0.12)
|
(-0.40, 0.05)
|