Data preparation

Load datasets

The data can be downloaded from here.

# data of treatment assignment by school
treatment_school <- read_dta("../data/Duflo_Hanna_Ryan/data/Incentives_FINAL/Raw-Data/Tests/treatschool.dta") %>% 
  mutate(treatment = 1)
# result of random check if a school is open
random_check <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/RandomCheck.dta")
# pre-treatment test results
pre_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Pretest.dta")
# post-treatment test results
post_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Posttest.dta")

Data processing

# Make school-level post treatment average test results
# - indicator for taking a written test
# - score for written test (NA for those who did not take tests)
post_test_school <- post_test %>% 
  group_by(schid) %>% 
  summarise_at(vars(post_writ, post_total_w), mean, na.rm = TRUE)
# For later use, get the mean of the average test results across schools
post_total_w_mean <- mean(post_test_school$post_total_w)

# School-level post treatment average test results, where
# scores of students who didn't take tests are replaced to 0
post_test_school_all <- post_test %>% 
  mutate(post_total_w = replace_na(post_total_w, 0)) %>% 
  group_by(schid) %>% 
  summarise(post_total_w_all = mean(post_total_w,na.rm = TRUE))

# Make school-level pre treatment average test results 
pre_test_school <- pre_test %>% 
  group_by(schid) %>% 
  summarise_at(vars(pre_writ), mean, na.rm = TRUE)

# School-level share of opens
# (In Imbens-Rubin textbook, it appears that all records, including pre-treatment, are counted for this variable.
# For information of post-treatment opens, use filter by month and year)
school_open <- random_check %>% 
  mutate(open = ifelse(a1_1 == 1, 1, 0)) %>% 
  # mutate(
  #   post_med = ifelse(year >= 2005 | (month >= 4 & year >= 2004), 1, 0),
  #   post_end = ifelse(year >= 2005 | (month >= 11 & year >= 2004), 1, 0),
  # ) %>% 
  # filter(post_med == 1) %>% 
  group_by(schid) %>% 
  summarise_at(vars(open), mean, na.rm = TRUE)

df <- post_test_school %>% 
  # merge treatment status information
  left_join(treatment_school, by = c("schid" = "schid")) %>% 
  mutate(treatment = ifelse(is.na(treatment), 0, 1)) %>% 
  # merge pre-treatment test results data
  left_join(pre_test_school, by = c("schid" = "schid")) %>% 
  # merge post-treatment test results data with NA replaced to 0
  left_join(post_test_school_all, by = c("schid" = "schid")) %>% 
  # merge school open information
  left_join(school_open, by = c("schid" = "schid")) %>% 
  # It appears that, the post-treatment test scores are divided by the average (ignoring students who didn't take exam)
  # Not sure exactly why
  mutate(
    post_total_w = post_total_w / post_total_w_mean,
    post_total_w_all = post_total_w_all / post_total_w_mean,
  )

# Summary statistics for control and treatment group variables
control_treat_sum <- df %>% 
  group_by(treatment) %>% 
  summarise_at(
    vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
    list(mean, sd)
  ) %>% 
  pivot_longer(cols = -treatment, names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>% 
  mutate(
    across(-treatment, ~ formatC(., digits = 2, format = "f")),
    across(-treatment, ~ ifelse(fn == "_fn2", paste0("(", ., ")"), .)),
    ) %>% 
  select(-treatment, -fn) %>% 
  as.matrix() %>% 
  t()
  
# Summary statistics for all schools
all_school_sum <- df %>% 
  summarise_at(
    vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
    list(min, max)
  ) %>% 
  pivot_longer(cols = everything(), names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>% 
  mutate(across(-fn, ~ formatC(., digits = 2, format = "f"))) %>% 
  select(-fn) %>% 
  as.matrix() %>% 
  t()

Summary statistics (Replication of Table 6.1)

cbind(
  c("pctprewritten", "open", "pctpostwritten", "written", "written_all"),
  control_treat_sum, all_school_sum
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE) %>% 
  add_header_above(
    c(" ", rep(c("Average", "(S.D.)"), 2), "Min", "Max")
  ) %>% 
  add_header_above(
    c(
      "Variable",
      "Control ($N_c$ = 54)" = 2,
      "Treatment ($N_c$ = 53)" = 2,
      " ", " "
      )
  ) %>% 
  pack_rows(index = c(
    "Pre-treatment" = 1,
    "Post-treatment" = 4
  )) %>% 
  kable_styling(position = "center")

Variable	Control (\(N_c\) = 54)		Treatment (\(N_c\) = 53)
	Average	(S.D.)	Average	(S.D.)	Min	Max
Pre-treatment
pctprewritten	0.19	(0.19)	0.16	(0.17)	0.00	0.67
Post-treatment
open	0.58	(0.19)	0.80	(0.13)	0.00	1.00
pctpostwritten	0.47	(0.19)	0.52	(0.23)	0.05	0.92
written	0.92	(0.45)	1.09	(0.42)	0.07	2.20
written_all	0.45	(0.32)	0.59	(0.39)	0.04	1.43

Estimates of components of variance of estimator for the effect on `open` variable (Replication of Table 6.2)

df_control <- df %>% filter(treatment == 0)
df_treatment <- df %>% filter(treatment == 1)

# mean of outcome for control
Yobs_c <- mean(df_control$open)
# mean of outcome for treatment
Yobs_t <- mean(df_treatment$open)
# estimate for average treatment effect
tau_hat <- Yobs_t - Yobs_c

# estimated variance for control (obtain as SD)
s_c <- sqrt(sum((df_control$open - Yobs_c)^2) / (nrow(df_control) - 1))
# estimated variance for treatment (obtain as SD)
s_t <- sqrt(sum((df_treatment$open - Yobs_t)^2) / (nrow(df_treatment) - 1))
# estimated variance for all (obtain as SD)
s <- sqrt(
  (s_c^2 * (nrow(df_control) - 1) + s_t^2 * (nrow(df_treatment) - 1)) / 
    (nrow(df) - 2) 
  )

# Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
# Sampling variance estimates under constant treatment effects (obtain as SD)
sd_const <- sqrt(s^2 * (1 / nrow(df_control) + 1 / nrow(df_treatment)))
# Sampling variance estimates when potential outcomes are perfectly correlated (obtain as SD)
sd_rho1 <- sqrt(
  s_c^2 * (nrow(df_treatment) / (nrow(df) * nrow(df_control))) +
    s_t^2 * (nrow(df_control) / (nrow(df) * nrow(df_treatment))) +
    s_c * s_t * 2 / nrow(df)
  )

bind_cols(
  name = c(
    "$\\bar{Y}_c^{obs}$", "$\\bar{Y}_t^{obs}$", "$\\hat{\\tau}$",
    "$s_c^2$", "$s_t^2$", "$s^2$",
    "$\\hat{V}^{\\text{neyman}} = s^2 / N_c + s^2 / N_t$",
    "$\\hat{V}^{\\text{const}} = s^2 \\cdot \\left( 1 / N_c +  1 / N_t \\right)$",
    "$\\hat{V}^{\\rho_{tc} = 1} = s_c^2 \\cdot \\left( N_t / (N \\cdot N_c ) \\right) + s_t^2 \\cdot \\left( N_c / (N \\cdot N_t) \\right) + s_c \\cdot s_t \\cdot \\left( 2 / N \\right)$"
  ),
  value = c(
    Yobs_c, Yobs_t, tau_hat,
    s_c, s_t, s,
    sd_neyman, sd_const, sd_rho1
  )
) %>% 
  mutate(value = formatC(value, digits = 2, format = "f")) %>% 
  mutate(
    value = ifelse(row_number() >= 4, paste0("$", value, "^2$"), paste0("$", value, "$"))
  ) %>% 
  set_colnames(NULL) %>% 
  kable("html", booktabs = TRUE) %>% 
  pack_rows(index = c(
    "Estimated means" = 3,
    "Estimated variance compnents" = 3,
    "Sampling variance estimates" = 3
  )) %>% 
  kable_styling(position = "center")

Estimated means
\(\bar{Y}_c^{obs}\)	\(0.58\)
\(\bar{Y}_t^{obs}\)	\(0.80\)
\(\hat{\tau}\)	\(0.22\)
Estimated variance compnents
\(s_c^2\)	\(0.19^2\)
\(s_t^2\)	\(0.13^2\)
\(s^2\)	\(0.16^2\)
Sampling variance estimates
\(\hat{V}^{\text{neyman}} = s^2 / N_c + s^2 / N_t\)	\(0.03^2\)
\(\hat{V}^{\text{const}} = s^2 \cdot \left( 1 / N_c + 1 / N_t \right)\)	\(0.03^2\)
\(\hat{V}^{\rho_{tc} = 1} = s_c^2 \cdot \left( N_t / (N \cdot N_c ) \right) + s_t^2 \cdot \left( N_c / (N \cdot N_t) \right) + s_c \cdot s_t \cdot \left( 2 / N \right)\)	\(0.03^2\)

Estimates of, and confidence intervals for average treatment effects (Replication of Table 6.3)

make_table_6_3 <- function(df, outcome) {
  
  df_control <- df %>% filter(treatment == 0)
  df_treatment <- df %>% filter(treatment == 1)
  
  # mean of outcome for control
  Yobs_c <- mean(df_control %>% pull(outcome))
  # mean of outcome for treatment
  Yobs_t <- mean(df_treatment %>% pull(outcome))
  # estimate for average treatment effect
  tau_hat <- Yobs_t - Yobs_c
  
  # estimated variance for control (obtain as SD)
  s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
  # estimated variance for treatment (obtain as SD)
  s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
  # Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
  sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
  
  ci <- paste0(
    "(",
    (tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
    (tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    tau_hat %>% formatC(digits = 2, format = "f"), 
    paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}

cbind(
  c("open", "pctpostwritten", "written", "written_all"),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_3(df, .)
  ) %>% 
    bind_cols() %>% 
    t()
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE, align = c("l", "c", "c", "c")) %>% 
  add_header_above(
    c("Variable", "$\\widehat{ATE}$", "$(\\widehat{\\text{s.e.}})$", "95\\% C.I.")
  ) %>% 
  kable_styling(position = "center")

## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4

Variable	\(\widehat{ATE}\)	\((\widehat{\text{s.e.}})\)	95% C.I.
open	0.22	(0.03)	(0.15, 0.28)
pctpostwritten	0.05	(0.04)	(-0.03, 0.13)
written	0.17	(0.08)	(0.00, 0.33)
written_all	0.14	(0.07)	(0.00, 0.27)

estimates of, and confidence intervals for average treatment effects by covariate (Replication of Table 6.4)

make_table_6_4 <- function(df, outcome, covar0) {
  
  if (covar0 == TRUE) {
    df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ == 0)
    df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ == 0)
  } else {
    df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ > 0)
    df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ > 0)
  }
  
  # Average treatment effects for samples with pre-writ == 0
  # mean of outcome for control
  Yobs_c <- mean(df_control %>% pull(outcome))
  # mean of outcome for treatment
  Yobs_t <- mean(df_treatment %>% pull(outcome))
  # estimate for average treatment effect
  tau_hat <- Yobs_t - Yobs_c
  
  # estimated variance for control (obtain as SD)
  s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
  # estimated variance for treatment (obtain as SD)
  s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
  # Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
  sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
  
  ci <- paste0(
    "(",
    (tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
    (tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    tau_hat %>% formatC(digits = 2, format = "f"), 
    paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}

make_table_6_4_diff <- function(df, outcome) {
  
  reg_res <- lm(as.formula(paste(outcome, "treatment*(pre_writ == 0)", sep = " ~ ")), df) %>% 
    coeftest(., vcov = vcovHC(., type = "HC1"))
  est <- reg_res["treatment:pre_writ == 0TRUE", "Estimate"]
  se <- reg_res["treatment:pre_writ == 0TRUE", "Std. Error"]
  
  ci <- paste0(
    "(",
    (est - 1.96 * se) %>% formatC(digits = 2, format = "f"), ", ",
    (est + 1.96 * se) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    est %>% formatC(digits = 2, format = "f"), 
    paste0("(", se %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}

cbind(
  c("open", "pctpostwritten", "written", "written_all"),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4(df, ., covar0 = TRUE)
    ) %>% 
    bind_cols() %>% 
    t(),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4(df, ., covar0 = FALSE)
    ) %>% 
    bind_cols() %>% 
    t(),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4_diff(df, .)
    ) %>% 
    bind_cols() %>% 
    t()
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE, align = c("l", rep("c", 9))) %>% 
  add_header_above(
    c(" ", rep(c("$\\tau$", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I."), 2), "EST", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I.")
  ) %>% 
  add_header_above(
    c("", "($N$ = 40)" = 3, "($N$ = 67)" = 3, " " = 3)
  ) %>% 
  add_header_above(
    c("Variable", "pctprewritten = 0" = 3, "pctprewritten $\\gt$ 0" = 3, "Difference" = 3)
  ) %>% 
  kable_styling(position = "center")

## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4

Variable	pctprewritten = 0			pctprewritten \(\gt\) 0			Difference
	(\(N\) = 40)			(\(N\) = 67)
	\(\tau\)	(\(\widehat{\text{s.e.}}\))	95% C.I.	\(\tau\)	(\(\widehat{\text{s.e.}}\))	95% C.I.	EST	(\(\widehat{\text{s.e.}}\))	95% C.I.
open	0.23	(0.05)	(0.14, 0.32)	0.21	(0.04)	(0.13, 0.29)	0.02	(0.06)	(-0.10, 0.14)
pctpostwritten	-0.04	(0.06)	(-0.16, 0.08)	0.11	(0.05)	(0.01, 0.21)	-0.15	(0.08)	(-0.31, 0.00)
written	0.20	(0.10)	(-0.00, 0.41)	0.18	(0.10)	(-0.02, 0.38)	0.03	(0.15)	(-0.26, 0.31)
written_all	0.04	(0.08)	(-0.10, 0.19)	0.22	(0.09)	(0.05, 0.39)	-0.18	(0.12)	(-0.40, 0.05)

Chapter 6

Data preparation

Load datasets

Data processing

Summary statistics (Replication of Table 6.1)

Estimates of components of variance of estimator for the effect on open variable (Replication of Table 6.2)

Estimates of, and confidence intervals for average treatment effects (Replication of Table 6.3)

estimates of, and confidence intervals for average treatment effects by covariate (Replication of Table 6.4)

Estimates of components of variance of estimator for the effect on `open` variable (Replication of Table 6.2)