# Install and load packages ---------------
packages <- c(
  "tidyverse",
  "ggrepel",
  "WDI",
  "forcats",
  "grid",
  "gridExtra",
  "countrycode",
  "lmtest",
  "sandwich",
  "scales",
  "haven",
  "kableExtra",
  "magrittr"
)

# Change to install = TRUE to install the required packages
pacman::p_load(packages, character.only = TRUE)

Data preparation

Load datasets

The data can be downloaded from here.

# data of treatment assignment by school
treatment_school <- read_dta("../data/Duflo_Hanna_Ryan/data/Incentives_FINAL/Raw-Data/Tests/treatschool.dta") %>% 
  mutate(treatment = 1)
# result of random check if a school is open
random_check <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/RandomCheck.dta")
# pre-treatment test results
pre_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Pretest.dta")
# post-treatment test results
post_test <- read_dta("../data/Duflo_Hanna_Ryan/dataverse_files/Posttest.dta")

Data processing

# Make school-level post treatment average test results
# - indicator for taking a written test
# - score for written test (NA for those who did not take tests)
post_test_school <- post_test %>% 
  group_by(schid) %>% 
  summarise_at(vars(post_writ, post_total_w), mean, na.rm = TRUE)
# For later use, get the mean of the average test results across schools
post_total_w_mean <- mean(post_test_school$post_total_w)

# School-level post treatment average test results, where
# scores of students who didn't take tests are replaced to 0
post_test_school_all <- post_test %>% 
  mutate(post_total_w = replace_na(post_total_w, 0)) %>% 
  group_by(schid) %>% 
  summarise(post_total_w_all = mean(post_total_w,na.rm = TRUE))

# Make school-level pre treatment average test results 
pre_test_school <- pre_test %>% 
  group_by(schid) %>% 
  summarise_at(vars(pre_writ), mean, na.rm = TRUE)

# School-level share of opens
# (In Imbens-Rubin textbook, it appears that all records, including pre-treatment, are counted for this variable.
# For information of post-treatment opens, use filter by month and year)
school_open <- random_check %>% 
  mutate(open = ifelse(a1_1 == 1, 1, 0)) %>% 
  # mutate(
  #   post_med = ifelse(year >= 2005 | (month >= 4 & year >= 2004), 1, 0),
  #   post_end = ifelse(year >= 2005 | (month >= 11 & year >= 2004), 1, 0),
  # ) %>% 
  # filter(post_med == 1) %>% 
  group_by(schid) %>% 
  summarise_at(vars(open), mean, na.rm = TRUE)

df <- post_test_school %>% 
  # merge treatment status information
  left_join(treatment_school, by = c("schid" = "schid")) %>% 
  mutate(treatment = ifelse(is.na(treatment), 0, 1)) %>% 
  # merge pre-treatment test results data
  left_join(pre_test_school, by = c("schid" = "schid")) %>% 
  # merge post-treatment test results data with NA replaced to 0
  left_join(post_test_school_all, by = c("schid" = "schid")) %>% 
  # merge school open information
  left_join(school_open, by = c("schid" = "schid")) %>% 
  # It appears that, the post-treatment test scores are divided by the average (ignoring students who didn't take exam)
  # Not sure exactly why
  mutate(
    post_total_w = post_total_w / post_total_w_mean,
    post_total_w_all = post_total_w_all / post_total_w_mean,
  )
# Summary statistics for control and treatment group variables
control_treat_sum <- df %>% 
  group_by(treatment) %>% 
  summarise_at(
    vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
    list(mean, sd)
  ) %>% 
  pivot_longer(cols = -treatment, names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>% 
  mutate(
    across(-treatment, ~ formatC(., digits = 2, format = "f")),
    across(-treatment, ~ ifelse(fn == "_fn2", paste0("(", ., ")"), .)),
    ) %>% 
  select(-treatment, -fn) %>% 
  as.matrix() %>% 
  t()
  
# Summary statistics for all schools
all_school_sum <- df %>% 
  summarise_at(
    vars(pre_writ, open, post_writ, post_total_w, post_total_w_all),
    list(min, max)
  ) %>% 
  pivot_longer(cols = everything(), names_to = c(".value", "fn"), names_patter = "(.*)(_.*$)") %>% 
  mutate(across(-fn, ~ formatC(., digits = 2, format = "f"))) %>% 
  select(-fn) %>% 
  as.matrix() %>% 
  t()

Summary statistics (Replication of Table 6.1)

cbind(
  c("pctprewritten", "open", "pctpostwritten", "written", "written_all"),
  control_treat_sum, all_school_sum
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE) %>% 
  add_header_above(
    c(" ", rep(c("Average", "(S.D.)"), 2), "Min", "Max")
  ) %>% 
  add_header_above(
    c(
      "Variable",
      "Control ($N_c$ = 54)" = 2,
      "Treatment ($N_c$ = 53)" = 2,
      " ", " "
      )
  ) %>% 
  pack_rows(index = c(
    "Pre-treatment" = 1,
    "Post-treatment" = 4
  )) %>% 
  kable_styling(position = "center")
Variable
Control (\(N_c\) = 54)
Treatment (\(N_c\) = 53)
Average
(S.D.)
Average
(S.D.)
Min
Max
Pre-treatment
pctprewritten 0.19 (0.19) 0.16 (0.17) 0.00 0.67
Post-treatment
open 0.58 (0.19) 0.80 (0.13) 0.00 1.00
pctpostwritten 0.47 (0.19) 0.52 (0.23) 0.05 0.92
written 0.92 (0.45) 1.09 (0.42) 0.07 2.20
written_all 0.45 (0.32) 0.59 (0.39) 0.04 1.43

Estimates of components of variance of estimator for the effect on open variable (Replication of Table 6.2)

df_control <- df %>% filter(treatment == 0)
df_treatment <- df %>% filter(treatment == 1)

# mean of outcome for control
Yobs_c <- mean(df_control$open)
# mean of outcome for treatment
Yobs_t <- mean(df_treatment$open)
# estimate for average treatment effect
tau_hat <- Yobs_t - Yobs_c

# estimated variance for control (obtain as SD)
s_c <- sqrt(sum((df_control$open - Yobs_c)^2) / (nrow(df_control) - 1))
# estimated variance for treatment (obtain as SD)
s_t <- sqrt(sum((df_treatment$open - Yobs_t)^2) / (nrow(df_treatment) - 1))
# estimated variance for all (obtain as SD)
s <- sqrt(
  (s_c^2 * (nrow(df_control) - 1) + s_t^2 * (nrow(df_treatment) - 1)) / 
    (nrow(df) - 2) 
  )

# Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
# Sampling variance estimates under constant treatment effects (obtain as SD)
sd_const <- sqrt(s^2 * (1 / nrow(df_control) + 1 / nrow(df_treatment)))
# Sampling variance estimates when potential outcomes are perfectly correlated (obtain as SD)
sd_rho1 <- sqrt(
  s_c^2 * (nrow(df_treatment) / (nrow(df) * nrow(df_control))) +
    s_t^2 * (nrow(df_control) / (nrow(df) * nrow(df_treatment))) +
    s_c * s_t * 2 / nrow(df)
  )

bind_cols(
  name = c(
    "$\\bar{Y}_c^{obs}$", "$\\bar{Y}_t^{obs}$", "$\\hat{\\tau}$",
    "$s_c^2$", "$s_t^2$", "$s^2$",
    "$\\hat{V}^{\\text{neyman}} = s^2 / N_c + s^2 / N_t$",
    "$\\hat{V}^{\\text{const}} = s^2 \\cdot \\left( 1 / N_c +  1 / N_t \\right)$",
    "$\\hat{V}^{\\rho_{tc} = 1} = s_c^2 \\cdot \\left( N_t / (N \\cdot N_c ) \\right) + s_t^2 \\cdot \\left( N_c / (N \\cdot N_t) \\right) + s_c \\cdot s_t \\cdot \\left( 2 / N \\right)$"
  ),
  value = c(
    Yobs_c, Yobs_t, tau_hat,
    s_c, s_t, s,
    sd_neyman, sd_const, sd_rho1
  )
) %>% 
  mutate(value = formatC(value, digits = 2, format = "f")) %>% 
  mutate(
    value = ifelse(row_number() >= 4, paste0("$", value, "^2$"), paste0("$", value, "$"))
  ) %>% 
  set_colnames(NULL) %>% 
  kable("html", booktabs = TRUE) %>% 
  pack_rows(index = c(
    "Estimated means" = 3,
    "Estimated variance compnents" = 3,
    "Sampling variance estimates" = 3
  )) %>% 
  kable_styling(position = "center")
Estimated means
\(\bar{Y}_c^{obs}\) \(0.58\)
\(\bar{Y}_t^{obs}\) \(0.80\)
\(\hat{\tau}\) \(0.22\)
Estimated variance compnents
\(s_c^2\) \(0.19^2\)
\(s_t^2\) \(0.13^2\)
\(s^2\) \(0.16^2\)
Sampling variance estimates
\(\hat{V}^{\text{neyman}} = s^2 / N_c + s^2 / N_t\) \(0.03^2\)
\(\hat{V}^{\text{const}} = s^2 \cdot \left( 1 / N_c + 1 / N_t \right)\) \(0.03^2\)
\(\hat{V}^{\rho_{tc} = 1} = s_c^2 \cdot \left( N_t / (N \cdot N_c ) \right) + s_t^2 \cdot \left( N_c / (N \cdot N_t) \right) + s_c \cdot s_t \cdot \left( 2 / N \right)\) \(0.03^2\)

Estimates of, and confidence intervals for average treatment effects (Replication of Table 6.3)

make_table_6_3 <- function(df, outcome) {
  
  df_control <- df %>% filter(treatment == 0)
  df_treatment <- df %>% filter(treatment == 1)
  
  # mean of outcome for control
  Yobs_c <- mean(df_control %>% pull(outcome))
  # mean of outcome for treatment
  Yobs_t <- mean(df_treatment %>% pull(outcome))
  # estimate for average treatment effect
  tau_hat <- Yobs_t - Yobs_c
  
  # estimated variance for control (obtain as SD)
  s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
  # estimated variance for treatment (obtain as SD)
  s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
  # Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
  sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
  
  ci <- paste0(
    "(",
    (tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
    (tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    tau_hat %>% formatC(digits = 2, format = "f"), 
    paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}

cbind(
  c("open", "pctpostwritten", "written", "written_all"),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_3(df, .)
  ) %>% 
    bind_cols() %>% 
    t()
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE, align = c("l", "c", "c", "c")) %>% 
  add_header_above(
    c("Variable", "$\\widehat{ATE}$", "$(\\widehat{\\text{s.e.}})$", "95\\% C.I.")
  ) %>% 
  kable_styling(position = "center")
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
Variable
\(\widehat{ATE}\)
\((\widehat{\text{s.e.}})\)
95% C.I.
open 0.22 (0.03) (0.15, 0.28)
pctpostwritten 0.05 (0.04) (-0.03, 0.13)
written 0.17 (0.08) (0.00, 0.33)
written_all 0.14 (0.07) (0.00, 0.27)

estimates of, and confidence intervals for average treatment effects by covariate (Replication of Table 6.4)

make_table_6_4 <- function(df, outcome, covar0) {
  
  if (covar0 == TRUE) {
    df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ == 0)
    df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ == 0)
  } else {
    df_control <- df %>% filter(treatment == 0) %>% filter(pre_writ > 0)
    df_treatment <- df %>% filter(treatment == 1) %>% filter(pre_writ > 0)
  }
  
  # Average treatment effects for samples with pre-writ == 0
  # mean of outcome for control
  Yobs_c <- mean(df_control %>% pull(outcome))
  # mean of outcome for treatment
  Yobs_t <- mean(df_treatment %>% pull(outcome))
  # estimate for average treatment effect
  tau_hat <- Yobs_t - Yobs_c
  
  # estimated variance for control (obtain as SD)
  s_c <- sqrt(sum((df_control %>% pull(outcome) - Yobs_c)^2) / (nrow(df_control) - 1))
  # estimated variance for treatment (obtain as SD)
  s_t <- sqrt(sum((df_treatment %>% pull(outcome) - Yobs_t)^2) / (nrow(df_treatment) - 1))
  # Sampling variance estimates, ignoring heterogeneous treatment effects (obtain as SD)
  sd_neyman <- sqrt(s_c^2 / nrow(df_control) + s_t^2 / nrow(df_control))
  
  ci <- paste0(
    "(",
    (tau_hat - 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"), ", ",
    (tau_hat + 1.96 * sd_neyman) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    tau_hat %>% formatC(digits = 2, format = "f"), 
    paste0("(", sd_neyman %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}

make_table_6_4_diff <- function(df, outcome) {
  
  reg_res <- lm(as.formula(paste(outcome, "treatment*(pre_writ == 0)", sep = " ~ ")), df) %>% 
    coeftest(., vcov = vcovHC(., type = "HC1"))
  est <- reg_res["treatment:pre_writ == 0TRUE", "Estimate"]
  se <- reg_res["treatment:pre_writ == 0TRUE", "Std. Error"]
  
  ci <- paste0(
    "(",
    (est - 1.96 * se) %>% formatC(digits = 2, format = "f"), ", ",
    (est + 1.96 * se) %>% formatC(digits = 2, format = "f"),
    ")"
  )
    
  return(c(
    est %>% formatC(digits = 2, format = "f"), 
    paste0("(", se %>% formatC(digits = 2, format = "f"), ")"),
    ci
    ))
  
}
cbind(
  c("open", "pctpostwritten", "written", "written_all"),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4(df, ., covar0 = TRUE)
    ) %>% 
    bind_cols() %>% 
    t(),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4(df, ., covar0 = FALSE)
    ) %>% 
    bind_cols() %>% 
    t(),
  map(
    c("open", "post_writ", "post_total_w", "post_total_w_all"),
    ~ make_table_6_4_diff(df, .)
    ) %>% 
    bind_cols() %>% 
    t()
  ) %>% 
  set_rownames(NULL) %>% 
  kable("html", booktabs = TRUE, align = c("l", rep("c", 9))) %>% 
  add_header_above(
    c(" ", rep(c("$\\tau$", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I."), 2), "EST", "($\\widehat{\\text{s.e.}}$)", "95\\% C.I.")
  ) %>% 
  add_header_above(
    c("", "($N$ = 40)" = 3, "($N$ = 67)" = 3, " " = 3)
  ) %>% 
  add_header_above(
    c("Variable", "pctprewritten = 0" = 3, "pctprewritten $\\gt$ 0" = 3, "Difference" = 3)
  ) %>% 
  kable_styling(position = "center")
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
## New names:
## * NA -> ...1
## * NA -> ...2
## * NA -> ...3
## * NA -> ...4
Variable
pctprewritten = 0
pctprewritten \(\gt\) 0
Difference
(\(N\) = 40)
(\(N\) = 67)
\(\tau\)
(\(\widehat{\text{s.e.}}\))
95% C.I.
\(\tau\)
(\(\widehat{\text{s.e.}}\))
95% C.I.
EST
(\(\widehat{\text{s.e.}}\))
95% C.I.
open 0.23 (0.05) (0.14, 0.32) 0.21 (0.04) (0.13, 0.29) 0.02 (0.06) (-0.10, 0.14)
pctpostwritten -0.04 (0.06) (-0.16, 0.08) 0.11 (0.05) (0.01, 0.21) -0.15 (0.08) (-0.31, 0.00)
written 0.20 (0.10) (-0.00, 0.41) 0.18 (0.10) (-0.02, 0.38) 0.03 (0.15) (-0.26, 0.31)
written_all 0.04 (0.08) (-0.10, 0.19) 0.22 (0.09) (0.05, 0.39) -0.18 (0.12) (-0.40, 0.05)