PS7 Problem 2

We will use the following variables in the dataset contribupdate.csv:

age: Age (years)
female: 1=female, 0=male
faminc: Household income (in $1,000 U.S. dollars);
given: Campaign contributions over the past year (in U.S. dollars).

contrib <- read.csv("data/contribupdate.csv")
contrib$ln_faminc <- log(contrib$faminc) 
contrib$ln_given <- log(contrib$given)

\[ \ln(\text{given}_i) = \beta_0 + \beta_1 \text{age}_i + \beta_2 \text{female}_i + \beta_3 \ln(\text{faminc}_i) + u_i \]

m2 <- lm(ln_given ~ age + female + ln_faminc, contrib)

Calculate the following quantities by hand and use simulation to get 95% confidence bounds.

library(arm)

## Loading required package: MASS

## Loading required package: Matrix

## Loading required package: lme4

## 
## arm (Version 1.13-1, built: 2022-8-25)

## Working directory is /Users/stephaniewright/630repo/labs

sims <- arm::sim(m2, n.sims = 1000)
beta <- as.matrix(sims@coef)

The predicted percent change in `given` for a 10-year increase in `age`.

Log-Level Model: Unit $\Delta x_1$ is associated with a $100(e^{\beta_1\Delta x_1} - 1)$ percent change in $E(y)$.

# by hand 
(exp(10*coef(m2)["age"]) - 1)*100

##      age 
## 12.19846

# simulations
pc_sims <- (exp(10 * beta[, "age"]) - 1) * 100

# confidence interval 
quantile(pc_sims, c(0.025, 0.975))

##     2.5%    97.5% 
## 10.62496 13.88938

The predicted change in campaign contributions (given) for a 10-year increase in age (age) is about 12%, holding all else constant. The 95% confidence interval is between 10.5% and 13.8%.

The difference in the expected values of `given` for females versus males, holding `age` and `faminc` at their respective means.

# BY HAND

# ln(given) for females 
m2f <- coef(m2)["(Intercept)"] + coef(m2)["age"] * mean(contrib$age) + coef(m2)["female"] * 1 + coef(m2)["ln_faminc"] * mean(contrib$ln_faminc)

# ln(given) for males 
m2m <- coef(m2)["(Intercept)"] + coef(m2)["age"] * mean(contrib$age) + coef(m2)["female"] * 0 + coef(m2)["ln_faminc"] * mean(contrib$ln_faminc)

# smearing factor 
norm <- exp(summary(m2)$sigma^2 / 2) # assuming normal 
duan <- mean(exp(residuals(m2))) # not assuming normal 

# assuming normal 
(exp(m2m) * norm) - (exp(m2f) * norm)

## (Intercept) 
##    61.48932

# not assuming normal 
(exp(m2m) * duan) - (exp(m2f) * duan)

## (Intercept) 
##    61.73631

# SIMULATIONS

# ln(given) for females 
m2f_sims <- beta[,"(Intercept)"] + beta[,"age"] * mean(contrib$age) + beta[,"female"] * 1 + beta[,"ln_faminc"] * mean(contrib$ln_faminc)

# ln(given) for males 
m2m_sims <- beta[,"(Intercept)"] + beta[,"age"] * mean(contrib$age) + beta[,"female"] * 0 + beta[,"ln_faminc"] * mean(contrib$ln_faminc)

# difference in given bt males and females
diff_sims <- (exp(m2m_sims) * norm) - (exp(m2f) * norm)
diff_sims <- (exp(m2m_sims) * duan) - (exp(m2f) * duan)

# confidence interval 
quantile(diff_sims, c(0.025, 0.975))

##     2.5%    97.5% 
## 52.98236 70.71429

The difference in the expected values of given for females versus males, holding age and faminc at their respective means is about $61. In other words, men with average age and income contribute $61 more to campaigns than women with average age and income. The 95% confidence interval is between $53 and $70.

The average marginal effect of `faminc` on `given`.

Express in levels:

\[given = e^{\beta_0 + \beta_1 \text{age}_i + \beta_2 \text{female}_i + \beta_3 \ln(\text{faminc}_i) + u_i}\]

Differentiate given with respect to faminc:

\[\frac{\partial(given)}{\partial(faminc)} = \hat{given} \cdot \frac{\beta_3}{faminc}\]

# BY HAND (predict given, transform,  smear)

# assuming normal 
mean(coef(m2)["ln_faminc"] * (exp(predict(m2)) * norm / exp(contrib$ln_faminc))) # length?

## [1] 2.066237

# not assuming normal (Duan)
mean(coef(m2)["ln_faminc"] * (exp(predict(m2)) * duan / exp(contrib$ln_faminc)))

## [1] 2.074536

# SIMULATIONS 

me_sims <- array(NA, 1000)

for (i in 1:1000) {
  
  given_sims <- exp(beta[i,"(Intercept)"] + beta[i,"age"] * contrib$age + beta[i,"female"] * contrib$female + beta[i,"ln_faminc"] * contrib$ln_faminc) * duan # length?
  
  me_sims[i] <- mean(beta[i, "ln_faminc"] * (given_sims / exp(contrib$ln_faminc))) # length? 
  
}

# confidence interval
quantile(me_sims, c(0.025, 0.975))

##     2.5%    97.5% 
## 1.951749 2.186102

The average marginal effect of faminc on given is about 2, which suggests that an $1000 increase in household income is associated with $2 increase in campaign contributions. The 95% confidence interval is between $1.95 and $2.19.

Heteroskedasticity

We’re going to return to the Chile data set. The variables of interest are as follows:

vote_yes: 1=will vote yes for Pinochet, 0=will vote no against Pinochet
statusquo: higher values = more support for the status quo, standardized to have mean zero and standard deviation of one
income_1000: monthly income, in thousands of pesos
education: factor variable with 3 levels: primary only (baseline), secondary (S), post-secondary (PS)
sex: factor variable with two levels, female (baseline) and male (M)

# data
chile <- read.csv("data/chile.csv")
chile <- na.omit(chile)

# re-code income to 1000s
chile$income_1000 <- chile$income / 1000

# re-code education as dummies 
chile$educS <- ifelse(chile$education == "S", 1, 0)
chile$educPS <- ifelse(chile$education == "PS", 1, 0)

# re-code sex as dummy 
chile$female <- ifelse(chile$sex == "F", 1, 0)

# re-code interaction as product term 
chile$inc_sq <- chile$income_1000 * chile$statusquo

\[ voteyes_i = \beta_0 + \beta_1female_i + \beta_2educS_i + \beta_3educPS_i + \beta_4income1000_i + \beta_5statusquo_i + \beta_6(income1000_i)(statusquo_i) + u_i \]

m1 <- lm(vote_yes ~ female + educS + educPS + income_1000 + statusquo + inc_sq, data=chile)
summary(m1)

## 
## Call:
## lm(formula = vote_yes ~ female + educS + educPS + income_1000 + 
##     statusquo + inc_sq, data = chile)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08370 -0.10507 -0.02720  0.06529  1.02970 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.5092674  0.0123093  41.373  < 2e-16 ***
## female       0.0455888  0.0126208   3.612 0.000312 ***
## educS       -0.0468570  0.0145527  -3.220 0.001307 ** 
## educPS      -0.0781653  0.0196575  -3.976 7.29e-05 ***
## income_1000 -0.0001114  0.0001664  -0.669 0.503390    
## statusquo    0.3766162  0.0077454  48.625  < 2e-16 ***
## inc_sq       0.0002796  0.0001350   2.071 0.038539 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2576 on 1696 degrees of freedom
## Multiple R-squared:  0.7355, Adjusted R-squared:  0.7346 
## F-statistic:   786 on 6 and 1696 DF,  p-value: < 2.2e-16

Heteroskedasticity: the variance of the error term is correlated with the independent variable(s).

Error Term: the difference between the true value of $y$ and the expected value of $y$.

In a linear probability model, the true value of $y$ takes on values of only 0 or 1, while the expected value of $y$ can be anything between 0 and 1.

The variance of the error term will vary depending on the level of predicted y, leading to heteroskedasticity.

Robust (Huber-White) SE

By Hand: substitute a diagonal matrix where the entries are each observation’s squared residual

\[Var(\hat{\beta}) = (X'X)^{-1}X' \Sigma X (X'X)^{-1}\] Default: $\Sigma=\sigma^2I$

Robust: $\hat{\Sigma}=\hat{u}^2I$

# YOUR CODE HERE
N <- nrow(m1$model)
K <- summary(m1)$df[1]

# add intercept column, remove dv column 
X <- cbind(rep(1,N), m1$model[,-1]) 
colnames(X)[1] <- "Intercept"
X <- as.matrix(X)

# default standard error 
se_default <- sqrt(diag((sum(m1$residuals^2) / m1$df.residual) * (solve(t(X) %*% X))))

# robust standard error (plug squared residuals in)
se_robust <- sqrt((diag(solve(t(X) %*% X) %*% (t(X) %*% diag(m1$residuals^2) %*% X) %*% solve(t(X) %*% X))))

# t-values 
t_values <- m1$coefficients / se_robust

# p-values 
p_values <- 2 * (1 - pt(abs(t_values), m1$df.residual))

# print  
round(data.frame(coefficient = coef(m1), 
           se.robust = se_robust, 
           t.value = t_values, 
           p.value = p_values), 5)

##             coefficient se.robust  t.value p.value
## (Intercept)     0.50927   0.01242 41.00648 0.00000
## female          0.04559   0.01273  3.58134 0.00035
## educS          -0.04686   0.01534 -3.05488 0.00229
## educPS         -0.07817   0.01718 -4.54978 0.00001
## income_1000    -0.00011   0.00015 -0.73599 0.46184
## statusquo       0.37662   0.00581 64.79568 0.00000
## inc_sq          0.00028   0.00008  3.38100 0.00074

Using sandwich/car

library(sandwich)
library(car)

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:arm':
## 
##     logit

HC0_sw <- sqrt(diag(vcovHC(m1, type="HC0")))
HC0_car <- sqrt(diag(hccm(m1, type="hc0")))

HC3_sw <- sqrt(diag(vcovHC(m1))) # default is HC3
HC3_car <- sqrt(diag(hccm(m1, type="hc3")))

round(data.frame(coefficient = coef(m1), 
                 default = se_default,
                 robust = se_robust, 
                 HC0 = HC0_sw,
                 HC3 = HC3_sw), 5) # round to 5 decimal places

##             coefficient default  robust     HC0     HC3
## (Intercept)     0.50927 0.01231 0.01242 0.01242 0.01246
## female          0.04559 0.01262 0.01273 0.01273 0.01277
## educS          -0.04686 0.01455 0.01534 0.01534 0.01538
## educPS         -0.07817 0.01966 0.01718 0.01718 0.01725
## income_1000    -0.00011 0.00017 0.00015 0.00015 0.00015
## statusquo       0.37662 0.00775 0.00581 0.00581 0.00584
## inc_sq          0.00028 0.00014 0.00008 0.00008 0.00008

In marginaleffects

library(marginaleffects)

## Warning: package 'marginaleffects' was built under R version 4.3.3

options(marginaleffects_print_omit = c("s.value"))

# new model with interaction term 
m1me <- lm(vote_yes ~ female + educS + educPS + income_1000*statusquo, data=chile)

# estimate ME of status quo at range of income

cme <- slopes(m1me, variables = c("statusquo"), newdata = datagrid(income_1000 = seq(min(chile$income_1000), max(chile$income_1000), 1))) # default SE 

cme_HC3 <- slopes(m1me, variables = c("statusquo"), newdata = datagrid(income_1000 = seq(min(chile$income_1000), max(chile$income_1000), 1)), vcov = "HC3") # robust SE 

# plot ME and SE for range of income
plot(cme$income_1000, cme$estimate,
     main = "CME of Status Quo Across Income",
     ylab="CME of SQ", xlab="Income (1000 Pesos)",
     type = "l", ylim=c(0.35,0.5), xlim=c(0,200))

# add default CI 
lines(cme$income_1000, cme$estimate + 1.96*cme$std.error, lty=2) 
lines(cme$income_1000, cme$estimate - 1.96*cme$std.error, lty=2) 

# add robust CI 
lines(cme$income_1000, cme$estimate + 1.96*cme_HC3$std.error, lty=2,col="red") 
lines(cme$income_1000, cme$estimate - 1.96*cme_HC3$std.error, lty=2,col="red")

Diagnosing Heteroskedasticity

F-Test (use the squared residuals as an estimate of error variance and check the extent to which they can be explained by IV)

# regress squared residuals on predictors
m1_res_1 <- lm(m1$residuals^2 ~ m1$model[,2] + m1$model[,3] + m1$model[,4] + m1$model[,5] + m1$model[,6] + m1$model[,7])

# F-stat: tests whether at least one IV significantly explains variation in the squared residuals (heteroskedasticity)
# p-value: the probability of obtaining the observed F-statistic if the null hypothesis (homoskedasticity) were true
summary(m1_res_1)

## 
## Call:
## lm(formula = m1$residuals^2 ~ m1$model[, 2] + m1$model[, 3] + 
##     m1$model[, 4] + m1$model[, 5] + m1$model[, 6] + m1$model[, 
##     7])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.08438 -0.06635 -0.04907 -0.01298  1.10611 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.705e-02  7.051e-03  10.927  < 2e-16 ***
## m1$model[, 2] -7.878e-04  7.230e-03  -0.109  0.91324    
## m1$model[, 3] -5.386e-03  8.336e-03  -0.646  0.51827    
## m1$model[, 4] -2.973e-02  1.126e-02  -2.640  0.00836 ** 
## m1$model[, 5] -6.907e-05  9.532e-05  -0.725  0.46881    
## m1$model[, 6] -6.300e-03  4.437e-03  -1.420  0.15580    
## m1$model[, 7]  8.001e-05  7.734e-05   1.034  0.30106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1476 on 1696 degrees of freedom
## Multiple R-squared:  0.007883,   Adjusted R-squared:  0.004373 
## F-statistic: 2.246 on 6 and 1696 DF,  p-value: 0.03658

Breusch-Pagen Test (regress squared residuals on IV)

# B-P test, use chi-squared test chi2 = N*R2, on K df
N <- sum(summary(m1_res_1)$df[1:2]) # num observations
R2 <- summary(m1_res_1)$r.squared # explained variance 
chi2 = N*R2 # test statistic 
K = summary(m1_res_1)$df[1] - 1 # num parameters 

1 - pchisq(q = chi2, df = K) # p-value

## [1] 0.03677072

#install.packages("lmtest")
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(m1)

## 
##  studentized Breusch-Pagan test
## 
## data:  m1
## BP = 13.424, df = 6, p-value = 0.03677

Fixed Effects

Add group as categorical variable. Since we have removed what is shared across observations within groups, the errors are now independent, and SEs are unbiased

Include fixed effects for region.

m1_fe <- lm(vote_yes ~ female + educS + educPS + income_1000 + statusquo + inc_sq + as.factor(region), data=chile)
summary(m1_fe)

## 
## Call:
## lm(formula = vote_yes ~ female + educS + educPS + income_1000 + 
##     statusquo + inc_sq + as.factor(region), data = chile)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07664 -0.10303 -0.02570  0.06746  1.03074 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.5096379  0.0168658  30.217  < 2e-16 ***
## female               0.0451365  0.0126461   3.569 0.000368 ***
## educS               -0.0470365  0.0145864  -3.225 0.001285 ** 
## educPS              -0.0774576  0.0197061  -3.931 8.81e-05 ***
## income_1000         -0.0001380  0.0001693  -0.815 0.414934    
## statusquo            0.3775280  0.0078460  48.117  < 2e-16 ***
## inc_sq               0.0002703  0.0001354   1.997 0.045998 *  
## as.factor(region)M   0.0362238  0.0376889   0.961 0.336627    
## as.factor(region)N  -0.0032260  0.0217449  -0.148 0.882080    
## as.factor(region)S  -0.0081963  0.0179584  -0.456 0.648156    
## as.factor(region)SA  0.0070432  0.0173374   0.406 0.684618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2578 on 1692 degrees of freedom
## Multiple R-squared:  0.7358, Adjusted R-squared:  0.7342 
## F-statistic: 471.2 on 10 and 1692 DF,  p-value: < 2.2e-16

Clustered Standard Errors

# with sandwich 
sqrt(diag(sandwich::vcovCL(m1, cluster = ~ region, type = "HC1")))

##  (Intercept)       female        educS       educPS  income_1000    statusquo 
## 1.031738e-02 9.027897e-03 1.058711e-02 7.394449e-03 1.149234e-04 6.275046e-03 
##       inc_sq 
## 4.535374e-05

sqrt(diag(sandwich::vcovCL(m1, cluster = ~ region, type = "HC3")))

##  (Intercept)       female        educS       educPS  income_1000    statusquo 
## 1.157760e-02 1.075783e-02 1.224346e-02 8.672368e-03 1.621388e-04 7.085268e-03 
##       inc_sq 
## 5.087155e-05

Lab 8