##########################################################################
## An Introduction to Empirical Legal Studies
## 	Chapter 9 Replication File
##
## Revised on 3/21/2014 KEC	  
##
## This work is licensed under a Creative Commons Attribution 4.0
## International License. See http://creativecommons.org/licenses/by/4.0
## for more information.
##
## Authors
##	Lee Epstein, Washington University in St. Louis
##  and Andrew D. Martin, University of Michigan
##########################################################################

## Chapter 9

## load packages
library(ggplot2)
library(foreign)
library(mvtnorm)
library(boot)
source("ggplotTemplate.R")

## Figure 9.1 Nomogram of multiple regression parameters and 95% confidence 
## interval for analysis of setence length (in months) in Table 9.1. The 
## dots show the regression parameter estimates and the horizontal lines, 
## the 95% confidence itnervals
## Section 9.1, p. 204

## read in data
ict <- read.dta("ictData.dta")
condition <- ict$verdict=="guilty" & !is.na(ict$verdict)
regressionData <- data.frame(sentence=ict$sentence[condition], 
                             numGuil=ict$numGuil[condition],
                             genocide=ict$genocide[condition], 
                             crimAg=ict$crimAg[condition],
                             warCrimes=ict$warCrimes[condition], 
                             mfTotal=ict$mfTotal[condition],
                             afTotal=ict$afTotal[condition])
regressionData <- na.omit(regressionData)

## make nomogram for multiple regression
fullModel <- lm(sentence ~ numGuil+afTotal+mfTotal+genocide+crimAg, 
                data=regressionData)
print(summary(fullModel))

coefData <- data.frame(coef=coef(fullModel), se=coef(summary(fullModel))[,2])
coefData$lows <- coefData$coef - 1.96 * coefData$se
coefData$highs <- coefData$coef + 1.95 * coefData$se
coefData$names <- ordered(c("Intercept", "Number of Aggravating Factors", 
                            "Number of Mitigating Factors",
                            "Number Guilty", "Genocide", "Crimes Against Humanity"),
                          levels= rev(c("Intercept", 
                                        "Number of Aggravating Factors", 
                                        "Number of Mitigating Factors",
                                        "Number Guilty", "Genocide", 
                                        "Crimes Against Humanity"
                                   )))

nomogram <- ggplot(coefData, aes(y = coef, x=names, ymin=coefData$lows, 
                                 ymax=coefData$highs))
nomogram <- nomogram + geom_pointrange(size=.7) + coord_flip() + 
  geom_hline(yintercept=0) +
  ylab("Multiple Regression Coefficient") + xlab("Independent Variable")

## show plot and save it
print(nomogram)
ggsave("nomogram.pdf", nomogram, height=5, width=6) 

## Figure 9.2 Anscombe's Quartet. The lines in each panel are linear regression 
## estimates. Each line has the same slope and intercept, and the residual 
## standard deviation and R-squared is the same for each panel. Only for Dataset 
## 1 would the linear regression be appropriate
## cite: Anscombe, Francis J. (1973) Graphs in statistical analysis. 
## American Statistician, 27, 17–21.
## Section 9.2, p. 206

## data (using the built-in R dataset)
anscombe2 <- with(anscombe, data.frame(
  X     = c(x1, x2, x3, x4),
  Y     = c(y1, y2, y3, y4),
  group = gl(4, nrow(anscombe))
  ))
levels(anscombe2$group) <- c("Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4")

## make the figure
anscombePlot <- ggplot(anscombe2, aes(X, Y)) + geom_point(size=2) + 
  facet_wrap(~ group)
anscombePlot <- anscombePlot + geom_abline(slope=0.5, intercept=3)

## show plot and save it
print(anscombePlot)
ggsave("anscombePlot.pdf", anscombePlot, height=8, width=8)

## Figure 9.3 Linear regression of hypothetical Y on X, demonstrating what can 
## go wrong if we don't include the right independent variables 
## Section 9.2, p. 208

## put together data.frame
x1 <- c(1,2,3,4)
y1 <- x1 + 5

x2 <- x1 + 7
y2 <- x2 - 7

simpData <- data.frame(x=c(x1, x2), y=c(y1,y2), label=as.factor(c(rep("Men", 4), 
                                                                  rep("Women",4))))

## make the first-figure: analyze together
simpPlot1 <- ggplot(simpData, aes(x,y)) + geom_point(size=3) + xlab("X") + 
  ylab("Y")
simpPlot1 <- simpPlot1 + stat_smooth(method="lm", se=FALSE, colour="black") 

## show plot and and save it
print(simpPlot1)
ggsave("simpPlot1.pdf", simpPlot1, height=5, width=5)

## Figure 9.4 Linear regression of hypothetical Y on X controlling for sex,
## illustrating Simpson's paradox. The two lines represent the least squares 
## line when simultaneously controlling for sex
# Section 9.2, p. 209

## do group-by-group
simpPlot2 <- ggplot(subset(simpData), aes(x,y)) + geom_point(size=3) + 
  xlab("X") + ylab("Y")
simpPlot2 <- simpPlot2 + facet_wrap(~label, ncol=1) + stat_smooth(method="lm", 
                                                                  se=FALSE, colour="black")

## show plot and save it
print(simpPlot2)
ggsave("simpPlot2.pdf", simpPlot2, height=8, width=5)

## Figure 9.5 Scatterplot of the votes of ICJ judge per capita gross domestic 
## product of the judge's country, with a regression line
## Section 9.3, p. 214

## read in data
icj <- read.dta("icjData.dta")
icj$jGDP <- icj$judgeGDP / 10000

## model
logitModel <- glm(decision1~jGDP, data=icj, family=binomial(logit))

## make necessary changes to data
icjPlot <- data.frame(decision1=icj$decision1, jGDP=icj$jGDP)
icjPlot <- na.omit(data.frame(decision1=icj$decision1, jGDP=icj$jGDP))
logitScatter <- ggplot(data=icjPlot, aes(x=jGDP, y=decision1)) + 
  geom_point(size=2, alpha=.2)
logitScatter <- logitScatter + stat_smooth(method="lm", se=FALSE, colour="black")
logitScatter <- logitScatter + xlab("Judge Per Capita GDP ($10,000 in 1996)")
logitScatter <- logitScatter + ylab("Judge Rules in Favor of Applicant")

## show plot and save it
print(logitScatter)
ggsave("logitScatter.pdf", logitScatter, height=8, width=8)

## Figure 9.6 Logistic Regression Curve
## Section 9.3, p. 215

## logit curves
ruler <- data.frame(x=seq(-4,4,0.01))
ruler$y <- inv.logit(ruler$x)

## make the plot
logitPlot <- ggplot(data=ruler, aes(x, y)) + geom_line(size=1)
logitPlot <- logitPlot + ylab("Cumulative Distribution Function") + xlab("X")

## show plot and save it
print(logitPlot)
ggsave("logitPlot.pdf", logitPlot, height=5, width=5)

## Figure 9.7 Predicted probabilities of an ICJ judge vote in favor of plaintiff 
## as a function of per capita judge GDP, based on the estimates in Table 9.2. 
## The grey region denotes 95% confidence interval
## Section 9.3, p. 218

## read in data
icj <- read.dta("icjData.dta")
icj$jGDP <- icj$judgeGDP / 10000

## model
logitModel <- glm(decision1~jGDP, data=icj, family=binomial(logit))

## put together the predicted probability figure for the range of X
## set up ruler
predictions <- data.frame(jGDP=seq(0,3.8,0.001))
predictions$probs <- predict(logitModel, newdata=predictions, type="response")

## loop to get highs and lows
beta <- coef(logitModel)
vcov <- vcov(logitModel)
sims <- 5000
holder <- matrix(NA, sims, nrow(predictions))

for(i in 1:sims) {
  beta.draw <- rmvnorm(1, mean=beta, sigma=vcov)
  holder[i,] <-  inv.logit(beta.draw[1] + beta.draw[2] * predictions$jGDP)
}
predictions$bottoms <- apply(holder,2,quantile,0.025)
predictions$tops <- apply(holder,2,quantile,.975)

## create the figure
logitPred <- ggplot(predictions, aes(x=jGDP, y=probs))
logitPred <- logitPred + geom_ribbon(aes(x=jGDP, ymin=bottoms, ymax=tops),
              alpha=0.5, fill="grey60") + 
  xlab("Judge Per Capita GDP ($10,000 in 1996)") +
  ylab("Probability Judge Rules in Favor of Applicant") + 
  geom_line(size=1) + ylim(0.2,0.6)

## show plot and save it
print(logitPred)
ggsave("logitPred.pdf", logitPred, height=5, width=5)