##########################################################################
## An Introduction to Empirical Legal Studies
## 	Chapter 8 Replication File
##
## Revised on 3/21/2014 KEC	  
##
## This work is licensed under a Creative Commons Attribution 4.0
## International License. See http://creativecommons.org/licenses/by/4.0
## for more information.
##
## Authors
##	Lee Epstein, Washington University in St. Louis
##  and Andrew D. Martin, University of Michigan
##########################################################################

## Chapter 8

## load packages
library(ggplot2)
library(foreign)
source("ggplotTemplate.R")

## Figure 8.1 Scatterplot of hypothetical sentence length
## Section 8.1, p. 174

## read in data
counts <- 1:30
sentences <- seq(120, 700, by=20)
straightData <- data.frame(counts=counts, sentences=sentences)

## make the scatterplot
straightScatter <- ggplot(data=straightData, aes(x=counts, y=sentences)) + 
  geom_point(size=2)
straightScatter <- straightScatter + 
  xlab("Number of Guilty Counts") + ylab("Sentence (month)")
straightScatter <- straightScatter + xlim(0,31)

## show plot and save it
print(straightScatter)
ggsave("straightScatter.pdf", straightScatter, height=5, width=5)

## Figure 8.2 Scatterplot of hypothetical sentence length data with a 
## best-fitting line
## Section 8.1, p. 175

## add the regression line
straightScatterLine <- straightScatter + geom_abline(intercept = 100, slope=20)

## show plot and save it
print(straightScatterLine)
ggsave("straightScatterLine.pdf", straightScatterLine, height=5, width=5)

## Figure 8.3 Scatterplot of sentence length on number of guilty counts
## Section 8.2, p. 177

## read in data
ict <- read.dta("ictData.dta")

## keep only the guilty pleas
condition <- ict$verdict=="guilty" & !is.na(ict$verdict)
regressionData <- data.frame(sentence=ict$sentence[condition], 
                             numGuil=ict$numGuil[condition])
regressionData <- na.omit(regressionData)

## make the scatterplot
realScatter <- ggplot(data=regressionData, aes(x=numGuil, y=sentence)) + 
  geom_point(size=2, alpha=.2)
realScatter <- realScatter + xlab("Number of Guilty Counts") + 
  ylab("Sentence (months)")
realScatter <- realScatter + ylim(0,670) + xlim(0,31)

## show plot and save it
print(realScatter)
ggsave("realScatter.pdf", realScatter, height=5, width=5)

## Figure 8.4 Scatterplot of sentence length on number of guilty counts with 
## linear regression line, estimated using ordinary least squares
## Section 8.2, p. 177

## make the scatterplot with regression line
realScatterLine <- realScatter + stat_smooth(method="lm", se=FALSE, colour="black") 

## show plot and save it
print(realScatterLine)
ggsave("realScatterLine.pdf", realScatterLine, height=5, width=5)

## Figure 8.5 Demonstration of ordinary least squares with some hypothetical data.
## Each panel shows the same scatterplot with different lines and, thus, different 
## sets of residuals. The lower right-hand panel is the best fitting line
## Section 8.2, p. 180

exp <- c(5, 5, 4, 11, 16, 4, 5, 5, 5, 6, 7, 5, 3, 3, 7, 4, 4, 3, 5, 
         6, 10, 4, 6, 6, 4, 7, 6, 4, 5, 3, 4, 4, 6, 7, 3, 5, 5, 2, 4, 
         4, 3, 3, 1, 9, 3, 6, 2, 4, 1, 3)
exp <- jitter(exp)
inc <- c(52.7, 54.5, 45.7, 80, 81.3, 49.4, 41.8, 44.8, 43, 38.5, 53, 
         46.8, 46.2, 40.1, 52.6, 37.7, 45.2, 51.8, 40, 43.8, 59.2, 44.4, 
         49.7, 58.2, 26.6, 50.4, 46.7, 50.4, 52.8, 39.6, 48.6, 44.4, 58.7, 
         45.8, 36.8, 41.1, 35, 28.5, 48.9, 52.8, 46.3, 34.5, 38.7, 66.6, 
         46.6, 36.2, 26.1, 45.1, 35.3, 34.3)

## function to generate predictions
predictions <- function(slope, intercept, exp) {
  yhat <- intercept + slope * exp
  yhat
}

## put together data for the four scenarios
scen1 <- c(70, -3)
scen1lab <- "Slope -3, Intercept 70"
scen2 <- c(55, 0.5)
scen2lab <- "Slope 0.5, Intercept 55"
scen3 <- c(10, 5)
scen3lab <- "Slope 5, Intercept 10"
scen4 <- c(29.924, 3.324)
scen4lab <- "Slope 3.3, Intercept 30"
  
## this is set up to show four in one panel
regressionData <- data.frame(experience=rep(exp, 4), income=rep(inc,4),
                             predictions=c(predictions(scen1[2], scen1[1], exp),
                                           predictions(scen2[2], scen2[1], exp),
                                           predictions(scen3[2], scen3[1], exp),
                                           predictions(scen4[2], scen4[1], exp)),
                             labels=ordered(c(rep(scen1lab, length(exp)),
                                            rep(scen2lab, length(exp)),
                                            rep(scen3lab, length(exp)),
                                            rep(scen4lab, length(exp))),
                                            levels=(c(scen1lab, scen2lab,
                                                      scen3lab, scen4lab))))


## make the scatterplot
olsDemo <- ggplot(data=regressionData, aes(x=experience, y=income, ymin=income, 
                                           ymax=predictions))
olsDemo <- olsDemo + geom_point(size=2) + xlab("X") 
olsDemo <- olsDemo + ylab("Y") + geom_linerange(colour="grey40")
olsDemo <- olsDemo + facet_wrap(~labels, ncol=2)
olsDemo <- olsDemo + geom_abline(slope=scen1[2], intercept=scen1[1], size=lineSize,
                                 data=subset(regressionData, labels==scen1lab))
olsDemo <- olsDemo + geom_abline(slope=scen2[2], intercept=scen2[1], size=lineSize,
                                 data=subset(regressionData, labels==scen2lab))
olsDemo <- olsDemo + geom_abline(slope=scen3[2], intercept=scen3[1], size=lineSize,
                                 data=subset(regressionData, labels==scen3lab))
olsDemo <- olsDemo + geom_abline(slope=scen4[2], intercept=scen4[1], size=lineSize,
                                 data=subset(regressionData, labels==scen4lab))

## show plot and save it
print(olsDemo)
ggsave("olsDemo.pdf", olsDemo, height=10, width=10)

## Figure 8.6 Two linear regression models with similar slopes and intercepts. 
## For Model 1, R-squared = 0.85 and standard error = 1.9. For Model 2, 
## R-squared = 0.4 and standard error = 6.0
## Section 8.3, p. 188

set.seed(123456)

## variables
N <- 50
alpha <- -2
beta <- 1.5
sigma1 <- 2
sigma2 <- 6
model1string <- "Model One"
model2string <- "Model Two"

## pull together data
X <- runif(N, 0, 10)
Y1 <- alpha + beta * X + rnorm(N, 0, sigma1)
Y2 <- alpha + beta * X + rnorm(N, 0, sigma2)

modelFitData <- data.frame(
  X = rep(X, 2),
  Y = c(Y1, Y2),
  model = ordered(c(rep(model1string,N), rep(model2string,N)),
                   levels=c(model1string, model2string))
  )

## make the figure
modelFitPlot <-  ggplot(data=modelFitData,
                        aes(x=X, y=Y))
modelFitPlot <- modelFitPlot + geom_point(size=2) + xlab("X")
modelFitPlot <- modelFitPlot + ylab("Y")
modelFitPlot <- modelFitPlot + facet_wrap(~model, ncol=1)
modelFitPlot <- modelFitPlot + stat_smooth(method="lm", se=FALSE, colour="black")

## show plot and save it
print(modelFitPlot)
ggsave("modelFitPlot.pdf", modelFitPlot, height=7, width=5)

## print model results
print(summary(lm(Y1~X)))
print(summary(lm(Y2~X)))

## Figure 8.7 Prediction plot for linear regression of sentence length (in months) 
## on the number of counts on which the defendant was convicted. Grey region shows 
## 955 confidence interval for the expected value
## Section 8.3, p. 189

## read in data
ict <- read.dta("ictData.dta")
condition <- ict$verdict=="guilty" & !is.na(ict$verdict)
regressionData <- data.frame(sentence=ict$sentence[condition], 
                             numGuil=ict$numGuil[condition],
                             genocide=ict$genocide[condition], 
                             crimAg=ict$crimAg[condition],
                             warCrimes=ict$warCrimes[condition], 
                             mfTotal=ict$mfTotal[condition],
                             afTotal=ict$afTotal[condition])
regressionData <- na.omit(regressionData)

## make predicted plot for simple linear regression
## NOTE: to do this for a multiple regression would require setting up the ranges
## manually like the logit example

regPred <- ggplot(regressionData, aes(x=numGuil, y=sentence)) +
  stat_smooth(method = "lm", color="black", size=1, alpha=0.5, fill="grey60")
regPred <- regPred + xlab("Number of Guilty Counts") +
  ylab("Sentence (months)") + geom_point(size=2, alpha=0.2) + ylim(0,880) + 
  xlim(0,31)

## show plot and save it
print(regPred)
ggsave("regPred.pdf", regPred, height=5, width=5)