Full repo: https://github.com/jrfarrer/stats701_hw1/
Published file: https://jrfarrer.github.io/stats701_hw1/
Begin by setting up the R session, creating a logger function, and loading packages.
# Set the seed for reproducibility
set.seed(44)
# Set the locale of the session so languages other than English can be used
invisible(Sys.setlocale("LC_ALL", "en_US.UTF-8"))
# Prevent printing in scientific notation
options(digits = 4, width = 220)
# Create a logger function
logger <- function(msg, level = "info", file = log_file) {
cat(paste0("[", format(Sys.time(), "%Y-%m-%d %H:%M:%S.%OS"), "][", level, "] ", msg, "\n"), file = stdout())
}
# Set the project directory
base_dir <- ''
data_dir <- paste0(base_dir, "data/")
code_dir <- paste0(base_dir, "code/")
viz_dir <- paste0(base_dir, "viz/")
dir.create(data_dir, showWarnings = FALSE)
dir.create(code_dir, showWarnings = FALSE)
dir.create(viz_dir, showWarnings = FALSE)
# Create a function that will be used to load/install packages
fn_load_packages <- function(p) {
if (!is.element(p, installed.packages()[,1]) || (p =="DT" && !(packageVersion(p) > "0.1"))) {
if (p == "DT") {
devtools::install_github('rstudio/DT')
} else {
install.packages(p, dep = TRUE, repos = 'http://cran.us.r-project.org')
}
}
a <- suppressPackageStartupMessages(require(p, character.only = TRUE))
if (a) {
logger(paste0("Loaded package ", p, " version ", packageVersion(p)))
} else {
logger(paste0("Unable to load packages ", p))
}
}
# Create a vector of packages
packages <- c('tidyverse','ggthemes','knitr','readxl','broom','forecast','stringr',
'ISLR','GGally','gridExtra','leaps','extrafont','pander')
# Use function to load the required packages
invisible(lapply(packages, fn_load_packages))
[2016-10-16 23:35:18.18][info] Loaded package tidyverse version 1.0.0
[2016-10-16 23:35:18.18][info] Loaded package ggthemes version 3.2.0
[2016-10-16 23:35:18.18][info] Loaded package knitr version 1.14
[2016-10-16 23:35:18.18][info] Loaded package readxl version 0.1.1
[2016-10-16 23:35:18.18][info] Loaded package broom version 0.4.1
[2016-10-16 23:35:18.18][info] Loaded package forecast version 7.1
[2016-10-16 23:35:18.18][info] Loaded package stringr version 1.1.0
[2016-10-16 23:35:19.19][info] Loaded package ISLR version 1.0
[2016-10-16 23:35:19.19][info] Loaded package GGally version 1.2.0
[2016-10-16 23:35:19.19][info] Loaded package gridExtra version 2.2.1
[2016-10-16 23:35:19.19][info] Loaded package leaps version 2.9
[2016-10-16 23:35:19.19][info] Loaded package extrafont version 0.17
[2016-10-16 23:35:19.19][info] Loaded package pander version 0.6.0
# To the font second font, run the following two lines of code and add name of user to vector
# system(paste0("cp -r ",viz_dir,"fonts/. ~/Library/Fonts/")) # instantaneous
# font_import() # takes approximately 5-10 min
users_v <- c("Jordan")
# Create a color palette
pal538 <- ggthemes_data$fivethirtyeight
# Create a theme to use throughout the analysis
theme_jrf <- function(base_size = 8, base_family = ifelse(Sys.info()[['user']] %in% users_v, "DecimaMonoPro", "Helvetica")) {
theme(
plot.background = element_rect(fill = "#F0F0F0", colour = "#606063"),
panel.background = element_rect(fill = "#F0F0F0", colour = NA),
panel.border = element_blank(),
panel.grid.major = element_line(colour = "#D7D7D8"),
panel.grid.minor = element_line(colour = "#D7D7D8", size = 0.25),
panel.margin = unit(0.25, "lines"),
panel.margin.x = NULL,
panel.margin.y = NULL,
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.title = element_text(colour = "#A0A0A3"),
axis.text.x = element_text(vjust = 1, family = 'Helvetica', colour = '#3C3C3C'),
axis.text.y = element_text(hjust = 1, family = 'Helvetica', colour = '#3C3C3C'),
legend.background = element_blank(),
legend.key = element_blank(),
plot.title = element_text(face = 'bold', colour = '#3C3C3C', hjust = 0),
text = element_text(size = 9, family = ifelse(Sys.info()[['user']] %in% users_v,"DecimaMonoPro", "Helvetica")),
title = element_text(family = ifelse(Sys.info()[['user']] %in% users_v,"DecimaMonoPro", "Helvetica"))
)
}
Let’s use Hadley’s readr package to load the dataset, using the col_name parameter to set the column names of the tibble.
# Load the csv with meaningful column names
survey_results <- read_csv(paste0(data_dir,'Survey_results_final.csv'), skip = 1,
col_names = c('hitid','hittypeid','title','description','keywords',
'reward','creationtime','maxassignments','requesterannotation',
'assignmentdurationinseconds','autoapprovaldelayinseconds',
'expiration','numberofsimilarhits','lifetimeinseconds',
'assignmentid','workerid','assignmentstatus','accepttime',
'submittime','autoapprovaltime','approvaltime','rejectiontime',
'requesterfeedback','worktime','lifetimeapprovalrate',
'last30daysapprovalrate','last7daysapprovalrate','age',
'education','gender','income','sirius','wharton','approve','reject'))
# Print a few records in the tibble
survey_results %>%
select(age, education, gender, income, sirius, wharton, worktime)
# Put into a new tibble we'll use for cleaning (there will be a final later)
survey_results_cleaning <- survey_results
We’ll sequentially clean each of the primary variables of the dataset and create exploratory summaries.
Let’s quickly summarize the age variable, noting that it is a character.
survey_results_cleaning %>% group_by(age) %>% summarise(cnt = n()) %>% arrange(age)
We correct some errant values, using our judgement as data analysts and plot a histogram.
survey_results_cleaning <-
survey_results %>%
mutate(
age2 = ifelse(age == 'Eighteen (18)', "18", ifelse(age == 'female', NA, ifelse(age == "27`", "27", age)))
, age2 = as.integer(age2)
)
ggplot(survey_results_cleaning, aes(x = age2)) +
geom_point(aes(x = 4, y = 1), shape = 1, colour = pal538['red'], fill = NA, size = 6, stroke = 1) +
geom_point(aes(x = 223, y = 1), shape = 1, colour = pal538['red'], fill = NA, size = 6, stroke = 1) +
geom_histogram(binwidth = 1, fill = pal538['blue']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = "Age", y = "Count", x = "Age (years)")
It looks like we still missed some bad values.
sort(unique(survey_results_cleaning$age2))
[1] 4 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
[29] 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 65 66 67 68 69 74 76 223
We fix those too and plot the histogram.
survey_results_cleaning <-
survey_results_cleaning %>%
mutate(
age3 = ifelse(age2 %in% c(4, 223), NA, age2)
)
ggplot(survey_results_cleaning, aes(x = age3)) + geom_histogram(binwidth = 1, fill = pal538['blue']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = "Age", y = "Count", x = "Age (years)")
Let’s look at the unique values and counts.
survey_results_cleaning %>% group_by(education) %>% summarise(cnt = n()) %>% arrange(education)
It appears that 0 respondents left the survey on the default which read ‘select one’. We’ll update that to ‘Other’ and modify this variable to be a factor.
survey_results_cleaning <-
survey_results_cleaning %>%
mutate(
education2 = ifelse(education == "select one", "Other", education)
, education2 = factor(education2, levels = c('Less than 12 years; no high school diploma'
, 'High school graduate (or equivalent)'
, 'Some college, no diploma; or Associate’s degree'
, 'Bachelor’s degree or other 4-year degree'
, 'Graduate or professional degree'
, 'Other'))
)
survey_results_cleaning %>% group_by(education2) %>% summarise(cnt = n()) %>% arrange(education2)
We’ll summarize the gender variable.
survey_results_cleaning %>% group_by(gender) %>% summarise(cnt = n()) %>% arrange(gender)
We update this to be a factor.
survey_results_cleaning <-
survey_results_cleaning %>%
mutate(gender2 = as.factor(gender))
survey_results_cleaning %>%
group_by(gender2) %>%
summarise(cnt = n()) %>%
arrange(gender2) %>%
mutate(prop = cnt / sum(cnt))
survey_results_cleaning %>% group_by(income) %>% summarise(cnt = n()) %>% arrange(income)
Let’s convert this to a factor variable.
survey_results_cleaning <-
survey_results_cleaning %>%
mutate(
income2 = factor(income, levels = c('Less than $15,000'
, '$15,000 - $30,000'
, '$30,000 - $50,000'
, '$50,000 - $75,000'
, '$75,000 - $150,000'
, 'Above $150,000'))
)
survey_results_cleaning %>% group_by(income2) %>% summarise(cnt = n()) %>% arrange(income2)
survey_results_cleaning %>% group_by(sirius) %>% summarise(cnt = n()) %>% arrange(sirius)
survey_results_cleaning %>% group_by(wharton) %>% summarise(cnt = n()) %>% arrange(wharton)
Let’s convert these to factors for better analysis capabilities.
survey_results_cleaning <-
survey_results_cleaning %>%
mutate(
sirius2 = factor(sirius, levels = c("Yes","No"))
, wharton2 = factor(wharton, levels = c("Yes","No"))
)
survey_results_cleaning %>% group_by(sirius2) %>% summarise(cnt = n()) %>% arrange(sirius2)
survey_results_cleaning %>% group_by(wharton2) %>% summarise(cnt = n()) %>% arrange(wharton2)
ggplot(survey_results_cleaning, aes(x = worktime)) + geom_histogram(binwidth = 1, fill = pal538['blue']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = "Worktime", y = "Count", x = "Worktime in Seconds")
We select and rename the columns.
survey_results_cleaning <-
survey_results_cleaning %>%
select(age3, education2, gender2, income2, sirius2, wharton2, worktime) %>%
rename(
age = age3
, education = education2
, gender = gender2
, income = income2
, sirius = sirius2
, wharton = wharton2
)
Let’s review the records with missing data.
survey_results_cleaning[!complete.cases(survey_results_cleaning), ] %>% print(width = Inf)
We will remove the 7 records that have NAs for sirius or wharton. Without information about the response, we will have trouble making an estimate of p, the proportion of Sirius listeners who listened to Business Radio Powered by the Wharton School.
survey_results_cleaning %>%
filter(is.na(sirius) | is.na(wharton))
We put together a final data frame.
survey_results_final <-
survey_results_cleaning %>%
filter(!is.na(sirius) & !is.na(wharton))
We previously listened to the podcast Planet Money’s episode about Amazon’s Mechanical Turk program.
pander(summary(survey_results_final), missing = "", split.table = Inf)
age | education | gender | income | sirius | wharton | worktime |
---|---|---|---|---|---|---|
Min. :18.0 | Less than 12 years; no high school diploma : 10 | Female: 742 | Less than $15,000 :209 | Yes:1358 | Yes: 70 | Min. : 8.0 |
1st Qu.:23.0 | High school graduate (or equivalent) :192 | Male :1009 | $15,000 - $30,000 :366 | No : 399 | No :1687 | 1st Qu.: 17.0 |
Median :28.0 | Some college, no diploma; or Associate’s degree:743 | NA’s : 6 | $30,000 - $50,000 :428 | Median : 21.0 | ||
Mean :30.4 | Bachelor’s degree or other 4-year degree :613 | $50,000 - $75,000 :376 | Mean : 22.5 | |||
3rd Qu.:34.0 | Graduate or professional degree :180 | $75,000 - $150,000:327 | 3rd Qu.: 26.0 | |||
Max. :76.0 | Other : 19 | Above $150,000 : 47 | Max. :108.0 | |||
NA’s :3 | NA’s : 4 |
Variable | Class | Description |
---|---|---|
Age | Integer | The age in years of the survey respondent |
Education | Factor | Level of education attain by the survey respondent |
Gender | Factor | Gender indicated by the survey respondent (Male or Female) |
Income | Factor | Income level provided by the survey respondent |
Sirius | Factor | Response to “Have you ever listened to Sirius Radio?” |
Wharton | Factor | Response to “Have you ever listened to Sirius Business Radio by Wharton?” |
Worktime | Integer | Number of second spent completing the survey |
On the surface, we have no reason to believe that the MTURK dataset could be representative of the US population. Knowledge of MTURK is not universal and attracts particular types of individuals willing to perform many small tasks for a minor reward (from Planet Money podcast).
First, we quickly see that the proportion of Sirius listeners is much higher than the given proportion. If the US population is 321.4 million, then the proportion of Sirius listeners is
\[\frac{51.6}{321.4} = 0.1605\]
However, we quickly see that in the survey data from MTURK, the proportion of Sirius listeners is much higher.
(sirius_prop <- survey_results_final %>% summarise(prop_sirius = sum(sirius == "Yes") / n()))
% say that have listened to Sirius.
Second, in order to answer the question “Does this appear to be a random sample from the US population?” empirically we can look at the four characteristics in our final dataset
For age and gender, we download a table called “Population by Age” from US Census Bureau’s Current Population Survey in 2012.
census_age_gender <- read_csv(url("http://www.census.gov/population/age/data/files/2012/2012gender_table1.csv"),
skip = 6,
col_names = c("age", "all","all_percent","male","male_percent","female","female_percent"),
col_types = cols(age = col_character(),
all = col_number(),
all_percent = col_number(),
male = col_number(),
male_percent = col_number(),
female = col_number(),
female_percent = col_number())
)
census_age_gender
We will need to bucket our MTURK dataset to match the categories of the Census Bureau’s. In doing so we remove the 109 records with an age 18-19 and without a listed gender.
actual <-
survey_results_final %>%
filter(age >= 20 & !is.na(gender)) %>%
mutate(age_bucket = paste0(floor(age / 10), "0 to ", floor(age / 10),"9 years")) %>%
group_by(age_bucket, gender) %>%
summarise(
n = n()
) %>%
ungroup() %>%
mutate(source = "Actual") %>%
select(source, age_bucket, gender, n)
actual_size <- sum(actual$n)
actual
Next we clean the Census Bureau’s dataset and scale the expected number of individuals to our dataset size of 1645.
expected <-
census_age_gender %>%
filter(row_number() <= 19) %>%
select(age, male, female) %>%
mutate(age = gsub("\\.","", age)) %>%
filter(!(age %in% c('Under 5 years','All ages','5 to 9 years','10 to 14 years','15 to 19 years'))) %>%
mutate(age_bucket = paste0(substring(age,1,1), "0 to ", substring(age,1,1),"9 years")) %>%
mutate(age_bucket = ifelse(age_bucket == "80 to 89 years", "80 years plus", age_bucket)) %>%
select(-age) %>%
gather(gender, n, -age_bucket) %>%
group_by(age_bucket, gender) %>%
summarise(n = sum(n)) %>%
ungroup() %>%
mutate(gender = paste0(toupper(substring(gender,1,1)), substring(gender, 2, 999))) %>%
mutate(percent = n / sum(n)) %>%
mutate(Expected = actual_size * percent) %>%
select(age_bucket, gender, Expected) %>%
gather(source, n, -age_bucket, -gender) %>%
select(source, age_bucket, gender, n)
expected
Then we can combine the two.
actual_expected <-
union(actual, expected) %>%
mutate(
source = factor(source, levels = c("Actual","Expected"))
, gender = factor(gender, levels = c("Male","Female"))
)
We find that the MTURK sample is younger and more male the US population. For example, in a sample 1645 we would expect to find 155.5733 males, 20 to 29 years old. However, in the MTURK sample there are 559 males, 20 to 29 years old, or 403.4267 more than expected. In addition, in the US population we would expect 2, 2, 852.4531, 51.8% females and 2, 1, 792.5469, 48.2% males. However, the MTUK sample has 1, 2, 715, 43.5% females and 1, 1, 930, 56.5% males.
actual_expected %>%
ggplot(aes(x = source, y = n, fill = source)) +
geom_bar(stat = "identity") +
coord_flip() +
facet_grid(age_bucket ~ gender, switch = "y") +
theme_jrf() +
labs(title = "MTURK is younger and more male than US Population",
y = paste0("Respondents to Survey (", actual_size, ")"), x = NULL) +
scale_fill_manual(values = c(Actual = pal538['blue'][[1]], Expected = pal538['red'][[1]])) +
guides(fill = FALSE) +
geom_text(aes(label = round(n, 0)), hjust = 0, family = "DecimaMonoPro") +
scale_y_continuous(expand = c(0.02, 40)) +
theme(strip.text.y = element_text(size = 6))
Looking at education, we download data the US Census Bureau’s Current Population Report that shows statistics on educational attainment. The data is by age and gender, but we aggregate the age section to the total population to compare to the MTURK sample. The table below shows expected vs actual proportions.
census_edu <- read_csv(url("https://www.census.gov/hhes/socdemo/education/data/cps/2015/Table%201-01.csv"),
skip = 5)
edu_expected <-
census_edu %>%
select(1, 3:17) %>%
filter(row_number() %in% c(2:15)) %>%
select(-X1) %>%
mutate(`Doctoral degree` = as.integer(gsub(",","",`Doctoral degree`))) %>%
summarise_each(funs(sum(., na.rm =TRUE))) %>%
gather(education, n) %>%
mutate(
education = ifelse(education == "None", "Other",
ifelse(education %in% c("1st - 4th grade","5th - 6th grade","7th - 8th grade","9th grade",
"10th grade","11th grade /2"), "Less than 12 years; no high school diploma",
ifelse(education == "High school graduate", "High school graduate (or equivalent)",
ifelse(education %in% c("Some college, no degree","Associate's degree, occupational",
"Associate's degree, academic"),
"Some college, no diploma; or Associate’s degree",
ifelse(education == "Bachelor's degree", "Bachelor’s degree or other 4-year degree",
ifelse(education %in% c("Master's degree","Professional degree","Doctoral degree"),
"Graduate or professional degree", NA))))))
, education = factor(education, levels = c('Less than 12 years; no high school diploma'
, 'High school graduate (or equivalent)'
, 'Some college, no diploma; or Associate’s degree'
, 'Bachelor’s degree or other 4-year degree'
, 'Graduate or professional degree'
, 'Other'))
) %>%
group_by(education) %>%
summarise(expected_n = sum(n)) %>%
ungroup() %>%
mutate(expected = expected_n / sum(expected_n))
edu_actual <-
survey_results_final %>%
group_by(education) %>%
summarise(actual_n = n()) %>%
ungroup() %>%
mutate(actual = actual_n / sum(actual_n))
comparison_tbl_edu <-
inner_join(edu_expected, edu_actual, by = c("education")) %>%
mutate(
expected = paste0(round(100*expected,1), "%")
, actual = paste0(round(100*actual,1), "%")
) %>%
select(-actual_n, -expected_n)
comparison_tbl_edu
We find that the MTURK sample over indexes on people have been to college or graduated from college. Notably, in a sample of the US population we would expect to find 27.8% of people to have ‘Some college, no diploma; or Associate’s degree’, but in the MTURK sample 42.3% fit this category of educational attainment.
We use a proportions test to determine if the proportions are indeed different.
edu_matrix <- inner_join(edu_expected, edu_actual, by = c("education")) %>% select(expected_n, actual_n) %>% as.matrix
(edu_prop_test <- prop.test(edu_matrix))
6-sample test for equality of proportions without continuity correction
data: edu_matrix
X-squared = 760, df = 5, p-value <2e-16
alternative hypothesis: two.sided
sample estimates:
prop 1 prop 2 prop 3 prop 4 prop 5 prop 6
0.9999 0.9991 0.9962 0.9955 0.9977 0.9926
Using 6-sample test for equality of proportions without continuity correction we have strong evidence against the null hypothesis that the proportions in the education groups are the same. This provides further evidence that the MTURK sample does not represent the US population.
Looking at income, we download from the US Census Bureau statistics on personal income.
download.file("http://www2.census.gov/programs-surveys/cps/tables/pinc-01/2016/pinc01_1_1_1.xls",
destfile = paste0(data_dir, 'pinc01_1_1_1.xls'), mode = "wb")
trying URL 'http://www2.census.gov/programs-surveys/cps/tables/pinc-01/2016/pinc01_1_1_1.xls'
Content type 'application/vnd.ms-excel' length 54784 bytes (53 KB)
==================================================
downloaded 53 KB
income <- read_excel(paste0(data_dir, 'pinc01_1_1_1.xls'), skip = 8)
income_expected <-
income[, c(4:44)] %>%
filter(row_number() == 2) %>%
gather(income, n) %>%
mutate(
n = as.integer(n)
) %>%
select(income, n) %>%
mutate(
income = ifelse(row_number() <= 6, "Less than $15,000",
ifelse(row_number() <= 12, "$15,000 - $30,000",
ifelse(row_number() <= 20, "$30,000 - $50,000",
ifelse(row_number() <= 30, "$50,000 - $75,000",
"Above $75,000"))))
, income = factor(income, levels = c("Less than $15,000","$15,000 - $30,000", "$30,000 - $50,000",
"$50,000 - $75,000", "Above $75,000"))
) %>%
group_by(income) %>%
summarise(
n = sum(n)
) %>%
ungroup() %>%
mutate(expected = n / sum(n)) %>%
mutate(expected_n = n) %>%
select(income, expected_n, expected)
income_actual <-
survey_results_final %>%
filter(!is.na(income)) %>%
mutate(
income = as.character(income)
, income = ifelse(income %in% c("$75,000 - $150,000","Above $150,000"), "Above $75,000", income)
, income = factor(income, levels = c("Less than $15,000","$15,000 - $30,000", "$30,000 - $50,000",
"$50,000 - $75,000", "Above $75,000"))
) %>%
group_by(income) %>%
summarise(
n = n()
) %>%
ungroup() %>%
mutate(actual = n / sum(n)) %>%
mutate(actual_n = n) %>%
select(income, actual_n, actual)
comparison_tbl_income <-
inner_join(income_expected, income_actual, by = c("income")) %>%
mutate(
expected = paste0(round(100*expected,1), "%")
, actual = paste0(round(100*actual,1), "%")
) %>%
select(-actual_n, -expected_n)
comparison_tbl_income
Looking at the table above we see there is a smaller percentage of lower income respondents than expected (26.7% vs. 11.9%). In addition, there is larger percentage of high earning respondents than expected (15.7% vs. 21.3%).
We use a proportions test to determine if the proportions are indeed different.
income_matrix <- inner_join(income_expected, income_actual, by = c("income")) %>% select(expected_n, actual_n) %>% as.matrix
(income_prop_test <- prop.test(income_matrix))
5-sample test for equality of proportions without continuity correction
data: income_matrix
X-squared = 260, df = 4, p-value <2e-16
alternative hypothesis: two.sided
sample estimates:
prop 1 prop 2 prop 3 prop 4 prop 5
0.9966 0.9930 0.9910 0.9883 0.9896
Using 5-sample test for equality of proportions without continuity correction we have strong evidence against the null hypothesis that the proportions in the income groups are the same. This provides further evidence that the MTURK sample does not represent the US population.
Though we might be concerned that our sample does not represent the MTURK population as a whole we have no evidence to support this from the data provided. There should be concern that someone who opts to participate in a survey about satellite radio might be more likely to be a satellite radio listener (unless MTURK workers are much more likely to be Sirius listeners). However, we have no evidence to support this claim.
In thinking about this question we read the articles “Who are these people?” Evaluating the demographic characteristics and political preferences of MTurk survey respondents and “Demographics of Mechanical Turk”.
In order to estimate the number of Wharton listeners in the US we will create a 95% confidence interval of the proportion of Wharton listeners in the MTURK dataset and multiply this by the Sirius radio listeners (51.6 million).
\[\hat{p} \pm z \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}\]
p_hat <-
survey_results_final %>%
filter(sirius == "Yes") %>%
summarise(p_hat = sum(wharton == "Yes") / n()) %>%
unlist()
ci2 <- c(p_hat - qt(0.975, nrow(survey_results_final)) * sqrt(p_hat*(1-p_hat) / nrow(survey_results_final))
, p_hat + qt(0.975, nrow(survey_results_final)) * sqrt(p_hat*(1-p_hat) / nrow(survey_results_final)))
pop_p <- p_hat * 51.6
pop_ci <- round(ci2 * 51.6,2)
We estimate the sample proportion to be 0.0501 and the 95% confidence interval to be
\[(0.0399, 0.0603)\]
Thus we estimate the size of the Wharton listeners in the US to be 2.58 million and the 95% confidence interval to be (in millions)
\[(2.06, 3.11)\]
) and age, gender, income, and education characteristics. However, assuming that the sample represents the population, we estimate that there are between 2.06 and 3.11 million listeners of “Business Radio Powered by the Wharton School”.
x <- seq(0, 1, length = 40)
y <- 1 + 1.2*x + rnorm(40, mean = 0, sd = 2)
ggplot(data_frame(x, y), aes(x = x, y = y)) + geom_point(colour = pal538['blue']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = "Scatterplot of (x,y) pairs", y = "y", x = "x")
We use the the lm function to create a linear model.
fit1 <- lm(y ~ x)
tidy(fit1) %>% pander()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 0.002587 | 0.6573 | 0.003936 | 0.9969 |
x | 2.963 | 1.131 | 2.619 | 0.01259 |
We find that \(\beta_0=0.4226\) and \(\beta_1=NA\). Next we overlay LS equation on the scatterplot.
ggplot(data = fit1$model, aes(x = x, y = y)) + geom_point(colour = pal538['blue']) +
geom_smooth(method="lm", se = TRUE, colour = pal538['red']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = "LS Equation", y = "y", x = "x")
The 95% confidence interval for \(\beta_1\) is \[NA \pm 1.96\times NA\] or \[(NA, NA)\]
This 95% confidence interval does indeed contain the true \(\beta_1\) which is \(1.2\).
The RSE is 0.027 which is very close to the true standard deviation of the error of \(\sigma = 2\).
We begin with the given simulation code chunk:
x <- seq(0, 1, length = 40)
n_sim <- 100
b1 <- numeric(n_sim) # nsim many LS estimates of beta1 (=1.2)
upper_ci <- numeric(n_sim) # lower bound
lower_ci <- numeric(n_sim) # upper bound
t_star <- qt(0.975, 38)
# Carry out the simulation
for (i in 1:n_sim){
y <- 1 + 1.2 * x + rnorm(40, sd = 2)
lse <- lm(y ~ x)
lse_out <- summary(lse)$coefficients
se <- lse_out[2, 2]
b1[i] <- lse_out[2, 1]
upper_ci[i] <- b1[i] + t_star * se
lower_ci[i] <- b1[i] - t_star * se
}
We will summarize \(\beta_1\).
summary(b1) %>% pander()
Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
---|---|---|---|---|---|
-0.931 | 0.682 | 1.22 | 1.29 | 1.86 | 3.94 |
ggplot(data = data_frame(b1 = b1), aes(x = b1)) + geom_histogram(binwidth = 0.2, fill = pal538['blue']) +
geom_vline(xintercept = 1.2, colour = pal538['red']) +
geom_label(aes(x = 1.2, y = Inf, label = 'beta[1] == 1.2'),
vjust = "inward", hjust = "inward", parse = TRUE, colour = pal538['red']) +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
labs(title = expression("Histogram of LS estimates "~b[1]~" of "~beta[1]), y = "Count", x = expression(b[1]))
The sampling distribution does agree with the theory as most of the LS estimate of \(\beta_1\) are close to 1.2.
ci <- data_frame(n = 1:100, b1 = b1 , lower_ci = lower_ci, upper_ci = upper_ci,
covers = factor(ifelse(lower_ci < 1.2 & upper_ci > 1.2, "Yes", "No"), levels = c("Yes", "No")))
We find that 97 out of 100 95% confidence intervals cover the true \(\beta_1\). We show this graphically below, where the red intervals do not cover the true \(\beta_1\) and the green intervals do cover the true \(\beta_1\).
ggplot(data = ci) +
geom_vline(xintercept = 1.2) +
geom_segment(aes(x = lower_ci, xend = upper_ci, y = n, yend = n, colour = covers)) +
labs(title = "100 Sample Confidence Intervals", y = NULL, x = expression(beta[1])) +
geom_label(aes(x = 1.2, y = Inf, label = 'beta[1] == 1.2'), vjust = "inward", hjust = "inward", parse = TRUE) +
guides(color = guide_legend(title = expression("Covers "~beta[1]~"?"))) +
theme(legend.position = 'bottom') +
theme_jrf() +
scale_x_continuous(expand = c(0.05, 0.01)) + scale_y_continuous(expand = c(0.02, 0.01)) +
scale_colour_manual(values = c('Yes' = pal538['green'][[1]], 'No' = pal538['red'][[1]]))
We begin by loading and tidying the ML Pay dataset.
# Read in the ML Pay dataset
ml_pay <- read_csv(paste0(data_dir, "MLPayData_Total.csv"))
# Let's tidy the dataset
ml_pay2 <-
ml_pay %>%
rename(team = Team.name.2014) %>%
gather(metric_raw, value, -payroll, -avgwin, -team) %>%
mutate(
year = as.factor(str_extract(metric_raw, "(\\d)+"))
, metric = ifelse(substring(metric_raw,1,1) == "p", "payroll",
ifelse(str_detect(metric_raw, ".pct"), "avgwin", "wins"))
) %>%
select(team, year, metric, value, payroll, avgwin)
ml_pay2
Let’s do a few data quality checks, where we ensure there are 30 teams per year and there are no missing values.
# Show there are 30 unique teams per year
ml_pay2 %>%
group_by(year) %>%
summarise(
n = n()
, n_distinct = n_distinct(team)
)
# Show that there are no missing values
ml_pay2 %>%
summarise(
na = sum(is.na(value))
, nan = sum(is.nan(value))
)
For the 17 years between 1998 and 2014, we summarize the payroll of the 30 teams.
ml_pay2 %>%
filter(metric == "payroll") %>%
group_by(year) %>%
summarise(
min = min(value)
, p25 = quantile(value, .25)
, p50 = quantile(value, .5)
, mean = mean(value)
, p75 = quantile(value, .75)
, max = max(value)
)
The box-plot below show there was a general growth in payroll spending over the 17 years in the MLB. The outlier at the high end of the payroll scale is the New York Yankees.
ml_pay2 %>%
filter(metric == "payroll") %>%
ggplot(aes(year, value)) + geom_boxplot(fill = pal538['blue']) +
theme_jrf() +
labs(title = "Payroll Growth", y = "Team Payroll ($m)", x = NULL)
Let’s identify what the year-over-year (yoy) growth in payroll has been by team.
ml_pay2 %>%
filter(metric == "payroll") %>%
arrange(team, year) %>%
group_by(team) %>%
mutate(
yoy_growth = (value - lag(value)) / lag(value)
) %>%
group_by(team) %>%
summarise(
avg_yoy_growth = mean(yoy_growth, na.rm = TRUE)
) %>%
ungroup() %>%
arrange(desc(avg_yoy_growth)) %>%
print(n = 30)
Let’s summarize this as the yoy growth overall.
avg_yoy_growth <-
ml_pay2 %>%
filter(metric == "payroll") %>%
arrange(team, year) %>%
group_by(team) %>%
mutate(
yoy_growth = (value - lag(value)) / lag(value)
) %>%
group_by(team) %>%
summarise(
avg_yoy_growth = mean(yoy_growth, na.rm = TRUE)
) %>%
ungroup() %>%
summarise(
avg_yoy_growth = mean(avg_yoy_growth)
) %>%
unlist()
avg_yoy_growth
avg_yoy_growth
0.1042
Next, we summarize the winning percentage for the 17 years. This is not particularly meaningful, but it is a way to identify any errant values.
ml_pay2 %>%
filter(metric == "avgwin") %>%
group_by(year) %>%
summarise(
min = min(value)
, p25 = quantile(value, .25)
, p50 = quantile(value, .5)
, mean = mean(value)
, p75 = quantile(value, .75)
, max = max(value)
)
The box-plot below shows the dispersion of winning percentage overtime. You can see the dot in 2003 is the Detroit Tigers who lost more games than any American League team in history (43-119).
ml_pay2 %>%
filter(metric == "avgwin") %>%
ggplot(aes(year, value)) + geom_boxplot(fill = pal538['blue']) +
scale_y_continuous(labels = scales::percent) +
theme_jrf() +
labs(title = "Winning Percentage", y = "Winning Percentage", x = NULL)
Let’s summarize the two variables across time to get an idea of where the values fall.
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
group_by(metric) %>%
summarise(
min = min(value)
, p25 = quantile(value, .25)
, p50 = quantile(value, .5)
, mean = mean(value)
, p75 = quantile(value, .75)
, max = max(value)
) %>%
pander()
metric | min | p25 | p50 | mean | p75 | max |
---|---|---|---|---|---|---|
avgwin | 0.2654 | 0.4444 | 0.50 | 0.5 | 0.5556 | 0.716 |
payroll | 8.3170 | 51.3329 | 73.34 | 78.1 | 94.9997 | 235.295 |
Next, let’s look at scatter plots of payroll vs winning percentage over the 17 years. This plot helps highlight the fact that average payroll increases over time.
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
spread(metric, value) %>%
ggplot(aes(x = payroll, y = avgwin)) + facet_wrap(~ year) +
geom_point(colour = pal538['blue'], alpha = 0.75) +
geom_smooth(method = "lm", se = FALSE, colour = pal538['red']) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Payroll vs Winning Percentage", y = "Winning Percentage", x = "Payroll ($m)") +
theme_jrf() +
geom_text(data =
. %>%
group_by(year) %>%
summarise(
cor = cor(payroll, avgwin)
),
aes(x = 170, y = .7, label = paste0("cor = ", round(cor, 3))),
size = 3
)
avg_person_cor <-
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
spread(metric, value) %>%
group_by(year) %>%
summarise(
cor = cor(payroll, avgwin)
) %>%
ungroup() %>%
summarise(
cor = mean(cor)
) %>%
unlist()
# Avg Person Correlation
avg_person_cor
cor
0.4402
Let’s show the trends in the two variable by team.
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
ggplot(aes(x = year, y = value, group = team)) +
facet_grid(metric ~ ., scales = "free_y", switch = "y",
labeller = ggplot2::labeller(metric = c(avgwin = "Winning Percentage", payroll = "Payroll"))) +
geom_line(colour = pal538['blue']) +
guides(color = FALSE) +
theme_jrf() +
labs(title = "Payroll and Winning Percentage by Team", y = NULL, x = NULL)
Summary of Exploratory Analysis:
Let’s build a linear model to predict winning percentage for each of the 17 years. The best way to do this is using nested data frames (tidyr), purrr, and broom.
lm_by_year <-
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
spread(metric, value) %>%
group_by(year) %>%
nest() %>%
mutate(
model = purrr::map(data, ~ lm(avgwin ~ payroll, data = .))
)
Below is a summary of each model. It appears that some of the models are significant at 95% confidence level, but a number of models are not, notably 2012, 2014, and 2000. If we look back at the Payroll vs Winning Percentage plot, we can see that correlation for these years are lower than others.
lm_by_year %>%
unnest(model %>% purrr::map(broom::glance)) %>%
select(year, r.squared, adj.r.squared, sigma, statistic, p.value)
Below is the full summary of the model for 1998 and note that the results match the 1998 record above.
summary(lm_by_year$model[[1]])
Call:
lm(formula = avgwin ~ payroll, data = .)
Residuals:
Min 1Q Median 3Q Max
-0.12418 -0.03151 -0.00042 0.02347 0.11439
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.350656 0.025803 13.59 7.5e-14 ***
payroll 0.003635 0.000579 6.27 8.8e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0546 on 28 degrees of freedom
Multiple R-squared: 0.584, Adjusted R-squared: 0.57
F-statistic: 39.4 on 1 and 28 DF, p-value: 8.78e-07
However, before we interpret these models let’s check our model assumptions, aside from linearity, we need to check (1) normality and (2) equal variance of the residuals.
The normal Q-Q plots below show that the residuals are approximately normal.
lm_by_year %>%
unnest(model %>% purrr::map(broom::augment)) %>%
ggplot() +
facet_wrap(~ year) +
stat_qq(aes(sample = .std.resid), colour = pal538['blue']) +
geom_abline(data =
. %>%
group_by(year) %>%
summarise(
slope = diff(quantile(.std.resid, c(0.25, 0.75))) / diff(qnorm(c(0.25, 0.75)))
, int = quantile(.std.resid, c(0.25, 0.75))[1L] -
(diff(quantile(.std.resid, c(0.25, 0.75))) /
diff(qnorm(c(0.25, 0.75)))) * qnorm(c(0.25, 0.75))[1L]
),
aes(slope = slope, intercept = int), alpha = 0.5
) +
theme_jrf() +
scale_x_continuous(labels = NULL) +
labs(title = "Normal Q-Q", y = "Standardized Residuals", x = "Theoretical Quantiles")
The fitted values vs residuals plots show approximately equal variance of the residuals (i.e. no heteroscedasticity).
lm_by_year %>%
unnest(model %>% purrr::map(broom::augment)) %>%
ggplot(aes(x = .fitted, y = .resid)) +
facet_wrap(~ year, scale = "free_x") +
geom_point(colour = pal538['blue']) +
geom_smooth(method = "loess", colour = pal538['red'], se = FALSE, size = .25, alpha = 0.5) +
geom_hline(yintercept = 0, alpha = 0.5, linetype = 'dashed', color = 'black') +
theme_jrf() +
scale_x_continuous(labels = NULL) +
labs(title = "Fitted Values vs Residuals", y = "Residuals", x = "Fitted Values")
Having checked the model assumptions, we can look at the \(\beta\) that have been estimated by the models. Below are the 34 coefficients (17 models with an intercept term and a coefficient for payroll). The p-values show that for many of the estimated coefficients we do not have enough evidence to reject the null hypothesis that the coefficients differ from 0.
lm_by_year %>%
unnest(model %>% purrr::map(broom::tidy)) %>%
print(n = 34)
We find that in some years, payroll is a significant variable in predicting winning percentage while in others it is not. We might consider using previous years payroll to predict winning percentage.
Using the aggregated data provided in MLPayData_Total.csv we create linear regression to predict average winning percentage.
fit1 <- lm(avgwin ~ payroll, data = ml_pay2 %>% select(team, payroll, avgwin) %>% distinct())
summary(fit1)
Call:
lm(formula = avgwin ~ payroll, data = ml_pay2 %>% select(team,
payroll, avgwin) %>% distinct())
Residuals:
Min 1Q Median 3Q Max
-0.04003 -0.01749 0.00094 0.01095 0.07030
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4226 0.0153 27.56 < 2e-16 ***
payroll 0.0614 0.0117 5.23 1.5e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.027 on 28 degrees of freedom
Multiple R-squared: 0.494, Adjusted R-squared: 0.476
F-statistic: 27.4 on 1 and 28 DF, p-value: 1.47e-05
We find that model is significant with an F-statistic of 27.38.
red_sox <- ml_pay2 %>% select(team, payroll, avgwin) %>% distinct() %>% filter(team == "Boston Red Sox")
(red_sox_interval <- predict(fit1, red_sox, interval = "prediction", level = .95))
fit lwr upr
1 0.5436 0.4848 0.6025
The 95% prediction interval for the Boston Red Sox is \((0.4848, 0.6025)\) and their winning percentage is \(0.5487\). In other words, the model does quite well in predicting the Boston Red Sox’s winning percentage over the 17 year period.
oakland <- ml_pay2 %>% select(team, payroll, avgwin) %>% distinct() %>% filter(team == "Oakland Athletics")
(oakland_interval <- predict(fit1, oakland, interval = "prediction", level = .95))
fit lwr upr
1 0.4742 0.4172 0.5312
The 95% prediction interval for the Oakland Athletics is \((0.4172, 0.5312)\) and their winning percentage is \(0.5445\). In other words, the model under-predicting the Oakland Athletic’s winning percentage over the 17 year period as it’s outside the prediction interval. This was an expected result as Billy Beane was the general manager for the A’s during this period.
To build a model to best predict the winning percentage in 2014, we’ll use the payroll and winning percentage from previous years. We will use the last 5 years of winning percentages and payroll figures. We use our domain knowledge to assume that data more than 5 years back will not have an influence on the current season.
last_5years <-
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
arrange(team, year) %>%
spread(metric, value) %>%
group_by(team) %>%
mutate(
payroll_lag1 = lag(payroll, 1)
, payroll_lag2 = lag(payroll, 2)
, payroll_lag3 = lag(payroll, 3)
, payroll_lag4 = lag(payroll, 4)
, payroll_lag5 = lag(payroll, 5)
, avgwin_lag1 = lag(avgwin, 1)
, avgwin_lag2 = lag(avgwin, 2)
, avgwin_lag3 = lag(avgwin, 3)
, avgwin_lag4 = lag(avgwin, 4)
, avgwin_lag5 = lag(avgwin, 5)
) %>%
ungroup() %>%
filter(year == 2014) %>%
select(-payroll, -year)
summary(lm(avgwin ~ . -team, data = last_5years))
Call:
lm(formula = avgwin ~ . - team, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.09422 -0.01651 0.00883 0.02237 0.06299
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.525603 0.115091 4.57 0.00021 ***
payroll_lag1 0.000450 0.000339 1.33 0.20019
payroll_lag2 0.000828 0.000626 1.32 0.20133
payroll_lag3 -0.000859 0.000772 -1.11 0.27985
payroll_lag4 0.000035 0.000898 0.04 0.96933
payroll_lag5 0.000340 0.000770 0.44 0.66390
avgwin_lag1 0.088470 0.162141 0.55 0.59167
avgwin_lag2 0.515698 0.179432 2.87 0.00972 **
avgwin_lag3 -0.528842 0.216923 -2.44 0.02477 *
avgwin_lag4 -0.338486 0.190392 -1.78 0.09144 .
avgwin_lag5 0.050099 0.214129 0.23 0.81751
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0463 on 19 degrees of freedom
Multiple R-squared: 0.6, Adjusted R-squared: 0.39
F-statistic: 2.85 on 10 and 19 DF, p-value: 0.0237
We will iteratively remove the explanatory variable that has the largest p-value for the coefficient estimate.
summary(lm(avgwin ~ . -team -payroll_lag4, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.09415 -0.01685 0.00889 0.02227 0.06232
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.526261 0.110967 4.74 0.00012 ***
payroll_lag1 0.000449 0.000329 1.36 0.18831
payroll_lag2 0.000832 0.000601 1.39 0.18127
payroll_lag3 -0.000845 0.000669 -1.26 0.22096
payroll_lag5 0.000363 0.000488 0.74 0.46594
avgwin_lag1 0.089320 0.156605 0.57 0.57479
avgwin_lag2 0.513650 0.167223 3.07 0.00602 **
avgwin_lag3 -0.530114 0.209032 -2.54 0.01966 *
avgwin_lag4 -0.337122 0.182412 -1.85 0.07943 .
avgwin_lag5 0.049045 0.207041 0.24 0.81516
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0451 on 20 degrees of freedom
Multiple R-squared: 0.6, Adjusted R-squared: 0.42
F-statistic: 3.33 on 9 and 20 DF, p-value: 0.0119
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5,
data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.09380 -0.01754 0.00645 0.01984 0.06502
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.538249 0.096510 5.58 1.6e-05 ***
payroll_lag1 0.000457 0.000320 1.43 0.1682
payroll_lag2 0.000870 0.000566 1.54 0.1390
payroll_lag3 -0.000837 0.000653 -1.28 0.2137
payroll_lag5 0.000364 0.000477 0.76 0.4536
avgwin_lag1 0.091051 0.152878 0.60 0.5578
avgwin_lag2 0.506139 0.160457 3.15 0.0048 **
avgwin_lag3 -0.532122 0.204112 -2.61 0.0165 *
avgwin_lag4 -0.315147 0.153493 -2.05 0.0527 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0441 on 21 degrees of freedom
Multiple R-squared: 0.599, Adjusted R-squared: 0.446
F-statistic: 3.92 on 8 and 21 DF, p-value: 0.00566
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.09901 -0.01594 0.00531 0.02218 0.06506
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.558474 0.089004 6.27 2.6e-06 ***
payroll_lag1 0.000509 0.000304 1.67 0.1082
payroll_lag2 0.000883 0.000557 1.58 0.1272
payroll_lag3 -0.000940 0.000621 -1.51 0.1442
payroll_lag5 0.000390 0.000468 0.83 0.4136
avgwin_lag2 0.540367 0.147599 3.66 0.0014 **
avgwin_lag3 -0.507848 0.197046 -2.58 0.0172 *
avgwin_lag4 -0.321684 0.150838 -2.13 0.0444 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0434 on 22 degrees of freedom
Multiple R-squared: 0.592, Adjusted R-squared: 0.462
F-statistic: 4.56 on 7 and 22 DF, p-value: 0.00282
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1 -payroll_lag5, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1 - payroll_lag5, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.10148 -0.01826 0.00261 0.02746 0.06409
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.586540 0.081836 7.17 2.7e-07 ***
payroll_lag1 0.000552 0.000297 1.86 0.0764 .
payroll_lag2 0.000748 0.000530 1.41 0.1711
payroll_lag3 -0.000604 0.000469 -1.29 0.2104
avgwin_lag2 0.504779 0.140343 3.60 0.0015 **
avgwin_lag3 -0.464948 0.188934 -2.46 0.0218 *
avgwin_lag4 -0.361271 0.142207 -2.54 0.0183 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0432 on 23 degrees of freedom
Multiple R-squared: 0.579, Adjusted R-squared: 0.47
F-statistic: 5.28 on 6 and 23 DF, p-value: 0.00151
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1 -payroll_lag5 -payroll_lag3, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1 - payroll_lag5 - payroll_lag3, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.10542 -0.01801 0.00232 0.02926 0.06465
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.570319 0.081966 6.96 3.4e-07 ***
payroll_lag1 0.000392 0.000274 1.43 0.1653
payroll_lag2 0.000244 0.000362 0.67 0.5063
avgwin_lag2 0.519764 0.141771 3.67 0.0012 **
avgwin_lag3 -0.369298 0.176113 -2.10 0.0467 *
avgwin_lag4 -0.419934 0.136562 -3.08 0.0052 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0437 on 24 degrees of freedom
Multiple R-squared: 0.549, Adjusted R-squared: 0.455
F-statistic: 5.84 on 5 and 24 DF, p-value: 0.00115
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1 -payroll_lag5 -payroll_lag3 -payroll_lag2, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1 - payroll_lag5 - payroll_lag3 - payroll_lag2,
data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.11094 -0.01508 0.00388 0.02677 0.07620
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.565351 0.080740 7.00 2.4e-07 ***
payroll_lag1 0.000499 0.000221 2.25 0.0333 *
avgwin_lag2 0.486773 0.131613 3.70 0.0011 **
avgwin_lag3 -0.324439 0.161292 -2.01 0.0552 .
avgwin_lag4 -0.396055 0.130451 -3.04 0.0055 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0433 on 25 degrees of freedom
Multiple R-squared: 0.54, Adjusted R-squared: 0.467
F-statistic: 7.35 on 4 and 25 DF, p-value: 0.000467
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1 -payroll_lag5 -payroll_lag3 -payroll_lag2 -avgwin_lag3, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1 - payroll_lag5 - payroll_lag3 - payroll_lag2 -
avgwin_lag3, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.14646 -0.01807 0.00922 0.02539 0.07564
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.508958 0.080029 6.36 9.8e-07 ***
payroll_lag1 0.000305 0.000211 1.45 0.1598
avgwin_lag2 0.383337 0.128052 2.99 0.0060 **
avgwin_lag4 -0.464237 0.133145 -3.49 0.0018 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0457 on 26 degrees of freedom
Multiple R-squared: 0.466, Adjusted R-squared: 0.404
F-statistic: 7.56 on 3 and 26 DF, p-value: 0.000853
summary(lm(avgwin ~ . -team -payroll_lag4 -avgwin_lag5 -avgwin_lag1 -payroll_lag5 -payroll_lag3 -payroll_lag2 -avgwin_lag3 -payroll_lag1, data = last_5years))
Call:
lm(formula = avgwin ~ . - team - payroll_lag4 - avgwin_lag5 -
avgwin_lag1 - payroll_lag5 - payroll_lag3 - payroll_lag2 -
avgwin_lag3 - payroll_lag1, data = last_5years)
Residuals:
Min 1Q Median 3Q Max
-0.14569 -0.01120 0.00467 0.02812 0.07996
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.4791 0.0789 6.07 1.7e-06 ***
avgwin_lag2 0.4543 0.1207 3.77 0.00082 ***
avgwin_lag4 -0.4126 0.1308 -3.15 0.00393 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0466 on 27 degrees of freedom
Multiple R-squared: 0.423, Adjusted R-squared: 0.38
F-statistic: 9.9 on 2 and 27 DF, p-value: 0.000597
Using this process, we would select a model with the two explanatory variables of the average winning percentage from 2 years and 4 years ago. This model seems rather arbitrary because there is not something significant about 2 years or 4 years ago.
Perhaps a better solutions is to build features from the previous years. Below we attempt to using simple exponential smoothing (\(\alpha = 0.6\) to weight recent values more) of payroll and winning percentage over 3 years.
alpha <- 0.6
nyears <- 3
fn_ses_forecast <- function(x) {
if (sum(!is.na(x)) < nyears) {
fore <- as.double(NA)
} else {
fore <- data.frame(ses(x, alpha = alpha, initial = 'simple'))[,1][1]
}
return(fore)
}
last_2years <-
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
arrange(team, year) %>%
spread(metric, value) %>%
group_by(team) %>%
mutate(
payroll_ses = rollapply(payroll, FUN = fn_ses_forecast,
width = list(-nyears:-1), fill = NA, by.column = TRUE, align = "right")
, avgwin_ses = rollapply(avgwin, FUN = fn_ses_forecast,
width = list(-nyears:-1), fill = NA, by.column = TRUE, align = "right")
) %>%
ungroup() %>%
group_by(team) %>%
mutate(
payroll_ses_lag1 = lag(payroll_ses,1)
, avgwin_ses_lag1 = lag(avgwin_ses,1)
) %>%
ungroup() %>%
filter(year == 2014) %>%
select(team, avgwin, payroll_ses, avgwin_ses, payroll_ses_lag1, avgwin_ses_lag1)
last_2years
Despite our efforts above, we find that this is not very helpful. We settle on a model that contains the 3-year smoothed payroll figures.
(ses_fit <- step(lm(avgwin ~ . -team, data = last_2years)))
Start: AIC=-167.8
avgwin ~ (team + payroll_ses + avgwin_ses + payroll_ses_lag1 +
avgwin_ses_lag1) - team
Df Sum of Sq RSS AIC
- avgwin_ses_lag1 1 0.00030 0.0804 -170
- avgwin_ses 1 0.00354 0.0837 -168
- payroll_ses_lag1 1 0.00532 0.0855 -168
- payroll_ses 1 0.00542 0.0855 -168
<none> 0.0801 -168
Step: AIC=-169.7
avgwin ~ payroll_ses + avgwin_ses + payroll_ses_lag1
Df Sum of Sq RSS AIC
- avgwin_ses 1 0.00474 0.0852 -170
- payroll_ses 1 0.00521 0.0856 -170
<none> 0.0804 -170
- payroll_ses_lag1 1 0.00561 0.0860 -170
Step: AIC=-169.9
avgwin ~ payroll_ses + payroll_ses_lag1
Df Sum of Sq RSS AIC
<none> 0.0852 -170
- payroll_ses_lag1 1 0.0134 0.0986 -168
- payroll_ses 1 0.0166 0.1018 -167
Call:
lm(formula = avgwin ~ payroll_ses + payroll_ses_lag1, data = last_2years)
Coefficients:
(Intercept) payroll_ses payroll_ses_lag1
0.49336 0.00124 -0.00123
Using this model, we can predict the winning percentage for the teams in 2015. To do this we just need to change the width of our rolling simple exponential smoothing function to include 2014 data as this was not being used in the previous prediction of 2014.
last_2years_2015 <-
ml_pay2 %>%
filter(metric %in% c("payroll","avgwin")) %>%
select(-payroll, -avgwin) %>%
arrange(team, year) %>%
spread(metric, value) %>%
group_by(team) %>%
mutate(
payroll_ses = rollapply(payroll, FUN = fn_ses_forecast,
width = list((-nyears+1):0), fill = NA, by.column = TRUE, align = "right")
, avgwin_ses = rollapply(avgwin, FUN = fn_ses_forecast,
width = list((-nyears+1):0), fill = NA, by.column = TRUE, align = "right")
) %>%
ungroup() %>%
group_by(team) %>%
mutate(
payroll_ses_lag1 = lag(payroll_ses,1)
, avgwin_ses_lag1 = lag(avgwin_ses,1)
) %>%
ungroup() %>%
filter(year == 2014) %>%
select(team, avgwin, payroll_ses, avgwin_ses, payroll_ses_lag1, avgwin_ses_lag1)
cbind(
last_2years_2015 %>% select(team),
prediction_2015 = predict(ses_fit, last_2years_2015)
) %>% tbl_df %>%
print(n = 30)
We can start of with basic pairs plot, but it’s difficult to read.
pairs(Auto)
But, we can do much better than that using ggpairs. Here we can learn much more about our dataset.
Auto_proper <-
Auto %>% tbl_df %>%
mutate(
cylinders = as.factor(cylinders)
, year = as.integer(paste0("19", year))
, year2 = as.factor(paste0("19", year))
, origin = factor(origin, labels = c('American', 'European', 'Japanese'))
, name = as.character(name)
)
ggpairs(Auto_proper %>% select(-name, -year2)) + theme_jrf()
From this plot alone, we can glean a lot information about the Auto dataset.
More points can be made but this is a strong starting point.
Before going much further, let’s check to ensure we have no missing data.
# Complete.cases shows there is no missing values
sum(!complete.cases(Auto_proper))
[1] 0
We can do some summary statistics to get a feel of the bounds of the variables
sapply(Auto_proper, summary)
$mpg
Min. 1st Qu. Median Mean 3rd Qu. Max.
9.0 17.0 22.8 23.4 29.0 46.6
$cylinders
3 4 5 6 8
4 199 3 83 103
$displacement
Min. 1st Qu. Median Mean 3rd Qu. Max.
68 105 151 194 276 455
$horsepower
Min. 1st Qu. Median Mean 3rd Qu. Max.
46.0 75.0 93.5 104.0 126.0 230.0
$weight
Min. 1st Qu. Median Mean 3rd Qu. Max.
1610 2230 2800 2980 3610 5140
$acceleration
Min. 1st Qu. Median Mean 3rd Qu. Max.
8.0 13.8 15.5 15.5 17.0 24.8
$year
Min. 1st Qu. Median Mean 3rd Qu. Max.
1970 1970 1980 1980 1980 1980
$origin
American European Japanese
245 68 79
$name
Length Class Mode
392 character character
$year2
191970 191971 191972 191973 191974 191975 191976 191977 191978 191979 191980 191981 191982
29 27 28 40 26 30 34 28 36 29 27 28 30
auto_fit1 <- lm(mpg ~ year, data = Auto_proper)
summary(auto_fit1)
Call:
lm(formula = mpg ~ year, data = Auto_proper)
Residuals:
Min 1Q Median 3Q Max
-12.021 -5.441 -0.441 4.974 18.209
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.41e+03 1.73e+02 -13.9 <2e-16 ***
year 1.23e+00 8.74e-02 14.1 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.36 on 390 degrees of freedom
Multiple R-squared: 0.337, Adjusted R-squared: 0.335
F-statistic: 198 on 1 and 390 DF, p-value: <2e-16
We find that model year is a significant variable at the 0.05 level. We have strong evidence against the hypothesis that the coefficient associated with year is equal to 0 (P-value = 1.076e-36).
We estimate that for each additional year (car being newer) a cars MPG increases by 1.23. For example, for a car with model year 1980 we estimate 28.3912 mpg and a car with model year 1981 we estimate 29.6212. The difference a year makes in the estimate is \(29.6212 - 28.3912= 0\).
auto_fit2 <- lm(mpg ~ horsepower + year, data = Auto_proper)
summary(auto_fit2)
Call:
lm(formula = mpg ~ horsepower + year, data = Auto_proper)
Residuals:
Min 1Q Median 3Q Max
-12.077 -3.078 -0.431 2.588 15.315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.26e+03 1.31e+02 -9.61 <2e-16 ***
horsepower -1.32e-01 6.34e-03 -20.76 <2e-16 ***
year 6.57e-01 6.63e-02 9.92 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.39 on 389 degrees of freedom
Multiple R-squared: 0.685, Adjusted R-squared: 0.684
F-statistic: 424 on 2 and 389 DF, p-value: <2e-16
Year is still significant at the 0.05 level. We have strong evidence against the hypothesis that the coefficient associated with year is equal to 0 (P-value = 7.994e-21).
For cars with the same horsepower, we estimate that each additional year (car being newer) a cars MPG increases by 0.6573.
We show the two confidence intervals for the coefficient of year between the two models.
confint(auto_fit1, "year", level = 0.95)
2.5 % 97.5 %
year 1.058 1.402
confint(auto_fit2, "year", level = 0.95)
2.5 % 97.5 %
year 0.527 0.7875
These two confidence intervals are different. To a non-statistician, we would describe this difference as
In our first model to predict MPG, we only use the model year of the car. In our second model, we include horsepower which explains part of the variation in MPG between cars. In other words, the effect of the model year on MPG is smaller when we include the variation explained by horsepower.
auto_fit3 <- lm(mpg ~ horsepower * year, data = Auto_proper)
summary(auto_fit3)
Call:
lm(formula = mpg ~ horsepower * year, data = Auto_proper)
Residuals:
Min 1Q Median 3Q Max
-12.349 -2.451 -0.456 2.406 14.444
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.29e+03 3.19e+02 -13.5 <2e-16 ***
horsepower 3.14e+01 3.08e+00 10.2 <2e-16 ***
year 2.19e+00 1.61e-01 13.6 <2e-16 ***
horsepower:year -1.60e-02 1.56e-03 -10.2 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.9 on 388 degrees of freedom
Multiple R-squared: 0.752, Adjusted R-squared: 0.75
F-statistic: 393 on 3 and 388 DF, p-value: <2e-16
The interaction term is significant at the 0.05 level. We have strong evidence against the hypothesis that the coefficient associated with the interaction of year and horsepower is equal to 0 (P-value = 7.367e-22)
Now the effect of year cannot be interpreted uniformly when holding the other variable horsepower constant as it depends on the value of horsepower. Thus, we show the effect of a 1 year increase in model year (one year newer), on the 25th percentile, median, and 75th percentile horsepower values in our dataset.
Horsepower = 75 | Horsepower = 93.5 | Horsepower = 126 | |
---|---|---|---|
Effect of 1 year increase in model year | 0.9951 | 0.6999 | 0.1812 |
We have the cylinder variable coded a categorical variable because the number of cylinders is a characteristic of the car, rather than a feature that can be easily changed. In other words, a 1 unit change in cylinder is really not meaningful as most engines are made with cylinders with multiples of 2.
Auto_proper %>%
filter(!(cylinders %in% c("3","5"))) %>%
ggplot(aes(x = cylinders, y = mpg, fill = cylinders)) +
scale_fill_manual("Cylinders", values = c('4' = pal538['blue'][[1]], '6' = pal538['green'][[1]], '8' = pal538['red'][[1]])) +
geom_boxplot() +
theme_jrf() +
labs(title = "MPG vs Cylinders", x = "Cylinders", y = "MPG")
Per the question, we will use cylinders as a integer (not continuous).
auto_fit4 <- lm(mpg ~ horsepower + as.integer(cylinders), data = Auto_proper)
summary(auto_fit4)
Call:
lm(formula = mpg ~ horsepower + as.integer(cylinders), data = Auto_proper)
Residuals:
Min 1Q Median 3Q Max
-12.392 -2.965 -0.318 2.149 16.634
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 40.72614 0.65869 61.83 <2e-16 ***
horsepower -0.08617 0.00985 -8.75 <2e-16 ***
as.integer(cylinders) -2.57965 0.28486 -9.06 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.46 on 389 degrees of freedom
Multiple R-squared: 0.675, Adjusted R-squared: 0.673
F-statistic: 403 on 2 and 389 DF, p-value: <2e-16
Cylinders is significant at the 0.01 level. We have strong evidence against the hypothesis that the coefficient associated with cylinders is equal to 0 (P-value = 6.634e-18).
We estimate that, holding horsepower constant, for each additional cylinder in the car, the car’s mpg is -2.5796 lower.
auto_fit5 <- lm(mpg ~ horsepower + cylinders, data = Auto_proper)
summary(auto_fit5)
Call:
lm(formula = mpg ~ horsepower + cylinders, data = Auto_proper)
Residuals:
Min 1Q Median 3Q Max
-9.59 -2.71 -0.61 1.90 16.33
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 30.7761 2.4128 12.76 <2e-16 ***
horsepower -0.1030 0.0113 -9.09 <2e-16 ***
cylinders4 6.5734 2.1692 3.03 0.0026 **
cylinders5 5.0737 3.2666 1.55 0.1212
cylinders6 -0.3441 2.1858 -0.16 0.8750
cylinders8 0.4974 2.2764 0.22 0.8272
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.27 on 386 degrees of freedom
Multiple R-squared: 0.705, Adjusted R-squared: 0.701
F-statistic: 184 on 5 and 386 DF, p-value: <2e-16
Cylinders is significant at the 0.01 level. If one of the coefficients of the factor levels in the model is significant, then the variable as whole is significant. We then use ANOVA to compare the two models.
anova(auto_fit4, auto_fit5)
Analysis of Variance Table
Model 1: mpg ~ horsepower + as.integer(cylinders)
Model 2: mpg ~ horsepower + cylinders
Res.Df RSS Df Sum of Sq F Pr(>F)
1 389 7752
2 386 7037 3 715 13.1 3.8e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
We have strong evidence that the model using categorical variable for cylinder better explains the variation in MPG than the model that uses a quantitative variable for cylinder (P-value = 3.782e-08)
The fundamental difference between model 1 and model 2 is that model 1 assumes that there can be incremental increase in cylinders whereas model 2 assumes that there are different types of cylinders. In reality, you cannot increase a cars cylinders by 0.25 so model 1 is not practically valid. Model 2 recognizes the nature of the variable cylinder and how cars are made, thus being a practically applicable model.
The plots below provide a comparison between the two models.
cylinders <- seq(from = 2, to = 8, by = 0.25)
int_line <- data_frame(
cylinders = cylinders
, int = coef(summary(auto_fit4))[,1][[1]] + coef(summary(auto_fit4))[,1][[3]]*cylinders
, slope = coef(summary(auto_fit4))[,1][[2]]
)
g1 <-
ggplot(Auto_proper, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.5) +
geom_abline(data = int_line, aes(intercept = int, slope = slope, colour = as.factor(cylinders))) +
theme_jrf() +
labs(title = "Model 1: Quantative Cylinders", x = "Horsepower", y = "MPG") +
guides(colour = guide_legend(title = "Cylinders (sample)"))
g2 <-
ggplot(Auto_proper, aes(x = horsepower, y = mpg, colour = cylinders)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
theme_jrf() +
labs(title = "Model 2: Categorical Cylinders", x = "Horsepower", y = "MPG") +
scale_colour_manual("Cylinders", values = c('3' = "#ffff00",
'4' = pal538['blue'][[1]],
'5' = pal538['dkgray'][[1]],
'6' = pal538['green'][[1]],
'8' = pal538['red'][[1]]))
grid.arrange(g1, g2, ncol = 2)
First we make a dataframe of the car that we will predict MPG.
future_car <- data_frame(
year = 1983
, length = 180
, cylinders = factor(8, levels = c(3,4,5,6,8))
, displacement = 350
, horsepower = 260
, weight = 4000
)
Reviewing the diagonal from the pairs plot in the exploratory analysis, we note that the continuous predictors and the dependent variable MPG are all somewhat normally distribute and we decide not to perform any transformations.
We will use the leaps
package using each of the 3 methods. With each we will:
auto_fit6 <- regsubsets(mpg ~ ., data = Auto_proper %>% select(-name, -year2), nvmax = 11, method="exhaustive")
auto_fit6_sum <- summary(auto_fit6)
as_data_frame(auto_fit6_sum$outmat) %>% print(width = Inf)
data_frame(
predictors = 1:length(auto_fit6_sum$cp)
, cp = auto_fit6_sum$cp
, bic = auto_fit6_sum$bic
, adjr2 = auto_fit6_sum$adjr2
) %>%
gather(metric, value, -predictors) %>%
mutate(metric = factor(metric, levels = c("cp","bic","adjr2"))) %>%
ggplot(aes(x = predictors, y = value, colour = metric)) +
facet_grid(metric ~ ., scale = "free_y", switch = "y",
labeller = ggplot2::labeller(metric = c(cp = "Cp", bic = "BIC", adjr2 = "Adjusted R^2"))) +
geom_vline(xintercept = 3, alpha = 0.5) + geom_line() + geom_point() +
geom_label(data = data_frame(
predictors = c(which.min(auto_fit6_sum$cp), which.min(auto_fit6_sum$bic), which.max(auto_fit6_sum$adjr2))
, metric = factor(c("cp","bic","adjr2"), levels = c("cp","bic","adjr2"))
, value = c(min(auto_fit6_sum$cp), min(auto_fit6_sum$bic), max(auto_fit6_sum$adjr2))
, label = paste0("Optimal\nd=", c(which.min(auto_fit6_sum$cp), which.min(auto_fit6_sum$bic), which.max(auto_fit6_sum$adjr2)))
, vjust = c(-.5, -.5, 1.25)
), aes(x = predictors, y = value, label = label, vjust = vjust), family = "DecimaMonoPro") +
theme_jrf() +
labs(title = "Exhaustive Search", x = "# of Predictors", y = NULL) +
geom_label(data = data_frame(x = 3, y = 300, metric = factor(c("cp"), levels = c("cp","bic","adjr2")),
label = "Elbow with\n3 predictors"), aes(x=x,y=y,label=label), colour = "black", hjust = -.1,
family = "DecimaMonoPro") +
scale_colour_manual(guide = FALSE, values = c(pal538['red'][[1]], pal538['green'][[1]], pal538['blue'][[1]]))
auto_fit7 <- regsubsets(mpg ~ ., data = Auto_proper %>% select(-name, -year2), nvmax = 11, method="forward")
auto_fit7_sum <- summary(auto_fit7)
as_data_frame(auto_fit7_sum$outmat) %>% print(width = Inf)
data_frame(
predictors = 1:length(auto_fit7_sum$cp)
, cp = auto_fit7_sum$cp
, bic = auto_fit7_sum$bic
, adjr2 = auto_fit7_sum$adjr2
) %>%
gather(metric, value, -predictors) %>%
mutate(metric = factor(metric, levels = c("cp","bic","adjr2"))) %>%
ggplot(aes(x = predictors, y = value, colour = metric)) +
facet_grid(metric ~ ., scale = "free_y", switch = "y",
labeller = ggplot2::labeller(metric = c(cp = "Cp", bic = "BIC", adjr2 = "Adjusted R^2"))) +
geom_vline(xintercept = 3, alpha = 0.5) + geom_line() + geom_point() +
geom_label(data = data_frame(
predictors = c(which.min(auto_fit7_sum$cp), which.min(auto_fit7_sum$bic), which.max(auto_fit7_sum$adjr2))
, metric = factor(c("cp","bic","adjr2"), levels = c("cp","bic","adjr2"))
, value = c(min(auto_fit7_sum$cp), min(auto_fit7_sum$bic), max(auto_fit7_sum$adjr2))
, label = paste0("Optimal\nd=", c(which.min(auto_fit7_sum$cp), which.min(auto_fit7_sum$bic) ,which.max(auto_fit7_sum$adjr2)))
, vjust = c(-.5, -.5, 1.25)
), aes(x = predictors, y = value, label = label, vjust = vjust), family = "DecimaMonoPro") +
theme_jrf() +
labs(title = "Forward Search", x = "# of Predictors", y = NULL) +
geom_label(data = data_frame(x = 3, y = 300, metric = factor(c("cp"), levels = c("cp","bic","adjr2")),
label = "Elbow with\n3 predictors"), aes(x=x,y=y,label=label), colour = "black", hjust = -.1,
family = "DecimaMonoPro") +
scale_colour_manual(guide = FALSE, values = c(pal538['red'][[1]], pal538['green'][[1]], pal538['blue'][[1]]))
auto_fit8 <- regsubsets(mpg ~ ., data = Auto_proper %>% select(-name, -year2), nvmax = 11, method="backward")
auto_fit8_sum <- summary(auto_fit7)
as_data_frame(auto_fit8_sum$outmat) %>% print(width = Inf)
data_frame(
predictors = 1:length(auto_fit8_sum$cp)
, cp = auto_fit8_sum$cp
, bic = auto_fit8_sum$bic
, adjr2 = auto_fit8_sum$adjr2
) %>%
gather(metric, value, -predictors) %>%
mutate(metric = factor(metric, levels = c("cp","bic","adjr2"))) %>%
ggplot(aes(x = predictors, y = value, colour = metric)) +
facet_grid(metric ~ ., scale = "free_y", switch = "y",
labeller = ggplot2::labeller(metric = c(cp = "Cp", bic = "BIC", adjr2 = "Adjusted R^2"))) +
geom_vline(xintercept = 3, alpha = 0.5) + geom_line() + geom_point() +
geom_label(data = data_frame(
predictors = c(which.min(auto_fit8_sum$cp), which.min(auto_fit8_sum$bic), which.max(auto_fit8_sum$adjr2))
, metric = factor(c("cp","bic","adjr2"), levels = c("cp","bic","adjr2"))
, value = c(min(auto_fit8_sum$cp), min(auto_fit8_sum$bic), max(auto_fit8_sum$adjr2))
, label = paste0("Optimal\nd=", c(which.min(auto_fit8_sum$cp), which.min(auto_fit8_sum$bic), which.max(auto_fit8_sum$adjr2)))
, vjust = c(-.5, -.5, 1.25)
), aes(x = predictors, y = value, label = label, vjust = vjust), family = "DecimaMonoPro") +
theme_jrf() +
labs(title = "Backward Search", x = "# of Predictors", y = NULL) +
geom_label(data = data_frame(x = 3, y = 300, metric = factor(c("cp"), levels = c("cp","bic","adjr2")),
label = "Elbow with\n3 predictors"), aes(x=x,y=y,label=label), colour = "black", hjust = -.1,
family = "DecimaMonoPro") +
scale_colour_manual(guide = FALSE, values = c(pal538['red'][[1]], pal538['green'][[1]], pal538['blue'][[1]]))
In all 3 methods, we find that there is an elbow in the information criteria at 3 predictors. These three predictors are
Regarding (3), this indicates that we might want to try creating a binary variable, whether or not the car is 6 cylinders. We will create 4 models
Model 1: Cylinders - all levels
Auto_proper2 <-
Auto_proper %>%
mutate(
is_6cylinder = cylinders == 6
)
auto_fit9 <- lm(mpg ~ weight + year + cylinders, Auto_proper2)
summary(auto_fit9)
Call:
lm(formula = mpg ~ weight + year + cylinders, data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-8.618 -2.047 -0.129 1.772 13.882
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.45e+03 9.32e+01 -15.55 < 2e-16 ***
weight -6.17e-03 4.38e-04 -14.06 < 2e-16 ***
year 7.51e-01 4.71e-02 15.94 < 2e-16 ***
cylinders4 7.01e+00 1.62e+00 4.33 1.9e-05 ***
cylinders5 8.53e+00 2.47e+00 3.45 0.00061 ***
cylinders6 4.04e+00 1.68e+00 2.41 0.01649 *
cylinders8 6.19e+00 1.80e+00 3.44 0.00064 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.2 on 385 degrees of freedom
Multiple R-squared: 0.834, Adjusted R-squared: 0.832
F-statistic: 323 on 6 and 385 DF, p-value: <2e-16
Model 2: Binary 6-cylinder
auto_fit10 <- lm(mpg ~ weight + year + is_6cylinder, Auto_proper2)
summary(auto_fit10)
Call:
lm(formula = mpg ~ weight + year + is_6cylinder, data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-9.196 -2.005 -0.116 1.824 13.929
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.48e+03 9.36e+01 -15.78 < 2e-16 ***
weight -6.45e-03 2.07e-04 -31.15 < 2e-16 ***
year 7.69e-01 4.73e-02 16.27 < 2e-16 ***
is_6cylinderTRUE -2.54e+00 4.09e-01 -6.22 1.3e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.27 on 388 degrees of freedom
Multiple R-squared: 0.826, Adjusted R-squared: 0.824
F-statistic: 612 on 3 and 388 DF, p-value: <2e-16
Model 3: Binary 6-cylinder & Horsepower
auto_fit11 <- lm(mpg ~ weight + year + is_6cylinder + horsepower, Auto_proper2)
summary(auto_fit11)
Call:
lm(formula = mpg ~ weight + year + is_6cylinder + horsepower,
data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-8.942 -2.027 -0.059 1.757 13.851
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.39e+03 9.78e+01 -14.24 < 2e-16 ***
weight -5.47e-03 4.13e-04 -13.27 < 2e-16 ***
year 7.27e-01 4.94e-02 14.71 < 2e-16 ***
is_6cylinderTRUE -2.92e+00 4.28e-01 -6.81 3.7e-11 ***
horsepower -2.57e-02 9.43e-03 -2.72 0.0067 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.25 on 387 degrees of freedom
Multiple R-squared: 0.829, Adjusted R-squared: 0.827
F-statistic: 469 on 4 and 387 DF, p-value: <2e-16
Model 4: Cylinders - all levels & Horsepower
auto_fit12 <- lm(mpg ~ weight + year + cylinders + horsepower, Auto_proper2)
summary(auto_fit12)
Call:
lm(formula = mpg ~ weight + year + cylinders + horsepower, data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-8.723 -1.996 -0.079 1.773 13.781
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.39e+03 9.63e+01 -14.47 < 2e-16 ***
weight -5.64e-03 4.99e-04 -11.30 < 2e-16 ***
year 7.23e-01 4.87e-02 14.86 < 2e-16 ***
cylinders4 6.65e+00 1.62e+00 4.10 5e-05 ***
cylinders5 7.90e+00 2.48e+00 3.19 0.00155 **
cylinders6 3.68e+00 1.68e+00 2.19 0.02889 *
cylinders8 6.52e+00 1.80e+00 3.63 0.00032 ***
horsepower -2.16e-02 9.94e-03 -2.17 0.03070 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.19 on 384 degrees of freedom
Multiple R-squared: 0.836, Adjusted R-squared: 0.833
F-statistic: 280 on 7 and 384 DF, p-value: <2e-16
Now that we have 4 models, we can compare the AIC (Mallow’s Cp in linear regression) and BIC. If there is not a significant difference, we will use the simplest model (model 2).
data_frame(
Model = c("1: Cylinders - all levels", "2: Binary 6-cylinder",
"3: Binary 6-cylinder & Horsepower","4: Cylinders - all levels & Horsepower")
, AIC = AIC(auto_fit9, auto_fit10, auto_fit11, auto_fit12)$AIC
, BIC = BIC(auto_fit9, auto_fit10, auto_fit11, auto_fit12)$BIC
) %>%
kable()
Model | AIC | BIC |
---|---|---|
1: Cylinders - all levels | 2034 | 2066 |
2: Binary 6-cylinder | 2048 | 2068 |
3: Binary 6-cylinder & Horsepower | 2042 | 2066 |
4: Cylinders - all levels & Horsepower | 2031 | 2067 |
Model 2 has higher AIC and BIC values compared to the other 3 models, indicating it explains less variation in MPG. However, the difference is not large and it is the simplest model. We will use model 2 as our final model.
We notice that we might want a quadratic term for the predictor weight by looking at the following charts. We may be concerned about overfitting, particularly for 6-cylinder cars (i.e. 1970 and 1982). However, we know that we will be predicting the MPG of a 8-cylinder car.
Auto_proper2 %>%
select(mpg, weight, year, cylinders, is_6cylinder) %>%
ggplot(aes(x = weight, y = mpg, colour = is_6cylinder)) +
facet_wrap(~ year) +
geom_point(alpha = 0.5) +
theme_jrf(base_size) +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
labs(title = "Evidence for adding Quadratic Term for Weight", x = "Weight", y = "MPG") +
scale_colour_manual("Is 6 Cylinder?", values = c('TRUE' = pal538['green'][[1]], "FALSE" = pal538['red'][[1]])) +
guides(colour = guide_legend(reverse = TRUE)) +
theme(legend.position = 'bottom')
We create a model to add in the quadratic term for weight. We see that the binary variable for whether the car is a 6-cylinder is now only marginally significant.
auto_fit13 <- lm(mpg ~ weight + I(weight^2) + year + is_6cylinder, Auto_proper2)
summary(auto_fit13)
Call:
lm(formula = mpg ~ weight + I(weight^2) + year + is_6cylinder,
data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-9.501 -1.660 -0.126 1.556 13.164
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.57e+03 8.72e+01 -17.98 < 2e-16 ***
weight -2.01e-02 1.67e-03 -12.01 < 2e-16 ***
I(weight^2) 2.12e-06 2.59e-07 8.21 3.5e-15 ***
year 8.26e-01 4.42e-02 18.67 < 2e-16 ***
is_6cylinderTRUE -7.52e-01 4.36e-01 -1.72 0.086 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.02 on 387 degrees of freedom
Multiple R-squared: 0.851, Adjusted R-squared: 0.85
F-statistic: 554 on 4 and 387 DF, p-value: <2e-16
Let’s remove the binary predictor.
auto_fit14 <- lm(mpg ~ weight + I(weight^2) + year, Auto_proper2)
summary(auto_fit14)
Call:
lm(formula = mpg ~ weight + I(weight^2) + year, data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-9.456 -1.708 -0.173 1.519 13.179
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.57e+03 8.74e+01 -18.0 <2e-16 ***
weight -2.15e-02 1.44e-03 -14.9 <2e-16 ***
I(weight^2) 2.35e-06 2.25e-07 10.4 <2e-16 ***
year 8.29e-01 4.43e-02 18.7 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.03 on 388 degrees of freedom
Multiple R-squared: 0.85, Adjusted R-squared: 0.849
F-statistic: 734 on 3 and 388 DF, p-value: <2e-16
When we plot this model the results look great.
Auto_proper2 %>%
select(mpg, weight, year, cylinders, is_6cylinder) %>%
ggplot(aes(x = weight, y = mpg)) +
facet_wrap(~ year) +
geom_point(color = pal538['blue'][[1]], alpha = 0.5) +
theme_jrf() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE, colour = pal538['red'][[1]]) +
labs(title = "Removing the Binary Is 6-cylinder Variable", x = "Weight", y = "MPG")
Let’s compare the AIC and BIC values for all of these models.
data_frame(
Model = c("1: Cylinders - all levels", "2: Binary 6-cylinder",
"3: Binary 6-cylinder & Horsepower","4: Cylinders - all levels & Horsepower",
"5: Binary 6-cylinder and Quadratic Weight", "6: Quadratic Weight Only")
, AIC = AIC(auto_fit9, auto_fit10, auto_fit11, auto_fit12, auto_fit13, auto_fit14)$AIC
, BIC = BIC(auto_fit9, auto_fit10, auto_fit11, auto_fit12, auto_fit13, auto_fit14)$BIC
) %>%
kable()
Model | AIC | BIC |
---|---|---|
1: Cylinders - all levels | 2034 | 2066 |
2: Binary 6-cylinder | 2048 | 2068 |
3: Binary 6-cylinder & Horsepower | 2042 | 2066 |
4: Cylinders - all levels & Horsepower | 2031 | 2067 |
5: Binary 6-cylinder and Quadratic Weight | 1987 | 2011 |
6: Quadratic Weight Only | 1988 | 2008 |
There is only a slight information gain with the binary 6-cylinder variable and we believe this to be overfitting. We will proceed with model 6. Let’s check what would happen if we used regsubsets
with a the quadratic term.
auto_fit15 <- regsubsets(mpg ~ . + I(weight^2), data = Auto_proper2 %>% select(-name, -year2), nvmax = 11, method="exhaustive")
Reordering variables and trying again:
auto_fit15_sum <- summary(auto_fit15)
as_data_frame(auto_fit15_sum$outmat) %>% print(width = Inf)
data_frame(
predictors = 1:length(auto_fit15_sum$cp)
, cp = auto_fit15_sum$cp
, bic = auto_fit15_sum$bic
, adjr2 = auto_fit15_sum$adjr2
) %>%
gather(metric, value, -predictors) %>%
mutate(metric = factor(metric, levels = c("cp","bic","adjr2"))) %>%
ggplot(aes(x = predictors, y = value, colour = metric)) +
facet_grid(metric ~ ., scale = "free_y", switch = "y",
labeller = ggplot2::labeller(metric = c(cp = "Cp", bic = "BIC", adjr2 = "Adjusted R^2"))) +
geom_vline(xintercept = 3, alpha = 0.5) + geom_line() + geom_point() +
geom_label(data = data_frame(
predictors = c(which.min(auto_fit15_sum$cp), which.min(auto_fit15_sum$bic), which.max(auto_fit15_sum$adjr2))
, metric = factor(c("cp","bic","adjr2"), levels = c("cp","bic","adjr2"))
, value = c(min(auto_fit15_sum$cp), min(auto_fit15_sum$bic), max(auto_fit15_sum$adjr2))
, label = paste0("Optimal\nd=", c(which.min(auto_fit15_sum$cp), which.min(auto_fit15_sum$bic), which.max(auto_fit15_sum$adjr2)))
, vjust = c(-.5, -.5, 1.25)
), aes(x = predictors, y = value, label = label, vjust = vjust), family = "DecimaMonoPro") +
theme_jrf() +
labs(title = "Exhaustive Search with Quadratic Weight", x = "# of Predictors", y = NULL) +
geom_label(data = data_frame(x = 3, y = 300, metric = factor(c("cp"), levels = c("cp","bic","adjr2")),
label = "Elbow with\n3 predictors"), aes(x=x,y=y,label=label), colour = "black", hjust = -.1,
family = "DecimaMonoPro") +
scale_colour_manual(guide = FALSE, values = c(pal538['red'][[1]], pal538['green'][[1]], pal538['blue'][[1]]))
The result confirms our thinking: a 3 predictor model with a quadratic weight term.
Our final model to predict MPG based on the predictors in the Auto dataset is
\[MPG = \beta_0 + \beta_1 Weight + \beta_2 Weight^2 + \beta_3 year\]
(auto_fit_final <- summary(auto_fit14))
Call:
lm(formula = mpg ~ weight + I(weight^2) + year, data = Auto_proper2)
Residuals:
Min 1Q Median 3Q Max
-9.456 -1.708 -0.173 1.519 13.179
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.57e+03 8.74e+01 -18.0 <2e-16 ***
weight -2.15e-02 1.44e-03 -14.9 <2e-16 ***
I(weight^2) 2.35e-06 2.25e-07 10.4 <2e-16 ***
year 8.29e-01 4.43e-02 18.7 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.03 on 388 degrees of freedom
Multiple R-squared: 0.85, Adjusted R-squared: 0.849
F-statistic: 734 on 3 and 388 DF, p-value: <2e-16
Thus the model is
\[MPG = -1572.84 + -0.02 Weight + 2.3e-06 Weight^2 + 0.83 year\]
The normal Q-Q plot shows that residuals might not come from a normal distribution at the tails, but all together is somewhat normal.
data_frame(std.resid = rstandard(auto_fit14)) %>%
ggplot() +
stat_qq(aes(sample = std.resid), colour = pal538['blue']) +
geom_abline(data =
. %>%
summarise(
slope = diff(quantile(std.resid, c(0.25, 0.75))) / diff(qnorm(c(0.25, 0.75)))
, int = quantile(std.resid, c(0.25, 0.75))[1L] -
(diff(quantile(std.resid, c(0.25, 0.75))) /
diff(qnorm(c(0.25, 0.75)))) * qnorm(c(0.25, 0.75))[1L]
),
aes(slope = slope, intercept = int), alpha = 0.5
) +
theme_jrf() +
scale_x_continuous(labels = NULL) +
labs(title = "Normal Q-Q", y = "Standardized Residuals", x = "Theoretical Quantiles")
In addition, the Shapiro-Wilks test shows that we have evidence that the residuals do not come from a normal distribution.
shapiro.test(rstandard(auto_fit14))
Shapiro-Wilk normality test
data: rstandard(auto_fit14)
W = 0.95, p-value = 4e-10
The fitted values vs residuals plots show approximately equal variance of the residuals (i.e. no heteroscedasticity).
data_frame(
fitted = auto_fit14$fitted.values
, resid = auto_fit14$residuals
) %>%
ggplot(aes(x = fitted, y = resid)) +
geom_point(colour = pal538['blue']) +
geom_smooth(method = "loess", colour = pal538['red'], se = FALSE, size = .25, alpha = 0.5) +
geom_hline(yintercept = 0, alpha = 0.5, linetype = 'dashed', color = 'black') +
theme_jrf() +
scale_x_continuous(labels = NULL) +
labs(title = "Fitted Values vs Residuals", y = "Residuals", x = "Fitted Values")
The effects associated with weight are difficult to describe because of the quadratic term. Holding the effect of weight constant, each additional year of car (newer model years), a car’s MPG increases by 0.8289.
Finally, we can find a 95% prediction interval for the car built in 1983.
future_car_pred <- predict(auto_fit14, future_car, interval = "prediction")
The prediction interval for the MPG of this car is
\[(16.2702, 28.3166)\]