Download the .Rmd directly at the top-right.
Full repo: https://github.com/jrfarrer/stats701_hw2/
Published file: https://jrfarrer.github.io/stats701_hw2/
# Set the seed for reproducibility
set.seed(44)
# Set the locale of the session so languages other than English can be used
invisible(Sys.setlocale("LC_ALL", "en_US.UTF-8"))
# Prevent printing in scientific notation
options(digits = 4, width = 220)
# Create a logger function
logger <- function(msg, level = "info", file = log_file) {
cat(paste0("[", format(Sys.time(), "%Y-%m-%d %H:%M:%S.%OS"), "][", level, "] ", msg, "\n"), file = stdout())
}
# Set the project directory
base_dir <- ''
data_dir <- paste0(base_dir, "data/")
code_dir <- paste0(base_dir, "code/")
viz_dir <- paste0(base_dir, "viz/")
dir.create(data_dir, showWarnings = FALSE)
dir.create(code_dir, showWarnings = FALSE)
dir.create(viz_dir, showWarnings = FALSE)
# To the font second font, run the following two lines of code and add name of user to vector
# system(paste0("cp -r ",viz_dir,"fonts/. ~/Library/Fonts/")) # instantaneous
# font_import() # takes approximately 5-10 min
users_v <- c("Jordan")
# Create a color palette
pal538 <- ggthemes_data$fivethirtyeight
# Create a theme to use throughout the analysis
theme_jrf <- function(base_size = 8, base_family = ifelse(Sys.info()[['user']] %in% users_v, "DecimaMonoPro", "Helvetica")) {
theme(
plot.background = element_rect(fill = "#F0F0F0", colour = "#606063"),
panel.background = element_rect(fill = "#F0F0F0", colour = NA),
panel.border = element_blank(),
panel.grid.major = element_line(colour = "#D7D7D8"),
panel.grid.minor = element_line(colour = "#D7D7D8", size = 0.25),
panel.margin = unit(0.25, "lines"),
panel.margin.x = NULL,
panel.margin.y = NULL,
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.title = element_text(colour = "#A0A0A3"),
axis.text.x = element_text(vjust = 1, family = 'Helvetica', colour = '#3C3C3C'),
axis.text.y = element_text(hjust = 1, family = 'Helvetica', colour = '#3C3C3C'),
legend.background = element_blank(),
legend.key = element_blank(),
plot.title = element_text(face = 'bold', colour = '#3C3C3C', hjust = 0),
text = element_text(size = 9, family = ifelse(Sys.info()[['user']] %in% users_v,"DecimaMonoPro", "Helvetica")),
title = element_text(family = ifelse(Sys.info()[['user']] %in% users_v,"DecimaMonoPro", "Helvetica"))
)
}
# Load the csv with meaningful column names
framingham_raw <- read.table(paste0(data_dir, "Framingham.dat"), sep = ",", header = TRUE, as.is = TRUE) %>% as_tibble
framingham_cleaning <-
framingham_raw %>%
rename(hd = `Heart.Disease.`)
framingham_cleaning <-
framingham_cleaning %>%
setNames(tolower(names(.))) %>%
mutate(
sex = factor(tolower(sex))
, hd = factor(hd)
)
We load the Framingham.dat
file and summarize the 8 variables and 1407 records.
head(framingham_cleaning)
We then remove 14 observations that have missing values.
framingham_cleaning2 <-
framingham_cleaning %>%
filter(complete.cases(.))
pander(summary(framingham_cleaning2), missing = "", split.table = Inf)
hd | age | sex | sbp | dbp | chol | frw | cig |
---|---|---|---|---|---|---|---|
0:1086 | Min. :45.0 | female:730 | Min. : 90 | Min. : 50.0 | Min. : 96 | Min. : 52 | Min. : 0.00 |
1: 307 | 1st Qu.:48.0 | male :663 | 1st Qu.:130 | 1st Qu.: 80.0 | 1st Qu.:200 | 1st Qu.: 94 | 1st Qu.: 0.00 |
Median :52.0 | Median :142 | Median : 90.0 | Median :230 | Median :103 | Median : 0.00 | ||
Mean :52.4 | Mean :148 | Mean : 90.2 | Mean :235 | Mean :105 | Mean : 8.04 | ||
3rd Qu.:56.0 | 3rd Qu.:160 | 3rd Qu.: 98.0 | 3rd Qu.:264 | 3rd Qu.:114 | 3rd Qu.:20.00 | ||
Max. :62.0 | Max. :300 | Max. :160.0 | Max. :430 | Max. :222 | Max. :60.00 |
framingham_final <- framingham_cleaning2
To get a better idea of this dataset, we create a pairs plot colored by observations without heart disease (hd = 0) and with heart disease (hd = 1).
fpairs <-ggpairs(framingham_final, mapping = aes(colour = hd))
for (row in seq_len(fpairs$nrow))
for (col in seq_len(fpairs$ncol))
fpairs[row, col] <- fpairs[row, col] + scale_colour_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]])) + scale_fill_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]]))
fpairs
For the variable sex, we create a frequency table.
pander(descr::CrossTable(framingham_final$hd, framingham_final$sex, prop.chisq=FALSE, prop.t = FALSE, chisq = TRUE, dnn = c("HD","Sex")), split.table = Inf, big.mark = ',')
HD |
Sex female |
male |
Total |
---|---|---|---|
0 N Row(%) Column(%) |
610 56.17% 83.56% |
476 43.83% 71.79% |
1086 77.96% |
1 N Row(%) Column(%) |
120 39.09% 16.44% |
187 60.91% 28.21% |
307 22.04% |
Total |
730 52.4049% |
663 47.5951% |
1393 |
We fit a logistic regression model to predict heart disease with only the explanatory variable systolic blood pressure (SBP).
fit1 <- glm(hd ~ sbp, data = framingham_final, family = binomial('logit'))
tidy(fit1) %>% pander(caption = paste0(as.character(summary(fit1)$call)[3],': ', as.character(summary(fit1)$call)[2]))
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -3.655 | 0.3479 | -10.51 | 8.075e-26 |
sbp | 0.01581 | 0.002222 | 7.118 | 1.097e-12 |
glance(fit1) %>% pander()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|
1469 | 1392 | -708.7 | 1421 | 1432 | 1417 | 1391 |
The next variable we choose will have the largest \(|z|\) statistic or the smallest \(p\) value. We create 6 models that are hd ~ sbp + var
and extract the summary of each variable var
.
vars_rotating <- names(framingham_final)[!(names(framingham_final) %in% c("hd","sbp"))]
best_second_var <- data_frame()
for(var in vars_rotating) {
best_second_var <- bind_rows(best_second_var,
tidy(glm(paste0('hd ~ sbp + ', var), data = framingham_final, family = binomial('logit'))) %>%
filter(row_number() == 3) %>%
rename(`z.statistic` = statistic)
)
}
best_second_var %>%
arrange(p.value)
We see that sex will be the most important addition on top of sbp.
fit2 <- glm(hd ~ sbp + sex, data = framingham_final, family = binomial('logit'))
fit2_summary <- summary(fit2)
tidy(fit2) %>% pander(caption = paste0(as.character(summary(fit2)$call)[3],': ', as.character(summary(fit2)$call)[2]))
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -4.57 | 0.3897 | -11.73 | 9.289e-32 |
sbp | 0.01872 | 0.002324 | 8.053 | 8.071e-16 |
sexmale | 0.9034 | 0.1398 | 6.464 | 1.02e-10 |
glance(fit2) %>% pander()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|
1469 | 1392 | -686.9 | 1380 | 1395 | 1374 | 1390 |
The residual deviance of fit2 is always smaller than fit1 because fit1 is a nested model of fit2 and thus has more degress of freedom.
We perform the Wald Test which tests whether the set of terms is significant.
confint.default(fit2) %>% pander()
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | -5.334 | -3.806 |
sbp | 0.01416 | 0.02327 |
sexmale | 0.6295 | 1.177 |
wald_test <- wald.test(b = coef(fit2), Sigma = vcov(fit2), Terms = 1:3)
wald_test$result$chi2 %>% as_tibble() %>% pander(caption = "Wald Test")
value | |
---|---|
chi2 | 393.9 |
df | 3.0 |
P | 0.0 |
The Wald Test yields a \(p\)-value of 1.019810^{-10}. The Wald Test for just one variable is the same test performed by the function summary
. Below are the results for the Wald Test for just the term sex
. Note that the statistic is z-value squared (\(6.464^2 = 41.7831\)) and the \(p\)-values are the same.
wald_test <- wald.test(b = coef(fit2), Sigma = vcov(fit2), Terms = 3)
Next we perform the Likelihood Test (Chi-squared).
chi_sq_fit2 <- anova(fit2, test = "Chisq")
tidy(chi_sq_fit2) %>% pander(missing = '', caption = 'Analysis of Deviance Table')
term | df | Deviance | Resid..Df | Resid..Dev | Pr..Chi. |
---|---|---|---|---|---|
NULL | 1392 | 1469 | |||
sbp | 1 | 51.86 | 1391 | 1417 | 5.949e-13 |
sex | 1 | 43.7 | 1390 | 1374 | 3.828e-11 |
The \(p\)-value for adding the variable sex is 3.827610^{-11}. The \(p\)-value is not the same as the Wald Test. The Wald Test and the Likelihood Test both hold for this new model.
Using backward selection, we are left with a model with 4 predictors: sbp, sex, age, and chol.
fit7 <- glm(hd ~ sbp+sex+age+cig+chol+frw+dbp, data = framingham_final, family = binomial('logit'))
fit6 <- glm(hd ~ sbp+sex+age+cig+chol+frw, data = framingham_final, family = binomial('logit'))
fit5 <- glm(hd ~ sbp+sex+age+cig+chol, data = framingham_final, family = binomial('logit'))
fit4 <- glm(hd ~ sbp+sex+age+chol, data = framingham_final, family = binomial('logit'))
tidy(fit4) %>% pander(caption = paste0(as.character(summary(fit4)$call)[3],': ', as.character(summary(fit4)$call)[2]))
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -8.409 | 0.9086 | -9.255 | 2.151e-20 |
sbp | 0.01696 | 0.002362 | 7.179 | 7.021e-13 |
sexmale | 0.9899 | 0.1451 | 6.824 | 8.842e-12 |
age | 0.05664 | 0.0145 | 3.906 | 9.377e-05 |
chol | 0.00448 | 0.001495 | 2.996 | 0.002737 |
glance(fit4) %>% pander()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|
1469 | 1392 | -674.5 | 1359 | 1385 | 1349 | 1388 |
Next we use exahustive search to find the best model using the AIC criterion.
Xy <- bind_cols(model.matrix(hd ~.+0, framingham_final) %>% as_tibble(),
framingham_final %>% select(hd))
exhaustive_model <- bestglm(as.data.frame(Xy), family = binomial('logit'),
method = "exhaustive", IC = "AIC", nvmax = 10)
Morgan-Tatar search since family is non-gaussian.
exhaustive_model$BestModels
This method is not guaranteed to include only variables with \(p\)-values less than 0.05. In this example, the best model includes frw which the glm summary shows is not significant at the 0.05 level. Thus, the best model returned by exhaustive search is not the same as the model returned by backwards elimination.
final_model <- exhaustive_model$BestModel
tidy(final_model) %>% pander(caption = "Best model returned by exhaustive search")
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -9.228 | 0.9962 | -9.263 | 1.979e-20 |
age | 0.06153 | 0.01478 | 4.164 | 3.122e-05 |
sexmale | 0.9113 | 0.1571 | 5.8 | 6.633e-09 |
sbp | 0.01597 | 0.002487 | 6.42 | 1.366e-10 |
chol | 0.004493 | 0.001503 | 2.99 | 0.002794 |
frw | 0.006039 | 0.004004 | 1.508 | 0.1315 |
cig | 0.01228 | 0.006088 | 2.017 | 0.04369 |
glance(final_model) %>% pander()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|
1469 | 1392 | -671.6 | 1357 | 1394 | 1343 | 1386 |
The final logistic regression model predicts whether or not an individual will have heart disease (hd) with 6 variables. The six variables are listed in the order of importance to the model below:
summary_vars <-
data_frame(
variable = rownames(coef(summary(final_model))[-1, ])
, p_value = coef(summary(final_model))[-1, 4]
) %>%
mutate(
estimate = coef(summary(final_model))[-1, 1]
, odds = exp(estimate)
, prob = odds / (1 + odds)
, percentage = paste0(round(prob * 100, 2), "%")
) %>%
arrange(p_value)
summary_vars %>% select(-percentage)
Holding all other variables constant,
liz <- data_frame(age = 50, sexmale = 0, sbp = 110, dbp = 80, chol = 180, frw = 105, cig = 0)
liz_predict <- exp(predict(final_model, liz, type = 'response')) / (1 + exp(predict(final_model, liz, type = 'response')))
The probability that Liz will have heart disease with our final model is 0.5124.
fit1_classificer <-
data_frame(
pred = framingham_final %>% mutate(pred = ifelse(sbp > 176, 1, 0)) %>% select(pred) %>% unlist(),
true = framingham_final %>% select(hd) %>% unlist()
) %>%
mutate(type = ifelse(pred == true, ifelse(pred == 0, "tn", "tp"), ifelse(pred == 0, "fn", "fp"))) %>%
group_by(type) %>%
count() %>%
spread(type, n) %>%
summarise(fpr = fp / (fp + tn), tpr = tp / (tp + fn))
The ROC curve is used to assess the accuracy of a continuous variable for predicting a binary outcome. In the ROC above the continuous variable is systolic blood pressure (sbp) and the binary outcome is whether the individual will have heart disease (hd).
plot_df <-
data_frame(
hd = framingham_final %>% mutate(hd = as.integer(hd) - 1) %>% select(hd) %>% unlist()
, fitted_fit1 = fit1$fitted
, fitted_fit2 = fit2$fitted
)
#p1 <-
ggplot(plot_df, aes(d = hd, m = fitted_fit1)) +
geom_roc() + style_roc() +
theme_jrf() +
labs(title = "ROC for fit1") +
geom_vline(xintercept = 0.1) +
geom_label(data = data_frame(hd = 0.11, fitted_fit1 = 0.8, label = "False Positive\nRate = 1"),
aes(x = hd, y = fitted_fit1, label = "False Positive Rate = 1"), hjust = 'left') +
geom_point(data = data_frame(hd = fit1_classificer$fpr, fitted_fit1 = fit1_classificer$tpr),
aes(x = hd, y = fitted_fit1), colour = pal538['red'][[1]]) +
geom_text(data = data_frame(hd = fit1_classificer$fpr + 0.02, fitted_fit1 = fit1_classificer$tpr, text = "SBP > 176"),
aes(x = hd, y = fitted_fit1, label = text), colour = pal538['red'][[1]], hjust = 0.1)
The classifier such that the False Positive Rate is less than 0.1 and the true positive rate is as high as possible is SBP > 176, then HD = 1, else HD = 0.
plot_df2 <-
plot_df %>%
gather(model, fitted, -hd) %>%
mutate(model = gsub("fitted_","", model))
ggplot(plot_df2, aes(d = hd, m = fitted, colour = model)) +
geom_roc() + style_roc() +
theme_jrf() +
labs(title = "ROC for fit1 and fit2") +
scale_colour_manual("Model", values = c('fit1' = pal538['blue'][[1]], 'fit2' = pal538['red'][[1]]))
The ROC curve for fit2 always contains the ROC curve for fit1. The AUC of the ROC for fit2 is 0.6801 and AUC of the ROC for fit1 is 0.6357. We expected the AUC for fit2 to be larger than the AUC for fit1 because fit1 is a nested model.
values_df <-
plot_df2 %>%
mutate(pred = as.integer(fitted > .5)) %>%
mutate(type = ifelse(pred == hd, ifelse(pred == 0, "tn", "tp"), ifelse(pred == 0, "fn", "fp"))) %>%
group_by(model, type) %>%
count() %>%
ungroup() %>%
spread(type, n) %>%
mutate(
positive_prediction_value = tp / (tp + fp)
, negative_prediction_value = tn / (tn + fn)
)
values_df %>%
select(model, positive_prediction_value, negative_prediction_value) %>%
pander()
model | positive_prediction_value | negative_prediction_value |
---|---|---|
fit1 | 0.4500 | 0.7830 |
fit2 | 0.4722 | 0.7863 |
fit2 is more desireable if we care about Positive Prediction Value because the PPV is 0.4722 for fit2 vs 0.45 for fit1.
ppnp_vs_threshold_df <- data_frame()
prob_thresholds <- seq(from = 0, to = 1, by = 0.01)
for (threshold in prob_thresholds) {
ppnp_vs_threshold_df <-
bind_rows(
ppnp_vs_threshold_df,
plot_df2 %>%
mutate(pred = as.integer(fitted > threshold)) %>%
mutate(
tn = as.integer(pred == hd & pred == 0)
, tp = as.integer(pred == hd & pred != 0)
, fn = as.integer(pred != hd & pred == 0)
, fp = as.integer(pred != hd & pred != 0)
) %>%
group_by(model) %>%
summarise(
tn = sum(tn)
, tp = sum(tp)
, fn = sum(fn)
, fp = sum(fp)
) %>%
mutate(
positive_prediction_value = tp / (tp + fp)
, negative_prediction_value = tn / (tn + fn)
) %>%
mutate(
threshold = threshold
, ppnp = positive_prediction_value / negative_prediction_value
) %>%
select(threshold, model, positive_prediction_value, negative_prediction_value, ppnp)
)
}
ggplot(ppnp_vs_threshold_df, aes(x = threshold, y = ppnp, colour = as.factor(model))) +
geom_line() + geom_point() +
theme_jrf() +
scale_colour_manual("Model", values = c('fit1' = pal538['blue'][[1]], 'fit2' = pal538['red'][[1]])) +
labs(title = "PP/NP vs Probability Threshold",
x = "Probability Threshold", y = "Positive Prediction Value / Negative Prediction Value")
ppnp_vs_threshold_df %>%
select(-ppnp) %>%
gather(metric, value, -threshold, -model) %>%
ggplot(aes(x = threshold, y = value, colour = model)) + facet_grid(. ~ metric) +
geom_line() + geom_point() +
theme_jrf() +
scale_colour_manual("Model", values = c('fit1' = pal538['blue'][[1]], 'fit2' = pal538['red'][[1]])) +
labs(title = "Positive Prediction Value & Negative Prediction Value",
x = "Probability Threshold", y = "Prediction Value") +
theme(legend.position = 'bottom')
I would choose fit2, but fit1 and fit2 are nearly the same. I would want to ise fit2 with a probability threshold slightly above 0.5.
If the risk ratio is \(\frac{a_{1,0}}{a_{0,1}} = 10\) then \(\frac{a_{0,1}}{a_{1,0}} = 0.1\). The threshold over the \(P(Y=1|x) > \frac{0.1}{1 + 0.1} = 0.0909\) or \(logit > log(\frac{0.0909}{1 - 0.0909}) = -2.3026 = log(0.1)\) gives us the Bayes rule.
We have the logit from 1 b), thus the linear boundary for the Bayes classifier is
\[-9.2279 + 0.0615age + 0.9113sexmale + 0.016sbp + 0.0045chol + 0.006frw + 0.0123cig \\ = log(0.1) = -2.3026\]
mce_df <-
data_frame(
hd = framingham_final$hd
, fitted = final_model$fitted.values
, pred = as.integer(fitted > 0.0909)
, a10 = 10 * ifelse(hd == 1 & pred != 1, 1, 0)
, a01 = 1 * ifelse(hd == 0 & pred != 0, 1, 0)
, a10_a01 = a10 + a01
)
mce <- sum(mce_df$a10_a01) / nrow(mce_df)
We use the formula for weighted misclassification error
\[MCE=\frac{a_{10} \sum_{\hat{y}_i=1} I\{y_i \neq \hat{y}_i\} + a_{01} \sum_{\hat{y}_i=0} I\{y_i \neq \hat{y}_i\}}{n}\]
For the risk ratio of \(\frac{a_{1,0}}{a_{0,1}} = 10\), the MCE is 0.7143.
liz_pred_logit <- predict(final_model, liz, type = 'link')
For Liz, the logit is -2.9523 and thus we classify her as 0 or will not have heart disease.
Below is a plot of the posterior thresholds by the associated weighted misclassification rates using the two risk ratios \(\frac{a_{1,0}}{a_{0,1}} = 10\) and \(\frac{a_{1,0}}{a_{0,1}} = 1\).
threshold_mce_df <- data_frame()
posterior_thresholds <- seq(
from = round(exp(min(final_model$fitted)) / (1 + exp(min(final_model$fitted))), 2) - 0.01,
to = round(exp(max(final_model$fitted)) / (1 + exp(max(final_model$fitted))), 2),
length.out = 40)
for (threshold in posterior_thresholds) {
threshold_mce_df <-
bind_rows(
threshold_mce_df,
data_frame(
hd = framingham_final$hd
, fitted = final_model$fitted.values
, pred = as.integer(fitted > log(threshold / (1 - threshold)))
, a10_10 = 10 * ifelse(hd == 1 & pred != 1, 1, 0)
, a10_1 = 1 * ifelse(hd == 1 & pred != 1, 1, 0)
, a01 = 1 * ifelse(hd == 0 & pred != 0, 1, 0)
, a10_a01_10 = a10_10 + a01
, a10_a01_1 = a10_1 + a01
) %>%
summarise(
mce_10 = sum(a10_a01_10) / n()
, mce_1 = sum(a10_a01_1) / n()
) %>%
mutate(threshold = threshold) %>%
select(threshold, mce_10, mce_1)
)
}
threshold_mce_df2 <-
threshold_mce_df %>%
select(threshold, mce_10, mce_1) %>%
gather(risk_ratio, mce, -threshold) %>%
mutate(risk_ratio = gsub("mce_","",risk_ratio))
ggplot(threshold_mce_df2, aes(x = threshold, y = mce, colour = as.factor(risk_ratio))) +
geom_line() + geom_point() +
theme_jrf() +
scale_colour_manual("Risk Ratio", values = c('10' = pal538['blue'][[1]], '1' = pal538['red'][[1]])) +
labs(title = "Misclassification Error (MCE) vs Posterior Threshold",
x = "Posterior Threshold", y = "Weighted Misclassification Error (MCE)")
The range of the x-axis is from 0.5 to 0.7 which reflects the range of the fitted values from the final model for 1 b). We know from model the fitted values have a range from 0.0272 to 0.8301 which when exponentiated and converted to a probability is 0.5 to 0.7.
When \(\frac{a_{1,0}}{a_{0,1}} = 10\) the Bayes rule classifier does not perform well for large thresholds. When the threshold is 0.5, the rule is all observations are hd = 1
which means that \(a_{10} \sum_{\hat{y}_i=1} I\{y_i \neq \hat{y}_i\}\) goes to 0. The misclassification initially goes down, but then there is a huge penality for false negatives.
The Bayes rule classifier with \(\frac{a_{1,0}}{a_{0,1}} = 1\) performs much better and indeed the misclassification rate decreases as the threshold increases.
bills_train <- read.csv(paste0(data_dir, 'Bills.subset.csv'), row.names = 1, na.strings=c("","NA")) %>% as_tibble()
bills_test <- read.csv(paste0(data_dir, 'Bills.subset.test.csv'), row.names = 1) %>% as_tibble()
Our objective is to predict whether or not a bill will pass the the floor of the Pennsylvania State House based on characteristics of the bill. Some of the characteristics of bills that we use in this analysis is the number amendments the bill had, the session the bill was introduced, and the number of cosponsors. Using the process of training/test datasets and cross-validation, we built a logistic regression model for this prediction exercise. However, a bill passing the State House is a rare event. Less than 7% of introduced bills have passed the House since the 2009-2010 session. As a result, our final model is effective at identifying bills that will not pass, but is less effective at identifying bills that will pass.
Collectively, the bills dataset contains 8011 observations split into training (7011) and test (1000) sets.
pass_vals <- c("bill:passed", "governor:signed", "governor:received")
bills_train2 <-
bills_train %>%
mutate(
status2 = as.factor(as.numeric(status %in% pass_vals))
, dow = factor(day.of.week.introduced, labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
, is_sponsor_in_leadership2 = as.factor(is_sponsor_in_leadership)
)
bills_test2 <-
bills_test %>%
mutate(
status2 = as.factor(as.numeric(status %in% pass_vals))
, dow = factor(day.of.week.introduced, labels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
, is_sponsor_in_leadership2 = as.factor(is_sponsor_in_leadership)
)
We find that 364 observations in the training set are missing at least one value (1 in the test set). Let’s look by feature.
bills_train2 %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(feature, NAs) %>%
arrange(desc(NAs))
There are 5 values of status that are NA
. Based on the assignment’s comment that “All other outcomes are failures” we will assume these are failures to pass rather than missing values. We will remove the observations without day of week (from training and test datasets) so that different models are comparable.
bills_train3 <-
bills_train2 %>%
select(-originating_committee) %>%
filter(complete.cases(.))
bills_test3 <-
bills_test2 %>%
select(-originating_committee) %>%
filter(complete.cases(.))
pander(summary(bills_train3), missing = "", split.table = 150)
bill_id | sponsor_party | session | num_cosponsors | num_d_cosponsors | num_r_cosponsors | title_word_count |
---|---|---|---|---|---|---|
HB-1-2009-2010 : 1 | Democratic:3212 | 2009-2010 :2738 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 6 |
HB-1-2011-2012 : 1 | Republican:3692 | 2009-2010 Special Session #1 (Transportation): 0 | 1st Qu.: 11.0 | 1st Qu.: 3.0 | 1st Qu.: 3.0 | 1st Qu.: 22 |
HB-10-2009-2010 : 1 | 2011-2012 :2678 | Median : 20.0 | Median : 8.0 | Median : 8.0 | Median : 27 | |
HB-10-2011-2012 : 1 | 2013-2014 :1488 | Mean : 23.5 | Mean :10.5 | Mean :13.1 | Mean : 34 | |
HB-100-2009-2010: 1 | 3rd Qu.: 32.0 | 3rd Qu.:15.0 | 3rd Qu.:19.0 | 3rd Qu.: 34 | ||
HB-100-2011-2012: 1 | Max. :165.0 | Max. :90.0 | Max. :99.0 | Max. :751 | ||
(Other) :6898 |
day.of.week.introduced | num_amendments | status | is_sponsor_in_leadership | num_originating_committee_cosponsors |
---|---|---|---|---|
Min. :1.00 | Min. :0.000 | committee:referred:6015 | Min. :0.000 | Min. : 0.00 |
1st Qu.:2.00 | 1st Qu.:0.000 | governor:signed : 427 | 1st Qu.:0.000 | 1st Qu.: 0.00 |
Median :3.00 | Median :0.000 | bill:reading:1 : 232 | Median :1.000 | Median : 1.00 |
Mean :2.72 | Mean :0.179 | committee:passed : 106 | Mean :0.599 | Mean : 2.06 |
3rd Qu.:4.00 | 3rd Qu.:0.000 | bill:reading:2 : 72 | 3rd Qu.:1.000 | 3rd Qu.: 3.00 |
Max. :6.00 | Max. :8.000 | bill:passed : 31 | Max. :1.000 | Max. :19.00 |
(Other) : 21 |
num_originating_committee_cosponsors_r | num_originating_committee_cosponsors_d | status2 | dow | is_sponsor_in_leadership2 |
---|---|---|---|---|
Min. : 0.00 | Min. :0.000 | 0:6442 | Mon:1628 | 0:2770 |
1st Qu.: 0.00 | 1st Qu.:0.000 | 1: 462 | Tue:1630 | 1:4134 |
Median : 1.00 | Median :0.000 | Wed:1634 | ||
Mean : 1.34 | Mean :0.712 | Thu:1104 | ||
3rd Qu.: 2.00 | 3rd Qu.:1.000 | Fri: 892 | ||
Max. :13.00 | Max. :8.000 | Sat: 16 |
To explore the data further we create a pairs plot.
bill_pairs <-ggpairs(bills_train3 %>% select(status2, sponsor_party, session, is_sponsor_in_leadership2, dow),
mapping = aes(colour = status2))
for (row in seq_len(bill_pairs$nrow))
for (col in seq_len(bill_pairs$ncol))
bill_pairs[row, col] <- bill_pairs[row, col] + scale_colour_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]])) + scale_fill_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]]))
bill_pairs
bill_pairs2 <-ggpairs(bills_train3 %>% select(status2, num_cosponsors, num_d_cosponsors, num_r_cosponsors, title_word_count, num_amendments, num_originating_committee_cosponsors, num_originating_committee_cosponsors_r, num_originating_committee_cosponsors_d), mapping = aes(colour = status2))
for (row in seq_len(bill_pairs2$nrow))
for (col in seq_len(bill_pairs2$ncol))
bill_pairs2[row, col] <- bill_pairs2[row, col] + scale_colour_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]])) + scale_fill_manual(values = c(pal538['blue'][[1]], pal538['red'][[1]]))
bill_pairs2
Based on the pairs plots we make the following observations
Given the partisanship of the PA, we would imagine that a bill’s successful is likely dependent on the party sponsor. We create a frequency table and Pearson’s Chi-squared test.
pander(descr::CrossTable(bills_train3$status2, bills_train3$sponsor_party, prop.chisq=FALSE, prop.t = FALSE, chisq = TRUE, dnn = c("Status","Sponsor Party")), split.table = Inf, big.mark = ',')
Status |
Sponsor Party Democratic |
Republican |
Total |
---|---|---|---|
0 N Row(%) Column(%) |
3082 47.842% 95.953% |
3360 52.158% 91.008% |
6442 93.308% |
1 N Row(%) Column(%) |
130 28.139% 4.047% |
332 71.862% 8.992% |
462 6.692% |
Total |
3212 46.5238% |
3692 53.4762% |
6904 |
pander(chisq.test(bills_train3$status2, bills_train3$sponsor_party), caption = "Pearson's Chi-squared test with Yates' continuity correction")
Test statistic | df | P value |
---|---|---|
66.48 | 1 | 3.533e-16 * * * |
We also look at the relationship between number of amendments and whether the bills passed. We note that bills with at least one amendment had 31.15% pass rate vs 2.94% for bills without an amendment.
bills_train3 %>%
group_by(num_amendments, status2) %>%
count() %>%
spread(status2, n) %>%
mutate(`Total` = as.integer(ifelse(is.na(`0`), 0 ,`0`)) + `1`) %>%
rename(`Amendments | Status` = num_amendments) %>%
pander(missing = '')
Amendments | Status | 0 | 1 | Total |
---|---|---|---|
0 | 5810 | 176 | 5986 |
1 | 554 | 146 | 700 |
2 | 65 | 84 | 149 |
3 | 11 | 37 | 48 |
4 | 2 | 11 | 13 |
5 | 5 | 5 | |
6 | 2 | 2 | |
8 | 1 | 1 |
bills_train3 %>%
mutate(amendments = ifelse(num_amendments == 0, "0 Amendments", "1+ Amendments")) %>%
group_by(amendments) %>%
summarise(pass_rate = sum(status2 == 1) / n()) %>%
ungroup() %>%
spread(amendments, pass_rate) %>%
pander()
0 Amendments | 1+ Amendments |
---|---|
0.0294 | 0.3115 |
The many continous explanatory variables are derviatives of another and we would expect to so see some colinearity among these variables. The correlation matrix belows shows that many variables are very closely positively correlated to each other.
bills_train3 %>%
select(num_cosponsors, num_d_cosponsors, num_r_cosponsors, title_word_count, num_amendments,
num_originating_committee_cosponsors, num_originating_committee_cosponsors_r,
num_originating_committee_cosponsors_d) %>%
cor() %>%
#corrplot(method="square", tl.cex = 1/par("cex"), tl.col = 'black', tl.pos = 'l')
corrplot(type = "upper", tl.pos = "td", method = "square", tl.cex = 0.5, tl.col = 'black', diag = FALSE)
We quickly notice that what is missing is a feature for the bipartisan support of a bill. In a state like Pennsylvania, whether or not a bill passes is likely a function of the bipartisanship of the bill. We can create an indicator feature as to whether or not a bill has both Democrat and Republican sponsors.
bills_train4 <-
bills_train3 %>%
mutate(is_bipartisan = factor(ifelse(num_d_cosponsors > 0 & num_r_cosponsors > 0, 1, 0)))
bills_test4 <-
bills_test3 %>%
mutate(is_bipartisan = factor(ifelse(num_d_cosponsors > 0 & num_r_cosponsors > 0, 1, 0)))
bipartisan_tbl <- table(bills_train4$status2, bills_train4$is_bipartisan)
pander(descr::CrossTable(bills_train4$status2, bills_train4$is_bipartisan, prop.chisq=FALSE, prop.t = FALSE, chisq = TRUE, dnn = c("Status","Bipartisan")), split.table = Inf, big.mark = ',')
Status |
Bipartisan 0 |
1 |
Total |
---|---|---|---|
0 N Row(%) Column(%) |
1043 16.191% 91.813% |
5399 83.809% 93.603% |
6442 93.308% |
1 N Row(%) Column(%) |
93 20.130% 8.187% |
369 79.870% 6.397% |
462 6.692% |
Total |
1136 16.4542% |
5768 83.5458% |
6904 |
We see that of the bills that do pass, nearly 79.87% of them have bipartisan support.
We begin by building a model with all the predictors we think could affect the status of a bill. As expected from our exploratory analysis, we find the number of cosponsors from each party and the number of party members on each originating committee is correlated to the number of cosponsors and number of members on each originating committee.
We remove these and continue with backward selection until all the variables in the logistic regression model are significant at the 0.01 confidence level. Our initial model is shown below.
bills_fit1 <- glm(status2 ~ session + dow + is_sponsor_in_leadership2 + sponsor_party + is_bipartisan +
num_cosponsors + num_d_cosponsors + num_r_cosponsors + title_word_count + num_amendments +
num_originating_committee_cosponsors + num_originating_committee_cosponsors_r +
num_originating_committee_cosponsors_d, data = bills_train4, family = binomial)
bills_fit2 <- glm(status2 ~ session + dow + is_sponsor_in_leadership2 + sponsor_party + is_bipartisan +
num_cosponsors + num_d_cosponsors + title_word_count + num_amendments +
num_originating_committee_cosponsors + num_originating_committee_cosponsors_r,
data = bills_train4, family = binomial)
bills_fit3 <- glm(status2 ~ session + dow + is_sponsor_in_leadership2 + sponsor_party + is_bipartisan +
num_cosponsors + title_word_count + num_amendments +
num_originating_committee_cosponsors + num_originating_committee_cosponsors_r,
data = bills_train4, family = binomial)
bills_fit4 <- glm(status2 ~ session + dow + is_sponsor_in_leadership2 + sponsor_party + is_bipartisan +
num_cosponsors + title_word_count + num_amendments +
num_originating_committee_cosponsors, data = bills_train4, family = binomial)
bills_fit5 <- glm(status2 ~ session + is_sponsor_in_leadership2 + sponsor_party +
num_cosponsors + title_word_count + num_amendments + is_bipartisan +
num_originating_committee_cosponsors, data = bills_train4, family = binomial)
bills_fit6 <- glm(status2 ~ session + is_sponsor_in_leadership2 + sponsor_party +
title_word_count + num_amendments + is_bipartisan +
num_originating_committee_cosponsors, data = bills_train4, family = binomial)
bills_fit7 <- glm(status2 ~ session + sponsor_party + title_word_count + num_amendments + is_bipartisan +
num_originating_committee_cosponsors, data = bills_train4, family = binomial)
bills_fit8 <- glm(status2 ~ session + sponsor_party + title_word_count + num_amendments + is_bipartisan,
data = bills_train4, family = binomial)
tidy(bills_fit8) %>% pander(caption = paste0(as.character(summary(bills_fit8)$call)[3],': ', as.character(summary(bills_fit8)$call)[2]))
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -3.931 | 0.1742 | -22.56 | 9.9e-113 |
session2011-2012 | 0.4799 | 0.1391 | 3.45 | 0.0005601 |
session2013-2014 | 0.4677 | 0.1571 | 2.977 | 0.002911 |
sponsor_partyRepublican | 0.7379 | 0.1287 | 5.734 | 9.825e-09 |
title_word_count | 0.004378 | 0.001094 | 4.001 | 6.312e-05 |
num_amendments | 1.82 | 0.07698 | 23.64 | 1.516e-123 |
is_bipartisan1 | -0.5002 | 0.143 | -3.498 | 0.000468 |
glance(bills_fit8) %>% pander()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
---|---|---|---|---|---|---|
3391 | 6903 | -1243 | 2500 | 2548 | 2486 | 6897 |
We use the package glmulti
to select the best model based on AIC. We tried bestglm
but it would not process locally even with only a few explantory variables. We start with a more limited set of variables, excluding the breakdowns by Republican and Democrat because they are linear combinations of num_cosponsors
and num_originating_committee_cosponsors
.
best_glm_bills <- glmulti::glmulti(status2 ~ session + dow + is_sponsor_in_leadership2 + sponsor_party +
num_cosponsors + title_word_count + num_amendments + is_bipartisan +
num_originating_committee_cosponsors,
data = bills_train4, family = binomial, crit = aic, level = 1, plotty = FALSE, report = FALSE)
glmulit
returns a best model with the equations status2 ~ 1 + session + sponsor_party + is_bipartisan + title_word_count + num_amendments + num_originating_committee_cosponsors
.
We find that the best model has an AIC of 2496.2076 which is only slightly smaller than the AIC of the final model we found from backwards selection 2500.442.
plot(best_glm_bills, type = 's')
We plot the ROC curves for the 8 models we have created. The first 6 ROC curves in gray are from the backwards selection process. The ROC curve in with heart disease (hd = 1) is the final model from backwards selection and the with heart disease (hd = 1) ROC curve is the best glm from the glmulti
package. Noteablly the curves are relatively similar. This is expected because the same model type (logistic regression) is being used as opposed to another classification model (CART, SVM, etc.)
bills_plot_roc_df <-
bills_train4 %>%
bind_cols(
data_frame(
backwards = bills_fit8$fitted
, glmulti = bills_fit7$fitted
, other1 = bills_fit6$fitted
, other2 = bills_fit5$fitted
, other3 = bills_fit4$fitted
, other4 = bills_fit3$fitted
, other5 = bills_fit2$fitted
, other6 = bills_fit1$fitted
)
) %>%
select(status = status2, backwards, glmulti, starts_with("other")) %>%
gather(model, fitted, -status) %>%
mutate(model = factor(model,
levels = c("other1","other2","other3","other4","other5", "other6","backwards","glmulti"))) %>%
mutate(status = as.integer(status) - 1)
ggplot(bills_plot_roc_df, aes(d = status, m = fitted, colour = model)) +
geom_roc() + style_roc() +
theme_jrf() +
labs(title = "ROC for Backwards Selection and glmulti Models") +
scale_colour_manual("Model", values = c('backwards' = pal538['blue'][[1]], 'glmulti' = pal538['red'][[1]],
'other1' = pal538['medgray'][[1]], 'other2' = pal538['medgray'][[1]],
'other3' = pal538['medgray'][[1]], 'other4' = pal538['medgray'][[1]],
'other5' = pal538['medgray'][[1]], 'other6' = pal538['medgray'][[1]]
))
Let’s review the AUC for the 8 curves.
data_frame(
model = c("other1","other2","other3","other4","other5", "other6","backwards","glmulti")
, AUC = c(auc(roc(bills_train4$status2, bills_fit1$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit2$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit3$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit4$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit5$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit6$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit8$fitted))[1]
, auc(roc(bills_train4$status2, bills_fit7$fitted))[1])
) %>%
pander()
model | AUC |
---|---|
other1 | 0.8450 |
other2 | 0.8450 |
other3 | 0.8457 |
other4 | 0.8465 |
other5 | 0.8479 |
other6 | 0.8480 |
backwards | 0.8451 |
glmulti | 0.8482 |
We perform \(K\)-fold cross validation on the training dataset with \(K = 10\) for two models we have focused in on: backwards and glmulti.
data_frame(
model = c('backwards','glmulti')
, `CV estimate of Prediction Rrror` = c(boot::cv.glm(bills_train4, bills_fit8, K = 10)$delta[[1]],
boot::cv.glm(bills_train4, bills_fit7, K = 10)$delta[[1]])
) %>%
pander()
model | CV estimate of Prediction Rrror |
---|---|
backwards | 0.04737 |
glmulti | 0.04766 |
We see these are incredibly similar models and would be satisified with either model. The \(K\)-folds cross validation was a means to check that we were not overfitting.
We propose a loss function such that \(\frac{a_{1,0}}{a_{0,1}} = 5\) in order to minimize false negatives. A bill passing is a very rare event (unfortunately) so we want to maximize the negative predictive value (incidence of false negatives). We see no difference in weighted MCE.
data_frame(
status = bills_train4$status2
, fitted_back = bills_fit8$fitted.values
, fitted_glmulit = bills_fit8$fitted.values
, pred_back = as.integer(fitted_back > 0.2 / (1 + 0.2))
, pred_glmulit = as.integer(fitted_glmulit > 0.2 / (1 + 0.2))
, a10_back = 5 * ifelse(status == 1 & pred_back != 1, 1, 0)
, a10_glmulti = 5 * ifelse(status == 1 & pred_glmulit != 1, 1, 0)
, a01_back = 1 * ifelse(status == 0 & pred_back != 0, 1, 0)
, a01_glmulti = 1 * ifelse(status == 0 & pred_glmulit != 0, 1, 0)
, a10_a01_back = a10_back + a01_back
, a10_a01_glmulti = a10_glmulti + a01_glmulti
) %>%
summarise(
`MCE - Backwards` = sum(a10_a01_back) / n()
, `MCE - glmulti` = sum(a10_a01_glmulti) / n()
) %>%
pander()
MCE - Backwards | MCE - glmulti |
---|---|
0.2152 | 0.2152 |
The confusion table for the backwards selection model is below.
confusion_bills <- table(bills_train4$status2, ifelse(predict(bills_fit8, type = 'response') > .5, 1, 0))
pander(descr::CrossTable(bills_train4$status2, ifelse(predict(bills_fit8, type = 'response') > .5, 1, 0), prop.chisq=FALSE, prop.r = F, prop.c = F, prop.t = TRUE, chisq = TRUE, dnn = c("Status","Prediction")), split.table = Inf, big.mark = ',')
Status |
Prediction 0 |
1 |
Total |
---|---|---|---|
0 N Total(%) |
6392 92.5840% |
50 0.7242% |
6442 |
1 N Total(%) |
339 4.9102% |
123 1.7816% |
462 |
Total | 6731 | 173 | 6904 |
We see that while this model goes a good job at predicting when a bill will not pass (specificity \(= \frac{TN}{TN + FP} = 0.9922\)), it doesn’t have the positive predictive value we’d hope (\(\frac{TP}{TP + FP} = 0.2662\)).
As our final model we select the model identified by the backwards selection procedure. We chose this model over the one identified by the glmulti
package because it contains fewer explanatory variables but has nearly the same AIC value (2500.442 vs 2496.2076).
We now use the test dataset to find the MCE.
tidy_bills_fit8 <-
tidy(bills_fit8) %>%
mutate(term = gsub("_","\\\\,", term))
bills_test_pred <- ifelse(predict(bills_fit8, bills_test4, type = "response") > 0.5, 1, 0)
confusion_test_bills <- table(bills_test4$status2, bills_test_pred)
pander(descr::CrossTable(bills_test4$status2, bills_test_pred, prop.chisq=FALSE, prop.r = F, prop.c = F, prop.t = TRUE, chisq = TRUE, dnn = c("Status","Prediction")), split.table = Inf, big.mark = ',')
Status |
Prediction 0 |
1 |
Total |
---|---|---|---|
0 N Total(%) |
918 91.892% |
13 1.301% |
931 |
1 N Total(%) |
50 5.005% |
18 1.802% |
68 |
Total | 968 | 31 | 999 |
We see that the unweighted MCE is 0.0631. However, what we see is that again the model is good at predicting when a bill will not pass but has difficulty properly predicting a true positive (positive prediction value \(= \frac{TP}{TP + FP} = 0.2647\))
The final model is
\[\hat{y}_i = -3.9313 + 0.4799session2011-2012 + 0.4677session2013-2014 + 0.7379sponsor\,partyRepublican + \\ 0.0044title\,word\,count + 1.8198num\,amendments + -0.5002is\,bipartisan1\]
summary_vars_bills <-
data_frame(
variable = rownames(coef(summary(bills_fit8))[-1, ])
, p_value = coef(summary(bills_fit8))[-1, 4]
) %>%
mutate(
estimate = coef(summary(bills_fit8))[-1, 1]
, odds = exp(estimate)
, prob = odds / (1 + odds)
, percentage = paste0(round(prob * 100, 2), "%")
) %>%
arrange(p_value)
summary_vars_bills %>% select(-percentage)
Holding all other variables constant,
In considering how to better improve this model, we would want to consider gathering
We think that these features could potentially help improve the classifier. However, we also recognize that a different family of classifier might be helpful for this type of problem, particularly CART.