1.Preparing the data
library(readr)
knitr::opts_chunk$set(echo = TRUE)
data<-read.csv("Downloads/bank.csv", sep = ";")
df <- data.frame(data)
Using RF and LPM to predict y with 100 bootstrap loops: 1. Report the test AUC and its 95% CI for both models. 2. Report the variable importance by MDI for the top 6 predictors from 100 runs. (You need to save each random forest model in your 100 loops). 3. What’s the OOB confusion matrix after 100 runs of random forest. 4. Comparing the test AUCs and OOB AUCs. Note: Attribute rf$votes reports OOB results for the training data.
library(PASWR)
## Warning: package 'PASWR' was built under R version 4.2.3
## Loading required package: lattice
library(ROCR)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(rpart)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#RF
dff <- df[complete.cases(df),] # for rf
n <- 10
B <- 100
AUCRF <- c()
auc.lpm <- c()
AUCoob <- c()
for (i in 1:n) {
ind <- sample(nrow(dff), nrow(dff), replace = TRUE)
train2 <- dff[ind, ]
test2 <- dff[-ind, ]
#RF model
model3 <- randomForest(y~.,
ntree = B, data = train2) # RF
phat3 <- predict(model3, test2, type = "prob")
# Fit the linear probability model
lpm <- glm(y ~ ., data = train2, family = binomial(link = "probit"), maxit = 1000) #had a warning that the algorithm did not converge so i added the maxit
phat4 <- predict(lpm, newdata = test2, type = "response")
#phatoob <- model3$votes
#AUC3 for RF
pred_rocr3 <- prediction(phat3[,2], test2$y)
auc_ROCR3 <- performance(pred_rocr3, measure = "auc")
AUCRF[i] <- auc_ROCR3@y.values[[1]]
#OOBAUC
#pred_oob <- prediction(phatoob[,2], model3$predicted) #check this i dont think its correct
#auc_oob <- performance(pred_oob, measure = "auc")
#AUCoob[i] <- auc_oob@y.values[[1]]
#AUClpm FOR lpm
pred_rocr4 <- prediction(phat4, test2$y)
auc_ROCR4 <- performance(pred_rocr4, measure = "auc")
auc.lpm[i] <- auc_ROCR4@y.values[[1]]
}
mean(AUCRF)
## [1] 0.9145768
sd(AUCRF)
## [1] 0.008897272
mean(auc.lpm)
## [1] 0.8901227
sd(auc.lpm)
## [1] 0.01022751
#oob
#mean(AUCoob)
#sd(AUCoob)
# Compute the 95% CI for the AUCs
ci.rf <- quantile(AUCRF, c(0.025, 0.975))
ci.lpm <- quantile(auc.lpm, c(0.025, 0.975))
ci.rf
## 2.5% 97.5%
## 0.9041017 0.9276355
ci.lpm
## 2.5% 97.5%
## 0.8730853 0.9043056
#Reporting the variable importance by MDI for the top 6 predictors from 100 runs.
imp <- model3$importance
top6 <- imp[order(imp[, "MeanDecreaseGini"], decreasing = TRUE)[1:6], , drop = FALSE]
print(top6)
## MeanDecreaseGini
## duration 253.36965
## month 115.66402
## balance 85.69167
## age 83.59149
## day 80.60128
## job 78.90272
3.OOB confusion matrix after 100 runs of random forest.
# ADD THE CONFUSI0n TABLE
yhat <- ifelse(phat3 > 0.5, 1, 0)
# 5. Build the confusion table.
ct <- table(test2$y, yhat[,2])
# This function take `ct` and make a better table
gct <- function(x) {
conf_table <- matrix(0, 2, 2)
conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
colnames(conf_table) <- c("Y=1", "Y=0")
rownames(conf_table) <- c("phat=1", "phat=0")
conf_table
}
cont <- gct(ct)
cont
## Y=1 Y=0
## phat=1 74 116
## phat=0 35 1440
#Compare the test AUCs and OOB AUCs. Note that attribute rf$votes reports OOB results for the training data.
FF <- model3$votes
FF
AUCOOB <- c()
pred_rocrr <- prediction(FF[,2], model3$y)
auc_ROCROOB <- performance(pred_rocrr, measure = "auc")
AUCOOB[i] <- auc_ROCROOB@y.values[[1]]
mean(AUCOOB)
## [1] NA
Yy <-ifelse(df$y == "yes", 1, 0)
newdata <- cbind(df, Yy)
newdata <- newdata[, -17]
newdata$Yy <- as.numeric(newdata$Yy)
## Full grid search with external boostrapping
library(gbm)
## Warning: package 'gbm' was built under R version 4.2.3
## Loaded gbm 2.1.8.1
library(dplyr)
h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))
auc <-c()
for(i in 1:nrow(grid)) {
auc <- c()
for (j in 1:2) {
try({
ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
train <- newdata[ind,]
val <- newdata[-ind,]
boos <- gbm(
Yy ~ .,
distribution = "bernoulli",
interaction.depth = grid[i, 1],
n.trees = grid[i, 2],
shrinkage = grid[i, 3],
bag.fraction = 1,
data = train
)
prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
pred_rocr <- prediction(prboos, val$Yy)
auc_rocr <- performance(pred_rocr, measure = "auc")
auc[j] <- auc_rocr@y.values[[1]]
}, silent = TRUE
)
}
grid$AUC[i] <- mean(auc)
}
best <- head(arrange(grid, desc(AUC)))
best
## D B h AUC
## 1 2 200 0.07 0.9142666
## 2 2 250 0.07 0.9098041
## 3 2 150 0.09 0.9096120
## 4 2 250 0.05 0.9088931
## 5 2 300 0.05 0.9085689
## 6 2 200 0.05 0.9071892
plot(best)
#best predictors
summary.gbm(boos)
## var rel.inf
## duration duration 38.2875060
## month month 25.5405588
## poutcome poutcome 13.4454036
## job job 4.6218397
## pdays pdays 3.7136039
## balance balance 3.1189176
## age age 3.0186497
## day day 2.5445164
## previous previous 1.3900406
## contact contact 1.2641744
## marital marital 1.1132746
## campaign campaign 0.7975131
## education education 0.5685828
## housing housing 0.3489388
## default default 0.1746398
## loan loan 0.0518403
#ADABOOST
#change y to 1, -1 but 1,0 with gbm
##EVERYTHING IS THE SAME EXCEPT THE DISTRIBUTION
library(gbm)
h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))
mse <-c()
sdmse <-c()
for(i in 1:nrow(grid)) {
auc <- c()
for (j in 1:10) {
try({
ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
train <- newdata[ind,]
val <- newdata[-ind,]
model_ada <- gbm(
Yy ~ .,
distribution = "adaboost",
interaction.depth = grid[i, 1],
n.trees = grid[i, 2],
shrinkage = grid[i, 3],
bag.fraction = 1,
data = train
)
prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
pred_rocr <- prediction(prboos, val$Yy)
auc_rocr <- performance(pred_rocr, measure = "auc")
auc[j] <- auc_rocr@y.values[[1]]
}, silent = TRUE
)
}
grid$AUC[i] <- mean(auc)
}
best1 <- head(arrange(grid, desc(AUC)))
best1
## D B h AUC
## 1 1 300 0.09 0.9409896
## 2 1 250 0.07 0.9402929
## 3 2 300 0.03 0.9395809
## 4 1 300 0.02 0.9383634
## 5 1 300 0.03 0.9382302
## 6 2 300 0.01 0.9378086
#best predictions
summary.gbm(model_ada)
## var rel.inf
## duration duration 44.2651091
## month month 21.3367858
## poutcome poutcome 15.3910881
## balance balance 3.8550096
## pdays pdays 3.3117235
## contact contact 2.7888737
## job job 2.5910143
## age age 1.8989087
## day day 1.8526680
## marital marital 1.0615889
## education education 0.4834577
## default default 0.3587141
## loan loan 0.2987125
## previous previous 0.2134945
## campaign campaign 0.1928412
## housing housing 0.1000102
##JOUSBOOST USING 1,-1
Y <-ifelse(df$y == "yes", 1, -1)
bstdata <- cbind(df, Y)
bstdata <- bstdata[, -17]
#DF$Y <- ifelse(df$y == 0, -1, 1)
#adaboost X's as a matrix
library(mltools)
## Warning: package 'mltools' was built under R version 4.2.3
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
df_hot <- one_hot(as.data.table(bstdata))
library(JOUSBoost)
## Warning: package 'JOUSBoost' was built under R version 4.2.3
## JOUSBoost 2.1.0
#MAUC <- c()
aucc <- c()
for (i in 1:10) {
ind <- sample(nrow(df_hot), nrow(df_hot), replace = TRUE)
train <- df_hot[ind, ]
val <- df_hot[-ind,]
ada <- adaboost(as.matrix(train[, -"Y"]),
train$Y, tree_depth = 1,
n_rounds = 100)
phat <- predict(ada, val, type = "prob")
pred_rocR <- prediction(phat, val$Y)
auc_ROCR <- performance(pred_rocR, measure = "auc")
aucc[i] <- auc_ROCR@y.values[[1]]
}
mean(aucc)
## [1] 0.8847019
sd(aucc)
## [1] 0.007089726
#MAUC[r] <- mean(aucc)