1.Preparing the data

library(readr)
knitr::opts_chunk$set(echo = TRUE)

data<-read.csv("Downloads/bank.csv", sep = ";")
df <- data.frame(data)
  1. Random Forest and LPM

Using RF and LPM to predict y with 100 bootstrap loops: 1. Report the test AUC and its 95% CI for both models. 2. Report the variable importance by MDI for the top 6 predictors from 100 runs. (You need to save each random forest model in your 100 loops). 3. What’s the OOB confusion matrix after 100 runs of random forest. 4. Comparing the test AUCs and OOB AUCs. Note: Attribute rf$votes reports OOB results for the training data.

library(PASWR)
## Warning: package 'PASWR' was built under R version 4.2.3
## Loading required package: lattice
library(ROCR)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(rpart)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#RF
dff <- df[complete.cases(df),] # for rf

n <- 10
B <- 100
AUCRF <- c()
auc.lpm <- c()
AUCoob <- c()

for (i in 1:n) {
  ind <- sample(nrow(dff), nrow(dff), replace = TRUE)
  train2 <- dff[ind, ]
  test2 <- dff[-ind, ]
  #RF model
  model3 <- randomForest(y~.,
                         ntree = B, data = train2) # RF    
  
  phat3 <- predict(model3, test2, type = "prob")
  
  # Fit the linear probability model
  lpm <- glm(y ~ ., data = train2, family = binomial(link = "probit"), maxit =   1000) #had a warning that the algorithm did not converge so i added the maxit
  
  phat4 <- predict(lpm, newdata = test2, type = "response")
  #phatoob <- model3$votes
  
  #AUC3 for RF
  pred_rocr3 <- prediction(phat3[,2], test2$y)
  auc_ROCR3 <- performance(pred_rocr3, measure = "auc")
  AUCRF[i] <- auc_ROCR3@y.values[[1]]
  
  #OOBAUC
  #pred_oob <- prediction(phatoob[,2], model3$predicted) #check this i dont   think its correct
  #auc_oob <- performance(pred_oob, measure = "auc")
  #AUCoob[i] <- auc_oob@y.values[[1]]
  
  #AUClpm FOR lpm
  pred_rocr4 <- prediction(phat4, test2$y)
  auc_ROCR4 <- performance(pred_rocr4, measure = "auc")
  auc.lpm[i] <- auc_ROCR4@y.values[[1]]
  

}


mean(AUCRF)
## [1] 0.9145768
sd(AUCRF)
## [1] 0.008897272
mean(auc.lpm)
## [1] 0.8901227
sd(auc.lpm)
## [1] 0.01022751
#oob
#mean(AUCoob)
#sd(AUCoob)
# Compute the 95% CI for the AUCs
ci.rf <- quantile(AUCRF, c(0.025, 0.975))
ci.lpm <- quantile(auc.lpm, c(0.025, 0.975))


ci.rf
##      2.5%     97.5% 
## 0.9041017 0.9276355
ci.lpm
##      2.5%     97.5% 
## 0.8730853 0.9043056

#Reporting the variable importance by MDI for the top 6 predictors from 100 runs.

imp <- model3$importance
top6 <- imp[order(imp[, "MeanDecreaseGini"], decreasing = TRUE)[1:6], , drop = FALSE]
print(top6)
##          MeanDecreaseGini
## duration        253.36965
## month           115.66402
## balance          85.69167
## age              83.59149
## day              80.60128
## job              78.90272

3.OOB confusion matrix after 100 runs of random forest.

# ADD THE CONFUSI0n TABLE
yhat <- ifelse(phat3 > 0.5, 1, 0)

# 5. Build the confusion table.

ct <- table(test2$y, yhat[,2])

# This function take `ct` and make a better table
gct <- function(x) {
  conf_table <- matrix(0, 2, 2)
  conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
  conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
  conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
  conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
  colnames(conf_table) <- c("Y=1", "Y=0")
  rownames(conf_table) <- c("phat=1", "phat=0")
  conf_table
}

cont <- gct(ct)
cont
##        Y=1  Y=0
## phat=1  74  116
## phat=0  35 1440

#Compare the test AUCs and OOB AUCs. Note that attribute rf$votes reports OOB results for the training data.

FF <- model3$votes
FF
AUCOOB <- c()

pred_rocrr <- prediction(FF[,2], model3$y)
  auc_ROCROOB <- performance(pred_rocrr, measure = "auc")
  AUCOOB[i] <- auc_ROCROOB@y.values[[1]]

mean(AUCOOB)
## [1] NA
Yy <-ifelse(df$y == "yes", 1, 0)
newdata <- cbind(df, Yy)
newdata <- newdata[, -17]


newdata$Yy <- as.numeric(newdata$Yy)
## Full grid search with external boostrapping

library(gbm)
## Warning: package 'gbm' was built under R version 4.2.3
## Loaded gbm 2.1.8.1
library(dplyr)

h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))

auc <-c()


for(i in 1:nrow(grid)) {
  auc <- c()
  for (j in 1:2) {
    try({
      ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
      train <- newdata[ind,]
      val <- newdata[-ind,] 
      boos <- gbm(
        Yy ~ .,
        distribution = "bernoulli",
        interaction.depth = grid[i, 1],
        n.trees = grid[i, 2],
        shrinkage = grid[i, 3],
        bag.fraction = 1,
        data = train
      )
      prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
      
      pred_rocr <- prediction(prboos, val$Yy)
      auc_rocr <- performance(pred_rocr, measure = "auc")
      auc[j] <- auc_rocr@y.values[[1]]
    }, silent = TRUE
    )
  }
  grid$AUC[i] <- mean(auc)
}
best <- head(arrange(grid, desc(AUC)))
best
##   D   B    h       AUC
## 1 2 200 0.07 0.9142666
## 2 2 250 0.07 0.9098041
## 3 2 150 0.09 0.9096120
## 4 2 250 0.05 0.9088931
## 5 2 300 0.05 0.9085689
## 6 2 200 0.05 0.9071892
plot(best)

#best predictors
summary.gbm(boos)

##                 var    rel.inf
## duration   duration 38.2875060
## month         month 25.5405588
## poutcome   poutcome 13.4454036
## job             job  4.6218397
## pdays         pdays  3.7136039
## balance     balance  3.1189176
## age             age  3.0186497
## day             day  2.5445164
## previous   previous  1.3900406
## contact     contact  1.2641744
## marital     marital  1.1132746
## campaign   campaign  0.7975131
## education education  0.5685828
## housing     housing  0.3489388
## default     default  0.1746398
## loan           loan  0.0518403
  1. Apply AdaBoost.M1 with gbm(adaboost) and adaboost() from the JOUSBoost package. For gbm, follow the same 2 steps in (3) and report your results. For JOUSBoost, make sure that you follow the requirements of adaboost().
#ADABOOST 
#change y to 1, -1 but 1,0 with gbm
##EVERYTHING IS THE SAME EXCEPT THE DISTRIBUTION

library(gbm)

h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))

mse <-c()
sdmse <-c()

for(i in 1:nrow(grid)) {
  auc <- c()
  for (j in 1:10) {
    try({
      ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
      train <- newdata[ind,]
      val <- newdata[-ind,]
      model_ada <- gbm(
        Yy ~ .,
        distribution = "adaboost",
        interaction.depth = grid[i, 1],
        n.trees = grid[i, 2],
        shrinkage = grid[i, 3],
        bag.fraction = 1,
        data = train
      )
      prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
      
      pred_rocr <- prediction(prboos, val$Yy)
      auc_rocr <- performance(pred_rocr, measure = "auc")
      auc[j] <- auc_rocr@y.values[[1]]
    }, silent = TRUE
    )
  }
  grid$AUC[i] <- mean(auc)
}
best1 <- head(arrange(grid, desc(AUC)))
best1
##   D   B    h       AUC
## 1 1 300 0.09 0.9409896
## 2 1 250 0.07 0.9402929
## 3 2 300 0.03 0.9395809
## 4 1 300 0.02 0.9383634
## 5 1 300 0.03 0.9382302
## 6 2 300 0.01 0.9378086
#best predictions
summary.gbm(model_ada)

##                 var    rel.inf
## duration   duration 44.2651091
## month         month 21.3367858
## poutcome   poutcome 15.3910881
## balance     balance  3.8550096
## pdays         pdays  3.3117235
## contact     contact  2.7888737
## job             job  2.5910143
## age             age  1.8989087
## day             day  1.8526680
## marital     marital  1.0615889
## education education  0.4834577
## default     default  0.3587141
## loan           loan  0.2987125
## previous   previous  0.2134945
## campaign   campaign  0.1928412
## housing     housing  0.1000102
##JOUSBOOST USING 1,-1
Y <-ifelse(df$y == "yes", 1, -1)
bstdata <- cbind(df, Y)
bstdata <- bstdata[, -17]

#DF$Y <- ifelse(df$y == 0, -1, 1)

#adaboost X's as a matrix
library(mltools)
## Warning: package 'mltools' was built under R version 4.2.3
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
df_hot <- one_hot(as.data.table(bstdata))

library(JOUSBoost)
## Warning: package 'JOUSBoost' was built under R version 4.2.3
## JOUSBoost 2.1.0
#MAUC <- c()
aucc <- c()

for (i in 1:10) {
  ind <- sample(nrow(df_hot), nrow(df_hot), replace = TRUE)
  train <- df_hot[ind, ]
  val <- df_hot[-ind,]
  
  ada <- adaboost(as.matrix(train[, -"Y"]),
                  train$Y, tree_depth = 1,
                  n_rounds = 100)
  
  phat <- predict(ada, val, type = "prob")
    
    pred_rocR <- prediction(phat, val$Y)
    auc_ROCR <- performance(pred_rocR, measure = "auc")
    aucc[i] <- auc_ROCR@y.values[[1]]
}
mean(aucc)
## [1] 0.8847019
sd(aucc)
## [1] 0.007089726
#MAUC[r] <- mean(aucc)