Portuguese banking institution Problem

1.Preparing the data

library(readr)
knitr::opts_chunk$set(echo = TRUE)

data<-read.csv("Downloads/bank.csv", sep = ";")

df <- data.frame(data)

Random Forest and LPM

Using RF and LPM to predict y with 100 bootstrap loops: 1. Report the test AUC and its 95% CI for both models. 2. Report the variable importance by MDI for the top 6 predictors from 100 runs. (You need to save each random forest model in your 100 loops). 3. What’s the OOB confusion matrix after 100 runs of random forest. 4. Comparing the test AUCs and OOB AUCs. Note: Attribute rf$votes reports OOB results for the training data.

library(PASWR)

## Warning: package 'PASWR' was built under R version 4.2.3

## Loading required package: lattice

library(ROCR)
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

library(rpart)
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.2.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#RF
dff <- df[complete.cases(df),] # for rf

n <- 10
B <- 100
AUCRF <- c()
auc.lpm <- c()
AUCoob <- c()

for (i in 1:n) {
  ind <- sample(nrow(dff), nrow(dff), replace = TRUE)
  train2 <- dff[ind, ]
  test2 <- dff[-ind, ]
  #RF model
  model3 <- randomForest(y~.,
                         ntree = B, data = train2) # RF    
  
  phat3 <- predict(model3, test2, type = "prob")
  
  # Fit the linear probability model
  lpm <- glm(y ~ ., data = train2, family = binomial(link = "probit"), maxit =   1000) #had a warning that the algorithm did not converge so i added the maxit
  
  phat4 <- predict(lpm, newdata = test2, type = "response")
  #phatoob <- model3$votes
  
  #AUC3 for RF
  pred_rocr3 <- prediction(phat3[,2], test2$y)
  auc_ROCR3 <- performance(pred_rocr3, measure = "auc")
  AUCRF[i] <- auc_ROCR3@y.values[[1]]
  
  #OOBAUC
  #pred_oob <- prediction(phatoob[,2], model3$predicted) #check this i dont   think its correct
  #auc_oob <- performance(pred_oob, measure = "auc")
  #AUCoob[i] <- auc_oob@y.values[[1]]
  
  #AUClpm FOR lpm
  pred_rocr4 <- prediction(phat4, test2$y)
  auc_ROCR4 <- performance(pred_rocr4, measure = "auc")
  auc.lpm[i] <- auc_ROCR4@y.values[[1]]
  

}


mean(AUCRF)

## [1] 0.9145768

sd(AUCRF)

## [1] 0.008897272

mean(auc.lpm)

## [1] 0.8901227

sd(auc.lpm)

## [1] 0.01022751

#oob
#mean(AUCoob)
#sd(AUCoob)

# Compute the 95% CI for the AUCs
ci.rf <- quantile(AUCRF, c(0.025, 0.975))
ci.lpm <- quantile(auc.lpm, c(0.025, 0.975))


ci.rf

##      2.5%     97.5% 
## 0.9041017 0.9276355

ci.lpm

##      2.5%     97.5% 
## 0.8730853 0.9043056

#Reporting the variable importance by MDI for the top 6 predictors from 100 runs.

imp <- model3$importance
top6 <- imp[order(imp[, "MeanDecreaseGini"], decreasing = TRUE)[1:6], , drop = FALSE]
print(top6)

##          MeanDecreaseGini
## duration        253.36965
## month           115.66402
## balance          85.69167
## age              83.59149
## day              80.60128
## job              78.90272

3.OOB confusion matrix after 100 runs of random forest.

# ADD THE CONFUSI0n TABLE
yhat <- ifelse(phat3 > 0.5, 1, 0)

# 5. Build the confusion table.

ct <- table(test2$y, yhat[,2])

# This function take `ct` and make a better table
gct <- function(x) {
  conf_table <- matrix(0, 2, 2)
  conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
  conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
  conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
  conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
  colnames(conf_table) <- c("Y=1", "Y=0")
  rownames(conf_table) <- c("phat=1", "phat=0")
  conf_table
}

cont <- gct(ct)
cont

##        Y=1  Y=0
## phat=1  74  116
## phat=0  35 1440

#Compare the test AUCs and OOB AUCs. Note that attribute rf$votes reports OOB results for the training data.

FF <- model3$votes
FF

AUCOOB <- c()

pred_rocrr <- prediction(FF[,2], model3$y)
  auc_ROCROOB <- performance(pred_rocrr, measure = "auc")
  AUCOOB[i] <- auc_ROCROOB@y.values[[1]]

mean(AUCOOB)

## [1] NA

Yy <-ifelse(df$y == "yes", 1, 0)
newdata <- cbind(df, Yy)
newdata <- newdata[, -17]


newdata$Yy <- as.numeric(newdata$Yy)

## Full grid search with external boostrapping

library(gbm)

## Warning: package 'gbm' was built under R version 4.2.3

## Loaded gbm 2.1.8.1

library(dplyr)

h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))

auc <-c()


for(i in 1:nrow(grid)) {
  auc <- c()
  for (j in 1:2) {
    try({
      ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
      train <- newdata[ind,]
      val <- newdata[-ind,] 
      boos <- gbm(
        Yy ~ .,
        distribution = "bernoulli",
        interaction.depth = grid[i, 1],
        n.trees = grid[i, 2],
        shrinkage = grid[i, 3],
        bag.fraction = 1,
        data = train
      )
      prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
      
      pred_rocr <- prediction(prboos, val$Yy)
      auc_rocr <- performance(pred_rocr, measure = "auc")
      auc[j] <- auc_rocr@y.values[[1]]
    }, silent = TRUE
    )
  }
  grid$AUC[i] <- mean(auc)
}
best <- head(arrange(grid, desc(AUC)))
best

##   D   B    h       AUC
## 1 2 200 0.07 0.9142666
## 2 2 250 0.07 0.9098041
## 3 2 150 0.09 0.9096120
## 4 2 250 0.05 0.9088931
## 5 2 300 0.05 0.9085689
## 6 2 200 0.05 0.9071892

plot(best)

#best predictors
summary.gbm(boos)

##                 var    rel.inf
## duration   duration 38.2875060
## month         month 25.5405588
## poutcome   poutcome 13.4454036
## job             job  4.6218397
## pdays         pdays  3.7136039
## balance     balance  3.1189176
## age             age  3.0186497
## day             day  2.5445164
## previous   previous  1.3900406
## contact     contact  1.2641744
## marital     marital  1.1132746
## campaign   campaign  0.7975131
## education education  0.5685828
## housing     housing  0.3489388
## default     default  0.1746398
## loan           loan  0.0518403

Apply AdaBoost.M1 with gbm(adaboost) and adaboost() from the JOUSBoost package. For gbm, follow the same 2 steps in (3) and report your results. For JOUSBoost, make sure that you follow the requirements of adaboost().

#ADABOOST 
#change y to 1, -1 but 1,0 with gbm
##EVERYTHING IS THE SAME EXCEPT THE DISTRIBUTION

library(gbm)

h <- seq(0.01, 0.1, 0.01) #SHRINKAGE
B <- seq(100, 300, 50) #how many treees
D <- 1:2 #depth
grid <- as.data.frame(expand.grid(D, B, h))
colnames(grid) <- c("D", "B", "h")
grid$AUC <- rep(0, nrow(grid))

mse <-c()
sdmse <-c()

for(i in 1:nrow(grid)) {
  auc <- c()
  for (j in 1:10) {
    try({
      ind <- sample(nrow(newdata), nrow(newdata), replace = TRUE)
      train <- newdata[ind,]
      val <- newdata[-ind,]
      model_ada <- gbm(
        Yy ~ .,
        distribution = "adaboost",
        interaction.depth = grid[i, 1],
        n.trees = grid[i, 2],
        shrinkage = grid[i, 3],
        bag.fraction = 1,
        data = train
      )
      prboos <- predict(boos, val, n.trees = grid[i, 2], type = "response")
      
      pred_rocr <- prediction(prboos, val$Yy)
      auc_rocr <- performance(pred_rocr, measure = "auc")
      auc[j] <- auc_rocr@y.values[[1]]
    }, silent = TRUE
    )
  }
  grid$AUC[i] <- mean(auc)
}
best1 <- head(arrange(grid, desc(AUC)))
best1

##   D   B    h       AUC
## 1 1 300 0.09 0.9409896
## 2 1 250 0.07 0.9402929
## 3 2 300 0.03 0.9395809
## 4 1 300 0.02 0.9383634
## 5 1 300 0.03 0.9382302
## 6 2 300 0.01 0.9378086

#best predictions
summary.gbm(model_ada)

##                 var    rel.inf
## duration   duration 44.2651091
## month         month 21.3367858
## poutcome   poutcome 15.3910881
## balance     balance  3.8550096
## pdays         pdays  3.3117235
## contact     contact  2.7888737
## job             job  2.5910143
## age             age  1.8989087
## day             day  1.8526680
## marital     marital  1.0615889
## education education  0.4834577
## default     default  0.3587141
## loan           loan  0.2987125
## previous   previous  0.2134945
## campaign   campaign  0.1928412
## housing     housing  0.1000102

##JOUSBOOST USING 1,-1
Y <-ifelse(df$y == "yes", 1, -1)
bstdata <- cbind(df, Y)
bstdata <- bstdata[, -17]

#DF$Y <- ifelse(df$y == 0, -1, 1)

#adaboost X's as a matrix
library(mltools)

## Warning: package 'mltools' was built under R version 4.2.3

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

df_hot <- one_hot(as.data.table(bstdata))

library(JOUSBoost)

## Warning: package 'JOUSBoost' was built under R version 4.2.3

## JOUSBoost 2.1.0

#MAUC <- c()
aucc <- c()

for (i in 1:10) {
  ind <- sample(nrow(df_hot), nrow(df_hot), replace = TRUE)
  train <- df_hot[ind, ]
  val <- df_hot[-ind,]
  
  ada <- adaboost(as.matrix(train[, -"Y"]),
                  train$Y, tree_depth = 1,
                  n_rounds = 100)
  
  phat <- predict(ada, val, type = "prob")
    
    pred_rocR <- prediction(phat, val$Y)
    auc_ROCR <- performance(pred_rocR, measure = "auc")
    aucc[i] <- auc_ROCR@y.values[[1]]
}

mean(aucc)

## [1] 0.8847019

sd(aucc)

## [1] 0.007089726

#MAUC[r] <- mean(aucc)

Portuguese banking institution Problem

Sourabh Joshi

2023-04-17