PART 1: Downloading data and pre-processing
# Load necessary libraries
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.2.3
# setting the working directory to the location of the CSV file
setwd("D:/Sourabh")
white_wine <- read.csv("winequality-white.csv", sep=";")
red_wine <- read.csv("winequality-red.csv",sep=";")
# Check if there is any missing values
anyNA(white_wine)
## [1] FALSE
anyNA(red_wine)
## [1] FALSE
# Checking if there is any Character Variable
str(white_wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
str(red_wine)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
Both the data-frames doesn’t have any missing values and all the variables are either integer or number. So, our data is clean.
# Asigning the dataframe
white_wine_reg <- white_wine[, ]
#Part 2: Predicting the wine quality
#RF gives estimates of what variables are important in the classification.A random sample of features to use at each split, a more diverse set of trees is produced which tends to lessen tree correlation beyond bagged trees and often dramatically increase predictive power.
# Setting train and test data for random forest and linear model. Comparing mean of both.
# Considering LM as Parametric and Random Forest as Non Parametric
RMSPE_reg <- c()
RMSPE_lm_reg<-c()
n = 10
for (i in 1:n) {
ind_reg <-
sample(nrow(white_wine_reg), nrow(white_wine_reg), replace = TRUE)
train_reg <- white_wine_reg[ind_reg,]
test_reg <- white_wine_reg[-ind_reg,]
p = ncol(train_reg) - 1
model_reg <-
randomForest(quality ~ .,
ntree = 10,
data = train_reg)
yhat_reg <- predict(model_reg, test_reg)
RMSPE_reg[i] <- sqrt(mean((test_reg$quality - yhat_reg) ^ 2))
model_lm_reg<-lm(quality~., train_reg)
yhat_lm_reg<-predict(model_lm_reg,test_reg)
RMSPE_lm_reg[i] <- sqrt(mean((test_reg$quality - yhat_lm_reg) ^ 2))
}
mean(RMSPE_reg)
## [1] 0.6560953
sd(RMSPE_reg)
## [1] 0.01466302
mean(RMSPE_lm_reg)
## [1] 0.7558754
sd(RMSPE_lm_reg)
## [1] 0.009927987
#Hence RMSPE of Random forest is lesser than RMSPE of LM, so Random Forest is a more suitable model
# Exploring important variables in the Data
library(randomForestExplainer)
## Warning: package 'randomForestExplainer' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# randomForestExplainer()
varImpPlot(model_reg)
importance_frame <- measure_importance(model_reg)
## [1] "Warning: your forest does not contain information on local importance so 'mse_increase' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes node_purity_increase
## 1 alcohol 1.8 1146 547.8835
## 2 chlorides 1.6 1155 375.7166
## 3 citric.acid 2.5 1193 243.2070
## 4 density 1.6 1287 534.4405
## 5 fixed.acidity 2.6 1085 232.6449
## 6 free.sulfur.dioxide 2.2 1232 363.0204
## 7 pH 3.9 1234 244.9899
## 8 residual.sugar 2.8 1256 307.1883
## 9 sulphates 3.5 1214 228.5781
## 10 total.sulfur.dioxide 3.5 1294 260.8903
## 11 volatile.acidity 2.3 1127 412.6473
## no_of_trees times_a_root p_value
## 1 10 0 0.957289056
## 2 10 5 0.925654604
## 3 10 0 0.612692173
## 4 10 3 0.005681307
## 5 10 0 0.999845391
## 6 10 0 0.186597699
## 7 10 0 0.170916171
## 8 10 0 0.053791156
## 9 10 0 0.363636651
## 10 10 0 0.003084565
## 11 10 2 0.989406364
#Alcohol is the strongest predictor in this dataset
# Assigning categories for classification and changing them into factors
white_wine_class<-white_wine
white_wine_class$class<-ifelse(white_wine_class$quality>mean(white_wine$quality),'More-than Average', 'Less-than Average')
white_wine_class$class<-as.factor(white_wine_class$class)
white_wine_class<-white_wine_class[,-12]
anyNA(white_wine_class)
## [1] FALSE
white_wine_class_lm<-white_wine
white_wine_class_lm$class<-ifelse(white_wine_class_lm$quality>mean(white_wine$quality),1, 0)
white_wine_class_lm$class <- as.numeric(as.character(white_wine_class_lm$class))
white_wine_class_lm<-white_wine_class_lm[,-12]
anyNA(white_wine_class_lm)
## [1] FALSE
# Checking which model is more effective
AUC_class <- c()
AUC_lm_class<-c()
n = 10
for (i in 1:n) {
ind_class <-sample(nrow(white_wine_class), nrow(white_wine_class), replace = TRUE)
train_class <- white_wine_class[ind_class,]
test_class <- white_wine_class[-ind_class,]
ind_class_lm <-sample(nrow(white_wine_class_lm), nrow(white_wine_class_lm), replace = TRUE)
train_class_lm <- white_wine_class_lm[ind_class_lm,]
test_class_lm <- white_wine_class_lm[-ind_class_lm,]
p = ncol(train_class) - 1
model_class <-
randomForest(class ~ ., ntree = 10, data = train_class)
phat_class <- predict(model_class, test_class, type = "prob")
pred_class <- prediction(phat_class[,2], test_class$class)
auc_ROCR1 <- performance(pred_class, measure = "auc")
AUC_class[i] <- auc_ROCR1@y.values[[1]]
model_lm_class<-lm(class~., train_class_lm)
phat_lm_class<-predict(model_lm_class,test_class_lm,type = "response")
pred_lm_class <- prediction(phat_lm_class, test_class_lm$class)
auc_ROCR2 <- performance(pred_lm_class, measure = "auc")
AUC_lm_class[i] <- auc_ROCR2@y.values[[1]]
}
mean(AUC_class)
## [1] 0.8620797
sd(AUC_class)
## [1] 0.009849
mean(AUC_lm_class)
## [1] 0.7933796
sd(AUC_lm_class)
## [1] 0.006656446
yhat_class<-c()
yhat_class <- ifelse(phat_class > 0.5, 1, 0)
## 5. Build the confusion table.
ct_class <- table(test_class$class, yhat_class[,2])
# This function take `ct` and make a better table
gct <- function(x) {
conf_table <- matrix(0, 2, 2)
conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
colnames(conf_table) <- c("Y=1", "Y=0")
rownames(conf_table) <- c("phat=1", "phat=0")
conf_table
}
cont <- gct(ct_class)
cont
## Y=1 Y=0
## phat=1 1036 191
## phat=0 152 436
library(randomForestExplainer)
# randomForestExplainer()
varImpPlot(model_class)
importance_frame <- measure_importance(model_class)
## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes gini_decrease no_of_trees
## 1 alcohol 1.1 395 360.1959 10
## 2 chlorides 2.8 417 163.9199 10
## 3 citric.acid 2.7 453 166.4152 10
## 4 density 1.8 486 252.8606 10
## 5 fixed.acidity 3.7 435 142.7943 10
## 6 free.sulfur.dioxide 2.1 483 214.1847 10
## 7 pH 2.6 475 166.3128 10
## 8 residual.sugar 2.2 445 174.6393 10
## 9 sulphates 3.2 434 145.5809 10
## 10 total.sulfur.dioxide 3.6 459 145.8733 10
## 11 volatile.acidity 2.0 394 277.3969 10
## times_a_root p_value
## 1 3 0.99314521
## 2 2 0.90974914
## 3 0 0.32096542
## 4 3 0.01870164
## 5 0 0.66694749
## 6 0 0.02644681
## 7 0 0.06101242
## 8 0 0.47293167
## 9 0 0.68496508
## 10 0 0.22319612
## 11 2 0.99407175
#Alcohol is the strongest predictor in this dataset
#Solution 3 # Data Preprocessing
white_wine$label <- 1
red_wine$label <- 0
wine <- rbind(white_wine, red_wine)
wine$label <- as.factor(wine$label)
anyNA(wine)
## [1] FALSE
str(wine)
## 'data.frame': 6497 obs. of 13 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
## $ label : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
wine_lm<-wine
wine_lm$label <- as.numeric(as.character(wine_lm$label))
anyNA(wine_lm)
## [1] FALSE
str(wine_lm)
## 'data.frame': 6497 obs. of 13 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
## $ label : num 1 1 1 1 1 1 1 1 1 1 ...
AUC_wine <- c()
AUC_lm_wine<-c()
n = 10
for (i in 1:n) {
ind_wine <-sample(nrow(wine), nrow(wine), replace = TRUE)
train_wine <- wine[ind_wine,]
test_wine <- wine[-ind_wine,]
ind_wine_lm <-sample(nrow(wine_lm), nrow(wine_lm), replace = TRUE)
train_wine_lm <- wine_lm[ind_wine_lm,]
test_wine_lm <- wine_lm[-ind_wine_lm,]
p = ncol(train_wine) - 1
model_wine <-
randomForest(label ~ ., ntree = 10, data = train_wine)
phat_wine <- predict(model_wine, test_wine, type = "prob")
pred_wine <- prediction(phat_wine[,2], test_wine$label)
auc_ROCR1 <- performance(pred_wine, measure = "auc")
AUC_wine[i] <- auc_ROCR1@y.values[[1]]
model_lm_wine<-lm(label~., train_wine_lm)
phat_lm_wine<-predict(model_lm_wine,test_wine_lm,type = "response")
pred_lm_wine <- prediction(phat_lm_wine, test_wine_lm$label)
auc_ROCR2 <- performance(pred_lm_wine, measure = "auc")
AUC_lm_wine[i] <- auc_ROCR2@y.values[[1]]
}
mean(AUC_wine)
## [1] 0.997189
sd(AUC_wine)
## [1] 0.00134241
mean(AUC_lm_wine)
## [1] 0.9955816
sd(AUC_lm_wine)
## [1] 0.0009960452
#Both the models have given almost similar predictive index.
#Creating confusion matrix
yhat_wine<-c()
yhat_wine <- ifelse(phat_wine > 0.5, 1, 0)
## 5. Build the confusion table.
ct_wine <- table(test_wine$label, yhat_wine[,2])
# This function take `ct` and make a better table
gct <- function(x) {
conf_table <- matrix(0, 2, 2)
conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
colnames(conf_table) <- c("Y=1", "Y=0")
rownames(conf_table) <- c("phat=1", "phat=0")
conf_table
}
cont2 <- gct(ct_wine)
cont2
## Y=1 Y=0
## phat=1 1785 12
## phat=0 10 565
#Exploring important variables
library(randomForestExplainer)
# randomForestExplainer()
varImpPlot(model_wine)
importance_frame <- measure_importance(model_wine)
## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."
importance_frame
## variable mean_min_depth no_of_nodes gini_decrease no_of_trees
## 1 alcohol 3.8 56 16.584161 10
## 2 chlorides 1.9 112 501.646682 10
## 3 citric.acid 3.7 45 22.060794 10
## 4 density 2.7 80 119.866160 10
## 5 fixed.acidity 2.7 74 79.119521 10
## 6 free.sulfur.dioxide 3.6 42 61.803530 10
## 7 pH 3.9 62 45.225919 10
## 8 quality 5.6 18 6.188488 8
## 9 residual.sugar 2.8 62 113.751486 10
## 10 sulphates 2.4 67 93.278515 10
## 11 total.sulfur.dioxide 1.2 108 908.772461 10
## 12 volatile.acidity 1.2 65 467.494618 10
## times_a_root p_value
## 1 0 9.124527e-01
## 2 2 3.351274e-08
## 3 0 9.981025e-01
## 4 1 4.322513e-02
## 5 0 1.643212e-01
## 6 0 9.995711e-01
## 7 0 7.108857e-01
## 8 0 1.000000e+00
## 9 0 7.108857e-01
## 10 0 4.630260e-01
## 11 3 3.563545e-07
## 12 4 5.654755e-01