PART 1: Downloading data and pre-processing

# Load necessary libraries
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.2.3

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart)

## Warning: package 'rpart' was built under R version 4.2.3

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.2.3

Extracting the data and Saving it to a dataframe

# setting the working directory to the location of the CSV file
setwd("D:/Sourabh")

white_wine <- read.csv("winequality-white.csv", sep=";")
red_wine <- read.csv("winequality-red.csv",sep=";")

# Check if there is any missing values
anyNA(white_wine)

## [1] FALSE

anyNA(red_wine)

## [1] FALSE

# Checking if there is any Character Variable
str(white_wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

str(red_wine)

## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

Both the data-frames doesn’t have any missing values and all the variables are either integer or number. So, our data is clean.

# Asigning the dataframe
white_wine_reg <- white_wine[, ]

#Part 2: Predicting the wine quality

#RF gives estimates of what variables are important in the classification.A random sample of features to use at each split, a more diverse set of trees is produced which tends to lessen tree correlation beyond bagged trees and often dramatically increase predictive power.

# Setting train and test data for random forest and linear model. Comparing mean of both.
# Considering LM as Parametric and Random Forest as Non Parametric
RMSPE_reg <- c()
RMSPE_lm_reg<-c()
n = 10
for (i in 1:n) {
  ind_reg <-
    sample(nrow(white_wine_reg), nrow(white_wine_reg), replace = TRUE)
  train_reg <- white_wine_reg[ind_reg,]
  test_reg <- white_wine_reg[-ind_reg,]
  
  
  p = ncol(train_reg) - 1
  
  
  model_reg <-
    randomForest(quality ~ .,
                 ntree = 10,
                 data = train_reg)
  yhat_reg <- predict(model_reg, test_reg)
  RMSPE_reg[i] <- sqrt(mean((test_reg$quality - yhat_reg) ^ 2))
  
  model_lm_reg<-lm(quality~., train_reg)
  yhat_lm_reg<-predict(model_lm_reg,test_reg)
  RMSPE_lm_reg[i] <- sqrt(mean((test_reg$quality - yhat_lm_reg) ^ 2))
  
  
}

mean(RMSPE_reg)

## [1] 0.6560953

sd(RMSPE_reg)

## [1] 0.01466302

mean(RMSPE_lm_reg)

## [1] 0.7558754

sd(RMSPE_lm_reg)

## [1] 0.009927987

#Hence RMSPE of Random forest is lesser than RMSPE of LM, so Random Forest is a more suitable model

# Exploring important variables in the Data
library(randomForestExplainer)

## Warning: package 'randomForestExplainer' was built under R version 4.2.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# randomForestExplainer()
varImpPlot(model_reg)

importance_frame <- measure_importance(model_reg)

## [1] "Warning: your forest does not contain information on local importance so 'mse_increase' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                variable mean_min_depth no_of_nodes node_purity_increase
## 1               alcohol            1.8        1146             547.8835
## 2             chlorides            1.6        1155             375.7166
## 3           citric.acid            2.5        1193             243.2070
## 4               density            1.6        1287             534.4405
## 5         fixed.acidity            2.6        1085             232.6449
## 6   free.sulfur.dioxide            2.2        1232             363.0204
## 7                    pH            3.9        1234             244.9899
## 8        residual.sugar            2.8        1256             307.1883
## 9             sulphates            3.5        1214             228.5781
## 10 total.sulfur.dioxide            3.5        1294             260.8903
## 11     volatile.acidity            2.3        1127             412.6473
##    no_of_trees times_a_root     p_value
## 1           10            0 0.957289056
## 2           10            5 0.925654604
## 3           10            0 0.612692173
## 4           10            3 0.005681307
## 5           10            0 0.999845391
## 6           10            0 0.186597699
## 7           10            0 0.170916171
## 8           10            0 0.053791156
## 9           10            0 0.363636651
## 10          10            0 0.003084565
## 11          10            2 0.989406364

#Alcohol is the strongest predictor in this dataset

# Assigning categories for classification and changing them into factors
white_wine_class<-white_wine
white_wine_class$class<-ifelse(white_wine_class$quality>mean(white_wine$quality),'More-than Average', 'Less-than Average')

white_wine_class$class<-as.factor(white_wine_class$class)
white_wine_class<-white_wine_class[,-12]
anyNA(white_wine_class)

## [1] FALSE

white_wine_class_lm<-white_wine
white_wine_class_lm$class<-ifelse(white_wine_class_lm$quality>mean(white_wine$quality),1, 0)
white_wine_class_lm$class <- as.numeric(as.character(white_wine_class_lm$class))
white_wine_class_lm<-white_wine_class_lm[,-12]
anyNA(white_wine_class_lm)

## [1] FALSE

# Checking which model is more effective
AUC_class <- c()
AUC_lm_class<-c()
n = 10
for (i in 1:n) {
  ind_class <-sample(nrow(white_wine_class), nrow(white_wine_class), replace = TRUE)
  train_class <- white_wine_class[ind_class,]
  test_class <- white_wine_class[-ind_class,]
  
  ind_class_lm <-sample(nrow(white_wine_class_lm), nrow(white_wine_class_lm), replace = TRUE)
  train_class_lm <- white_wine_class_lm[ind_class_lm,]
  test_class_lm <- white_wine_class_lm[-ind_class_lm,]
  
  p = ncol(train_class) - 1
  
  
  model_class <-
    randomForest(class ~ ., ntree = 10, data = train_class)
  phat_class <- predict(model_class, test_class, type = "prob")
  
  pred_class <- prediction(phat_class[,2], test_class$class)
  auc_ROCR1 <- performance(pred_class, measure = "auc")
  AUC_class[i] <- auc_ROCR1@y.values[[1]]
  
  
  model_lm_class<-lm(class~., train_class_lm)
  phat_lm_class<-predict(model_lm_class,test_class_lm,type = "response")
  
  pred_lm_class <- prediction(phat_lm_class, test_class_lm$class)
  auc_ROCR2 <- performance(pred_lm_class, measure = "auc")
  AUC_lm_class[i] <- auc_ROCR2@y.values[[1]]
  
  
  
}

mean(AUC_class)

## [1] 0.8620797

sd(AUC_class)

## [1] 0.009849

mean(AUC_lm_class)

## [1] 0.7933796

sd(AUC_lm_class)

## [1] 0.006656446

While combining the datasets, it is obvious that random forest gives a better predictive result.

yhat_class<-c()
yhat_class <- ifelse(phat_class > 0.5, 1, 0)
## 5. Build the confusion table.
ct_class <- table(test_class$class, yhat_class[,2])

# This function take `ct` and make a better table
gct <- function(x) {
  conf_table <- matrix(0, 2, 2)
  conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
  conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
  conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
  conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
  colnames(conf_table) <- c("Y=1", "Y=0")
  rownames(conf_table) <- c("phat=1", "phat=0")
  conf_table
}

cont <- gct(ct_class)
cont

##         Y=1 Y=0
## phat=1 1036 191
## phat=0  152 436

library(randomForestExplainer)

# randomForestExplainer()
varImpPlot(model_class)

importance_frame <- measure_importance(model_class)

## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                variable mean_min_depth no_of_nodes gini_decrease no_of_trees
## 1               alcohol            1.1         395      360.1959          10
## 2             chlorides            2.8         417      163.9199          10
## 3           citric.acid            2.7         453      166.4152          10
## 4               density            1.8         486      252.8606          10
## 5         fixed.acidity            3.7         435      142.7943          10
## 6   free.sulfur.dioxide            2.1         483      214.1847          10
## 7                    pH            2.6         475      166.3128          10
## 8        residual.sugar            2.2         445      174.6393          10
## 9             sulphates            3.2         434      145.5809          10
## 10 total.sulfur.dioxide            3.6         459      145.8733          10
## 11     volatile.acidity            2.0         394      277.3969          10
##    times_a_root    p_value
## 1             3 0.99314521
## 2             2 0.90974914
## 3             0 0.32096542
## 4             3 0.01870164
## 5             0 0.66694749
## 6             0 0.02644681
## 7             0 0.06101242
## 8             0 0.47293167
## 9             0 0.68496508
## 10            0 0.22319612
## 11            2 0.99407175

#Alcohol is the strongest predictor in this dataset

#Solution 3 # Data Preprocessing

white_wine$label <- 1 
red_wine$label <- 0
wine <- rbind(white_wine, red_wine)
wine$label <- as.factor(wine$label)
anyNA(wine)

## [1] FALSE

str(wine)

## 'data.frame':    6497 obs. of  13 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ label               : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

wine_lm<-wine
wine_lm$label <- as.numeric(as.character(wine_lm$label))
anyNA(wine_lm)

## [1] FALSE

str(wine_lm)

## 'data.frame':    6497 obs. of  13 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ label               : num  1 1 1 1 1 1 1 1 1 1 ...

Checking the predictions

AUC_wine <- c()
AUC_lm_wine<-c()
n = 10
for (i in 1:n) {
  ind_wine <-sample(nrow(wine), nrow(wine), replace = TRUE)
  train_wine <- wine[ind_wine,]
  test_wine <- wine[-ind_wine,]
  
  ind_wine_lm <-sample(nrow(wine_lm), nrow(wine_lm), replace = TRUE)
  train_wine_lm <- wine_lm[ind_wine_lm,]
  test_wine_lm <- wine_lm[-ind_wine_lm,]
  
  p = ncol(train_wine) - 1
  
  
  model_wine <-
    randomForest(label ~ ., ntree = 10, data = train_wine)
  phat_wine <- predict(model_wine, test_wine, type = "prob")
  
  pred_wine <- prediction(phat_wine[,2], test_wine$label)
  auc_ROCR1 <- performance(pred_wine, measure = "auc")
  AUC_wine[i] <- auc_ROCR1@y.values[[1]]
  
  
  model_lm_wine<-lm(label~., train_wine_lm)
  phat_lm_wine<-predict(model_lm_wine,test_wine_lm,type = "response")
  
  pred_lm_wine <- prediction(phat_lm_wine, test_wine_lm$label)
  auc_ROCR2 <- performance(pred_lm_wine, measure = "auc")
  AUC_lm_wine[i] <- auc_ROCR2@y.values[[1]]
  
  
  
}

mean(AUC_wine)

## [1] 0.997189

sd(AUC_wine)

## [1] 0.00134241

mean(AUC_lm_wine)

## [1] 0.9955816

sd(AUC_lm_wine)

## [1] 0.0009960452

#Both the models have given almost similar predictive index.

#Creating confusion matrix

yhat_wine<-c()
yhat_wine <- ifelse(phat_wine > 0.5, 1, 0)
## 5. Build the confusion table.
ct_wine <- table(test_wine$label, yhat_wine[,2])

# This function take `ct` and make a better table
gct <- function(x) {
  conf_table <- matrix(0, 2, 2)
  conf_table[1, 1] <- ifelse(sum(dim(x)) > 3, x[2, 2], 0)
  conf_table[2, 2] <- ifelse(sum(dim(x)) > 3, x[1, 1], 0)
  conf_table[1, 2] <- ifelse(sum(dim(x)) > 3, x[2, 1], 0)
  conf_table[2, 1] <- ifelse(sum(dim(x)) > 3, x[1, 2], 0)
  colnames(conf_table) <- c("Y=1", "Y=0")
  rownames(conf_table) <- c("phat=1", "phat=0")
  conf_table
}

cont2 <- gct(ct_wine)
cont2

##         Y=1 Y=0
## phat=1 1785  12
## phat=0   10 565

#Exploring important variables

library(randomForestExplainer)

# randomForestExplainer()
varImpPlot(model_wine)

importance_frame <- measure_importance(model_wine)

## [1] "Warning: your forest does not contain information on local importance so 'accuracy_decrease' measure cannot be extracted. To add it regrow the forest with the option localImp = TRUE and run this function again."

importance_frame

##                variable mean_min_depth no_of_nodes gini_decrease no_of_trees
## 1               alcohol            3.8          56     16.584161          10
## 2             chlorides            1.9         112    501.646682          10
## 3           citric.acid            3.7          45     22.060794          10
## 4               density            2.7          80    119.866160          10
## 5         fixed.acidity            2.7          74     79.119521          10
## 6   free.sulfur.dioxide            3.6          42     61.803530          10
## 7                    pH            3.9          62     45.225919          10
## 8               quality            5.6          18      6.188488           8
## 9        residual.sugar            2.8          62    113.751486          10
## 10            sulphates            2.4          67     93.278515          10
## 11 total.sulfur.dioxide            1.2         108    908.772461          10
## 12     volatile.acidity            1.2          65    467.494618          10
##    times_a_root      p_value
## 1             0 9.124527e-01
## 2             2 3.351274e-08
## 3             0 9.981025e-01
## 4             1 4.322513e-02
## 5             0 1.643212e-01
## 6             0 9.995711e-01
## 7             0 7.108857e-01
## 8             0 1.000000e+00
## 9             0 7.108857e-01
## 10            0 4.630260e-01
## 11            3 3.563545e-07
## 12            4 5.654755e-01

Sourabh_joshi_A4

Sourabh

2023-04-01

Extracting the data and Saving it to a dataframe

While combining the datasets, it is obvious that random forest gives a better predictive result.

Checking the predictions