CSE3506 Essentials of Data Analytics
Name        : Shivam Batra
Reg. No.    : 19BPS1131
Lab Exercise: 5
Objective: Applying logistic regression to predict red wine.
Methods -
1.   Exploratory data analysis (EDA)
2.   Data preparation
3.   Modeling -> Logistic regression
4.   ROC Curve
STEPS:
#Importing the dataset
data <- read.csv('winequality-red.csv', sep = ';')
str(data)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity      : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid      : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides         : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density          : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH            : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates          : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol          : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality         : int 5 5 5 6 5 5 5 7 7 5 ...
#Format outcome variable
data$quality <- ifelse(data$quality >= 7, 1, 0)
data$quality <- factor(data$quality, levels = c(0, 1))
#Descriptive statistics
summary(data)
##   fixed.acidity volatile.acidity citric.acid residual.sugar
##   Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
##   1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
##   Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
##   Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
##   3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
##   Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
##     chlorides    free.sulfur.dioxide total.sulfur.dioxide density
##   Min. :0.01200 Min. : 1.00          Min. : 6.00      Min. :0.9901
##   1st Qu.:0.07000 1st Qu.: 7.00        1st Qu.: 22.00    1st Qu.:0.9956
##   Median :0.07900 Median :14.00           Median : 38.00     Median :0.9968
##   Mean :0.08747 Mean :15.87              Mean : 46.47     Mean :0.9967
##   3rd Qu.:0.09000 3rd Qu.:21.00          3rd Qu.: 62.00    3rd Qu.:0.9978
##   Max. :0.61100 Max. :72.00            Max. :289.00      Max. :1.0037
##       pH       sulphates      alcohol quality
##   Min. :2.740 Min. :0.3300 Min. : 8.40 0:1382
##   1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1: 217
##   Median :3.310 Median :0.6200 Median :10.20
##   Mean :3.311 Mean :0.6581 Mean :10.42
##   3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
##   Max. :4.010 Max. :2.0000 Max. :14.90
Univariate analysis
  #Dependent variable
    #Frequency plot
par(mfrow=c(1,1))
barplot(table(data[[12]]),
         main = sprintf('Frequency plot of the variable: %s',
                        colnames(data[12])),
         xlab = colnames(data[12]),
         ylab = 'Frequency')
#Check class BIAS
table(data$quality)
##
## 0 1
## 1382 217
round(prop.table((table(data$quality))),2)
##
## 0 1
## 0.86 0.14
#Independent variable
     #Boxplots
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
   boxplot(x = data[i],
            horizontal = TRUE,
            main = sprintf('Boxplot of the variable: %s',
                           colnames(data[i])),
            xlab = colnames(data[i]))
}
#Histograms
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
   hist(x = data[[i]],
        main = sprintf('Histogram of the variable: %s',
                     colnames(data[i])),
        xlab = colnames(data[i]))
}
                                                            Bivariate analysis
 #Correlation matrix
library(ggcorrplot)
## Loading required package: ggplot2
ggcorrplot(round(cor(data[-12]), 2),
           type = "lower",
           lab = TRUE,
           title =
             'Correlation matrix of the red wine quality dataset')
Data preparation
#Missing values
sum(is.na(data))
## [1] 0
#Outliers
   #Identifing outliers
is_outlier <- function(x) {
   return(x < quantile(x, 0.25) - 1.5 * IQR(x) |
                x > quantile(x, 0.75) + 1.5 * IQR(x))
}
outlier <- data.frame(variable = character(),
                           sum_outliers = integer(),
                           stringsAsFactors=FALSE)
for (j in 1:(length(data)-1)){
   variable <- colnames(data[j])
   for (i in data[j]){
      sum_outliers <- sum(is_outlier(i))
   }
   row <- data.frame(variable,sum_outliers)
   outlier <- rbind(outlier, row)
}
outlier
##          variable sum_outliers
## 1     fixed.acidity      49
## 2 volatile.acidity        19
## 3       citric.acid     1
## 4    residual.sugar       155
## 5        chlorides     112
## 6 free.sulfur.dioxide       30
## 7 total.sulfur.dioxide       55
## 8          density     45
## 9             pH      35
## 10        sulphates       59
## 11          alcohol     13
#Identifying the percentage of outliers
for (i in 1:nrow(outlier)){
   if (outlier[i,2]/nrow(data) * 100 >= 5){
      print(paste(outlier[i,1],
                      '=',
                      round(outlier[i,2]/nrow(data) * 100, digits = 2),
                      '%'))
   }
}
## [1] "residual.sugar = 9.69 %"
## [1] "chlorides = 7 %"
#Inputting outlier values
for (i in 4:5){
   for (j in 1:nrow(data)){
      if (data[[j, i]] > as.numeric(quantile(data[[i]], 0.75) +
                                      1.5 * IQR(data[[i]]))){
         if (i == 4){
            data[[j, i]] <- round(mean(data[[i]]), digits = 2)
         } else{
            data[[j, i]] <- round(mean(data[[i]]), digits = 3)
         }
      }
   }
}
Modeling
#Splitting the dataset into the Training set and Test set
  #Stratified sample
data_ones <- data[which(data$quality == 1), ]
data_zeros <- data[which(data$quality == 0), ]
#Train data
set.seed(123)
train_ones_rows <- sample(1:nrow(data_ones), 0.8*nrow(data_ones))
train_zeros_rows <- sample(1:nrow(data_zeros), 0.8*nrow(data_ones))
train_ones <- data_ones[train_ones_rows, ]
train_zeros <- data_zeros[train_zeros_rows, ]
train_set <- rbind(train_ones, train_zeros)
table(train_set$quality)
##
## 0 1
## 173 173
#Test Data
test_ones <- data_ones[-train_ones_rows, ]
test_zeros <- data_zeros[-train_zeros_rows, ]
test_set <- rbind(test_ones, test_zeros)
table(test_set$quality)
##
## 0 1
## 1209 44
Logistic regression
#Logistic Regression
lr = glm(formula = quality ~.,
            data = train_set,
            family = binomial)
#Predictions
prob_pred = predict(lr,
                       type = 'response',
                       newdata = test_set[-12])
library(InformationValue)
optCutOff <- optimalCutoff(test_set$quality, prob_pred)[1]
y_pred = ifelse(prob_pred > optCutOff, 1, 0)
#Making the confusion matrix
cm_lr = table(test_set[, 12], y_pred)
cm_lr
## y_pred
##   0 1
## 0 1207 2
## 1 43 1
#Accuracy
accuracy_lr = (cm_lr[1,1] + cm_lr[1,1])/
  (cm_lr[1,1] + cm_lr[1,1] + cm_lr[2,1] + cm_lr[1,2])
accuracy_lr
## [1] 0.9816999
#ROC curve
library(ROSE)
## Loaded ROSE 0.0-4
par(mfrow = c(1, 1))
roc.curve(test_set$quality, y_pred)
## Area under the curve (AUC): 0.511