project3
#LOADING LIBRARY
library(readr)
library(readxl)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ROCR)
library(ineq)
library(StatMeasures)
library(rattle)
#LOADING DATA
Customerdata=read_excel("Thera Bank_Personal_Loan_Modelling-dataset-1.xlsx")
attach(Customerdata)
#Exploratory Data Analysis
#1 coloum name treatment
names(Customerdata)
## [1] "ID" "Age (in years)"
## [3] "Experience (in years)" "Income (in K/month)"
## [5] "ZIP Code" "Family members"
## [7] "CCAvg" "Education"
## [9] "Mortgage" "Personal Loan"
## [11] "Securities Account" "CD Account"
## [13] "Online" "CreditCard"
colnames(Customerdata)=make.names(colnames(Customerdata))
#2 Data overview
head(Customerdata)
## # A tibble: 6 x 14
## ID Age..in.years. Experience..in.~ Income..in.K.mo~ ZIP.Code
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25 1 49 91107
## 2 2 45 19 34 90089
## 3 3 39 15 11 94720
## 4 4 35 9 100 94112
## 5 5 35 8 45 91330
## 6 6 37 13 29 92121
## # ... with 9 more variables: Family.members <dbl>, CCAvg <dbl>,
## # Education <dbl>, Mortgage <dbl>, Personal.Loan <dbl>,
## # Securities.Account <dbl>, CD.Account <dbl>, Online <dbl>,
## # CreditCard <dbl>
summary(Customerdata)
## ID Age..in.years. Experience..in.years. Income..in.K.month.
## Min. : 1 Min. :23.00 Min. :-3.0 Min. : 8.00
## 1st Qu.:1251 1st Qu.:35.00 1st Qu.:10.0 1st Qu.: 39.00
## Median :2500 Median :45.00 Median :20.0 Median : 64.00
## Mean :2500 Mean :45.34 Mean :20.1 Mean : 73.77
## 3rd Qu.:3750 3rd Qu.:55.00 3rd Qu.:30.0 3rd Qu.: 98.00
## Max. :5000 Max. :67.00 Max. :43.0 Max. :224.00
##
## ZIP.Code Family.members CCAvg Education
## Min. : 9307 Min. :1.000 Min. : 0.000 Min. :1.000
## 1st Qu.:91911 1st Qu.:1.000 1st Qu.: 0.700 1st Qu.:1.000
## Median :93437 Median :2.000 Median : 1.500 Median :2.000
## Mean :93153 Mean :2.397 Mean : 1.938 Mean :1.881
## 3rd Qu.:94608 3rd Qu.:3.000 3rd Qu.: 2.500 3rd Qu.:3.000
## Max. :96651 Max. :4.000 Max. :10.000 Max. :3.000
## NA's :18
## Mortgage Personal.Loan Securities.Account CD.Account
## Min. : 0.0 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0 Median :0.000 Median :0.0000 Median :0.0000
## Mean : 56.5 Mean :0.096 Mean :0.1044 Mean :0.0604
## 3rd Qu.:101.0 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :635.0 Max. :1.000 Max. :1.0000 Max. :1.0000
##
## Online CreditCard
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.000
## Mean :0.5968 Mean :0.294
## 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000
##
str(Customerdata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5000 obs. of 14 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age..in.years. : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience..in.years.: num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income..in.K.month. : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : num 91107 90089 94720 94112 91330 ...
## $ Family.members : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : num 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account : num 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : num 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : num 0 0 0 0 1 0 0 1 0 0 ...
Customerdata$Education=as.factor(Customerdata$Education)
Customerdata$Personal.Loan=as.factor(Customerdata$Personal.Loan)
Customerdata$Securities.Account=as.factor(Customerdata$Securities.Account)
Customerdata$CD.Account=as.factor(Customerdata$CD.Account)
Customerdata$Online=as.factor(Customerdata$Online)
Customerdata$CreditCard=as.factor(Customerdata$CreditCard)
str(Customerdata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5000 obs. of 14 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Age..in.years. : num 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience..in.years.: num 1 19 15 9 8 13 27 24 10 9 ...
## $ Income..in.K.month. : num 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : num 91107 90089 94720 94112 91330 ...
## $ Family.members : num 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 3
2 3 ...
## $ Mortgage : num 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
2 ...
## $ Securities.Account : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1
1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 2 1 2
1 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1
1 ...
Customerdata=Customerdata[,-c(1,5)]
#4Checking Data set for missing value
library(mice)
library(VIM)
any(is.na.data.frame(Customerdata))
## [1] TRUE
impute=mice(Customerdata[,],m=3,seed=123)
##
## iter imp variable
## 1 1 Family.members
## 1 2 Family.members
## 1 3 Family.members
## 2 1 Family.members
## 2 2 Family.members
## 2 3 Family.members
## 3 1 Family.members
## 3 2 Family.members
## 3 3 Family.members
## 4 1 Family.members
## 4 2 Family.members
## 4 3 Family.members
## 5 1 Family.members
## 5 2 Family.members
## 5 3 Family.members
print(impute)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## Age..in.years. Experience..in.years. Income..in.K.month.
## "" "" ""
## Family.members CCAvg Education
## "pmm" "" ""
## Mortgage Personal.Loan Securities.Account
## "" "" ""
## CD.Account Online CreditCard
## "" "" ""
## PredictorMatrix:
## Age..in.years. Experience..in.years.
## Age..in.years. 0 1
## Experience..in.years. 1 0
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1
## Income..in.K.month. Family.members CCAvg Education
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 0 1 1 1
## Family.members 1 0 1 1
## CCAvg 1 1 0 1
## Education 1 1 1 0
## Mortgage Personal.Loan Securities.Account CD.Account
## Age..in.years. 1 1 1 1
## Experience..in.years. 1 1 1 1
## Income..in.K.month. 1 1 1 1
## Family.members 1 1 1 1
## CCAvg 1 1 1 1
## Education 1 1 1 1
## Online CreditCard
## Age..in.years. 1 1
## Experience..in.years. 1 1
## Income..in.K.month. 1 1
## Family.members 1 1
## CCAvg 1 1
## Education 1 1
newdata=complete(impute,1)
any(is.na.data.frame(newdata))
## [1] FALSE
#3checking for outliers
boxplot(newdata)
#5Negative value treatment
newdata$Experience..in.years.=abs(newdata$Experience..in.years.)
summary(newdata)
## Age..in.years. Experience..in.years. Income..in.K.month. Family.members
## Min. :23.00 Min. : 0.00 Min. : 8.00 Min. :1.000
## 1st Qu.:35.00 1st Qu.:10.00 1st Qu.: 39.00 1st Qu.:1.000
## Median :45.00 Median :20.00 Median : 64.00 Median :2.000
## Mean :45.34 Mean :20.13 Mean : 73.77 Mean :2.396
## 3rd Qu.:55.00 3rd Qu.:30.00 3rd Qu.: 98.00 3rd Qu.:3.000
## Max. :67.00 Max. :43.00 Max. :224.00 Max. :4.000
## CCAvg Education Mortgage Personal.Loan
## Min. : 0.000 1:2096 Min. : 0.0 0:4520
## 1st Qu.: 0.700 2:1403 1st Qu.: 0.0 1: 480
## Median : 1.500 3:1501 Median : 0.0
## Mean : 1.938 Mean : 56.5
## 3rd Qu.: 2.500 3rd Qu.:101.0
## Max. :10.000 Max. :635.0
## Securities.Account CD.Account Online CreditCard
## 0:4478 0:4698 0:2016 0:3530
## 1: 522 1: 302 1:2984 1:1470
##
##
##
##
#Decision Tree.
#Spliting Data into train and test data.
seed=2000
set.seed(seed)
sample=sample.split(newdata,SplitRatio = 0.7)
train_data=subset(newdata,sample == TRUE)
test_data=subset(newdata,sample == FALSE)
nrow(train_data)
## [1] 3333
nrow(test_data)
## [1] 1667
#Checking Response variable
table(train_data$Personal.Loan)
##
## 0 1
## 3025 308
str(train_data)
## 'data.frame': 3333 obs. of 12 variables:
## $ Age..in.years. : num 25 45 39 35 37 34 65 29 48 59 ...
## $ Experience..in.years.: num 1 19 15 9 13 9 39 5 23 32 ...
## $ Income..in.K.month. : num 49 34 11 100 29 180 105 45 114 40 ...
## $ Family.members : num 4 3 1 1 4 1 4 3 2 4 ...
## $ CCAvg : num 1.6 1.5 1 2.7 0.4 8.9 2.4 0.1 3.8 2.5 ...
## $ Education : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 3 3 2
3 2 ...
## $ Mortgage : num 0 0 0 0 155 0 0 0 0 0 ...
## $ Personal.Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1
1 ...
## $ Securities.Account : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 2
1 ...
## $ CD.Account : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...
## $ Online : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1
2 ...
## $ CreditCard : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1
1 ...
DT=train_data #DECISION TREE TRAIN DATA
RF=train_data #RANDOM FOREST TRAIN DATA
DS=test_data #DECISION TREE TEST DATA
RS=test_data #RANDOME FOREST TEST DATA
#Buliding Cart Model
Model1=rpart(formula = Personal.Loan~.,data=DT,method =
"class",cp=0,minsplit=100,minbucket=10,xval=10)
fancyRpartPlot(Model1)
Model1
## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.907590759 0.092409241)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.979034070 0.020965930)
## 4) CCAvg< 2.95 2475 11 0 (0.995555556 0.004444444) *
## 5) CCAvg>=2.95 196 45 0 (0.770408163 0.229591837)
## 10) CD.Account=0 178 32 0 (0.820224719 0.179775281) *
## 11) CD.Account=1 18 5 1 (0.277777778 0.722222222) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.619335347 0.380664653)
## 6) Education=1 449 50 0 (0.888641425 0.111358575)
## 12) Family.members< 2.5 399 0 0 (1.000000000 0.000000000) *
## 13) Family.members>=2.5 50 0 1 (0.000000000 1.000000000) *
## 7) Education=2,3 213 11 1 (0.051643192 0.948356808)
## 14) Income..in.K.month.< 116.5 18 7 0 (0.611111111 0.388888889) *
## 15) Income..in.K.month.>=116.5 195 0 1 (0.000000000 1.000000000) *
#Compleximity parameter chart
printcp(Model1)
##
## Classification tree:
## rpart(formula = Personal.Loan ~ ., data = DT, method = "class",
## cp = 0, minsplit = 100, minbucket = 10, xval = 10)
##
## Variables actually used in tree construction:
## [1] CCAvg CD.Account Education
## [4] Family.members Income..in.K.month.
##
## Root node error: 308/3333 = 0.092409
##
## n= 3333
##
## CP nsplit rel error xerror xstd
## 1 0.310065 0 1.00000 1.00000 0.054284
## 2 0.162338 2 0.37987 0.48052 0.038612
## 3 0.012987 3 0.21753 0.24351 0.027800
## 4 0.000000 6 0.17857 0.22727 0.026878
plotcp(Model1)
Pmodel=rpart(formula = Personal.Loan~.,data=DT,method =
"class",cp=0.04,minsplit=100,minbucket=100,xval=10)
fancyRpartPlot(Pmodel)
Pmodel
## n= 3333
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3333 308 0 (0.90759076 0.09240924)
## 2) Income..in.K.month.< 113.5 2671 56 0 (0.97903407 0.02096593) *
## 3) Income..in.K.month.>=113.5 662 252 0 (0.61933535 0.38066465)
## 6) Education=1 449 50 0 (0.88864143 0.11135857) *
## 7) Education=2,3 213 11 1 (0.05164319 0.94835681) *
#Cart Model output Explanation
#First node shows that there are 91% chance that customer will accept personal loan 9%
chance of not accepting loan.
#Monthly income is the 1st variable that is split in decision tree hence it is the most
important variable for building strategy.
#Highest Risk segment is of 6% which means that 6% of customer will not accept personal
loan are being taken in that segment which is of Monthly Income >114 ,Education=1.
#second Risk segment is of 2% which means that 2% of customer will not accept personal
loan are being taken in that segment which is of Monthly Income >114, Education =1 and
Family.Members < 2.5
#Prediction
DT$Prediction=predict(Pmodel,data=DT,type = "class")
DT$probability=predict(Pmodel,data=DT,type = "prob")
tbl=table(Actual=DT$Personal.Loan,prediction=DT$Prediction)
(3014+252)/sum(tbl)
## [1] 0.979898
Z=Pmodel
#accurancy of above model is 97.9 %.
#PREDICTION USING SAME MODEL IN TEST DATA.
DS$Predict.class=predict(Pmodel,newdata = DS,type = "class")
tbl1=table(Actual.test=DS$Personal.Loan,prediction.test=DS$Predict.class)
tbl1
## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125
(1489+144)/sum(tbl1)
## [1] 0.9796041
#Decision Tree Model Performance and Validation-Train Data.
#Confusion Matrix
DT$Prediction=predict(Pmodel,data=DT,type = "class")
tbl=table(Actual=DT$Personal.Loan,prediction=DT$Prediction)
tbl
## prediction
## Actual 0 1
## 0 3014 11
## 1 106 202
#1)Accuracy Of Model
(3014+252)/sum(tbl)
## [1] 0.979898
#2classification error
(11+56)/sum(tbl)
## [1] 0.02010201
#3)sensitivity(True Positive Rate)
252/(252+56)
## [1] 0.8181818
#4)Specifity(True Negative Rate)
3014/(11+3014)
## [1] 0.9963636
pobj=prediction(DT$probability[,2],DT$Personal.Loan)
perf <- performance(pobj, "tpr", "fpr")
plot(perf,main = "ROC curve")
KS <- max(attr(pobj, 'y.values')[[1]]-attr(perf, 'x.values')[[1]])
auc <- performance(pobj,"auc");
auc <- as.numeric(auc@y.values)
print(KS)
## [1] -Inf
auc
## [1] 0.8842803
gini=ineq(DT$probability,"gini")
print(gini)
## [1] 0.4767402
#Decision Tree Model Performance and Validation-Test Data.
#Confusion Matrix
DS$Predict.class=predict(Pmodel,newdata = DS,type = "class")
DS$probability1=predict(Pmodel,newdata = DS,type="prob")
tbl1=table(Actual.test=DS$Personal.Loan,prediction.test=DS$Predict.class)
tbl1
## prediction.test
## Actual.test 0 1
## 0 1489 6
## 1 47 125
(1489+144)/sum(tbl1)
## [1] 0.9796041
#1)Accuracy Of Model
(1489++144)/sum(tbl1)
## [1] 0.9796041
#2classification error
(6+28)/sum(tbl1)
## [1] 0.02039592
#3)sensitivity(True Positive Rate)
144/(144+28)
## [1] 0.8372093
#4)Specifity(True Negative Rate)
1489/(1489+6)
## [1] 0.9959866
pobj1=prediction(DS$probability1[,2],DS$Personal.Loan)
perf1 <- performance(pobj1, "tpr", "fpr")
plot(perf1,main = "ROC curve")
#Randome forest
print(sum(RF$Personal.Loan==1)/nrow(RF))
## [1] 0.09240924
rndforest=randomForest(Personal.Loan~.,data=RF,ntree=501,mtry=3,nodesize=10,i
mportance=TRUE)
rndforest
##
## Call:
## randomForest(formula = Personal.Loan ~ ., data = RF, ntree = 501,
mtry = 3, nodesize = 10, importance = TRUE)
## Type of random forest: classification
## Number of trees: 501
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 1.56%
## Confusion matrix:
## 0 1 class.error
## 0 3018 7 0.00231405
## 1 45 263 0.14610390
print(rndforest$err.rate)
## OOB 0 1
## [1,] 0.02977667 0.020720721 0.1313131
## [2,] 0.02933738 0.015512465 0.1744186
## [3,] 0.03137570 0.017621145 0.1759259
## [4,] 0.03066378 0.015013829 0.1950207
## [5,] 0.02920443 0.014380531 0.1797753
## [6,] 0.02917602 0.012707377 0.1923077
## [7,] 0.02753442 0.012064805 0.1796610
## [8,] 0.02718567 0.011576439 0.1800000
## [9,] 0.02687023 0.010781671 0.1824104
## [10,] 0.02363636 0.008018710 0.1758958
plot(rndforest)
rndforest=randomForest(Personal.Loan~.,data=RF,ntree=101,mtry=3,nodesize=10,i
mportance=TRUE)
print(rndforest$err.rate)
## OOB 0 1
## [1,] 0.04358553 0.019090909 0.2758621
## [2,] 0.03241895 0.012700166 0.2164948
## [3,] 0.02891566 0.011968085 0.1923077
## [4,] 0.03797922 0.014624506 0.2643678
## [5,] 0.03268846 0.011037528 0.2428571
## [6,] 0.03160920 0.009513742 0.2448980
## [7,] 0.03031250 0.010344828 0.2233333
## [8,] 0.02709360 0.010522743 0.1887417
## [9,] 0.02440513 0.008739496 0.1782178
## [10,] 0.02605271 0.008344459 0.2000000
plot(rndforest)
#TUNNING
set.seed(seed)
X=tuneRF(x=RF,y=RF$Personal.Loan,mtryStart =3,stepFactor = 1,ntreeTry
=500,trace = TRUE,
plot = TRUE,doBest = TRUE, nodesize=5,importance=TRUE)
## mtry = 3 OOB error = 0%
## Searching left ...
## Searching right ...