11/10/2024, ml3.
ipynb -
23:20 Colab
import pandas as
pd import numpy
as np
import matplotlib.pyplot as
plt import seaborn as sns
df = pd.read_csv('Churn_Modelling.csv')
df.head(10)
RowNumber CustomerI Surname CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMembe
d e s r
df.isnull().sum()
0 1 15634602 Hargrav 619 France Female 42 2 0.00 1 1 1
e
0
1 2 15647311 Hill 608 Spain Female 41 1 83807.86 1 0 1
RowNumber 0
2 3 15619304 Onio 502 France Female 42 8 159660.8 3 1 0
CustomerId 0 0
3 4 15701354 Boni 699 France Female 39 1 0.00 2 0 0
Surname 0
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.8 1 1 1
CreditScore 0
2
5 Geography6 15574012
0 Chu 645 Spain Male 44 8 113755.7 2 1 0
8
Gender 0
6 7 15592531 Bartlett 822 France Male 50 7 0.00 2 1 1
7 Age 8 0
15656148 Obinna 376 Germany Female 29 4 115046.7 4 1 0
4
Tenure 0
8 9 15792365 He 501 France Male 44 4 142051.0 2 0 1
Balance 0
NumOfProducts 0
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
Exited 0
df.describe().T
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 1/
11/10/2024, ml3.ipynb -
23:20 Colab
count mean std min 25% 50% 75% max
RowNumber 10000.0 5.000500e+0 2886.895680 1.00 2500.75 5.000500e+0 7.500250e+0 10000.00
3 3 3
CustomerId 10000.0 1.569094e+0 71936.18612 15565701.0 15628528.2 1.569074e+0 1.575323e+0 15815690.0
7 3 0 5 7 7 0
CreditScore 10000.0 6.505288e+0 96.653299 350.00 584.00 6.520000e+0 7.180000e+0 850.00
2 2 2
Age 10000.0 3.892180e+0 10.487806 18.00 32.00 3.700000e+0 4.400000e+0 92.00
1 1 1
Tenure 10000.0 5.012800e+0 2.892174 0.00 3.00 5.000000e+0 7.000000e+0 10.00
0 0 0
Balance 10000.0 7.648589e+0 62397.40520 0.00 0.00 9.719854e+0 1.276442e+0 250898.09
4 2 4 5
NumOfProducts 10000.0 1.530200e+0 0.581654 1.00 1.00 1.000000e+0 2.000000e+0 4.00
0 0 0
HasCrCard 10000.0 7.055000e- 0.455840 0.00 0.00 1.000000e+0 1.000000e+0 1.00
01 0 0
IsActiveMember 10000.0 5.151000e- 0.499797 0.00 0.00 1.000000e+0 1.000000e+0 1.00
01 0 0
df.info()
<class
'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to
9999 Data columns (total 14
columns):
# Column Non-Null Dtype
Count
0 RowNumber 10000 non- int64
null
1 CustomerId 10000 non- int64
null
2 Surname 10000 non- object
null
3 CreditScore 10000 non- int64
null
4 Geography 10000 non- object
null
5 Gender 10000 non- object
null
6 Age 10000 non- int64
null
7 Tenure 10000 non- int64
null
8 Balance 10000 non- float64
null
9 NumOfProducts 10000 non- int64
null
1 HasCrCard 10000 non- int64
0 null
1 IsActiveMember 10000 non- int64
1 null
1 EstimatedSalary 10000 non- float64
2 null
1 Exited 10000 non- int64
3 null
dtypes: float64(2), int64(9),
object(3) memory usage: 1.1+ MB
df[df['Balance']==0.0]
RowNumber CustomerI Surname CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveM
d e s e
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 2/
0 1 15634602 Hargrave 619 France Female 42 2 0.0 1 1
11/10/2024, ml3.ipynb -
23:20 3617 rows × 14 columns Colab
sns.barplot(x='Exited',y='Balance',data=df)
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 3/
11/10/2024, ml3.ipynb -
23:20 Colab
<Axes: xlabel='Exited', ylabel='Balance'>
b_zero=df[df['Balance']==0.0]
b_zero.head()
RowNumber CustomerI Surname CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMembe
d e s r
df['Geography'].unique()
0 1 15634602 Hargrav 619 France Female 42 2 0.0 1 1 1
e
array(['France', 'Spain', 'Germany'], dtype=object)
3 4 15701354 Boni 699 France Female 39 1 0.0 2 0 0
6 7 15592531
from sklearn.preprocessing Bartlett
import 822 France Male 50 7 0.0 2 1 1
LabelEncoder le = LabelEncoder()
11 12 15737173 Andrews 497 Spain Male 24 3 0.0 2 1 0
df['Geography'] = le.fit_transform(df['Geography'])
12 13 15632264 Kay 476 France Female 34 10 0.0 2 1 0
df.head(10)
RowNumber CustomerI Surname CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMembe
d e s r
df['Gender'] =
0 1 15634602 Hargrav
le.fit_transform(df['Gender']) df 619 0 Female 42 2 0.00 1 1 1
e
1 2 15647311 Hill 608 2 Female 41 1 83807.86 1 0 1
2 3 15619304 Onio 502 0 Female 42 8 159660.8 3 1 0
0
3 4 15701354 Boni 699 0 Female 39 1 0.00 2 0 0
4 5 15737888 Mitchell 850 2 Female 43 2 125510.8 1 1 1
2
5 6 15574012 Chu 645 2 Male 44 8 113755.7 2 1 0
8
6 7 15592531 Bartlett 822 0 Male 50 7 0.00 2 1 1
7 8 15656148 Obinna 376 1 Female 29 4 115046.7 4 1 0
4
8 9 15792365 He 501 0 Male 44 4 142051.0 2 0 1
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 4/
11/10/2024, ml3.ipynb -
23:20 Colab
RowNumber CustomerId Surname CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMe
e s m
0 1 15634602 Hargrave 619 0 0 42 2 0.00 1 1
1 2 15647311 Hill 608 2 0 41 1 83807.86 1 0
2 3 15619304 Onio 502 0 0 42 8 159660.8 3 1
0
3 4 15701354 Boni 699 0 0 39 1 0.00 2 0
4 5 15737888 Mitchell 850 2 0 43 2 125510.8 1 1
2
... ... ... ... ... ... ... ... ... ... ... ...
9995 9996 15606229 Obijiaku 771 0 1 39 5 0.00 2 1
9996 9997 15569892 Johnstone 516 0 1 35 10 57369.61 1 1
9997 9998 15584532 Liu 709 0 0 36 7 0.00 1 0
9998 9999 15682355 Sabbatini 772 1 1 42 3 75075.31 2 1
10000 rows × 14 columns
df.drop(['RowNumber','CustomerId','Surname'],axis=1,inplace=True)
df
CreditScore Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMembe EstimatedSalar Exited
s r y
0 619 0 0 42 2 0.00 1 1 1 101348.88 1
1 608 2 0 41 1 83807.86 1 0 1 112542.58 0
2 502 0 0 42 8 159660.8 3 1 0 113931.57 1
0
3 699 0 0 39 1 0.00 2 0 0 93826.63 0
4 850 2 0 43 2 125510.8 1 1 1 79084.10 0
2
... ... ... ... ... ... ... ... ... ... ... ...
9995 771 0 1 39 5 0.00 2 1 0 96270.64 0
9996 516 0 1 35 10 57369.61 1 1 1 101699.77 0
9997 709 0 0 36 7 0.00 1 0 1 42085.58 1
9998 772 1 1 42 3 75075.31 2 1 0 92888.52 1
9999 792 0 0 28 4 130142.7 1 1 0 38190.78 0
9
10000 rows × 11 columns
df[(df['Balance']==0) & (df['Exited']==0) & (df['IsActiveMember']==0)]
1424 rows × 11 columns
CreditScor Geography Gender Age Tenure Balance NumOfProduct HasCrCard IsActiveMembe EstimatedSalar Exited
e s r y
3 699 0 0 39 1 0.0 2 0 0 93826.63 0
df.describe().T
11 497 2 1 24 3 0.0 2 1 0 76390.01 0
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX-
12 476 0 0 34 10 0.0 2 1 0 26260.98 0 5/
11/10/2024, ml3.ipynb -
23:20 Colab
count mean std min 25% 50% 75% max
CreditScore 10000.0 650.528800 96.653299 350.00 584.00 652.000 718.0000 850.00
Geography 10000.0 0.746300 0.827529 0.00 0.00 0.000 1.0000 2.00
Gender 10000.0 0.545700 0.497932 0.00 0.00 1.000 1.0000 1.00
Age 10000.0 38.921800 10.487806 18.00 32.00 37.000 44.0000 92.00
Tenure 10000.0 5.012800 2.892174 0.00 3.00 5.000 7.0000 10.00
Balance 10000.0 76485.889288 62397.40520 0.00 0.00 97198.540 127644.240 250898.0
2 0 9
NumOfProducts 10000.0 1.530200 0.581654 1.00 1.00 1.000 2.0000 4.00
HasCrCard 10000.0 0.705500 0.455840 0.00 0.00 1.000 1.0000 1.00
IsActiveMember 10000.0 0.515100 0.499797 0.00 0.00 1.000 1.0000 1.00
EstimatedSalary 10000.0 100090.23988 57510.49281 11.58 51002.1 100193.91 149388.247 199992.4
1 8 1 5 5 8
Exited 10000.0 0.203700 0.402769 0.00 0.00 0.000 0.0000 1.00
fig, ax = plt.subplots(figsize=(12,
12))
sns.heatmap(df.corr(),annot=True)
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 6/
11/10/2024, ml3.ipynb -
23:20 Colab
<Axes: >
sns.pairplot(df)
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 7/
11/10/2024, ml3.ipynb -
23:20 Colab
<seaborn.axisgrid.PairGrid at 0x7b32832bf8b0>
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 8/
11/10/2024, ml3.ipynb -
23:20 Colab
sns.histplot(df['CreditScore'])
<Axes: xlabel='CreditScore', ylabel='Count'>
sns.histplot(df['Geography'])
<Axes: xlabel='Geography', ylabel='Count'>
sns.histplot(df['Balance'])
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 9/
11/10/2024, ml3.ipynb -
23:20 Colab
<Axes: xlabel='Balance', ylabel='Count'>
sns.histplot(df['EstimatedSalary'])
<Axes: xlabel='EstimatedSalary', ylabel='Count'>
sns.histplot(df['NumOfProducts'])
<Axes: xlabel='NumOfProducts', ylabel='Count'>
from sklearn.model_selection import
train_test_split x = df.drop('Exited',axis=1)
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 1
11/10/2024, ml3.ipynb -
y = df['Exited']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)
▾ GaussianNB i
GaussianNB()
y_pred = nb.predict(x_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0])
from sklearn.metrics import
accuracy_score,confusion_matrix,classification_report
print(classification_report(y_test,y_pred))
precision recall f1-score support
0 0.81 0.97 0.88 1607
1 0.33 0.07 0.11 393
accuracy 0.79 2000
macro avg 0.57 0.52 0.50 2000
weighted 0.72 0.79 0.73 2000
accuracy_score(y_test,y_pr
ed) 0.79
cm =
confusion_matrix(y_test,y_pred)
cm
array([[1553, 54],
[ 366, 27]])
sns.heatmap(cm,annot=True,fmt='d')
<Axes: >
from sklearn.tree import
DecisionTreeClassifier dt =
DecisionTreeClassifier()
dt.fit(x_train,y_train)
▾ DecisionTreeClassifier i
DecisionTreeClassifier()
https://colab.research.google.com/drive/1Lr6payo0UxiyJ7Is_FNsk2D2LDxbwPX- 10