4/2/25, 3:36 PM Assgn_06_ML.
ipynb - Colab
import pandas as pd
add Code add Text
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv("Cancer_data.csv")
print(df.head())
df.info()
mean_radius mean_texture mean_perimeter mean_area mean_smoothness \
0 17.99 10.38 122.80 1001.0 0.11840
1 20.57 17.77 132.90 1326.0 0.08474
2 19.69 21.25 130.00 1203.0 0.10960
3 11.42 20.38 77.58 386.1 0.14250
4 20.29 14.34 135.10 1297.0 0.10030
diagnosis
0 0
1 0
2 0
3 0
4 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mean_radius 564 non-null float64
1 mean_texture 562 non-null float64
2 mean_perimeter 563 non-null float64
3 mean_area 562 non-null float64
4 mean_smoothness 566 non-null float64
5 diagnosis 569 non-null int64
dtypes: float64(5), int64(1)
memory usage: 26.8 KB
df.fillna(df.mean(), inplace=True)
# Independent and dependent features
x = df.iloc[:, :-1]
y = df['diagnosis'] # Target variable
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=2)
# Post Pruning => Intially taking some parameter
treemodel = DecisionTreeClassifier(max_depth=2)
treemodel.fit(X_train, y_train)
DecisionTreeClassifier()
▾ DecisionTreeClassifier i ?
DecisionTreeClassifier()
# Plot
from sklearn import tree
plt.figure(figsize=(10, 10))
tree.plot_tree(treemodel, filled=True)
https://colab.research.google.com/drive/1waKedpF2yHp3ZIfNZqSASAoTSUdGRxzS?authuser=1#printMode=true 1/5
4/2/25, 3:36 PM Assgn_06_ML.ipynb - Colab
[Text(0.5, 0.8333333333333334, 'x[3] <= 697.8\ngini = 0.463\nsamples = 398\nvalue = [145, 253]'),
Text(0.25, 0.5, 'x[2] <= 90.365\ngini = 0.217\nsamples = 283\nvalue = [35, 248]'),
Text(0.375, 0.6666666666666667, 'True '),
Text(0.125, 0.16666666666666666, 'gini = 0.126\nsamples = 236\nvalue = [16, 220]'),
Text(0.375, 0.16666666666666666, 'gini = 0.482\nsamples = 47\nvalue = [19, 28]'),
Text(0.75, 0.5, 'x[1] <= 16.575\ngini = 0.083\nsamples = 115\nvalue = [110, 5]'),
Text(0.625, 0.6666666666666667, ' False'),
Text(0.625, 0.16666666666666666, 'gini = 0.496\nsamples = 11\nvalue = [6, 5]'),
Text(0.875, 0.16666666666666666, 'gini = 0.0\nsamples = 104\nvalue = [104, 0]')]
# Prediction
y_pred = treemodel.predict(X_test)
y_pred
array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1])
#Accuracy
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test, y_pred)
print(score)
0.8771929824561403
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.91 0.76 0.83 67
1 0.86 0.95 0.90 104
accuracy 0.88 171
macro avg 0.89 0.86 0.87 171
weighted avg 0.88 0.88 0.87 171
https://colab.research.google.com/drive/1waKedpF2yHp3ZIfNZqSASAoTSUdGRxzS?authuser=1#printMode=true 2/5
4/2/25, 3:36 PM Assgn_06_ML.ipynb - Colab
# Pre-pruning using GridSearchCV
parameter = {'criterion': ['gini', 'entropy'], 'max_depth': [4, 5, 6, 7, 8, 9],
'splitter': ['best', 'random'], 'max_features': ['auto', 'sqrt', 'log2']}
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(treemodel, param_grid=parameter, cv=5, scoring='accuracy')
cv.fit(X_train, y_train)
/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py:528: FitFailedWarning:
120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
estimator._validate_params()
File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
validate_parameter_constraints(
File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of DecisionTreeClassifier must be an int in the
warnings.warn(some_fits_failed_message, FitFailedWarning)
/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_search.py:1108: UserWarning: One or more of the test scores are non
nan nan 0.89946203 0.8518038 0.87939873 0.90202532
nan nan 0.88446203 0.87943038 0.89443038 0.90449367
nan nan 0.88458861 0.87708861 0.88702532 0.86183544
nan nan 0.89199367 0.88446203 0.88949367 0.89433544
nan nan 0.85917722 0.91205696 0.87439873 0.88949367
nan nan 0.87949367 0.88202532 0.89202532 0.84702532
nan nan 0.86936709 0.87183544 0.90455696 0.86689873
nan nan 0.88949367 0.86677215 0.88449367 0.88449367
nan nan 0.89458861 0.91699367 0.87436709 0.89699367
nan nan 0.89202532 0.88446203 0.88705696 0.89696203
nan nan 0.86949367 0.90205696 0.87686709 0.90205696]
warnings.warn(
▸ GridSearchCV
i ?
▸ best_estimator_:
DecisionTreeClassifier
▸ DecisionTreeClassifier ?
cv.best_params_
{'criterion': 'entropy',
'max_depth': 7,
'max_features': 'sqrt',
'splitter': 'random'}
y_pred = cv.predict(X_test)
y_pred
array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1])
score = accuracy_score(y_test, y_pred)
print(score)
0.8888888888888888
print(classification_report(y_test, y_pred))
https://colab.research.google.com/drive/1waKedpF2yHp3ZIfNZqSASAoTSUdGRxzS?authuser=1#printMode=true 3/5
4/2/25, 3:36 PM Assgn_06_ML.ipynb - Colab
precision recall f1-score support
0 0.93 0.78 0.85 67
1 0.87 0.96 0.91 104
accuracy 0.89 171
macro avg 0.90 0.87 0.88 171
weighted avg 0.89 0.89 0.89 171
# Random Forest
from sklearn.ensemble import RandomForestClassifier
X = x.values # Convert to numpy array for Random Forest
y = y.values
# Split again for Random Forest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# Create and fit Random Forest
rf = RandomForestClassifier(n_estimators=4, n_jobs=-1)
rf.fit(X_train, y_train)
▾ RandomForestClassifier i ?
RandomForestClassifier(n_estimators=4, n_jobs=-1)
# Check accuracy of Random Forest
rf.score(X_test, y_test)
0.9202127659574468
for i in range(10):
rf = RandomForestClassifier(n_estimators=4, n_jobs=-1)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
0.898936170212766
0.9308510638297872
0.9202127659574468
0.8776595744680851
0.8936170212765957
0.8670212765957447
0.898936170212766
0.9095744680851063
0.8670212765957447
0.925531914893617
from sklearn.svm import SVC
svm_linear=SVC(kernel='linear')
svm_rbf=SVC(kernel='rbf')
# kernel functions -> to handle non linear functions, map low dimen. to high dimen.
svm_linear.fit(X_train,y_train)
svm_rbf.fit(X_train,y_train)
▾ SVC i ?
SVC()
# accuracy score
y_pred_linear=svm_linear.predict(X_test)
y_pred_rbf=svm_rbf.predict(X_test)
accuracy_linear=accuracy_score(y_test,y_pred_linear)
accuracy_rbf=accuracy_score(y_test,y_pred_rbf)
accuracy_linear,accuracy_rbf
(0.925531914893617, 0.8882978723404256)
https://colab.research.google.com/drive/1waKedpF2yHp3ZIfNZqSASAoTSUdGRxzS?authuser=1#printMode=true 4/5
4/2/25, 3:36 PM Assgn_06_ML.ipynb - Colab
https://colab.research.google.com/drive/1waKedpF2yHp3ZIfNZqSASAoTSUdGRxzS?authuser=1#printMode=true 5/5