In [1]: import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_breast_cancer
cancer_data = load_breast_cancer()
In [2]: list(cancer_data.target_names)
Out[2]: ['malignant', 'benign']
In [3]: cancer_data.feature_names
Out[3]: array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension'], dtype='<U23')
In [4]: cancer_df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
cancer_df.head()
Out[4]:
mean mean worst worst
mean mean mean mean mean mean mean mean worst worst worst worst worst worst worst worst
concave fractal ... concave fractal
radius texture perimeter area smoothness compactness concavity symmetry radius texture perimeter area smoothness compactness concavity symmetry
points dimension points dimension
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
5 rows × 30 columns
In [5]: from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer_df,
cancer_data.target,
test_size = 0.33,
random_state = 42)
In [6]: from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
In [7]: # Create Decision Tree classifer object
dtc = DecisionTreeClassifier()
# Train Decision Tree Classifer
model = dtc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = model.predict(X_test)
In [8]: # Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.925531914893617
In [9]: from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Out[9]: array([[ 63, 4],
[ 10, 111]], dtype=int64)
In [10]: import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
matrix = plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Reds)
In [11]: from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.85 0.91 0.88 67
1 0.95 0.91 0.93 121
accuracy 0.91 188
macro avg 0.90 0.91 0.90 188
weighted avg 0.91 0.91 0.91 188