|
1 | 1 | """
|
2 |
| -========================================================= |
| 2 | +================ |
3 | 3 | The Iris Dataset
|
4 |
| -========================================================= |
| 4 | +================ |
5 | 5 | This data sets consists of 3 different types of irises'
|
6 | 6 | (Setosa, Versicolour, and Virginica) petal and sepal
|
7 | 7 | length, stored in a 150x4 numpy.ndarray
|
|
19 | 19 | # Modified for documentation by Jaques Grobler
|
20 | 20 | # License: BSD 3 clause
|
21 | 21 |
|
22 |
| -import matplotlib.pyplot as plt |
23 |
| - |
24 |
| -# unused but required import for doing 3d projections with matplotlib < 3.2 |
25 |
| -import mpl_toolkits.mplot3d # noqa: F401 |
26 |
| - |
| 22 | +# %% |
| 23 | +# Loading the iris dataset |
| 24 | +# ------------------------ |
27 | 25 | from sklearn import datasets
|
28 |
| -from sklearn.decomposition import PCA |
29 | 26 |
|
30 |
| -# import some data to play with |
31 | 27 | iris = datasets.load_iris()
|
32 |
| -X = iris.data[:, :2] # we only take the first two features. |
33 |
| -y = iris.target |
34 | 28 |
|
35 |
| -x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 |
36 |
| -y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 |
37 | 29 |
|
38 |
| -plt.figure(2, figsize=(8, 6)) |
39 |
| -plt.clf() |
| 30 | +# %% |
| 31 | +# Scatter Plot of the Iris dataset |
| 32 | +# -------------------------------- |
| 33 | +import matplotlib.pyplot as plt |
| 34 | + |
| 35 | +_, ax = plt.subplots() |
| 36 | +scatter = ax.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target) |
| 37 | +ax.set(xlabel=iris.feature_names[0], ylabel=iris.feature_names[1]) |
| 38 | +_ = ax.legend( |
| 39 | + scatter.legend_elements()[0], iris.target_names, loc="lower right", title="Classes" |
| 40 | +) |
40 | 41 |
|
41 |
| -# Plot the training points |
42 |
| -plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor="k") |
43 |
| -plt.xlabel("Sepal length") |
44 |
| -plt.ylabel("Sepal width") |
| 42 | +# %% |
| 43 | +# Each point in the scatter plot refers to one of the 150 iris flowers |
| 44 | +# in the dataset, with the color indicating their respective type |
| 45 | +# (Setosa, Versicolour, and Virginica). |
| 46 | +# You can already see a pattern regarding the Setosa type, which is |
| 47 | +# easily identifiable based on its short and wide sepal. Only |
| 48 | +# considering these 2 dimensions, sepal width and length, there's still |
| 49 | +# overlap between the Versicolor and Virginica types. |
| 50 | + |
| 51 | +# %% |
| 52 | +# Plot a PCA representation |
| 53 | +# ------------------------- |
| 54 | +# Let's apply a Principal Component Analysis (PCA) to the iris dataset |
| 55 | +# and then plot the irises across the first three PCA dimensions. |
| 56 | +# This will allow us to better differentiate between the three types! |
45 | 57 |
|
46 |
| -plt.xlim(x_min, x_max) |
47 |
| -plt.ylim(y_min, y_max) |
48 |
| -plt.xticks(()) |
49 |
| -plt.yticks(()) |
| 58 | +# unused but required import for doing 3d projections with matplotlib < 3.2 |
| 59 | +import mpl_toolkits.mplot3d # noqa: F401 |
| 60 | + |
| 61 | +from sklearn.decomposition import PCA |
50 | 62 |
|
51 |
| -# To getter a better understanding of interaction of the dimensions |
52 |
| -# plot the first three PCA dimensions |
53 | 63 | fig = plt.figure(1, figsize=(8, 6))
|
54 | 64 | ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)
|
55 | 65 |
|
|
58 | 68 | X_reduced[:, 0],
|
59 | 69 | X_reduced[:, 1],
|
60 | 70 | X_reduced[:, 2],
|
61 |
| - c=y, |
62 |
| - cmap=plt.cm.Set1, |
63 |
| - edgecolor="k", |
| 71 | + c=iris.target, |
64 | 72 | s=40,
|
65 | 73 | )
|
66 | 74 |
|
67 |
| -ax.set_title("First three PCA directions") |
68 |
| -ax.set_xlabel("1st eigenvector") |
| 75 | +ax.set_title("First three PCA dimensions") |
| 76 | +ax.set_xlabel("1st Eigenvector") |
69 | 77 | ax.xaxis.set_ticklabels([])
|
70 |
| -ax.set_ylabel("2nd eigenvector") |
| 78 | +ax.set_ylabel("2nd Eigenvector") |
71 | 79 | ax.yaxis.set_ticklabels([])
|
72 |
| -ax.set_zlabel("3rd eigenvector") |
| 80 | +ax.set_zlabel("3rd Eigenvector") |
73 | 81 | ax.zaxis.set_ticklabels([])
|
74 | 82 |
|
75 | 83 | plt.show()
|
| 84 | + |
| 85 | +# %% |
| 86 | +# PCA will create 3 new features that are a linear combination of the |
| 87 | +# 4 original features. In addition, this transform maximizes the variance. |
| 88 | +# With this transformation, we see that we can identify each species using |
| 89 | +# only the first feature (i.e. first eigenvalues). |
0 commit comments