L3_Classification_RandomForest - Jupyter Notebook
L3_Classification_RandomForest - Jupyter Notebook
# Loading dataset
import pandas as pd
iris = datasets.load_iris()
In [2]:
In [3]:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
In [4]:
# Here dataset will contain all independent columns given by iris.data. It will convert to dataframe.
dataset = pd.DataFrame(iris.data)
In [5]:
print(dataset.head())
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [6]:
# We are trying to create a new column named 'species' in dataset. The values of species column
# is same as iris.target - setosa, versicolor and verginica i.e. 0,1,2
dataset['species'] = iris.target
In [7]:
print(dataset)
In [8]:
X = dataset.iloc[:, : -1]
y = dataset.iloc[:, -1]
In [9]:
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
Accuracy: 0.9111111111111111
In [10]:
clf.predict([[3, 3, 2, 2]])
Out[10]:
array([0])
In [11]:
# This implies it is setosa flower type as we got the three species or classes in our data set:
# Setosa, Versicolor, and Virginia.
In [12]:
clf.predict([[3, 5, 5, 2]])
Out[12]:
array([2])
In [13]:
# Now we will also find out the important features or selecting features in the IRIS dataset.
In [14]:
clf.fit(X_train, y_train)
Out[14]:
In [15]:
feature_imp
Out[15]:
In [16]:
X=dataset[['petallength', 'petalwidth','sepallength']]
y=dataset['species']
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
Accuracy: 0.9333333333333333
In [18]:
# We can see that after removing the least important features (sepal width), the accuracy increased.
# This is because you removed misleading data and noise, resulting in an increased accuracy.
# A lesser amount of features also reduces the training time.
In [19]:
# first decision tree is 0th tree and total trees are from 0 to 99
clf.estimators_[0]
Out[19]:
fig = plt.figure(figsize=(25,20))
In [21]:
clf.estimators_[1]
Out[21]:
fig = plt.figure(figsize=(25,20))
In [ ]: