3_ID3_algorithm_updated
January 20, 2025
1 Experiment 3
2 Write a program to demonstrate the working of the decision
tree based on ID3 algorithm. Use an appropriate data set for
building the decision tree and apply this knowledge to classify
a new sample
[10]: import pandas as pd
import numpy as np
df_tennis = pd.read_csv("tennis.csv")
df_tennis
[10]: Outlook Temperature Humidity Windy PlayTennis
0 Sunny Hot High Weak No
1 Sunny Hot High Strong No
2 Overcast Hot High Weak Yes
3 Rainy Mild High Weak Yes
4 Rainy Cool Normal Weak Yes
5 Rainy Cool Normal Strong No
6 Overcast Cool Normal Strong Yes
7 Sunny Mild High Weak No
8 Sunny Cool Normal Weak Yes
9 Rainy Mild Normal Weak Yes
10 Sunny Mild Normal Strong Yes
11 Overcast Mild High Strong Yes
12 Overcast Hot Normal Weak Yes
13 Rainy Mild High Strong No
[11]: from collections import Counter
def entropy_list(a_list):
cnt = Counter(x for x in a_list)
num_instance = len(a_list)*1.0
probs = [x/num_instance for x in cnt.values()]
return entropy(probs)
1
[12]: import math
def entropy(probs): #overall entropy
return sum([-prob*math.log(prob,2) for prob in probs])
[13]: def info_gain(df,split,target,trace=0):
df_split = df.groupby(split)
nobs = len(df.index)*1.0
df_agg_ent = df_split.agg({ target:[entropy_list, lambda x: len(x)/nobs] })
# print(df_agg_ent)
df_agg_ent.columns = ['Entropy','PropObserved']
new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent["PropObserved"])
old_entropy = entropy_list(df[target])
return old_entropy - new_entropy
[14]: def id3(df,target,attribute_name,default_class = None):
cnt = Counter(x for x in df[target])
if len(cnt)==1:
return next(iter(cnt))
elif df.empty or (not attribute_name):
return default_class
else:
default_class = max(cnt.keys())
gains = [info_gain(df,attr,target) for attr in attribute_name]
index_max = gains.index(max(gains))
best_attr = attribute_name[index_max]
tree = { best_attr:{ } }
remaining_attr = [x for x in attribute_name if x!=best_attr]
for attr_val, data_subset in df.groupby(best_attr):
subtree = id3(data_subset,target,remaining_attr,default_class)
tree[best_attr][attr_val] = subtree
return tree
[15]: def classify(instance,tree,default = None):
attribute = next(iter(tree))
if instance[attribute] in tree[attribute].keys():
result = tree[attribute][instance[attribute]]
if isinstance(result,dict):
return classify(instance,result)
else:
return result
else:
return default
[16]: attribute_names=list(df_tennis.columns)
attribute_names.remove('PlayTennis')
training_data = df_tennis.iloc[1:-4] # all but last thousand instances
test_data = df_tennis.iloc[-4:] # just the last thousand
2
train_tree = id3(training_data, 'PlayTennis', attribute_names)
print("\n\nThe Resultant Decision train_tree is :\n")
print(train_tree)
test_data['predicted2'] = test_data.
↪apply(classify,axis=1,args=(train_tree,'Yes') )
print ('\n\n Training the model for a few samples, and again predicting␣
↪\'Playtennis\' for remaining attribute')
print('The Accuracy for new trained data is : ' + str(␣
↪sum(test_data['PlayTennis']==test_data['predicted2'] ) / (1.0*len(test_data.
↪index)) ))
The Resultant Decision train_tree is :
{'Outlook': {'Overcast': 'Yes', 'Rainy': {'Windy': {'Strong': 'No', 'Weak':
'Yes'}}, 'Sunny': {'Temperature': {'Cool': 'Yes', 'Hot': 'No', 'Mild': 'No'}}}}
Training the model for a few samples, and again predicting 'Playtennis' for
remaining attribute
The Accuracy for new trained data is : 0.75
C:\Users\Admin\AppData\Local\Temp\ipykernel_4940\150528394.py:8:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-
docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
test_data['predicted2'] =
test_data.apply(classify,axis=1,args=(train_tree,'Yes') )
[ ]: