1+ """
2+ ============================================================================
3+ Demonstration of randomized search to compare classifier performance
4+ ============================================================================
5+ An important step in classifier performance comparison is hyperparameter optimization. Here, we specify the classifer models we want to tune and a dictionary of hyperparameter ranges (preferably similar for fairness in comparision) for each classifier. Then, we find the optimal hyperparameters through a function that uses RandomizedSearchCV and refit the optimized models to obtain accuracies. We can see clearly in the plot that the optimized models perform better than the default parameter models. On the dataset we use in this example, car dataset from OpenML-CC18, SPORF also performs better than RF overall.
6+ """
7+ print (__doc__ )
8+
9+ from sklearn .model_selection import RandomizedSearchCV
10+ from sklearn .model_selection import GridSearchCV
11+
12+ import numpy as np
13+ import math
14+ from rerf .rerfClassifier import rerfClassifier
15+ from sklearn .ensemble import RandomForestClassifier
16+ import openml
17+ from sklearn .model_selection import train_test_split
18+ from sklearn import metrics
19+ from warnings import simplefilter
20+ simplefilter (action = 'ignore' , category = FutureWarning )
21+ from warnings import simplefilter
22+ simplefilter (action = 'ignore' , category = FutureWarning )
23+
24+ import matplotlib
25+ import matplotlib .pyplot as plt
26+ import numpy as np
27+
28+ def hyperparameter_optimization_random (X , y , * argv ):
29+ """
30+ Given a classifier and a dictionary of hyperparameters, find optimal hyperparameters using RandomizedSearchCV.
31+
32+ Parameters
33+ ----------
34+ X : numpy.ndarray
35+ Input data, shape (n_samples, n_features)
36+ y : numpy.ndarray
37+ Output data, shape (n_samples, n_outputs)
38+ *argv : list of tuples (classifier, hyperparameters)
39+ List of (classifier, hyperparameters) tuples:
40+
41+ classifier : sklearn-compliant classifier
42+ For example sklearn.ensemble.RandomForestRegressor, rerf.rerfClassifier, etc
43+ hyperparameters : dictionary of hyperparameter ranges
44+ See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html.
45+
46+ Returns
47+ -------
48+ clf_best_params : dictionary
49+ Dictionary of best hyperparameters
50+ """
51+
52+ clf_best_params = {}
53+
54+ # Iterate over all (classifier, hyperparameters) pairs
55+ for clf , params in argv :
56+
57+ # Run randomized search
58+ n_iter_search = 10
59+ random_search = RandomizedSearchCV (
60+ clf , param_distributions = params , n_iter = n_iter_search , cv = 10 , iid = False
61+ )
62+ random_search .fit (X , y )
63+
64+ # Save results
65+ clf_best_params [clf ] = random_search .best_params_
66+
67+ return clf_best_params
68+
69+ ###############################################################################
70+ # Building classifiers and specifying parameter ranges to sample from
71+ # ----------------------------------------------------------
72+ #
73+
74+ # get some data
75+ task_id = 146821 #car
76+ openml .config .apikey = 'c9ea8896542dd998ea42685f14e2bc14'
77+ benchmark_suite = openml .study .get_suite ('OpenML-CC18' )
78+ task = openml .tasks .get_task (task_id )
79+ X , y = task .get_X_and_y ()
80+ n_features = np .shape (X )[1 ]
81+ n_samples = np .shape (X )[0 ]
82+
83+ # build a classifier
84+ rerf = rerfClassifier ()
85+
86+ #specify max_depth and min_sample_splits ranges
87+ max_depth_array_rerf = (np .unique (np .round ((np .linspace (2 ,n_samples ,
88+ 10 ))))).astype (int )
89+ max_depth_range_rerf = np .append (max_depth_array_rerf , None )
90+
91+ min_sample_splits_range_rerf = (np .unique (np .round ((np .arange (1 ,math .log (n_samples ),
92+ (math .log (n_samples )- 2 )/ 10 )))
10BC0
)).astype (int )
93+
94+ # specify parameters and distributions to sample from
95+ rerf_param_dict = {"n_estimators" : np .arange (50 ,550 ,50 ),
96+ "max_depth" : max_depth_range_rerf ,
97+ "min_samples_split" : min_sample_splits_range_rerf ,
98+ "feature_combinations" : [1 ,2 ,3 ,4 ,5 ],
99+ "max_features" : ["sqrt" ,"log2" , None , n_features ** 2 ]}
100+
101+ # build another classifier
102+ rf = RandomForestClassifier ()
103+
104+ # specify max_depth and min_sample_splits ranges
105+ max_depth_array_rf = (np .unique (np .round ((np .linspace (2 ,n_samples ,
106+ 10 ))))).astype (int )
107+ max_depth_range_rf = np .append (max_depth_array_rf , None )
108+
109+ min_sample_splits_range_rf = (np .unique (np .round ((np .arange (2 ,math .log (n_samples ),
110+ (math .log (n_samples )- 2 )/ 10 ))))).astype (int )
111+
112+ # specify parameters and distributions to sample from
113+ rf_param_dict = {"n_estimators" : np .arange (50 ,550 ,50 ),
114+ "max_depth" : max_depth_range_rf ,
115+ "min_samples_split" : min_sample_splits_range_rf ,
116+ "max_features" : ["sqrt" ,"log2" , None ]}
117+
118+ ###############################################################################
119+ # Obtaining best parameters dictionary and refitting
120+ # ----------------------------------------------------------
121+ #
122+
123+ best_params = hyperparameter_optimization_random (X , y ,
124+ (rerf , rerf_param_dict ), (rf , rf_param_dict ))
125+ print (best_params )
126+
127+ # extract values from dict - seperate each classifier's param dict
128+ keys , values = zip (* best_params .items ())
129+
130+ # train test split
131+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.33 , random_state = 42 )
132+
133+ # get accuracies of optimized and default models
134+ rerf_opti = rerfClassifier (** values [0 ])
135+ rerf_opti .fit (X_train , y_train )
136+ rerf_pred_opti = rerf_opti .predict (X_test )
137+ rerf_acc_opti = metrics .accuracy_score (y_test , rerf_pred_opti )
138+
139+ rerf_default = rerfClassifier ()
140+ rerf_default .fit (X_train , y_train )
141+ rerf_pred_default = rerf_default .predict (X_test )
142+ rerf_acc_default = metrics .accuracy_score (y_test , rerf_pred_default )
143+
144+ rf_opti = RandomForestClassifier (** values [1 ])
145+ rf_opti .fit (X_train , y_train )
146+ rf_pred_opti = rf_opti .predict (X_test )
147+ rf_acc_opti = metrics .accuracy_score (y_test , rf_pred_opti )
148+
149+ rf_default = RandomForestClassifier ()
150+ rf_default .fit (X_train , y_train )
151+ rf_pred_default = rf_default .predict (X_test )
152+ rf_acc_default = metrics .accuracy_score (y_test , rf_pred_default )
153+
154+ ###############################################################################
155+ # Plotting the result
156+ # -------------------
157+
158+ labels = ['SPORF' , 'RF' ]
159+ rerf_acc = [rerf_acc_opti , rerf_acc_default ]
160+ rf_acc = [rf_acc_opti , rf_acc_default ]
161+
162+ x = np .arange (len (labels ))
163+ width = 0.35
164+
165+ fig , ax = plt .subplots ()
166+ rects1 = ax .bar (x - width / 2 , rerf_acc , width , label = 'Optimized' )
167+ rects2 = ax .bar (x + width / 2 , rf_acc , width , label = 'Default' )
168+
169+ # Add some text for labels, title and custom x-axis tick labels, etc.
170+ ax .set_ylabel ('Accuracy' )
171+ ax .set_title ('Accuracy of Optimized/Default SPORF and RF Models on car Dataset' )
172+ ax .set_xticks (x )
173+ ax .set_xticklabels (labels )
174+ ax .legend ()
175+
176+ def autolabel (rects ):
177+ """Attach a text label above each bar in *rects*, displaying its height."""
178+ for rect in rects :
179+ height = float ('%.3f' % (rect .get_height ()))
180+ ax .annotate ('{}' .format (height ),
181+ xy = (rect .get_x () + rect .get_width () / 2 , height ),
182+ xytext = (0 , 3 ), # 3 points vertical offset
183+ textcoords = "offset points" ,
184+ ha = 'center' , va = 'bottom' )
185+ autolabel (rects1 )
186+ autolabel (rects2 )
187+ fig .tight_layout ()
188+ plt .ylim ((.9 , 1 ))
189+
190+ plt .show ()
0 commit comments