8000 ENH: Add get_feature_names to PCA · scikit-learn/scikit-learn@4fd22e1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4fd22e1

Browse files
committed
ENH: Add get_feature_names to PCA
1 parent ea9896e commit 4fd22e1

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

sklearn/decomposition/pca.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,80 @@ def __init__(self, n_components=None, copy=True, whiten=False,
278278
self.iterated_power = iterated_power
279279
self.random_state = random_state
280280

281+
def get_feature_names(self, input_features=None, show_coef=False):
282+
"""
283+
Return dominant feature names for each component
284+
285+
Parameters
286+
----------
287+
input_features : list of string, shape (n_features), optional
288+
String names for input features if available. By default,
289+
"x0", "x1", ... "xn_features" is used.
290+
291+
show_coef : boolean or integer, default False
292+
When it is "True", return the principal components as the
293+
combination of the input features. If "False", will be just
294+
the component names. If it is an integer n, it returns the
295+
sorted top n contributions to each component.
296+
297+
298+
Returns
299+
-------
300+
output_feature_names : list of string, shape (n_components)
301+
When show_coef is "True", it is represented by the contribution
302+
of input features and show_coef is "False", it just represents
303+
the component names
304+
305+
Examples
306+
--------
307+
>>> import numpy as np
308+
>>> from sklearn.decomposition import PCA
309+
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
310+
>>> pca = PCA(n_components=2).fit(X)
311+
>>> pca.get_feature_names(show_coef=True)
312+
['0.84*x0 + 0.54*x1', '0.54*x0 - 0.84*x1']
313+
>>> pca.get_feature_names(show_coef=1)
314+
['0.84*x0', '-0.84*x1']
315+
>>> pca.get_feature_names()
316+
['pc0', 'pc1']
317+
"""
318+
check_is_fitted(self, 'components_')
319+
320+
n_features = self.components_.shape[1]
321+
components = self.components_
322+
323+
if input_features is None:
324+
input_features = ['x%d' % i for i in range(n_features)]
325+
else:
326+
if len(input_features) != n_features:
327+
raise ValueError("Length of input_features is {0} but it must"
328+
"equal number of features when fitted: {1}.".format
329+
(len(input_features), n_features))
330+
331+
def name_generator(coefficients, names):
332+
yield "{0:.2g}*{1}".format(coefficients[0], names[0])
333+
for c, n in zip(coefficients[1:], names[1:]):
334+
yield "{0:s} {1:.2g}*{2}".format('-' if c < 0 else '+', abs(c), n)
335+
336+
if show_coef is True:
337+
feature_names = [' '.join(name_generator(components[i],input_features))
338+
8000 for i in range(self.n_components)]
339+
elif show_coef is False:
340+
feature_names = ['pc{0}'.format(i) for i in range(self.n_components)]
341+
elif isinstance(show_coef, six.integer_types):
342+
if show_coef < 0 or show_coef > n_features:
343+
raise ValueError("show_coef is {0} but it must be between 1 and"
344+
"number of features {1}".format(show_coef, n_features))
345+
contribution = np.argsort(np.abs(components), axis=1)[:, ::-1]
346+
required = contribution[:,:show_coef]
347+
input_features = np.asarray(input_features)
348+
feature_names = [' '.join(name_generator(components[i][required[i]], input_features[required[i]]))
349+
for i in range(self.n_components)]
350+
else:
351+
raise ValueError("show_coef must be integer or boolean")
352+
return feature_names
353+
354+
281355
def fit(self, X, y=None):
282356
"""Fit the model with X.
283357

sklearn/decomposition/tests/test_pca.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from sklearn.utils.testing import assert_warns_message
1212
from sklearn.utils.testing import ignore_warnings
1313
from sklearn.utils.testing import assert_less
14+
from sklearn.utils.testing import assert_array_equal
1415

1516
from sklearn import datasets
1617
from sklearn.decomposition import PCA
@@ -507,3 +508,18 @@ def fit_deprecated(X):
507508
assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X)
508509
Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X)
509510
assert_array_almost_equal(Y, Y_pca)
511+
512+
513+
def test_get_feature_names():
514+
X1 = np.array([[-1, -1, 3], [-2, -1, 1], [-3, -2, -1], [1, 1, 2]])
515+
pca = PCA(n_components=2).fit(X1)
516+
assert_array_equal(pca.get_feature_names(), ['pc0', 'pc1'])
517+
assert_array_equal(pca.get_feature_names(show_coef=True),
518+
['-0.66*x0 - 0.46*x1 - 0.59*x2', '0.38*x0 + 0.47*x1 - 0.79*x2'])
519+
assert_array_equal(pca.get_feature_names(show_coef=1),
520+
['-0.66*x0', '-0.79*x2'])
521+
# Raise error when len(input_features) != n_features
522+
assert_raises(ValueError, pca.get_feature_names, ['a']);
523+
# Raise error when show_coef is greater than n_features
524+
assert_raises(ValueError, pca.get_feature_names, None, 4)
525+

0 commit comments

Comments
 (0)
0