8000 Merge pull request #6173 from dsquareindia/featurehasher_fix · scikit-learn/scikit-learn@c2eaf75 · GitHub
[go: up one dir, main page]

Skip to content

Commit c2eaf75

Browse files
committed
Merge pull request #6173 from dsquareindia/featurehasher_fix
[MRG+1] Fix: FeatureHasher now accepts string values
2 parents 113ee40 + eb242c2 commit c2eaf75

File tree

3 files changed

+37
-2
lines changed

3 files changed

+37
-2
lines changed

doc/whats_new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ New features
5050
Enhancements
5151
............
5252

53+
- :class:`feature_extraction.FeatureHasher` now accepts string values.
54+
(`#6173 <https://github.com/scikit-learn/scikit-learn/pull/6173>`_) By `Ryad Zenine`_
55+
and `Devashish Deshpande`_.
56+
5357
- The cross-validation iterators are replaced by cross-validation splitters
5458
available from :mod:`model_selection`. These expose a ``split`` method
5559
that takes in the data and yields a generator for the different splits.
@@ -4121,3 +4125,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
41214125
.. _Jonathan Arfa: https://github.com/jarfa
41224126

41234127
.. _Anish Shah: https://github.com/AnishShah
4128+
4129+
.. _Ryad Zenine: https://github.com/ryadzenine

sklearn/feature_extraction/_hashing.pyx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ from libc.stdlib cimport abs
88
cimport numpy as np
99
import numpy as np
1010

11+
from ..externals.six import string_types
12+
1113
from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
1214

1315
np.import_array()
@@ -43,7 +45,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype):
4345

4446
for x in raw_X:
4547
for f, v in x:
46-
value = v
48+
if isinstance(v, string_types):
49+
f = "%s%s%s" % (f, '=', v)
50+
value = 1
51+
else:
52+
value = v
53+
4754
if value == 0:
4855
continue
4956

@@ -53,6 +60,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype):
5360
# all exceptions. Add "except *" there?
5461
elif not isinstance(f, bytes):
5562
raise TypeError("feature names must be strings")
63+
5664
h = murmurhash3_bytes_s32(f, 0)
5765

5866
array.resize_smart(indices, len(indices) + 1)

sklearn/feature_extraction/tests/test_feature_hasher.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ def test_feature_hasher_dicts():
1212
h = FeatureHasher(n_features=16)
1313
assert_equal("dict", h.input_type)
1414

15-
raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
15+
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
16+
{"foo": "baz", "gaga": u"string1"}]
1617
X1 = FeatureHasher(n_features=16).transform(raw_X) 8000
1718
gen = (iter(d.items()) for d in raw_X)
1819
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
@@ -53,6 +54,26 @@ def test_feature_hasher_pairs():
5354
assert_equal([1, 3, 4], x2_nz)
5455

5556

57+
def test_feature_hasher_pairs_with_string_values():
58+
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
59+
{"baz": u"abc", "quux": 4, "foo": -1}])
60+
h = FeatureHasher(n_features=16, input_type="pair")
61+
x1, x2 = h.transform(raw_X).toarray()
62+
x1_nz = sorted(np.abs(x1[x1 != 0]))
63+
x2_nz = sorted(np.abs(x2[x2 != 0]))
64+
assert_equal([1, 1], x1_nz)
65+
assert_equal([1, 1, 4], x2_nz)
66+
67+
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
68+
{"bax": "abc"}])
69+
x1, x2 = h.transform(raw_X).toarray()
70+
x1_nz = np.abs(x1[x1 != 0])
71+
x2_nz = np.abs(x2[x2 != 0])
72+
assert_equal([1], x1_nz)
73+
assert_equal([1], x2_nz)
74+
assert_equal(x1, x2)
75+
76+
5677
def test_hash_empty_input():
5778
n_features = 16
5879
raw_X = [[], (), iter(range(0))]

0 commit comments

Comments
 (0)
0