From eb242c26d9223833497677584f33a4db4e96a9d1 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 15 Jan 2016 21:46:31 +0530 Subject: [PATCH] ENH: FeatureHasher now accepts string values. --- doc/whats_new.rst | 6 +++++ sklearn/feature_extraction/_hashing.pyx | 10 +++++++- .../tests/test_feature_hasher.py | 23 ++++++++++++++++++- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index def4de24fffd7..78a4d46ef67ed 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -50,6 +50,10 @@ New features Enhancements ............ + - :class:`feature_extraction.FeatureHasher` now accepts string values. + (`#6173 `_) By `Ryad Zenine`_ + and `Devashish Deshpande`_. + - The cross-validation iterators are replaced by cross-validation splitters available from :mod:`model_selection`. These expose a ``split`` method that takes in the data and yields a generator for the different splits. @@ -4121,3 +4125,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Jonathan Arfa: https://github.com/jarfa .. _Anish Shah: https://github.com/AnishShah + +.. _Ryad Zenine: https://github.com/ryadzenine diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 98600c9aa35e4..201082e94cbf3 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -8,6 +8,8 @@ from libc.stdlib cimport abs cimport numpy as np import numpy as np +from ..externals.six import string_types + from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32 np.import_array() @@ -43,7 +45,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype): for x in raw_X: for f, v in x: - value = v + if isinstance(v, string_types): + f = "%s%s%s" % (f, '=', v) + value = 1 + else: + value = v + if value == 0: continue @@ -53,6 +60,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype): # all exceptions. Add "except *" there? elif not isinstance(f, bytes): raise TypeError("feature names must be strings") + h = murmurhash3_bytes_s32(f, 0) array.resize_smart(indices, len(indices) + 1) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index c12919762aa14..34024d59bb1ba 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -12,7 +12,8 @@ def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert_equal("dict", h.input_type) - raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}] + raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, + {"foo": "baz", "gaga": u"string1"}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (iter(d.items()) for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) @@ -53,6 +54,26 @@ def test_feature_hasher_pairs(): assert_equal([1, 3, 4], x2_nz) +def test_feature_hasher_pairs_with_string_values(): + raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"}, + {"baz": u"abc", "quux": 4, "foo": -1}]) + h = FeatureHasher(n_features=16, input_type="pair") + x1, x2 = h.transform(raw_X).toarray() + x1_nz = sorted(np.abs(x1[x1 != 0])) + x2_nz = sorted(np.abs(x2[x2 != 0])) + assert_equal([1, 1], x1_nz) + assert_equal([1, 1, 4], x2_nz) + + raw_X = (iter(d.items()) for d in [{"bax": "abc"}, + {"bax": "abc"}]) + x1, x2 = h.transform(raw_X).toarray() + x1_nz = np.abs(x1[x1 != 0]) + x2_nz = np.abs(x2[x2 != 0]) + assert_equal([1], x1_nz) + assert_equal([1], x2_nz) + assert_equal(x1, x2) + + def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))]