From eb242c26d9223833497677584f33a4db4e96a9d1 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 15 Jan 2016 21:46:31 +0530
Subject: [PATCH] ENH: FeatureHasher now accepts string values.

---
 doc/whats_new.rst                             |  6 +++++
 sklearn/feature_extraction/_hashing.pyx       | 10 +++++++-
 .../tests/test_feature_hasher.py              | 23 ++++++++++++++++++-
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index def4de24fffd7..78a4d46ef67ed 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -50,6 +50,10 @@ New features
 Enhancements
 ............
 
+   - :class:`feature_extraction.FeatureHasher` now accepts string values.
+     (`#6173 <https://github.com/scikit-learn/scikit-learn/pull/6173>`_) By `Ryad Zenine`_
+     and `Devashish Deshpande`_.
+
    - The cross-validation iterators are replaced by cross-validation splitters
      available from :mod:`model_selection`. These expose a ``split`` method
      that takes in the data and yields a generator for the different splits.
@@ -4121,3 +4125,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Jonathan Arfa: https://github.com/jarfa
 
 .. _Anish Shah: https://github.com/AnishShah
+
+.. _Ryad Zenine: https://github.com/ryadzenine
diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx
index 98600c9aa35e4..201082e94cbf3 100644
--- a/sklearn/feature_extraction/_hashing.pyx
+++ b/sklearn/feature_extraction/_hashing.pyx
@@ -8,6 +8,8 @@ from libc.stdlib cimport abs
 cimport numpy as np
 import numpy as np
 
+from ..externals.six import string_types
+
 from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
 
 np.import_array()
@@ -43,7 +45,12 @@ def transform(raw_X, Py_ssize_t n_features, dtype):
 
     for x in raw_X:
         for f, v in x:
-            value = v
+            if isinstance(v, string_types):
+                f = "%s%s%s" % (f, '=', v)
+                value = 1
+            else:
+                value = v
+
             if value == 0:
                 continue
 
@@ -53,6 +60,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype):
             # all exceptions. Add "except *" there?
             elif not isinstance(f, bytes):
                 raise TypeError("feature names must be strings")
+
             h = murmurhash3_bytes_s32(f, 0)
 
             array.resize_smart(indices, len(indices) + 1)
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index c12919762aa14..34024d59bb1ba 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -12,7 +12,8 @@ def test_feature_hasher_dicts():
     h = FeatureHasher(n_features=16)
     assert_equal("dict", h.input_type)
 
-    raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
+    raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
+             {"foo": "baz", "gaga": u"string1"}]
     X1 = FeatureHasher(n_features=16).transform(raw_X)
     gen = (iter(d.items()) for d in raw_X)
     X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
@@ -53,6 +54,26 @@ def test_feature_hasher_pairs():
     assert_equal([1, 3, 4], x2_nz)
 
 
+def test_feature_hasher_pairs_with_string_values():
+    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
+                                       {"baz": u"abc", "quux": 4, "foo": -1}])
+    h = FeatureHasher(n_features=16, input_type="pair")
+    x1, x2 = h.transform(raw_X).toarray()
+    x1_nz = sorted(np.abs(x1[x1 != 0]))
+    x2_nz = sorted(np.abs(x2[x2 != 0]))
+    assert_equal([1, 1], x1_nz)
+    assert_equal([1, 1, 4], x2_nz)
+
+    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
+                                       {"bax": "abc"}])
+    x1, x2 = h.transform(raw_X).toarray()
+    x1_nz = np.abs(x1[x1 != 0])
+    x2_nz = np.abs(x2[x2 != 0])
+    assert_equal([1], x1_nz)
+    assert_equal([1], x2_nz)
+    assert_equal(x1, x2)
+
+
 def test_hash_empty_input():
     n_features = 16
     raw_X = [[], (), iter(range(0))]