Add: allow dropped_first to be any hashable type

pandas-dev · jreback · Jun 30, 2022 · Jun 9, 2021 · Jun 9, 2021 · Jun 14, 2021
commit 0cf35d8d8511fca4838438c540dd056488d32a1c
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -1101,7 +1101,7 @@ def from_dummies(
     data: DataFrame,
     subset: None | Index | list[Hashable] = None,
     sep: None | str | dict[str, str] = None,
-    dropped_first: None | str | dict[str, str] = None,
+    dropped_first: None | Hashable | dict[str, Hashable] = None,
 ) -> DataFrame:
     """
     Create a categorical `DataFrame` from a `DataFrame` of dummy variables.
@@ -1123,7 +1123,7 @@ def from_dummies(
         you can strip the underscore by specifying sep='_'.
         Alternatively, pass a dictionary to map prefix separators to prefixes if
         multiple and/or mixed separators are used in the column names.
-    dropped_fist : None, str or dict of str, default None
+    dropped_fist : None, Hashable or dict of Hashables, default None
         The implied value the dummy takes when all values are zero.
         Can be a a single value for all variables or a dict directly mapping the
         dropped value to a prefix of a variable.
@@ -1219,7 +1219,7 @@ def from_dummies(
                     f"First instance column: {col}"
                 )
     elif isinstance(sep, str):
-        variables_slice: dict[str, list] = {}
+        variables_slice = {}
         for col in data_to_decode.columns:
             prefix = col.split(sep)[0]
             if len(prefix) == len(col):
@@ -1250,24 +1250,24 @@ def check_len(item, name) -> None:
     if dropped_first:
         if isinstance(dropped_first, dict):
       
57AE
      check_len(dropped_first, "dropped_first")
-        elif isinstance(dropped_first, str):
+        elif isinstance(dropped_first, Hashable):
             dropped_first = dict(
                 zip(variables_slice, [dropped_first] * len(variables_slice))
             )
         else:
             raise TypeError(
-                f"Expected 'dropped_first' to be of type 'str' or 'dict'; "
+                f"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; "
                 f"Received 'dropped_first' of type: {type(dropped_first).__name__}"
             )
 
     cat_data = {}
     for prefix, prefix_slice in variables_slice.items():
         if sep is None:
             cats = subset.copy()
-        elif isinstance(sep, str):
-            cats = [col[len(prefix + sep) :] for col in prefix_slice]
         elif isinstance(sep, dict):
             cats = [col[len(prefix + sep[prefix]) :] for col in prefix_slice]
+        else:
+            cats = [col[len(prefix + sep) :] for col in prefix_slice]
         assigned = data_to_decode[prefix_slice].sum(axis=1)
         if any(assigned > 1):
             raise ValueError(

diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py
@@ -103,8 +103,8 @@ def test_from_dummies_no_prefix_wrong_dropped_first_type():
     with pytest.raises(
         TypeError,
         match=(
-            r"Expected \'dropped_first\' to be of type \'str\' or \'dict\'; "
-            r"Received \'dropped_first\' of type: list"
+            r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; "
+            r"Received 'dropped_first' of type: list"
         ),
     ):
         from_dummies(dummies, dropped_first=["c", "d"])
@@ -263,15 +263,14 @@ def test_from_dummies_with_prefix_dropped_first_str(dummies_with_unassigned):
 
 
 def test_from_dummies_with_prefix_dropped_first_wrong_type(dummies_with_unassigned):
-
     with pytest.raises(
         TypeError,
         match=(
-            r"Expected 'dropped_first' to be of type 'str' or 'dict'; "
-            r"Received 'dropped_first' of type: tuple"
+            r"Expected 'dropped_first' to be of type 'Hashable' or 'dict'; "
+            r"Received 'dropped_first' of type: list"
         ),
     ):
-        from_dummies(dummies_with_unassigned, sep="_", dropped_first=("x", "y"))
+        from_dummies(dummies_with_unassigned, sep="_", dropped_first=["x", "y"])
 
 
 def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned):
@@ -284,6 +283,26 @@ def test_from_dummies_with_prefix_dropped_first_dict(dummies_with_unassigned):
     tm.assert_frame_equal(result, expected)
 
 
+def test_from_dummies_with_prefix_dropped_first_int_and_float(dummies_with_unassigned):
+    expected = DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]})
+    result = from_dummies(
+        dummies_with_unassigned,
+        sep="_",
+        dropped_first={"col2": 1, "col1": 2.5},
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_from_dummies_with_prefix_dropped_first_bool_and_none(dummies_with_unassigned):
+    expected = DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]})
+    result = from_dummies(
+        dummies_with_unassigned,
+        sep="_",
+        dropped_first={"col2": None, "col1": False},
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_from_dummies_with_prefix_dropped_first_dict_not_complete(
     dummies_with_unassigned,
 ):