-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
Initial draft: from_dummies #41902
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial draft: from_dummies #41902
Changes from 1 commit
f3e6afe
c7c5588
d06540f
1fa4e8a
c7f8ec8
3cc98ca
0e131c6
9f74dc7
442b340
38cf04d
8eccfab
fd027c5
106ff3c
2019228
be39c05
d406227
61a25e0
1d104f8
5bcfbb4
ca6200e
bf17cdb
92b5dae
c2cd747
dc50464
4d9cfd0
82d6743
153202d
d3dd9f7
e6ec175
ee6025d
4e741c8
1b4a8e9
90177be
d58c668
46457fa
131f42b
1af65ac
6dacf53
61edd30
04f360c
7ff2f3b
56ea182
39a0199
e05fe3f
23f6c07
7190879
012a1dd
52ed909
d8e4743
0cf35d8
b9303bc
3207534
8089fe5
55ad274
1b17815
00c7b05
07ba536
bbe41d0
329394b
b83ac6a
1f5e1dc
8a3421b
16cdaa0
174df1f
e45d3f8
e83faed
1e12e6a
24e9899
c8e7a7d
0ac8fff
6af6cad
54fdcbd
ced3ed0
6db7744
c84d973
842d335
8f91012
84d5bd8
fd0f985
6230d0f
84a60f7
c78ef2a
52a9dea
bc658ba
9fbca72
2581fc9
85a0ed8
5b74039
015ee94
66c0292
30b8ff1
b261656
555825b
9d6e571
9f1bb8e
dc52985
e7d6828
ae9f3d2
a59ed4e
66c7a64
76221f8
7fa66b3
536f9c5
530889e
6536c65
1272a23
fd3b115
bd5a118
f7d08d0
c32e514
0fda02f
62b09ae
1dcdd9a
3c00690
4425b4a
dc144f7
15503b0
61a348b
f06a45c
f3a0f83
23c133f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1100,7 +1100,7 @@ def get_empty_frame(data) -> DataFrame: | |
def from_dummies( | ||
data: DataFrame, | ||
sep: None | str = None, | ||
implied_value: None | Hashable | dict[str, Hashable] = None, | ||
implied_category: None | Hashable | dict[str, Hashable] = None, | ||
) -> DataFrame: | ||
""" | ||
Create a categorical `DataFrame` from a `DataFrame` of dummy variables. | ||
|
@@ -1116,10 +1116,10 @@ def from_dummies( | |
character indicating the separation of the categorical names from the prefixes. | ||
For example, if your column names are 'prefix_A' and 'prefix_B', | ||
you can strip the underscore by specifying sep='_'. | ||
implied_value : None, Hashable or dict of Hashables, default None | ||
The implied value the dummy takes when all values are zero. | ||
implied_category : None, Hashable or dict of Hashables, default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a non-obvious name. what has been considered here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's meant to be the inverse of Not sure what a clearer name could be - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about using To remove the ambiguity of what implies the |
||
The implied category the dummy takes when all values are zero. | ||
Can be a a single value for all variables or a dict directly mapping the | ||
implied values to a prefix of a variable. | ||
implied categories to a prefix of a variable. | ||
|
||
Returns | ||
------- | ||
|
@@ -1160,7 +1160,7 @@ def from_dummies( | |
... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], | ||
... "col2_c": [0, 0, 0]}) | ||
|
||
>>> pd.from_dummies(df, sep="_", implied_value={"col1": "d", "col2": "e"}) | ||
>>> pd.from_dummies(df, sep="_", implied_category={"col1": "d", "col2": "e"}) | ||
col1 col2 | ||
0 a b | ||
1 b a | ||
|
@@ -1199,7 +1199,7 @@ def from_dummies( | |
f"Received 'sep' of type: {type(sep).__name__}" | ||
) | ||
|
||
# validate number of implied_value | ||
# validate number of implied_category | ||
def check_len(item, name) -> None: | ||
if not len(item) == len(variables_slice): | ||
len_msg = ( | ||
|
@@ -1209,18 +1209,19 @@ def check_len(item, name) -> None: | |
) | ||
raise ValueError(len_msg) | ||
|
||
if implied_value: | ||
if isinstance(implied_value, dict): | ||
check_len(implied_value, "implied_value") | ||
elif isinstance(implied_value, Hashable): | ||
implied_value = dict( | ||
zip(variables_slice, [implied_value] * len(variables_slice)) | ||
if implied_category: | ||
if isinstance(implied_category, dict): | ||
check_len(implied_category, "implied_category") | ||
elif isinstance(implied_category, Hashable): | ||
implied_category = dict( | ||
zip(variables_slice, [implied_category] * len(variables_slice)) | ||
) | ||
else: | ||
raise TypeError( | ||
f"Expected 'implied_value' to be of type " | ||
f"Expected 'implied_category' to be of type " | ||
f"'None', 'Hashable', or 'dict'; " | ||
f"Received 'implied_value' of type: {type(implied_value).__name__}" | ||
f"Received 'implied_category' of type: " | ||
f"{type(implied_category).__name__}" | ||
) | ||
|
||
cat_data = {} | ||
|
@@ -1238,8 +1239,8 @@ def check_len(item, name) -> None: | |
f"First instance in row: {assigned.argmax()}" | ||
pckSF marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
elif any(assigned == 0): | ||
if isinstance(implied_value, dict): | ||
cats.append(implied_value[prefix]) | ||
if isinstance(implied_category, dict): | ||
cats.append(implied_category[prefix]) | ||
else: | ||
raise ValueError( | ||
f"Dummy DataFrame contains unassigned value(s); " | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should consider moving get_dummies / from_dummies to a separate file (in /reshape), could be a precursor PR.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like that idea to improve clarity. What would be an elegant and obvious name for a collection of "reshape operations that change the data representation" - maybe
transform
? Or would we rather collect more categrogical/dummy specific operations instead? For me the first option seems more intuitive: I will think about a name -/reshape/transform.py
could cause confusion with the.transform
method.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one_hot_encoding.py
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or if its supposed to be a dummy operations file:
dummy_coding.py
?