8000 BUG: `query` on columns with characters like # in its name by aram-cedarwood · Pull Request #59296 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

BUG: query on columns with characters like # in its name #59296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ad0e9b8
escape unescape sharp, single quote, double quote
aram-cedarwood Jul 21, 2024
41fb2c8
parametrize and add tests
aram-cedarwood Jul 21, 2024
c756fc3
reinstate text in docs, shorten some lines
aram-cedarwood Jul 21, 2024
d4707b6
update whatsnew
aram-cedarwood Jul 21, 2024
ba50d91
minor: double space to single space
aram-cedarwood Jul 21, 2024
aaffbba
move to parsing.py, split better, add tests
aram-cedarwood Jul 27, 2024
0c75550
clean up
aram-cedarwood Jul 27, 2024
90c5dbc
remove old comment
aram-cedarwood Jul 27, 2024
c0ee651
test names
aram-cedarwood Jul 27, 2024
b7dc1a8
minor test change
aram-cedarwood Jul 27, 2024
164e3c5
improve splitting
aram-cedarwood Jul 27, 2024
4040370
fix splitting
aram-cedarwood Jul 27, 2024
148d1ed
improve splitting
aram-cedarwood Jul 27, 2024
990d0d3
add tests
aram-cedarwood Jul 27, 2024
e674eb8
edit docstring and comments
aram-cedarwood Jul 28, 2024
6a0ac72
minor test change
aram-cedarwood Jul 28, 2024
f2126b3
escape backticks
aram-cedarwood Jul 28, 2024
168f56c
escape backticks properly
aram-cedarwood Jul 29, 2024
810c82b
comment
aram-cedarwood Jul 30, 2024
86947b2
fix tests
aram-cedarwood Jul 30, 2024
e99db1c
GH 49633: special characters
aram-cedarwood Aug 6, 2024
a005f13
add noqa
aram-cedarwood Aug 6, 2024
a77a215
update docstring,
aram-cedarwood Aug 6, 2024
daf2c37
unmatched backtick or quote can raise SyntaxError OR TokenError
aram-cedarwood Aug 6, 2024
984431b
change splitting
aram-cedarwood Aug 7, 2024
b0833c0
remove repeated
aram-cedarwood Aug 7, 2024
5e0631d
collect chars in a list
aram-cedarwood Aug 9, 2024
d3669c7
add issue 49633 to whatsnew
aram-cedarwood Aug 9, 2024
87ded7c
atone for my typing sins :)
aram-cedarwood Aug 10, 2024
ad18c87
exclude test_query_eval.py for rst-inline-touching-normal in .pre-com…
aram-cedarwood Aug 10, 2024
b16d8c1
Merge branch 'main' into query_column_with_sharp_sign_in_name
aram-cedarwood Aug 10, 2024
173f399
tests: add decorators for Future Infer Strings job
aram-cedarwood Aug 12, 2024
9ee2231
pre-commit exclude
aram-cedarwood Aug 13, 2024
334ee63
Merge branch 'main' into query_column_with_sharp_sign_in_name
aram-cedarwood Aug 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
escape unescape sharp, single quote, double quote
  • Loading branch information
aram-cedarwood committed Aug 9, 2024
commit ad0e9b8b5968f2c06b12569067395e3fbed5ed0d
41 changes: 32 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
cast,
overload,
)
import urllib.parse
import warnings

import numpy as np
Expand Down Expand Up @@ -4559,14 +4560,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
For other characters that fall outside the ASCII range (U+0001..U+007F)
and those that are not further specified in PEP 3131,
the query parser will raise an error.
This excludes whitespace different than the space character,
but also the hashtag (as it is used for comments) and the backtick
itself (backtick can also not be escaped).

In a special case, quotes that make a pair around a backtick can
confuse the parser.
For example, ```it's` > `that's``` will raise an error,
as it forms a quoted string (``'s > `that'``) with a backtick inside.
This excludes whitespace different than the space character
and the backtick itself (backtick cannot be escaped).

See also the `Python documentation about lexical analysis
<https://docs.python.org/3/reference/lexical_analysis.html>`__
Expand Down Expand Up @@ -4620,7 +4615,35 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None
res = self.eval(expr, **kwargs)

# GH 59285
if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns):
# Create a copy of `self` with column names escaped
escaped_self = self.copy()
escaped_self.columns = [
urllib.parse.quote(col) for col in escaped_self.columns
]

# In expr, escape column names between backticks
column_name_to_escaped_name = {
col: urllib.parse.quote(col) for col in self.columns
}
escaped_expr = "`".join(
(column_name_to_escaped_name.get(token, token) if (i % 2) else token)
for i, token in enumerate(expr.split("`"))
)

# eval
escaped_res = escaped_self.eval(escaped_expr, **kwargs)

# If `res` is a Series or DataFrame, unescape names
res = escaped_res.copy()
if isinstance(res, Series) and res.name:
res.name = urllib.parse.unquote(res.name)
elif isinstance(res, DataFrame):
res.columns = [urllib.parse.unquote(col) for col in res.columns]
else:
res = self.eval(expr, **kwargs)

try:
result = self.loc[res]
Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/computation/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1978,6 +1978,64 @@ def test_eval_no_support_column_name(request, column):
tm.assert_frame_equal(result, expected)


def test_query_on_column_name_with_hashtag_character():
# GH 59285
df = DataFrame((1, 2, 3), columns=["a#"])
result = df.query("`a#` < 2")
expected = df[df["a#"] < 2]
tm.assert_frame_equal(result, expected)


def test_query_on_expr_with_comment():
# GH 59285
df = DataFrame((1, 2, 3), columns=["a#"])
result = df.query("`a#` < 2 # This is a comment")
expected = df[df["a#"] < 2]
tm.assert_frame_equal(result, expected)


def test_query_on_column_names_with_single_quote_character():
df = DataFrame(
[
{"it's": 1, "that's": 2},
{"it's": 3, "that's": 4},
{"it's": -1, "that's": -2},
{"it's": -3, "that's": -4},
]
)
result = df.query("`it's` < `that's`")
expected = df[df["it's"] < df["that's"]]
tm.assert_frame_equal(result, expected)


def test_query_on_column_names_with_double_quote_character():
df = DataFrame(
[
{'it"s': 1, 'that"s': 2},
{'it"s': 3, 'that"s': 4},
{'it"s': -1, 'that"s': -2},
{'it"s': -3, 'that"s': -4},
]
)
result = df.query('`it"s` < `that"s`')
expected = df[df['it"s'] < df['that"s']]
tm.assert_frame_equal(result, expected)


def test_query_on_column_names_with_single_quote_and_double_quote_character():
df = DataFrame(
[
{"it's": 1, 'that\'s "nice"': 2},
{"it's": 3, 'that\'s "nice"': 4},
{"it's": -1, 'that\'s "nice"': -2},
{"it's": -3, 'that\'s "nice"': -4},
]
)
result = df.query("`it's` < `that's \"nice\"`")
expected = df[df["it's"] < df['that\'s "nice"']]
tm.assert_frame_equal(result, expected)


def test_set_inplace():
# https://github.com/pandas-dev/pandas/issues/47449
# Ensure we don't only update the DataFrame inplace, but also the actual
Expand Down
0