8000 ENH: Quoting column names containing spaces with backticks to use them in query and eval. by hwalinga · Pull Request #24955 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: Quoting column names containing spaces with backticks to use them in query and eval. #24955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Backtick quotes are now tokenized. More tests and pytest fixtures
  • Loading branch information
hwalinga committed Feb 15, 2019
commit bfebb9dc690c2e7a60d12c20c2938eae46dc0953
14 changes: 13 additions & 1 deletion pandas/core/computation/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import numpy as np

from pandas.compat import reduce
from pandas.compat import reduce, string_types

import pandas as pd


# A token value Python's tokenizer probably will never use.
_BACKTICK_QUOTED_STRING = 100


def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, (np.bytes_, bytes)):
Expand All @@ -22,5 +26,13 @@ def _result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def clean_column_name_with_spaces(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, string_types) or " " not in name:
return name
return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")


class NameResolutionError(NameError):
pass
44 changes: 40 additions & 4 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import ast
from functools import partial
import itertools as it
import operator as op
import tokenize

import numpy as np
Expand All @@ -13,6 +15,8 @@
from pandas import compat
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING, clean_column_name_with_spaces)
from pandas.core.computation.ops import (
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
Expand All @@ -31,7 +35,13 @@ def tokenize_string(source):
A Python source code string
"""
line_reader = StringIO(source).readline
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
token_generator = tokenize.generate_tokens(line_reader)
for toknum, tokval, _, _, _ in token_generator:
if tokval == '`':
tokval = " ".join(it.takewhile(
lambda tokval: tokval != '`',
map(op.itemgetter(1), token_generator)))
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval


Expand Down Expand Up @@ -102,6 +112,30 @@ def _replace_locals(tok):
return toknum, tokval


def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`clean_column_name_with_spaces` so that the parser can find this
string when the query is executed. See also :meth:`DataFrame.eval`.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, clean_column_name_with_spaces(tokval)
return toknum, tokval


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
Expand All @@ -114,7 +148,8 @@ def _compose(*funcs):


def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
_rewrite_assign)):
_rewrite_assign,
_clean_spaces_backtick_quoted_names)):
"""Compose a collection of tokenization functions

Parameters
Expand Down Expand Up @@ -711,8 +746,9 @@ def visitor(x, y):
class PandasExprVisitor(BaseExprVisitor):

def __init__(self, env, engine, parser,
preparser=partial(_preparse, f=_compose(_replace_locals,
_replace_booleans))):
preparser=partial(_preparse, f=_compose(
_replace_locals, _replace_booleans,
_clean_spaces_backtick_quoted_names))):
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


Expand Down
9 changes: 8 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
PY36, raise_with_traceback,
string_and_binary_types)
from pandas.compat.numpy import function as nv
from pandas.core.computation.common import clean_column_name_with_spaces
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
Expand Down Expand Up @@ -3160,7 +3161,13 @@ def eval(self, expr, inplace=False, **kwargs):
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
resolvers = dict(self.iteritems()), index_resolvers
# column names with spaces are altered so that they can be referred
# to by backtick quoting.
# Also see _clean_spaces_backtick_quoted_names from
# pandas/core/computation/expr.py
column_resolvers = {clean_column_name_with_spaces(k): v
for k, v in self.iteritems()}
resolvers = column_resolvers, index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
Expand Down
60 changes: 40 additions & 20 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,30 +1034,50 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):

class TestDataFrameQueryBacktickQuoting(object):

def setup_method(self, method):
self.df = DataFrame({'A': [1, 2, 3],
'B B': [3, 2, 1],
'C C': [4, 5, 6]})

def teardown_method(self, method):
del self.df

def test_single_backtick_variable_query(self):
res = self.df.query('1 < `B B`')
expect = self.df[1 < self.df['B B']]
@pytest.fixture(scope='class')
def df(self):
yield DataFrame({'A': [1, 2, 3],
'B B': [3, 2, 1],
'C C': [4, 5, 6],
'C_C': [8, 9, 10],
'D_D D': [11, 1, 101]})

def test_single_backtick_variable_query(self, df):
res = df.query('1 < `B B`')
expect = df[1 < df['B B']]
assert_frame_equal(res, expect)

def test_two_backtick_variables_query(self):
res = self.df.query('1 < `B B` and 4 < `C C`')
expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])]
def test_two_backtick_variables_query(self, df):
res = df.query('1 < `B B` and 4 < `C C`')
expect = df[(1 < df['B B']) & (4 < df['C C'])]
assert_f 8D7D rame_equal(res, expect)

def test_single_backtick_variable_expr(self):
res = self.df.eval('A + `B B`')
expect = self.df['A'] + self.df['B B']
def test_single_backtick_variable_expr(self, df):
res = df.eval('A + `B B`')
expect = df['A'] + df['B B']
assert_series_equal(res, expect)

def test_two_backtick_variables_expr(self, df):
res = df.eval('`B B` + `C C`')
expect = df['B B'] + df['C C']
assert_series_equal(res, expect)

def test_already_underscore_variable(self, df):
res = df.eval('`C_C` + A')
expect = df['C_C'] + df['A']
assert_series_equal(res, expect)

def test_same_name_but_underscores(self, df):
res = df.eval('C_C + `C C`')
expect = df['C_C'] + df['C C']
assert_series_equal(res, expect)

def test_mixed_underscores_and_spaces(self, df):
res = df.eval('A + `D_D D`')
expect = df['A'] + df['D_D D']
assert_series_equal(res, expect)

def test_two_backtick_variables_expr(self):
res = self.df.eval('`B B` + `C C`')
expect = self.df['B B'] + self.df['C C']
def backtick_quote_name_with_no_spaces(self, df):
res = df.eval('A + `C_C`')
expect = df['A'] + df['C_C']
assert_series_equal(res, expect)
0