Backtick quotes are now tokenized. More tests and pytest fixtures

pandas-dev · jreback · Mar 20, 2019 · Jan 26, 2019 · Jan 26, 2019 · Feb 15, 2019
commit bfebb9dc690c2e7a60d12c20c2938eae46dc0953
diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py
@@ -1,10 +1,14 @@
 import numpy as np
 
-from pandas.compat import reduce
+from pandas.compat import reduce, string_types
 
 import pandas as pd
 
 
+# A token value Python's tokenizer probably will never use.
+_BACKTICK_QUOTED_STRING = 100
+
+
 def _ensure_decoded(s):
     """ if we have bytes, decode them to unicode """
     if isinstance(s, (np.bytes_, bytes)):
@@ -22,5 +26,13 @@ def _result_type_many(*arrays_and_dtypes):
         return reduce(np.result_type, arrays_and_dtypes)
 
 
+def clean_column_name_with_spaces(name):
+    """Check if name contains any spaces, if it contains any spaces
+    the spaces will be removed and an underscore suffix is added."""
+    if not isinstance(name, string_types) or " " not in name:
+        return name
+    return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")
+
+
 class NameResolutionError(NameError):
     pass
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
@@ -3,6 +3,8 @@
 
 import ast
 from functools import partial
+import itertools as it
+import operator as op
 import tokenize
 
 import numpy as np
@@ -13,6 +15,8 @@
 from pandas import compat
 from pandas.core import common as com
 from pandas.core.base import StringMixin
+from pandas.core.computation.common import (
+    _BACKTICK_QUOTED_STRING, clean_column_name_with_spaces)
 from pandas.core.computation.ops import (
     _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
     UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
@@ -31,7 +35,13 @@ def tokenize_string(source):
         A Python source code string
     """
     line_reader = StringIO(source).readline
-    for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
+    token_generator = tokenize.generate_tokens(line_reader)
+    for toknum, tokval, _, _, _ in token_generator:
+        if tokval == '`':
+            tokval = " ".join(it.takewhile(
+                lambda tokval: tokval != '`',
+                map(op.itemgetter(1), token_generator)))
+            toknum = _BACKTICK_QUOTED_STRING
         yield toknum, tokval
 
 
@@ -102,6 +112,30 @@ def _replace_locals(tok):
     return toknum, tokval
 
 
+def _clean_spaces_backtick_quoted_names(tok):
+    """Clean up a column name if surrounded by backticks.
+
+    Backtick quoted string are indicated by a certain tokval value. If a string
+    is a backtick quoted token it will processed by
+    :func:`clean_column_name_with_spaces` so that the parser can find this
+    string when the query is executed. See also :meth:`DataFrame.eval`.
+
+    Parameters
+    ----------
+    tok : tuple of int, str
+        ints correspond to the all caps constants in the tokenize module
+
+    Returns
+    -------
+    t : tuple of int, str
+        Either the input or token or the replacement values
+    """
+    toknum, tokval = tok
+    if toknum == _BACKTICK_QUOTED_STRING:
+        return tokenize.NAME, clean_column_name_with_spaces(tokval)
+    return toknum, tokval
+
+
 def _compose2(f, g):
     """Compose 2 callables"""
     return lambda *args, **kwargs: f(g(*args, **kwargs))
@@ -114,7 +148,8 @@ def _compose(*funcs):
 
 
 def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
-                                 _rewrite_assign)):
+                                 _rewrite_assign,
+                                 _clean_spaces_backtick_quoted_names)):
     """Compose a collection of tokenization functions
 
     Parameters
@@ -711,8 +746,9 @@ def visitor(x, y):
 class PandasExprVisitor(BaseExprVisitor):
 
     def __init__(self, env, engine, parser,
-                 preparser=partial(_preparse, f=_compose(_replace_locals,
-                                                         _replace_booleans))):
+                 preparser=partial(_preparse, f=_compose(
+                     _replace_locals, _replace_booleans,
+                     _clean_spaces_backtick_quoted_names))):
         super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
 
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -36,6 +36,7 @@
                            PY36, raise_with_traceback,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
+from pandas.core.computation.common import clean_column_name_with_spaces
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
@@ -3160,7 +3161,13 @@ def eval(self, expr, inplace=False, **kwargs):
         kwargs['level'] = kwargs.pop('level', 0) + 1
         if resolvers is None:
             index_resolvers = self._get_index_resolvers()
-            resolvers = dict(self.iteritems()), index_resolvers
+            # column names with spaces are altered so that they can be referred
+            # to by backtick quoting.
+            # Also see _clean_spaces_backtick_quoted_names from
+            # pandas/core/computation/expr.py
+            column_resolvers = {clean_column_name_with_spaces(k): v
+                                for k, v in self.iteritems()}
+            resolvers = column_resolvers, index_resolvers
         if 'target' not in kwargs:
             kwargs['target'] = self
         kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)

diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -1034,30 +1034,50 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):
 
 class TestDataFrameQueryBacktickQuoting(object):
 
-    def setup_method(self, method):
-        self.df = DataFrame({'A': [1, 2, 3],
-                             'B B': [3, 2, 1],
-                             'C C': [4, 5, 6]})
-
-    def teardown_method(self, method):
-        del self.df
-
-    def test_single_backtick_variable_query(self):
-        res = self.df.query('1 < `B B`')
-        expect = self.df[1 < self.df['B B']]
+    @pytest.fixture(scope='class')
+    def df(self):
+        yield DataFrame({'A': [1, 2, 3],
+                         'B B': [3, 2, 1],
+                         'C C': [4, 5, 6],
+                         'C_C': [8, 9, 10],
+                         'D_D D': [11, 1, 101]})
+
+    def test_single_backtick_variable_query(self, df):
+        res = df.query('1 < `B B`')
+        expect = df[1 < df['B B']]
         assert_frame_equal(res, expect)
 
-    def test_two_backtick_variables_query(self):
-        res = self.df.query('1 < `B B` and 4 < `C C`')
-        expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])]
+    def test_two_backtick_variables_query(self, df):
+        res = df.query('1 < `B B` and 4 < `C C`')
+        expect = df[(1 < df['B B']) & (4 < df['C C'])]
         assert_f
8D7D
rame_equal(res, expect)
 
-    def test_single_backtick_variable_expr(self):
-        res = self.df.eval('A + `B B`')
-        expect = self.df['A'] + self.df['B B']
+    def test_single_backtick_variable_expr(self, df):
+        res = df.eval('A + `B B`')
+        expect = df['A'] + df['B B']
+        assert_series_equal(res, expect)
+
+    def test_two_backtick_variables_expr(self, df):
+        res = df.eval('`B B` + `C C`')
+        expect = df['B B'] + df['C C']
+        assert_series_equal(res, expect)
+
+    def test_already_underscore_variable(self, df):
+        res = df.eval('`C_C` + A')
+        expect = df['C_C'] + df['A']
+        assert_series_equal(res, expect)
+
+    def test_same_name_but_underscores(self, df):
+        res = df.eval('C_C + `C C`')
+        expect = df['C_C'] + df['C C']
+        assert_series_equal(res, expect)
+
+    def test_mixed_underscores_and_spaces(self, df):
+        res = df.eval('A + `D_D D`')
+        expect = df['A'] + df['D_D D']
         assert_series_equal(res, expect)
 
-    def test_two_backtick_variables_expr(self):
-        res = self.df.eval('`B B` + `C C`')
-        expect = self.df['B B'] + self.df['C C']
+    def backtick_quote_name_with_no_spaces(self, df):
+        res = df.eval('A + `C_C`')
+        expect = df['A'] + df['C_C']
         assert_series_equal(res, expect)