8000 Backtick quotes are now tokenized. More tests and pytest fixtures · pandas-dev/pandas@bfebb9d · GitHub
[go: up one dir, main page]

Skip to content

Commit bfebb9d

Browse files
committed
Backtick quotes are now tokenized. More tests and pytest fixtures
1 parent 22686fd commit bfebb9d

File tree

4 files changed

+101
-26
lines changed
  • pandas
    • core
      • computation
        • < 8000 div class="PRIVATE_TreeView-item-visual prc-TreeView-TreeViewItemVisual-dRlGq" aria-hidden="true">
          expr.py
  • tests/frame
  • 4 files changed

    +101
    -26
    lines changed

    pandas/core/computation/common.py

    Lines changed: 13 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -1,10 +1,14 @@
    11
    import numpy as np
    22

    3-
    from pandas.compat import reduce
    3+
    from pandas.compat import reduce, string_types
    44

    55
    import pandas as pd
    66

    77

    8+
    # A token value Python's tokenizer probably will never use.
    9+
    _BACKTICK_QUOTED_STRING = 100
    10+
    11+
    812
    def _ensure_decoded(s):
    913
    """ if we have bytes, decode them to unicode """
    1014
    if isinstance(s, (np.bytes_, bytes)):
    @@ -22,5 +26,13 @@ def _result_type_many(*arrays_and_dtypes):
    2226
    return reduce(np.result_type, arrays_and_dtypes)
    2327

    2428

    29+
    def clean_column_name_with_spaces(name):
    30+
    """Check if name contains any spaces, if it contains any spaces
    31+
    the spaces will be removed and an underscore suffix is added."""
    32+
    if not isinstance(name, string_types) or " " not in name:
    33+
    return name
    34+
    return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")
    35+
    36+
    2537
    class NameResolutionError(NameError):
    2638
    pass

    pandas/core/computation/expr.py

    Lines changed: 40 additions & 4 deletions
    Original file line numberDiff line numberDiff line change
    @@ -3,6 +3,8 @@
    33

    44
    import ast
    55
    from functools import partial
    6+
    import itertools as it
    7+
    import operator as op
    68
    import tokenize
    79

    810
    import numpy as np
    @@ -13,6 +15,8 @@
    1315
    from pandas import compat
    1416
    from pandas.core import common as com
    1517
    from pandas.core.base import StringMixin
    18+
    from pandas.core.computation.common import (
    19+
    _BACKTICK_QUOTED_STRING, clean_column_name_with_spaces)
    1620
    from pandas.core.computation.ops import (
    1721
    _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
    1822
    UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
    @@ -31,7 +35,13 @@ def tokenize_string(source):
    3135
    A Python source code string
    3236
    """
    3337
    line_reader = StringIO(source).readline
    34-
    for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
    38+
    token_generator = tokenize.generate_tokens(line_reader)
    39+
    for toknum, tokval, _, _, _ in token_generator:
    40+
    if tokval == '`':
    41+
    tokval = " ".join(it.takewhile(
    42+
    lambda tokval: tokval != '`',
    43+
    map(op.itemgetter(1), token_generator)))
    44+
    toknum = _BACKTICK_QUOTED_STRING
    3545
    yield toknum, tokval
    3646

    3747

    @@ -102,6 +112,30 @@ def _replace_locals(tok):
    102112
    return toknum, tokval
    103113

    104114

    115+
    def _clean_spaces_backtick_quoted_names(tok):
    116+
    """Clean up a column name if surrounded by backticks.
    117+
    118+
    Backtick quoted string are indicated by a certain tokval value. If a string
    119+
    is a backtick quoted token it will processed by
    120+
    :func:`clean_column_name_with_spaces` so that the parser can find this
    121+
    string when the query is executed. See also :meth:`DataFrame.eval`.
    122+
    123+
    Parameters
    124+
    ----------
    125+
    tok : tuple of int, str
    126+
    ints correspond to the all caps constants in the tokenize module
    127+
    128+
    Returns
    129+
    -------
    130+
    t : tuple of int, str
    131+
    Either the input or token or the replacement values
    132+
    """
    133+
    toknum, tokval = tok
    134+
    if toknum == _BACKTICK_QUOTED_STRING:
    135+
    return tokenize.NAME, clean_column_name_with_spaces(tokval)
    136+
    return toknum, tokval
    137+
    138+
    105139
    def _compose2(f, g A3E2 ):
    106140
    """Compose 2 callables"""
    107141
    return lambda *args, **kwargs: f(g(*args, **kwargs))
    @@ -114,7 +148,8 @@ def _compose(*funcs):
    114148

    115149

    116150
    def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
    117-
    _rewrite_assign)):
    151+
    _rewrite_assign,
    152+
    _clean_spaces_backtick_quoted_names)):
    118153
    """Compose a collection of tokenization functions
    119154
    120155
    Parameters
    @@ -711,8 +746,9 @@ def visitor(x, y):
    711746
    class PandasExprVisitor(BaseExprVisitor):
    712747

    713748
    def __init__(self, env, engine, parser,
    714-
    preparser=partial(_preparse, f=_compose(_replace_locals,
    715-
    _replace_booleans))):
    749+
    preparser=partial(_preparse, f=_compose(
    750+
    _replace_locals, _replace_booleans,
    751+
    _clean_spaces_backtick_quoted_names))):
    716752
    super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
    717753

    718754

    pandas/core/frame.py

    Lines changed: 8 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -36,6 +36,7 @@
    3636
    PY36, raise_with_traceback,
    3737
    string_and_binary_types)
    3838
    from pandas.compat.numpy import function as nv
    39+
    from pandas.core.computation.common import clean_column_name_with_spaces
    3940
    from pandas.core.dtypes.cast import (
    4041
    maybe_upcast,
    4142
    cast_scalar_to_array,
    @@ -3160,7 +3161,13 @@ def eval(self, expr, inplace=False, **kwargs):
    31603161
    kwargs['level'] = kwargs.pop('level', 0) + 1
    31613162
    if resolvers is None:
    31623163
    index_resolvers = self._get_index_resolvers()
    3163-
    resolvers = dict(self.iteritems()), index_resolvers
    3164+
    # column names with spaces are altered so that they can be referred
    3165+
    # to by backtick quoting.
    3166+
    # Also see _clean_spaces_backtick_quoted_names from
    3167+
    # pandas/core/computation/expr.py
    3168+
    column_resolvers = {clean_column_name_with_spaces(k): v
    3169+
    for k, v in self.iteritems()}
    3170+
    resolvers = column_resolvers, index_resolvers
    31643171
    if 'target' not in kwargs:
    31653172
    kwargs['target'] = self
    31663173
    kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)

    pandas/tests/frame/test_query_eval.py

    Lines changed: 40 additions & 20 deletions
    Original file line numberDiff line numberDiff line change
    @@ -1034,30 +1034,50 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):
    10341034

    10351035
    class TestDataFrameQueryBacktickQuoting(object):
    10361036

    1037-
    def setup_method(self, method):
    1038-
    self.df = DataFrame({'A': [1, 2, 3],
    1039-
    'B B': [3, 2, 1],
    1040-
    'C C': [4, 5, 6]})
    1041-
    1042-
    def teardown_method(self, method):
    1043-
    del self.df
    1044-
    1045-
    def test_single_backtick_variable_query(self):
    1046-
    res = self.df.query('1 < `B B`')
    1047-
    expect = self.df[1 < self.df['B B']]
    1037+
    @pytest.fixture(scope='class')
    1038+
    def df(self):
    1039+
    yield DataFrame({'A': [1, 2, 3],
    1040+
    'B B': [3, 2, 1],
    1041+
    'C C': [4, 5, 6],
    1042+
    'C_C': [8, 9, 10],
    1043+
    'D_D D': [11, 1, 101]})
    1044+
    1045+
    def test_single_backtick_variable_query(self, df):
    1046+
    res = df.query('1 < `B B`')
    1047+
    expect = df[1 < df['B B']]
    10481048
    assert_frame_equal(res, expect)
    10491049

    1050-
    def test_two_backtick_variables_query(self):
    1051-
    res = self.df.query('1 < `B B` and 4 < `C C`')
    1052-
    expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])]
    1050+
    def test_two_backtick_variables_query(self, df):
    1051+
    res = df.query('1 < `B B` and 4 < `C C`')
    1052+
    expect = df[(1 < df['B B']) & (4 < df['C C'])]
    10531053
    assert_frame_equal(res, expect)
    10541054

    1055-
    def test_single_backtick_variable_expr(self):
    1056-
    res = self.df.eval('A + `B B`')
    1057-
    expect = self.df['A'] + self.df['B B']
    1055+
    def test_single_backtick_variable_expr(self, df):
    1056+
    res = df.eval('A + `B B`')
    1057+
    expect = df['A'] + df['B B']
    1058+
    assert_series_equal(res, expect)
    1059+
    1060+
    def test_two_backtick_variables_expr(self, df):
    1061+
    res = df.eval('`B B` + `C C`')
    1062+
    expect = df['B B'] + df['C C']
    1063+
    assert_series_equal(res, expect)
    1064+
    1065+
    def test_already_underscore_variable(self, df):
    1066+
    res = df.eval('`C_C` + A')
    1067+
    expect = df['C_C'] + df['A']
    1068+
    assert_series_equal(res, expect)
    1069+
    1070+
    def test_same_name_but_underscores(self, df):
    1071+
    res = df.eval('C_C + `C C`')
    1072+
    expect = df['C_C'] + df['C C']
    1073+
    assert_series_equal(res, expect)
    1074+
    1075+
    def test_mixed_underscores_and_spaces(self, df):
    1076+
    res = df.eval('A + `D_D D`')
    1077+
    expect = df['A'] + df['D_D D']
    10581078
    assert_series_equal(res, expect)
    10591079

    1060-
    def test_two_backtick_variables_expr(self):
    1061-
    res = self.df.eval('`B B` + `C C`')
    1062-
    expect = self.df['B B'] + self.df['C C']
    1080+
    def backtick_quote_name_with_no_spaces(self, df):
    1081+
    res = df.eval('A + `C_C`')
    1082+
    expect = df['A'] + df['C_C']
    10631083
    assert_series_equal(res, expect)

    0 commit comments

    Comments
     (0)
    0