10000 ENH: Quoting column names containing spaces with backticks to use them in query and eval. by hwalinga · Pull Request #24955 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: Quoting column names containing spaces with backticks to use them in query and eval. #24955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
10000
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
More clarity in comments; Moved column resolver to class; Use uuid
  • Loading branch information
hwalinga committed Mar 10, 2019
commit 63c25bfe0f0dff747b4975428930dd722f80268c
19 changes: 6 additions & 13 deletions pandas/core/computation/common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import uuid

import numpy as np

from pandas.compat import reduce, string_types
Expand Down Expand Up @@ -25,24 +27,15 @@ def _result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def _clean_column_name_with_spaces(name):
def _remove_spaces_column_name(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, string_types) or " " not in name:
return name
return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_")


def _get_column_resolvers(dataFrame):
"""Return the axis resolvers of a dataframe.

Column names with spaces are 'cleaned up' so that they can be referred to
by backtick quoting. See also :func:`_clean_spaces_backtick_quoted_names`
from :mod:`pandas.core.computation`
"""

return {_clean_column_name_with_spaces(k): v for k, v
in dataFrame.iteritems()}
# uuid3 will provide a unique string that can be independently reproduced.
return name.replace(" ", "_") + "_" + \
str(uuid.uuid3(uuid.NAMESPACE_DNS, name)).replace("-", "")


class NameResolutionError(NameError):
Expand Down
14 changes: 9 additions & 5 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING, _clean_column_name_with_spaces)
_BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
from pandas.core.computation.ops import (
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
Expand All @@ -36,6 +36,10 @@ def tokenize_string(source):
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
if tokval == '`':
tokval = " ".join(it.takewhile(
Expand Down Expand Up @@ -117,9 +121,9 @@ def _clean_spaces_backtick_quoted_names(tok):

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_clean_column_name_with_spaces` so that the parser can find this
string when the query is executed. See also :func:`_get_column_resolvers`
used in :meth:`DataFrame.eval`.
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.

Parameters
----------
Expand All @@ -133,7 +137,7 @@ def _clean_spaces_backtick_quoted_names(tok):
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _clean_column_name_with_spaces(tokval)
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval


Expand Down
10 changes: 6 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2971,7 +2971,10 @@ def query(self, expr, inplace=False, **kwargs):
.. versionadded:: 0.25.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an example in the Examples section as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, but don't know what this means:

1 Warnings found:
No extended summary found
Docstring for "pandas.DataFrame.query" correct. :)


You can refer to column names that contain spaces by surrounding
them in backticks like ```a a` + b``.
them in backticks.

For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.

inplace : bool
Whether the query should modify the data in place or return
Expand Down Expand Up @@ -3165,10 +3168,9 @@ def eval(self, expr, inplace=False, **kwargs):
resolvers = kwargs.pop('resolvers', None)
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
from pandas.core.computation.common import _get_column_resolvers

index_resolvers = self._get_index_resolvers()
column_resolvers = _get_column_resolvers(self)
column_resolvers = \
self._get_space_character_free_column_resolvers()
resolvers = column_resolvers, index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.computation.common import _remove_spaces_column_name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import this locally in the function (as we have some restricted import about computation)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

from pandas.core.index import (
Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index)
from pandas.core.indexes.datetimes import DatetimeIndex
Expand Down Expand Up @@ -423,6 +424,17 @@ def _get_index_resolvers(self):
d.update(self._get_axis_resolvers(axis_name))
return d

def _get_space_character_free_column_resolvers(self):
"""Return the space character free column resolvers of a dataframe.

Column names with spaces are 'cleaned up' so that they can be referred
to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""

return {_remove_spaces_column_name(k): v for k, v
in self.iteritems()}

@property
def _info_axis(self):
return getattr(self, self._info_axis_name)
Expand Down
0