8000 fix: Improve escaping of literals and identifiers by TrevorBergeron · Pull Request #682 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
14 changes: 7 additions & 7 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2096,7 +2096,7 @@ def _get_rows_as_json_values(self) -> Block:
)

column_names.append(serialized_column_name)
column_names_csv = sql.csv(column_names, quoted=True)
column_names_csv = sql.csv(map(sql.simple_literal, column_names))

# index columns count
index_columns_count = len(self.index_columns)
Expand All @@ -2108,22 +2108,22 @@ def _get_rows_as_json_values(self) -> Block:

# types of the columns to serialize for the row
column_types = list(self.index.dtypes) + list(self.dtypes)
column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True)
column_types_csv = sql.csv(
[sql.simple_literal(str(typ)) for typ in column_types]
)

# row dtype to use for deserializing the row as pandas series
pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types)
if pandas_row_dtype is None:
pandas_row_dtype = "object"
pandas_row_dtype = sql.quote(str(pandas_row_dtype))
pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype))

# create a json column representing row through SQL manipulation
row_json_column_name = guid.generate_guid()
select_columns = (
[ordering_column_name] + list(self.index_columns) + [row_json_column_name]
)
select_columns_csv = sql.csv(
[sql.column_reference(col) for col in select_columns]
)
select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns])
json_sql = f"""\
With T0 AS (
{textwrap.indent(expr_sql, " ")}
Expand All @@ -2136,7 +2136,7 @@ def _get_rows_as_json_values(self) -> Block:
"values", [{column_references_csv}],
"indexlength", {index_columns_count},
"dtype", {pandas_row_dtype}
) AS {row_json_column_name} FROM T0
) AS {sql.identifier(row_json_column_name)} FROM T0
)
SELECT {select_columns_csv} FROM T1
"""
Expand Down
28 changes: 5 additions & 23 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
import abc
import functools
import itertools
import textwrap
import typing
from typing import Collection, Iterable, Literal, Optional, Sequence
from typing import Collection, Literal, Optional, Sequence

import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
Expand All @@ -40,6 +39,7 @@
OrderingExpression,
)
import bigframes.core.schema as schemata
import bigframes.core.sql
from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
import bigframes.dtypes
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -821,15 +821,13 @@ def to_sql(
)
)
output_columns = [
col_id_overrides.get(col) if (col in col_id_overrides) else col
for col in baked_ir.column_ids
col_id_overrides.get(col, col) for col in baked_ir.column_ids
]
selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns))
sql = bigframes.core.sql.select_from(output_columns, sql)

sql = textwrap.dedent(f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n")
# Single row frames may not have any ordering columns
if len(baked_ir._ordering.all_ordering_columns) > 0:
order_by_clause = baked_ir._ordering_clause(
order_by_clause = bigframes.core.sql.ordering_clause(
baked_ir._ordering.all_ordering_columns
)
sql += f"{order_by_clause}\n"
Expand All @@ -843,22 +841,6 @@ def to_sql(
)
return typing.cast(str, sql)

def _ordering_clause(self, ordering: Iterable[OrderingExpression]) -> str:
parts = []
for col_ref in ordering:
asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC"
null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST"
ordering_expr = col_ref.scalar_expression
# We don't know how to compile scalar expressions in isolation
if ordering_expr.is_const:
# Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here.
continue
if not isinstance(ordering_expr, ex.UnboundVariableExpression):
raise ValueError("Expected direct column reference.")
part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"

def _to_ibis_expr(
self,
*,
Expand Down
155 changes: 129 additions & 26 deletions bigframes/core/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,49 +11,152 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

"""
Utility functions for SQL construction.
"""

from typing import Iterable
import datetime
import math
import textwrap
from typing import Iterable, TYPE_CHECKING

# Literals and identifiers matching this pattern can be unquoted
unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$"

def quote(value: str):
"""Return quoted input string."""

# Let's use repr which also escapes any special characters
#
# >>> for val in [
# ... "123",
# ... "str with no special chars",
# ... "str with special chars.,'\"/\\"
# ... ]:
# ... print(f"{val} -> {repr(val)}")
# ...
# 123 -> '123'
# str with no special chars -> 'str with no special chars'
# str with special chars.,'"/\ -> 'str with special chars.,\'"/\\'
if TYPE_CHECKING:
import google.cloud.bigquery as bigquery

return repr(value)
import bigframes.core.ordering


def column_reference(column_name: str):
### Writing SQL Values (literals, column references, table references, etc.)
def simple_literal(value: str | int | bool | float):
"""Return quoted input string."""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
if isinstance(value, str):
# Single quoting seems to work nicer with ibis than double quoting
return f"'{escape_special_characters(value)}'"
elif isinstance(value, (bool, int)):
return str(value)
elif isinstance(value, float):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals
if math.isnan(value):
return 'CAST("nan" as FLOAT)'
if value == math.inf:
return 'CAST("+inf" as FLOAT)'
if value == -math.inf:
return 'CAST("-inf" as FLOAT)'
return str(value)
else:
raise ValueError(f"Cannot produce literal for {value}")


def multi_literal(*values: str):
literal_strings = [simple_literal(i) for i in values]
return "(" + ", ".join(literal_strings) + ")"


def identifier(id: str) -> str:
"""Return a string representing column reference in a SQL."""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
# Just always escape, otherwise need to check against every reserved sql keyword
return f"`{escape_special_characters(id)}`"


def escape_special_characters(value: str):
"""Escapes all special charactesrs"""
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals
trans_table = str.maketrans(
{
"\a": r"\a",
"\b": r"\b",
"\f": r"\f",
"\n": r"\n",
"\r": r"\r",
"\t": r"\t",
"\v": r"\v",
"\\": r"\\",
"?": r"\?",
'"': r"\"",
"'": r"\'",
"`": r"\`",
}
)
return value.translate(trans_table)


def cast_as_string(column_name: str) -> str:
"""Return a string representing string casting of a column."""

return f"`{column_name}`"
return f"CAST({identifier(column_name)} AS STRING)"


def cast_as_string(column_name: str):
"""Return a string representing string casting of a column."""
def csv(values: Iterable[str]) -> str:
"""Return a string of comma separated values."""
return ", ".join(values)

return f"CAST({column_reference(column_name)} AS STRING)"

def table_reference(table_ref: bigquery.TableReference) -> str:
return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`"

def csv(values: Iterable[str], quoted=False):
"""Return a string of comma separated values."""

if quoted:
values = [quote(val) for val in values]
def infix_op(opname: str, left_arg: str, right_arg: str):
# Maybe should add parentheses??
return f"{left_arg} {opname} {right_arg}"

return ", ".join(values)

### Writing SELECT expressions
def select_from(columns: Iterable[str], subquery: str, distinct: bool = False):
selection = ", ".join(map(identifier, columns))
distinct_clause = "DISTINCT " if distinct else ""

return textwrap.dedent(
f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n"
)


def select_table(table_ref: bigquery.TableReference):
return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}")


def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str:
is_unique_sql = f"""WITH full_table AS (
{select_from(columns, table_sql)}
),
distinct_table AS (
{select_from(columns, table_sql, distinct=True)}
)

SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`,
(SELECT COUNT(*) FROM distinct_table) AS `distinct_count`
"""
return is_unique_sql


def ordering_clause(
ordering: Iterable[bigframes.core.ordering.OrderingExpression],
) -> str:
import bigframes.core.expression

parts = []
for col_ref in ordering:
asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC"
null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST"
ordering_expr = col_ref.scalar_expression
# We don't know how to compile scalar expressions in isolation
if ordering_expr.is_const:
# Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here.
continue
assert isinstance(
ordering_expr, bigframes.core.expression.UnboundVariableExpression
)
part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
parts.append(part)
return f"ORDER BY {' ,'.join(parts)}"


def snapshot_clause(time_travel_timestamp: datetime.datetime):
return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})"
38 changes: 25 additions & 13 deletions bigframes/session/_io/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

import bigframes
from bigframes.core import log_adapter
import bigframes.core.sql
import bigframes.formatting_helpers as formatting_helpers

IO_ORDERING_ID = "bqdf_row_nums"
Expand Down Expand Up @@ -353,7 +354,7 @@ def to_query(
else:
select_clause = "SELECT *"

where_clause = ""
filter_string = ""
if filters:
valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = {
"in": "IN",
Expand All @@ -373,12 +374,11 @@ def to_query(
):
filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters])

or_expressions = []
for group in filters:
if not isinstance(group, Iterable):
group = [group]

and_expressions = []
and_expression = ""
for filter_item in group:
if not isinstance(filter_item, tuple) or (len(filter_item) != 3):
raise ValueError(
Expand All @@ -397,17 +397,29 @@ def to_query(

operator_str = valid_operators[operator]

column_ref = bigframes.core.sql.identifier(column)
if operator_str in ["IN", "NOT IN"]:
value_list = ", ".join([repr(v) for v in value])
expression = f"`{column}` {operator_str} ({value_list})"
value_literal = bigframes.core.sql.multi_literal(*value)
else:
expression = f"`{column}` {operator_str} {repr(value)}"
and_expressions.append(expression)

or_expressions.append(" AND ".join(and_expressions))
value_literal = bigframes.core.sql.simple_literal(value)
expression = bigframes.core.sql.infix_op(
operator_str, column_ref, value_literal
)
if and_expression:
and_expression = bigframes.core.sql.infix_op(
"AND", and_expression, expression
)
else:
and_expression = expression

if or_expressions:
where_clause = " WHERE " + " OR ".join(or_expressions)
if filter_string:
filter_string = bigframes.core.sql.infix_op(
"OR", filter_string, and_expression
)
else:
filter_string = and_expression

full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}"
return full_query
if filter_string:
return f"{select_clause} FROM {sub_query} AS sub WHERE {filter_string}"
else:
return f"{select_clause} FROM {sub_query} AS sub"
Loading
0