diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 03d1a0a2de..93f072973c 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -261,6 +261,12 @@ def compile_explode( columns = tuple(ref.id.sql for ref in node.column_ids) return child.explode(columns, offsets_col) + @_compile_node.register + def compile_random_sample( + self, node: nodes.RandomSampleNode, child: ir.SQLGlotIR + ) -> ir.SQLGlotIR: + return child.sample(node.fraction) + def _replace_unsupported_ops(node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrite.rewrite_slice) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index d5902fa6fc..c0bed4090c 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -25,7 +25,7 @@ import sqlglot.expressions as sge from bigframes import dtypes -from bigframes.core import guid +from bigframes.core import guid, utils from bigframes.core.compile.sqlglot.expressions import typed_expr import bigframes.core.compile.sqlglot.sqlglot_types as sgt import bigframes.core.local_data as local_data @@ -71,7 +71,10 @@ def from_pyarrow( schema: bf_schema.ArraySchema, uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: - """Builds SQLGlot expression from pyarrow table.""" + """Builds SQLGlot expression from a pyarrow table. + + This is used to represent in-memory data as a SQL query. + """ dtype_expr = sge.DataType( this=sge.DataType.Type.STRUCT, expressions=[ @@ -117,6 +120,16 @@ def from_table( alias_names: typing.Sequence[str], uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: + """Builds a SQLGlotIR expression from a BigQuery table. + + Args: + project_id (str): The project ID of the BigQuery table. + dataset_id (str): The dataset ID of the BigQuery table. + table_id (str): The table ID of the BigQuery table. + col_names (typing.Sequence[str]): The names of the columns to select. + alias_names (typing.Sequence[str]): The aliases for the selected columns. + uid_gen (guid.SequentialUIDGenerator): A generator for unique identifiers. + """ selections = [ sge.Alias( this=sge.to_identifier(col_name, quoted=cls.quoted), @@ -137,7 +150,7 @@ def from_query_string( cls, query_string: str, ) -> SQLGlotIR: - """Builds SQLGlot expression from a query string""" + """Builds a SQLGlot expression from a query string""" uid_gen: guid.SequentialUIDGenerator = guid.SequentialUIDGenerator() cte_name = sge.to_identifier( next(uid_gen.get_uid_stream("bfcte_")), quoted=cls.quoted @@ -157,7 +170,7 @@ def from_union( output_ids: typing.Sequence[str], uid_gen: guid.SequentialUIDGenerator, ) -> SQLGlotIR: - """Builds SQLGlot expression by union of multiple select expressions.""" + """Builds a SQLGlot expression by unioning of multiple select expressions.""" assert ( len(list(selects)) >= 2 ), f"At least two select expressions must be provided, but got {selects}." @@ -205,6 +218,7 @@ def select( self, selected_cols: tuple[tuple[str, sge.Expression], ...], ) -> SQLGlotIR: + """Replaces new selected columns of the current SELECT clause.""" selections = [ sge.Alias( this=expr, @@ -213,15 +227,41 @@ def select( for id, expr in selected_cols ] - new_expr, _ = self._encapsulate_as_cte() + new_expr = _select_to_cte( + self.expr, + sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ), + ) new_expr = new_expr.select(*selections, append=False) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def project( + self, + projected_cols: tuple[tuple[str, sge.Expression], ...], + ) -> SQLGlotIR: + """Adds new columns to the SELECT clause.""" + projected_cols_expr = [ + sge.Alias( + this=expr, + alias=sge.to_identifier(id, quoted=self.quoted), + ) + for id, expr in projected_cols + ] + new_expr = _select_to_cte( + self.expr, + sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ), + ) + new_expr = new_expr.select(*projected_cols_expr, append=True) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def order_by( self, ordering: tuple[sge.Ordered, ...], ) -> SQLGlotIR: - """Adds ORDER BY clause to the query.""" + """Adds an ORDER BY clause to the query.""" if len(ordering) == 0: return SQLGlotIR(expr=self.expr.copy(), uid_gen=self.uid_gen) new_expr = self.expr.order_by(*ordering) @@ -231,34 +271,24 @@ def limit( self, limit: int | None, ) -> SQLGlotIR: - """Adds LIMIT clause to the query.""" + """Adds a LIMIT clause to the query.""" if limit is not None: new_expr = self.expr.limit(limit) else: new_expr = self.expr.copy() return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) - def project( - self, - projected_cols: tuple[tuple[str, sge.Expression], ...], - ) -> SQLGlotIR: - projected_cols_expr = [ - sge.Alias( - this=expr, - alias=sge.to_identifier(id, quoted=self.quoted), - ) - for id, expr in projected_cols - ] - new_expr, _ = self._encapsulate_as_cte() - new_expr = new_expr.select(*projected_cols_expr, append=True) - return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) - def filter( self, condition: sge.Expression, ) -> SQLGlotIR: - """Filters the query with the given condition.""" - new_expr, _ = self._encapsulate_as_cte() + """Filters the query by adding a WHERE clause.""" + new_expr = _select_to_cte( + self.expr, + sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ), + ) return SQLGlotIR( expr=new_expr.where(condition, append=False), uid_gen=self.uid_gen ) @@ -272,8 +302,15 @@ def join( joins_nulls: bool = True, ) -> SQLGlotIR: """Joins the current query with another SQLGlotIR instance.""" - left_select, left_table = self._encapsulate_as_cte() - right_select, right_table = right._encapsulate_as_cte() + left_cte_name = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ) + right_cte_name = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ) + + left_select = _select_to_cte(self.expr, left_cte_name) + right_select = _select_to_cte(right.expr, right_cte_name) left_ctes = left_select.args.pop("with", []) right_ctes = right_select.args.pop("with", []) @@ -288,17 +325,50 @@ def join( new_expr = ( sge.Select() .select(sge.Star()) - .from_(left_table) - .join(right_table, on=join_on, join_type=join_type_str) + .from_(sge.Table(this=left_cte_name)) + .join(sge.Table(this=right_cte_name), on=join_on, join_type=join_type_str) ) new_expr.set("with", sge.With(expressions=merged_ctes)) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def explode( + self, + column_names: tuple[str, ...], + offsets_col: typing.Optional[str], + ) -> SQLGlotIR: + """Unnests one or more array columns.""" + num_columns = len(list(column_names)) + assert num_columns > 0, "At least one column must be provided for explode." + if num_columns == 1: + return self._explode_single_column(column_names[0], offsets_col) + else: + return self._explode_multiple_columns(column_names, offsets_col) + + def sample(self, fraction: float) -> SQLGlotIR: + """Uniform samples a fraction of the rows.""" + uuid_col = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcol_")), quoted=self.quoted + ) + uuid_expr = sge.Alias(this=sge.func("RAND"), alias=uuid_col) + condition = sge.LT( + this=uuid_col, + expression=_literal(fraction, dtypes.FLOAT_DTYPE), + ) + + new_cte_name = sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ) + new_expr = _select_to_cte( + self.expr.select(uuid_expr, append=True), new_cte_name + ).where(condition, append=False) + return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) + def insert( self, destination: bigquery.TableReference, ) -> str: + """Generates an INSERT INTO SQL statement from the current SELECT clause.""" return sge.insert(self.expr.subquery(), _table(destination)).sql( dialect=self.dialect, pretty=self.pretty ) @@ -307,6 +377,9 @@ def replace( self, destination: bigquery.TableReference, ) -> str: + """Generates a MERGE statement to replace the destination table's contents. + by the current SELECT clause. + """ # Workaround for SQLGlot breaking change: # https://github.com/tobymao/sqlglot/pull/4495 whens_expr = [ @@ -325,23 +398,10 @@ def replace( ).sql(dialect=self.dialect, pretty=self.pretty) return f"{merge_str}\n{whens_str}" - def explode( - self, - column_names: tuple[str, ...], - offsets_col: typing.Optional[str], - ) -> SQLGlotIR: - num_columns = len(list(column_names)) - assert num_columns > 0, "At least one column must be provided for explode." - if num_columns == 1: - return self._explode_single_column(column_names[0], offsets_col) - else: - return self._explode_multiple_columns(column_names, offsets_col) - def _explode_single_column( self, column_name: str, offsets_col: typing.Optional[str] ) -> SQLGlotIR: """Helper method to handle the case of exploding a single column.""" - offset = ( sge.to_identifier(offsets_col, quoted=self.quoted) if offsets_col else None ) @@ -358,7 +418,12 @@ def _explode_single_column( # TODO: "CROSS" if not keep_empty else "LEFT" # TODO: overlaps_with_parent to replace existing column. - new_expr, _ = self._encapsulate_as_cte() + new_expr = _select_to_cte( + self.expr, + sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ), + ) new_expr = new_expr.select(selection, append=False).join( unnest_expr, join_type="CROSS" ) @@ -408,33 +473,32 @@ def _explode_multiple_columns( for column in columns ] ) - new_expr, _ = self._encapsulate_as_cte() + new_expr = _select_to_cte( + self.expr, + sge.to_identifier( + next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted + ), + ) new_expr = new_expr.select(selection, append=False).join( unnest_expr, join_type="CROSS" ) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) - def _encapsulate_as_cte( - self, - ) -> typing.Tuple[sge.Select, sge.Table]: - """Transforms a given sge.Select query by pushing its main SELECT statement - into a new CTE and then generates a 'SELECT * FROM new_cte_name' - for the new query.""" - select_expr = self.expr.copy() - existing_ctes = select_expr.args.pop("with", []) - new_cte_name = sge.to_identifier( - next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted - ) - new_cte = sge.CTE( - this=select_expr, - alias=new_cte_name, - ) - new_with_clause = sge.With(expressions=[*existing_ctes, new_cte]) - new_table_expr = sge.Table(this=new_cte_name) - new_select_expr = sge.Select().select(sge.Star()).from_(new_table_expr) - new_select_expr.set("with", new_with_clause) - return new_select_expr, new_table_expr +def _select_to_cte(expr: sge.Select, cte_name: sge.Identifier) -> sge.Select: + """Transforms a given sge.Select query by pushing its main SELECT statement + into a new CTE and then generates a 'SELECT * FROM new_cte_name' + for the new query.""" + select_expr = expr.copy() + existing_ctes = select_expr.args.pop("with", []) + new_cte = sge.CTE( + this=select_expr, + alias=cte_name, + ) + new_with_clause = sge.With(expressions=[*existing_ctes, new_cte]) + new_select_expr = sge.Select().select(sge.Star()).from_(sge.Table(this=cte_name)) + new_select_expr.set("with", new_with_clause) + return new_select_expr def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: @@ -454,6 +518,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) elif dtype == dtypes.JSON_DTYPE: return sge.ParseJSON(this=sge.convert(str(value))) + elif dtype == dtypes.TIMEDELTA_DTYPE: + return sge.convert(utils.timedelta_to_micros(value)) elif dtypes.is_struct_like(dtype): items = [ _literal(value=value[field_name], dtype=field_dtype).as_( diff --git a/bigframes/core/compile/sqlglot/sqlglot_types.py b/bigframes/core/compile/sqlglot/sqlglot_types.py index 0cfeaae3e9..5b0f70077d 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_types.py +++ b/bigframes/core/compile/sqlglot/sqlglot_types.py @@ -59,6 +59,8 @@ def from_bigframes_dtype( return "JSON" elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE: return "GEOGRAPHY" + elif bigframes_dtype == bigframes.dtypes.TIMEDELTA_DTYPE: + return "INT64" elif isinstance(bigframes_dtype, pd.ArrowDtype): if pa.types.is_list(bigframes_dtype.pyarrow_dtype): inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( diff --git a/tests/unit/core/compile/sqlglot/conftest.py b/tests/unit/core/compile/sqlglot/conftest.py index 645daddd46..754c19ac90 100644 --- a/tests/unit/core/compile/sqlglot/conftest.py +++ b/tests/unit/core/compile/sqlglot/conftest.py @@ -21,6 +21,7 @@ import pytest from bigframes import dtypes +import bigframes.core as core import bigframes.pandas as bpd import bigframes.testing.mocks as mocks import bigframes.testing.utils @@ -115,6 +116,16 @@ def scalar_types_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="module") +def scalar_types_array_value( + scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session +) -> core.ArrayValue: + managed_data_source = core.local_data.ManagedArrowTable.from_pandas( + scalar_types_pandas_df + ) + return core.ArrayValue.from_managed(managed_data_source, compiler_session) + + @pytest.fixture(scope="session") def nested_structs_types_table_schema() -> typing.Sequence[bigquery.SchemaField]: return [ diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_random_sample/test_compile_random_sample/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_random_sample/test_compile_random_sample/out.sql new file mode 100644 index 0000000000..aae34716d8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_random_sample/test_compile_random_sample/out.sql @@ -0,0 +1,184 @@ +WITH `bfcte_0` AS ( + SELECT + *, + RAND() AS `bfcol_16` + FROM UNNEST(ARRAY>[STRUCT( + TRUE, + CAST(b'Hello, World!' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST('2021-07-21T11:39:45' AS DATETIME), + ST_GEOGFROMTEXT('POINT(-122.0838511 37.3860517)'), + 123456789, + 0, + CAST(1.234567890 AS NUMERIC), + 1.25, + 0, + 0, + 'Hello, World!', + CAST('11:41:43.076160' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 4, + 0 + ), STRUCT( + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('1991-02-03' AS DATE), + CAST('1991-01-02T03:45:06' AS DATETIME), + ST_GEOGFROMTEXT('POINT(-71.104 42.315)'), + -987654321, + 1, + CAST(1.234567890 AS NUMERIC), + 2.51, + 1, + 1, + 'こんにちは', + CAST('11:14:34.701606' AS TIME), + CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + -1000000, + 1 + ), STRUCT( + TRUE, + CAST(b'\xc2\xa1Hola Mundo!' AS BYTES), + CAST('2023-03-01' AS DATE), + CAST('2023-03-01T10:55:13' AS DATETIME), + ST_GEOGFROMTEXT('POINT(-0.124474760143016 51.5007826749545)'), + 314159, + 0, + CAST(101.101010100 AS NUMERIC), + 25000000000.0, + 2, + 2, + ' ¡Hola Mundo! ', + CAST('23:59:59.999999' AS TIME), + CAST('2023-03-01T10:55:13.250125+00:00' AS TIMESTAMP), + 0, + 2 + ), STRUCT( + CAST(NULL AS BOOLEAN), + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + CAST(NULL AS INT64), + 1, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 3, + 3, + CAST(NULL AS STRING), + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + CAST(NULL AS INT64), + 3 + ), STRUCT( + FALSE, + CAST(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf' AS BYTES), + CAST('2021-07-21' AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -234892, + -2345, + CAST(NULL AS NUMERIC), + CAST(NULL AS FLOAT64), + 4, + 4, + 'Hello, World!', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 31540000000000, + 4 + ), STRUCT( + FALSE, + CAST(b'G\xc3\xbcten Tag' AS BYTES), + CAST('1980-03-14' AS DATE), + CAST('1980-03-14T15:16:17' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 55555, + 0, + CAST(5.555555000 AS NUMERIC), + 555.555, + 5, + 5, + 'Güten Tag!', + CAST('15:16:17.181921' AS TIME), + CAST('1980-03-14T15:16:17.181921+00:00' AS TIMESTAMP), + 4, + 5 + ), STRUCT( + TRUE, + CAST(b'Hello\tBigFrames!\x07' AS BYTES), + CAST('2023-05-23' AS DATE), + CAST('2023-05-23T11:37:01' AS DATETIME), + ST_GEOGFROMTEXT('LINESTRING(-0.127959 51.507728, -0.127026 51.507473)'), + 101202303, + 2, + CAST(-10.090807000 AS NUMERIC), + -123.456, + 6, + 6, + 'capitalize, This ', + CAST('01:02:03.456789' AS TIME), + CAST('2023-05-23T11:42:55.000001+00:00' AS TIMESTAMP), + CAST(NULL AS INT64), + 6 + ), STRUCT( + TRUE, + CAST(NULL AS BYTES), + CAST('2038-01-20' AS DATE), + CAST('2038-01-19T03:14:08' AS DATETIME), + CAST(NULL AS GEOGRAPHY), + -214748367, + 2, + CAST(11111111.100000000 AS NUMERIC), + 42.42, + 7, + 7, + ' سلام', + CAST('12:00:00.000001' AS TIME), + CAST('2038-01-19T03:14:17.999999+00:00' AS TIMESTAMP), + 4, + 7 + ), STRUCT( + FALSE, + CAST(NULL AS BYTES), + CAST(NULL AS DATE), + CAST(NULL AS DATETIME), + CAST(NULL AS GEOGRAPHY), + 2, + 1, + CAST(NULL AS NUMERIC), + 6.87, + 8, + 8, + 'T', + CAST(NULL AS TIME), + CAST(NULL AS TIMESTAMP), + 432000000000, + 8 + )]) +), `bfcte_1` AS ( + SELECT + * + FROM `bfcte_0` + WHERE + `bfcol_16` < 0.1 +) +SELECT + `bfcol_0` AS `bool_col`, + `bfcol_1` AS `bytes_col`, + `bfcol_2` AS `date_col`, + `bfcol_3` AS `datetime_col`, + `bfcol_4` AS `geography_col`, + `bfcol_5` AS `int64_col`, + `bfcol_6` AS `int64_too`, + `bfcol_7` AS `numeric_col`, + `bfcol_8` AS `float64_col`, + `bfcol_9` AS `rowindex`, + `bfcol_10` AS `rowindex_2`, + `bfcol_11` AS `string_col`, + `bfcol_12` AS `time_col`, + `bfcol_13` AS `timestamp_col`, + `bfcol_14` AS `duration_col` +FROM `bfcte_1` +ORDER BY + `bfcol_15` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index 70d73db6a7..2b080b0b7c 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -1,7 +1,7 @@ WITH `bfcte_0` AS ( SELECT * - FROM UNNEST(ARRAY>[STRUCT( + FROM UNNEST(ARRAY>[STRUCT( 0, TRUE, CAST(b'Hello, World!' AS BYTES), @@ -17,6 +17,7 @@ WITH `bfcte_0` AS ( 'Hello, World!', CAST('11:41:43.076160' AS TIME), CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + 4, 0 ), STRUCT( 1, @@ -34,6 +35,7 @@ WITH `bfcte_0` AS ( 'こんにちは', CAST('11:14:34.701606' AS TIME), CAST('2021-07-21T17:43:43.945289+00:00' AS TIMESTAMP), + -1000000, 1 ), STRUCT( 2, @@ -51,6 +53,7 @@ WITH `bfcte_0` AS ( ' ¡Hola Mundo! ', CAST('23:59:59.999999' AS TIME), CAST('2023-03-01T10:55:13.250125+00:00' AS TIMESTAMP), + 0, 2 ), STRUCT( 3, @@ -68,6 +71,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS STRING), CAST(NULL AS TIME), CAST(NULL AS TIMESTAMP), + CAST(NULL AS INT64), 3 ), STRUCT( 4, @@ -85,6 +89,7 @@ WITH `bfcte_0` AS ( 'Hello, World!', CAST(NULL AS TIME), CAST(NULL AS TIMESTAMP), + 31540000000000, 4 ), STRUCT( 5, @@ -102,6 +107,7 @@ WITH `bfcte_0` AS ( 'Güten Tag!', CAST('15:16:17.181921' AS TIME), CAST('1980-03-14T15:16:17.181921+00:00' AS TIMESTAMP), + 4, 5 ), STRUCT( 6, @@ -119,6 +125,7 @@ WITH `bfcte_0` AS ( 'capitalize, This ', CAST('01:02:03.456789' AS TIME), CAST('2023-05-23T11:42:55.000001+00:00' AS TIMESTAMP), + CAST(NULL AS INT64), 6 ), STRUCT( 7, @@ -136,6 +143,7 @@ WITH `bfcte_0` AS ( ' سلام', CAST('12:00:00.000001' AS TIME), CAST('2038-01-19T03:14:17.999999+00:00' AS TIMESTAMP), + 4, 7 ), STRUCT( 8, @@ -153,6 +161,7 @@ WITH `bfcte_0` AS ( 'T', CAST(NULL AS TIME), CAST(NULL AS TIMESTAMP), + 432000000000, 8 )]) ) @@ -171,7 +180,8 @@ SELECT `bfcol_11` AS `rowindex_2`, `bfcol_12` AS `string_col`, `bfcol_13` AS `time_col`, - `bfcol_14` AS `timestamp_col` + `bfcol_14` AS `timestamp_col`, + `bfcol_15` AS `duration_col` FROM `bfcte_0` ORDER BY - `bfcol_15` ASC NULLS LAST \ No newline at end of file + `bfcol_16` ASC NULLS LAST \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/test_compile_random_sample.py b/tests/unit/core/compile/sqlglot/test_compile_random_sample.py new file mode 100644 index 0000000000..6e333f0421 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/test_compile_random_sample.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import nodes +import bigframes.core as core +import bigframes.core.compile.sqlglot as sqlglot + +pytest.importorskip("pytest_snapshot") + + +def test_compile_random_sample( + scalar_types_array_value: core.ArrayValue, + snapshot, +): + """This test verifies the SQL compilation of a RandomSampleNode. + + Because BigFrames doesn't expose a public API for creating a random sample + operation, this test constructs the node directly and then compiles it to SQL. + """ + node = nodes.RandomSampleNode(scalar_types_array_value.node, fraction=0.1) + sql = sqlglot.compiler.SQLGlotCompiler().compile(node) + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index 6f8a2050e5..7307fd9b4e 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -24,8 +24,6 @@ def test_compile_readlocal( scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): - # Durations not yet supported - scalar_types_pandas_df = scalar_types_pandas_df.drop(["duration_col"], axis=1) bf_df = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql")