From 7116e3e60ef0bb5ce70e7752351e76e29351e2fa Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 19 Mar 2024 21:28:09 +0000 Subject: [PATCH 1/4] fix: renable to_csv and to_json related tests --- tests/system/small/test_dataframe_io.py | 12 ++++++++---- tests/system/small/test_series.py | 6 ++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index adc729565e..afb97c3595 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -115,7 +115,6 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): pd.testing.assert_series_equal(actual, expected) -@pytest.mark.skip(reason="Disable to unblock kokoro tests") @pytest.mark.parametrize( ("index"), [True, False], @@ -154,6 +153,7 @@ def test_to_csv_index( dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, + storage_options=dict(expand=True), ) convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name @@ -164,7 +164,6 @@ def test_to_csv_index( pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) -@pytest.mark.skip(reason="Disable to unblock kokoro tests") def test_to_csv_tabs( scalars_dfs: Tuple[bigframes.dataframe.DataFrame, pd.DataFrame], gcs_folder: str, @@ -194,6 +193,7 @@ def test_to_csv_tabs( dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, + storage_options=dict(expand=True), ) convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name @@ -415,7 +415,6 @@ def test_to_json_index_invalid_lines( scalars_df.to_json(path, index=index) -@pytest.mark.skip(reason="Disable to unblock kokoro tests") @pytest.mark.parametrize( ("index"), [True, False], @@ -435,7 +434,12 @@ def test_to_json_index_records_orient( """ Test the `to_json` API with `orient` is `records` and `lines` is True""" scalars_df.to_json(path, index=index, orient="records", lines=True) - gcs_df = pd.read_json(path, lines=True, convert_dates=["datetime_col"]) + gcs_df = pd.read_json( + path, + lines=True, + convert_dates=["datetime_col"], + storage_options=dict(expand=True), + ) convert_pandas_dtypes(gcs_df, bytes_col=True) if index and scalars_df.index.name is not None: gcs_df = gcs_df.set_index(scalars_df.index.name) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f63ea977ff..3c593a75bf 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2390,11 +2390,10 @@ def test_to_frame(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) -@pytest.mark.skip(reason="Disable to unblock kokoro tests") def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") - gcs_df = pd.read_json(path, lines=True) + gcs_df = pd.read_json(path, lines=True, storage_options=dict(expand=True)) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), @@ -2404,11 +2403,10 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): ) -@pytest.mark.skip(reason="Disable to unblock kokoro tests") def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) - gcs_df = pd.read_csv(path) + gcs_df = pd.read_csv(path, storage_options=dict(expand=True)) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), From b867d24008c5d01df5727a6efe8996f28f017f99 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 20 Mar 2024 03:32:09 +0000 Subject: [PATCH 2/4] fix gcs file path --- tests/system/small/test_dataframe_io.py | 9 +++------ tests/system/small/test_series.py | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index afb97c3595..401798d8aa 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -149,11 +149,10 @@ def test_to_csv_index( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path, + path.replace("*", "000000000000"), dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, - storage_options=dict(expand=True), ) convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name @@ -188,12 +187,11 @@ def test_to_csv_tabs( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path, + path.replace("*", "000000000000"), sep="\t", dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, - storage_options=dict(expand=True), ) convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name @@ -435,10 +433,9 @@ def test_to_json_index_records_orient( scalars_df.to_json(path, index=index, orient="records", lines=True) gcs_df = pd.read_json( - path, + path.replace("*", "000000000000"), lines=True, convert_dates=["datetime_col"], - storage_options=dict(expand=True), ) convert_pandas_dtypes(gcs_df, bytes_col=True) if index and scalars_df.index.name is not None: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3c593a75bf..06db72700c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2393,7 +2393,7 @@ def test_to_frame(scalars_dfs): def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") - gcs_df = pd.read_json(path, lines=True, storage_options=dict(expand=True)) + gcs_df = pd.read_json(path.replace("*", "000000000000"), lines=True) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), @@ -2406,7 +2406,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) - gcs_df = pd.read_csv(path, storage_options=dict(expand=True)) + gcs_df = pd.read_csv(path.replace("*", "000000000000")) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), From 48c8038433de27e49670c9f2d8e955b3cbb1b771 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 20 Mar 2024 18:03:44 +0000 Subject: [PATCH 3/4] add global FIRST_GCS_FILE_SUFFIX --- tests/system/small/test_dataframe_io.py | 14 +++++++++----- tests/system/small/test_encryption.py | 3 ++- tests/system/small/test_series.py | 5 +++-- tests/system/small/test_session.py | 22 ++++++++++------------ tests/system/utils.py | 2 ++ 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 401798d8aa..a5fd29ab92 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -19,7 +19,11 @@ import pyarrow as pa import pytest -from tests.system.utils import assert_pandas_df_equal, convert_pandas_dtypes +from tests.system.utils import ( + assert_pandas_df_equal, + convert_pandas_dtypes, + FIRST_GCS_FILE_SUFFIX, +) try: import pandas_gbq # type: ignore @@ -149,7 +153,7 @@ def test_to_csv_index( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path.replace("*", "000000000000"), + path.replace("*", FIRST_GCS_FILE_SUFFIX), dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, @@ -187,7 +191,7 @@ def test_to_csv_tabs( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path.replace("*", "000000000000"), + path.replace("*", FIRST_GCS_FILE_SUFFIX), sep="\t", dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, @@ -433,7 +437,7 @@ def test_to_json_index_records_orient( scalars_df.to_json(path, index=index, orient="records", lines=True) gcs_df = pd.read_json( - path.replace("*", "000000000000"), + path.replace("*", FIRST_GCS_FILE_SUFFIX), lines=True, convert_dates=["datetime_col"], ) @@ -475,7 +479,7 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): # table. scalars_df.to_parquet(path, index=index) - gcs_df = pd.read_parquet(path.replace("*", "000000000000")) + gcs_df = pd.read_parquet(path.replace("*", FIRST_GCS_FILE_SUFFIX)) convert_pandas_dtypes(gcs_df, bytes_col=False) if index and scalars_df.index.name is not None: gcs_df = gcs_df.set_index(scalars_df.index.name) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index f13d2b9e1a..568a236b22 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -19,6 +19,7 @@ import bigframes import bigframes.ml.linear_model +from tests.system.utils import FIRST_GCS_FILE_SUFFIX @pytest.fixture(scope="module") @@ -160,7 +161,7 @@ def test_read_csv_gcs( # Create a csv in gcs write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv" read_path = ( - write_path.replace("*", "000000000000") if engine is None else write_path + write_path.replace("*", FIRST_GCS_FILE_SUFFIX) if engine is None else write_path ) scalars_df_index.to_csv(write_path) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 06db72700c..04f986407e 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -27,6 +27,7 @@ from tests.system.utils import ( assert_pandas_df_equal, assert_series_equal, + FIRST_GCS_FILE_SUFFIX, skip_legacy_pandas, ) @@ -2393,7 +2394,7 @@ def test_to_frame(scalars_dfs): def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") - gcs_df = pd.read_json(path.replace("*", "000000000000"), lines=True) + gcs_df = pd.read_json(path.replace("*", FIRST_GCS_FILE_SUFFIX), lines=True) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), @@ -2406,7 +2407,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) - gcs_df = pd.read_csv(path.replace("*", "000000000000")) + gcs_df = pd.read_csv(path.replace("*", FIRST_GCS_FILE_SUFFIX)) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d0cd24e2be..b6634fc295 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -30,9 +30,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model -from tests.system.utils import skip_legacy_pandas - -FIRST_FILE = "000000000000" +from tests.system.utils import FIRST_GCS_FILE_SUFFIX, skip_legacy_pandas def test_read_gbq_tokyo( @@ -442,7 +440,7 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" else: path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df.to_csv(path, index=False) dtype = scalars_df.dtypes.to_dict() dtype.pop("geography_col") @@ -641,7 +639,7 @@ def test_read_csv_default_engine_throws_not_implemented_error( gcs_folder + "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv" ) - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df_index.to_csv(path) with pytest.raises(NotImplementedError, match=match): session.read_csv(read_path, **kwargs) @@ -649,7 +647,7 @@ def test_read_csv_default_engine_throws_not_implemented_error( def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_folder): path = gcs_folder + "test_read_csv_gcs_default_engine_w_header*.csv" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df_index.to_csv(path) # Skips header=N rows, normally considers the N+1th row as the header, but overridden by @@ -716,7 +714,7 @@ def test_read_csv_gcs_default_engine_w_index_col_name( session, scalars_df_default_index, gcs_folder ): path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name*.csv" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df_default_index.to_csv(path) df = session.read_csv(read_path, index_col="rowindex") @@ -731,7 +729,7 @@ def test_read_csv_gcs_default_engine_w_index_col_index( session, scalars_df_default_index, gcs_folder ): path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index*.csv" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df_default_index.to_csv(path) index_col = scalars_df_default_index.columns.to_list().index("rowindex") @@ -790,7 +788,7 @@ def test_read_csv_local_default_engine_w_index_col_index( def test_read_csv_gcs_w_usecols(session, scalars_df_index, gcs_folder, engine): path = gcs_folder + "test_read_csv_gcs_w_usecols" path = path + "_default_engine*.csv" if engine is None else path + "_bq_engine*.csv" - read_path = path.replace("*", FIRST_FILE) if engine is None else path + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) if engine is None else path scalars_df_index.to_csv(path) # df should only have 1 column which is bool_col. @@ -902,7 +900,7 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, e # Only bigquery engine for reads supports wildcards in path name. if engine != "bigquery": - path = path.replace("*", "000000000000") + path = path.replace("*", FIRST_GCS_FILE_SUFFIX) df_out = ( session.read_parquet(path, engine=engine) @@ -1012,7 +1010,7 @@ def test_read_parquet_gcs_compression_not_supported( def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df.to_json(path, index=False, lines=True, orient="records") df = session.read_json(read_path, lines=True, orient="records", engine="bigquery") @@ -1036,7 +1034,7 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json" - read_path = path.replace("*", FIRST_FILE) + read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) scalars_df.to_json( path, index=False, diff --git a/tests/system/utils.py b/tests/system/utils.py index 8ea49ed7e2..a6444fff8f 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -28,6 +28,8 @@ from bigframes.functions import remote_function +FIRST_GCS_FILE_SUFFIX = "000000000000" + def skip_legacy_pandas(test): @functools.wraps(test) From 708e23400740463de991fda1ccd2540bfa62edd5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 20 Mar 2024 19:11:20 +0000 Subject: [PATCH 4/4] trying to avoid import functions --- tests/system/small/test_dataframe_io.py | 32 +++++++++++-------------- tests/system/small/test_encryption.py | 4 ++-- tests/system/small/test_series.py | 6 ++--- tests/system/small/test_session.py | 24 +++++++++---------- tests/system/utils.py | 6 +++-- 5 files changed, 35 insertions(+), 37 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index a5fd29ab92..10d7408790 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -19,11 +19,7 @@ import pyarrow as pa import pytest -from tests.system.utils import ( - assert_pandas_df_equal, - convert_pandas_dtypes, - FIRST_GCS_FILE_SUFFIX, -) +from tests.system import utils try: import pandas_gbq # type: ignore @@ -153,12 +149,12 @@ def test_to_csv_index( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path.replace("*", FIRST_GCS_FILE_SUFFIX), + utils.get_first_file_from_wildcard(path), dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, ) - convert_pandas_dtypes(gcs_df, bytes_col=True) + utils.convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name scalars_pandas_df = scalars_pandas_df.copy() @@ -191,13 +187,13 @@ def test_to_csv_tabs( # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string dtype.pop("bytes_col") gcs_df = pd.read_csv( - path.replace("*", FIRST_GCS_FILE_SUFFIX), + utils.get_first_file_from_wildcard(path), sep="\t", dtype=dtype, date_format={"timestamp_col": "YYYY-MM-DD HH:MM:SS Z"}, index_col=index_col, ) - convert_pandas_dtypes(gcs_df, bytes_col=True) + utils.convert_pandas_dtypes(gcs_df, bytes_col=True) gcs_df.index.name = scalars_df.index.name scalars_pandas_df = scalars_pandas_df.copy() @@ -231,7 +227,7 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): else: df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) - convert_pandas_dtypes(df_out, bytes_col=False) + utils.convert_pandas_dtypes(df_out, bytes_col=False) # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() @@ -437,11 +433,11 @@ def test_to_json_index_records_orient( scalars_df.to_json(path, index=index, orient="records", lines=True) gcs_df = pd.read_json( - path.replace("*", FIRST_GCS_FILE_SUFFIX), + utils.get_first_file_from_wildcard(path), lines=True, convert_dates=["datetime_col"], ) - convert_pandas_dtypes(gcs_df, bytes_col=True) + utils.convert_pandas_dtypes(gcs_df, bytes_col=True) if index and scalars_df.index.name is not None: gcs_df = gcs_df.set_index(scalars_df.index.name) @@ -479,8 +475,8 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): # table. scalars_df.to_parquet(path, index=index) - gcs_df = pd.read_parquet(path.replace("*", FIRST_GCS_FILE_SUFFIX)) - convert_pandas_dtypes(gcs_df, bytes_col=False) + gcs_df = pd.read_parquet(utils.get_first_file_from_wildcard(path)) + utils.convert_pandas_dtypes(gcs_df, bytes_col=False) if index and scalars_df.index.name is not None: gcs_df = gcs_df.set_index(scalars_df.index.name) @@ -512,7 +508,7 @@ def test_to_sql_query_unnamed_index_included( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) + utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) def test_to_sql_query_named_index_included( @@ -529,7 +525,7 @@ def test_to_sql_query_named_index_included( pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) roundtrip = session.read_gbq(sql, index_col=idx_ids) - assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) + utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( @@ -544,7 +540,7 @@ def test_to_sql_query_unnamed_index_excluded( pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal( + utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) @@ -563,6 +559,6 @@ def test_to_sql_query_named_index_excluded( "rowindex_2", drop=True ).reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal( + utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 568a236b22..70d2ce381f 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -19,7 +19,7 @@ import bigframes import bigframes.ml.linear_model -from tests.system.utils import FIRST_GCS_FILE_SUFFIX +from tests.system import utils @pytest.fixture(scope="module") @@ -161,7 +161,7 @@ def test_read_csv_gcs( # Create a csv in gcs write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv" read_path = ( - write_path.replace("*", FIRST_GCS_FILE_SUFFIX) if engine is None else write_path + utils.get_first_file_from_wildcard(write_path) if engine is None else write_path ) scalars_df_index.to_csv(write_path) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 04f986407e..dcb47d8c60 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -27,7 +27,7 @@ from tests.system.utils import ( assert_pandas_df_equal, assert_series_equal, - FIRST_GCS_FILE_SUFFIX, + get_first_file_from_wildcard, skip_legacy_pandas, ) @@ -2394,7 +2394,7 @@ def test_to_frame(scalars_dfs): def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_json*.jsonl" scalars_df_index["int64_col"].to_json(path, lines=True, orient="records") - gcs_df = pd.read_json(path.replace("*", FIRST_GCS_FILE_SUFFIX), lines=True) + gcs_df = pd.read_json(get_first_file_from_wildcard(path), lines=True) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), @@ -2407,7 +2407,7 @@ def test_to_json(gcs_folder, scalars_df_index, scalars_pandas_df_index): def test_to_csv(gcs_folder, scalars_df_index, scalars_pandas_df_index): path = gcs_folder + "test_series_to_csv*.csv" scalars_df_index["int64_col"].to_csv(path) - gcs_df = pd.read_csv(path.replace("*", FIRST_GCS_FILE_SUFFIX)) + gcs_df = pd.read_csv(get_first_file_from_wildcard(path)) pd.testing.assert_series_equal( gcs_df["int64_col"].astype(pd.Int64Dtype()), diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index b6634fc295..c6702aa032 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -30,7 +30,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model -from tests.system.utils import FIRST_GCS_FILE_SUFFIX, skip_legacy_pandas +from tests.system import utils def test_read_gbq_tokyo( @@ -433,14 +433,14 @@ def test_read_pandas_tokyo( pd.testing.assert_frame_equal(result, expected) -@skip_legacy_pandas +@utils.skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: path = gcs_folder + "test_read_csv_gcs_default_engine_w_index*.csv" else: path = gcs_folder + "test_read_csv_gcs_default_engine_wo_index*.csv" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df.to_csv(path, index=False) dtype = scalars_df.dtypes.to_dict() dtype.pop("geography_col") @@ -490,7 +490,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): pytest.param("\t", id="custom_sep"), ], ) -@skip_legacy_pandas +@utils.skip_legacy_pandas def test_read_csv_local_default_engine(session, scalars_dfs, sep): scalars_df, scalars_pandas_df = scalars_dfs with tempfile.TemporaryDirectory() as dir: @@ -639,7 +639,7 @@ def test_read_csv_default_engine_throws_not_implemented_error( gcs_folder + "test_read_csv_gcs_default_engine_throws_not_implemented_error*.csv" ) - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df_index.to_csv(path) with pytest.raises(NotImplementedError, match=match): session.read_csv(read_path, **kwargs) @@ -647,7 +647,7 @@ def test_read_csv_default_engine_throws_not_implemented_error( def test_read_csv_gcs_default_engine_w_header(session, scalars_df_index, gcs_folder): path = gcs_folder + "test_read_csv_gcs_default_engine_w_header*.csv" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df_index.to_csv(path) # Skips header=N rows, normally considers the N+1th row as the header, but overridden by @@ -714,7 +714,7 @@ def test_read_csv_gcs_default_engine_w_index_col_name( session, scalars_df_default_index, gcs_folder ): path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_name*.csv" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df_default_index.to_csv(path) df = session.read_csv(read_path, index_col="rowindex") @@ -729,7 +729,7 @@ def test_read_csv_gcs_default_engine_w_index_col_index( session, scalars_df_default_index, gcs_folder ): path = gcs_folder + "test_read_csv_gcs_default_engine_w_index_col_index*.csv" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df_default_index.to_csv(path) index_col = scalars_df_default_index.columns.to_list().index("rowindex") @@ -788,7 +788,7 @@ def test_read_csv_local_default_engine_w_index_col_index( def test_read_csv_gcs_w_usecols(session, scalars_df_index, gcs_folder, engine): path = gcs_folder + "test_read_csv_gcs_w_usecols" path = path + "_default_engine*.csv" if engine is None else path + "_bq_engine*.csv" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) if engine is None else path + read_path = utils.get_first_file_from_wildcard(path) if engine is None else path scalars_df_index.to_csv(path) # df should only have 1 column which is bool_col. @@ -900,7 +900,7 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, e # Only bigquery engine for reads supports wildcards in path name. if engine != "bigquery": - path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + path = utils.get_first_file_from_wildcard(path) df_out = ( session.read_parquet(path, engine=engine) @@ -1010,7 +1010,7 @@ def test_read_parquet_gcs_compression_not_supported( def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df.to_json(path, index=False, lines=True, orient="records") df = session.read_json(read_path, lines=True, orient="records", engine="bigquery") @@ -1034,7 +1034,7 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json" - read_path = path.replace("*", FIRST_GCS_FILE_SUFFIX) + read_path = utils.get_first_file_from_wildcard(path) scalars_df.to_json( path, index=False, diff --git a/tests/system/utils.py b/tests/system/utils.py index a6444fff8f..e40502e6f2 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -28,8 +28,6 @@ from bigframes.functions import remote_function -FIRST_GCS_FILE_SUFFIX = "000000000000" - def skip_legacy_pandas(test): @functools.wraps(test) @@ -306,3 +304,7 @@ def delete_cloud_function( request = functions_v2.DeleteFunctionRequest(name=full_name) operation = functions_client.delete_function(request=request) return operation + + +def get_first_file_from_wildcard(path): + return path.replace("*", "000000000000")