8000 Autofetch table schema on load if not provided · googleapis/google-cloud-python@adeb233 · GitHub
[go: up one dir, main page]

Skip to content

Commit adeb233

Browse files
committed
Autofetch table schema on load if not provided
1 parent 86bb5cf commit adeb233

File tree

2 files changed

+139
-5
lines changed

2 files changed

+139
-5
lines changed

bigquery/google/cloud/bigquery/client.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,6 +1531,29 @@ def load_table_from_dataframe(
15311531
if location is None:
15321532
location = self.location
15331533

1534+
# If table schema is not provided, we try to fetch the existing table
1535+
# schema, and check if dataframe schema is compatible with it.
1536+
if not job_config.schema:
1537+
try:
1538+
table = self.get_table(destination)
1539+
except google.api_core.exceptions.NotFound:
1540+
table = None
1541+
else:
1542+
table_col_names = {field.name for field in table.schema}
1543+
dframe_col_names = set(dataframe.columns)
1544+
1545+
in_dframe_only = dframe_col_names - table_col_names
1546+
if in_dframe_only:
1547+
raise ValueError(
1548+
"Dataframe contains columns that are not present in "
1549+
"table: {}".format(in_dframe_only)
1550+
)
1551+
1552+
# schema fields not present in the dataframe are not needed
1553+
job_config.schema = [
1554+
field for field in table.schema if field.name in dframe_col_names
1555+
]
1556+
15341557
job_config.schema = _pandas_helpers.dataframe_to_bq_schema(
15351558
dataframe, job_config.schema
15361559
)

bigquery/tests/unit/test_client.py

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5228,15 +5228,23 @@ def test_load_table_from_file_bad_mode(self):
52285228
def test_load_table_from_dataframe(self):
52295229
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
52305230
from google.cloud.bigquery import job
5231+
from google.cloud.bigquery.schema import SchemaField
52315232

52325233
client = self._make_client()
52335234
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
52345235
dataframe = pandas.DataFrame(records)
52355236

5237+
get_table_patch = mock.patch(
5238+
"google.cloud.bigquery.client.Client.get_table",
5239+
autospec=True,
5240+
return_value=mock.Mock(
5241+
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
5242+
),
5243+
)
52365244
load_patch = mock.patch(
52375245
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
52385246
)
5239-
with load_patch as load_table_from_file:
5247+
with load_patch as load_table_from_file, get_table_patch:
52405248
client.load_table_from_dataframe(dataframe, self.TABLE_REF)
52415249

52425250
load_table_from_file.assert_called_once_with(
@@ -5263,15 +5271,23 @@ def test_load_table_from_dataframe(self):
52635271
def test_load_table_from_dataframe_w_client_location(self):
52645272
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
52655273
from google.cloud.bigquery import job
5274+
from google.cloud.bigquery.schema import SchemaField
52665275

52675276
client = self._make_client(location=self.LOCATION)
52685277
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
52695278
dataframe = pandas.DataFrame(records)
52705279

5280+
get_table_patch = mock.patch(
5281+
"google.cloud.bigquery.client.Client.get_table",
5282+
autospec=True,
5283+
return_value=mock.Mock(
5284+
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
5285+
),
5286+
)
52715287
load_patch = mock.patch(
52725288
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
52735289
)
5274-
with load_patch as load_table_from_file:
5290+
with load_patch as load_table_from_file, get_table_patch:
52755291
client.load_table_from_dataframe(dataframe, self.TABLE_REF)
52765292

52775293
load_table_from_file.assert_called_once_with(
@@ -5298,16 +5314,24 @@ def test_load_table_from_dataframe_w_client_location(self):
52985314
def test_load_table_from_dataframe_w_custom_job_config(self):
52995315
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
53005316
from google.cloud.bigquery import job
5317+
from google.cloud.bigquery.schema import SchemaField
53015318

53025319
client = self._make_client()
53035320
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
53045321
dataframe = pandas.DataFrame(records)
53055322
job_config = job.LoadJobConfig()
53065323

5324+
get_table_patch = mock.patch(
5325+
"google.cloud.bigquery.client.Client.get_table",
5326+
autospec=True,
5327+
return_value=mock.Mock(
5328+
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
5329+
),
5330+
)
53075331
load_patch = mock.patch(
53085332
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
53095333
)
5310-
with load_patch as load_table_from_file:
5334+
with load_patch as load_table_from_file, get_table_patch:
53115335
client.load_table_from_dataframe(
53125336
dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION
53135337
)
@@ -5370,7 +5394,20 @@ def test_load_table_from_dataframe_w_automatic_schema(self):
53705394
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
53715395
)
53725396

5373-
with load_patch as load_table_from_file:
5397+
get_table_patch = mock.patch(
5398+
"google.cloud.bigquery.client.Client.get_table",
5399+
autospec=True,
5400+
return_value=mock.Mock(
5401+
schema=[
5402+
SchemaField("int_col", "INTEGER"),
5403+
SchemaField("float_col", "FLOAT"),
5404+
SchemaField("bool_col", "BOOLEAN"),
5405+
SchemaField("dt_col", "DATETIME"),
5406+
SchemaField("ts_col", "TIMESTAMP"),
5407+
]
5408+
),
5409+
)
5410+
with load_patch as load_table_from_file, get_table_patch:
53745411
client.load_table_from_dataframe(
53755412
dataframe, self.TABLE_REF, location=self.LOCATION
53765413
)
@@ -5398,6 +5435,71 @@ def test_load_table_from_dataframe_w_automatic_schema(self):
53985435
SchemaField("ts_col", "TIMESTAMP"),
53995436
)
54005437

5438+
@unittest.skipIf(pandas is None, "Requires `pandas`")
5439+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
5440+
def test_load_table_from_dataframe_unknown_df_columns(self):
5441+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
5442+
from google.cloud.bigquery import job
5443+
from google.cloud.bigquery.schema import SchemaField
5444+
5445+
client = self._make_client()
5446+
records = [{"id": 1, "typo_age": 100}, {"id": 2, "typo_age": 60}]
5447+
dataframe = pandas.DataFrame(records)
5448+
5449+
get_table_patch = mock.patch(
5450+
"google.cloud.bigquery.client.Client.get_table",
5451+
autospec=True,
5452+
return_value=mock.Mock(
5453+
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
5454+
),
5455+
)
5456+
load_patch = mock.patch(
5457+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
5458+
)
5459+
with pytest.raises(ValueError) as exc_info, load_patch, get_table_patch:
5460+
client.load_table_from_dataframe(dataframe, self.TABLE_REF)
5461+
5462+
err_msg = str(exc_info.value)
5463+
assert "Dataframe contains columns that are not present in table" in err_msg
5464+
assert "typo_age" in err_msg
5465+
assert "id" not in err_msg
5466+
5467+
@unittest.skipIf(pandas is None, "Requires `pandas`")
5468+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
5469+
def test_load_table_from_dataframe_unknown_table(self):
5470+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
5471+
from google.cloud.bigquery import job
5472+
from google.cloud.bigquery.schema import SchemaField
5473+
5474+
client = self._make_client()
5475+
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
5476+
dataframe = pandas.DataFrame(records)
5477+
5478+
get_table_patch = mock.patch(
5479+
"google.cloud.bigquery.client.Client.get_table",
5480+
autospec=True,
5481+
side_effect=google.api_core.exceptions.NotFound("Table not found"),
5482+
)
5483+
load_patch = mock.patch(
5484+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
5485+
)
5486+
with load_patch as load_table_from_file, get_table_patch:
5487+
# there should be no error
5488+
client.load_table_from_dataframe(dataframe, self.TABLE_REF)
5489+
5490+
load_table_from_file.assert_called_once_with(
5491+
client,
5492+
mock.ANY,
5493+
self.TABLE_REF,
5494+
num_retries=_DEFAULT_NUM_RETRIES,
5495+
rewind=True,
5496+
job_id=mock.ANY,
5497+
job_id_prefix=None,
5498+
location=None,
5499+
project=None,
5500+
job_config=mock.ANY,
5501+
)
5502+
54015503
@unittest.skipIf(pandas is None, "Requires `pandas`")
54025504
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
54035505
def test_load_table_from_dataframe_struct_fields_error(self):
@@ -5686,10 +5788,19 @@ def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self):
56865788
@unittest.skipIf(pandas is None, "Requires `pandas`")
56875789
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
56885790
def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self):
5791+
from google.cloud.bigquery.schema import SchemaField
5792+
56895793
client = self._make_client()
56905794
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
56915795
dataframe = pandas.DataFrame(records)
56925796

5797+
get_table_patch = mock.patch(
5798+
"google.cloud.bigquery.client.Client.get_table",
5799+
autospec=True,
5800+
return_value=mock.Mock(
5801+
schema=[SchemaField("id", "INTEGER"), SchemaField("age", "INTEGER")]
5802+
),
5803+
)
56935804
load_patch = mock.patch(
56945805
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
56955806
)
@@ -5698,7 +5809,7 @@ def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self):
56985809
dataframe, "to_parquet", wraps=dataframe.to_parquet
56995810
)
57005811

5701-
with load_patch, pyarrow_patch, to_parquet_patch as to_parquet_spy:
5812+
with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch as to_parquet_spy:
57025813
client.load_table_from_dataframe(
57035814
dataframe,
57045815
self.TABLE_REF,

0 commit comments

Comments
 (0)
0