Skip to content

Commit 119f19b

Browse files
committed
feat(bigquery): add create_bqstorage_client param to to_dataframe and to_arrow
When the `create_bqstorage_client` parameter is set to `True`, the BigQuery client constructs a BigQuery Storage API client for you. This removes the need for boilerplate code to manually construct both clients explitly with the same credentials. Does this make the `bqstorage_client` parameter unnecessary? In most cases, yes, but there are a few cases where we'll want to continue using it. * When partner tools use `to_dataframe`, they should continue to use `bqstorage_client` so that they can set the correct amended user-agent strings. * When a developer needs to override the default API endpoint for the BQ Storage API, they'll need to manually supply a `bqstorage_client`.
1 parent 3e8fbae commit 119f19b

File tree

9 files changed

+304
-8
lines changed

9 files changed

+304
-8
lines changed

bigquery/google/cloud/bigquery/client.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,19 @@ def dataset(self, dataset_id, project=None):
341341

342342
return DatasetReference(project, dataset_id)
343343

344+
def _create_bqstorage_client(self):
345+
"""Create a BigQuery Storage API client using this client's credentials.
346+
347+
Returns:
348+
google.cloud.bigquery_storage_v1beta1.BigQueryStorageClient:
349+
A BigQuery Storage API client.
350+
"""
351+
from google.cloud import bigquery_storage_v1beta1
352+
353+
return bigquery_storage_v1beta1.BigQueryStorageClient(
354+
credentials=self._credentials
355+
)
356+
344357
def create_dataset(self, dataset, exists_ok=False, retry=DEFAULT_RETRY):
345358
"""API call: create the dataset via a POST request.
346359

bigquery/google/cloud/bigquery/job.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3109,7 +3109,12 @@ def result(
31093109

31103110
# If changing the signature of this method, make sure to apply the same
31113111
# changes to table.RowIterator.to_arrow()
3112-
def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
3112+
def to_arrow(
3113+
self,
3114+
progress_bar_type=None,
3115+
bqstorage_client=None,
3116+
create_bqstorage_client=False,
3117+
):
31133118
"""[Beta] Create a class:`pyarrow.Table` by loading all pages of a
31143119
table or query.
31153120
@@ -3142,6 +3147,16 @@ def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
31423147
31433148
Reading from a specific partition or snapshot is not
31443149
currently supported by this method.
3150+
create_bqstorage_client (bool):
3151+
**Beta Feature** Optional. If ``True``, create a BigQuery
3152+
Storage API client using the default API settings. The
3153+
BigQuery Storage API is a faster way to fetch rows from
3154+
BigQuery. See the ``bqstorage_client`` parameter for more
3155+
information.
3156+
3157+
This argument does nothing if ``bqstorage_client`` is supplied.
3158+
3159+
..versionadded:: 1.22.0
31453160
31463161
Returns:
31473162
pyarrow.Table
@@ -3156,12 +3171,20 @@ def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
31563171
..versionadded:: 1.17.0
31573172
"""
31583173
return self.result().to_arrow(
3159-
progress_bar_type=progress_bar_type, bqstorage_client=bqstorage_client
3174+
progress_bar_type=progress_bar_type,
3175+
bqstorage_client=bqstorage_client,
3176+
create_bqstorage_client=create_bqstorage_client,
31603177
)
31613178

31623179
# If changing the signature of this method, make sure to apply the same
31633180
# changes to table.RowIterator.to_dataframe()
3164-
def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=None):
3181+
def to_dataframe(
3182+
self,
3183+
bqstorage_client=None,
3184+
dtypes=None,
3185+
progress_bar_type=None,
3186+
create_bqstorage_client=False,
3187+
):
31653188
"""Return a pandas DataFrame from a QueryJob
31663189
31673190
Args:
@@ -3194,6 +3217,16 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=Non
31943217
for details.
31953218
31963219
..versionadded:: 1.11.0
3220+
create_bqstorage_client (bool):
3221+
**Beta Feature** Optional. If ``True``, create a BigQuery
3222+
Storage API client using the default API settings. The
3223+
BigQuery Storage API is a faster way to fetch rows from
3224+
BigQuery. See the ``bqstorage_client`` parameter for more
3225+
information.
3226+
3227+
This argument does nothing if ``bqstorage_client`` is supplied.
3228+
3229+
..versionadded:: 1.22.0
31973230
31983231
Returns:
31993232
A :class:`~pandas.DataFrame` populated with row data and column
@@ -3207,6 +3240,7 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=Non
32073240
bqstorage_client=bqstorage_client,
32083241
dtypes=dtypes,
32093242
progress_bar_type=progress_bar_type,
3243+
create_bqstorage_client=create_bqstorage_client,
32103244
)
32113245

32123246
def __iter__(self):

bigquery/google/cloud/bigquery/table.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1440,7 +1440,12 @@ def _to_arrow_iterable(self, bqstorage_client=None):
14401440

14411441
# If changing the signature of this method, make sure to apply the same
14421442
# changes to job.QueryJob.to_arrow()
1443-
def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
1443+
def to_arrow(
1444+
self,
1445+
progress_bar_type=None,
1446+
bqstorage_client=None,
1447+
create_bqstorage_client=False,
1448+
):
14441449
"""[Beta] Create a class:`pyarrow.Table` by loading all pages of a
14451450
table or query.
14461451
@@ -1473,6 +1478,16 @@ def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
14731478
14741479
Reading from a specific partition or snapshot is not
14751480
currently supported by this method.
1481+
create_bqstorage_client (bool):
1482+
**Beta Feature** Optional. If ``True``, create a BigQuery
1483+
Storage API client using the default API settings. The
1484+
BigQuery Storage API is a faster way to fetch rows from
1485+
BigQuery. See the ``bqstorage_client`` parameter for more
1486+
information.
1487+
1488+
This argument does nothing if ``bqstorage_client`` is supplied.
1489+
1490+
..versionadded:: 1.22.0
14761491
14771492
Returns:
14781493
pyarrow.Table
@@ -1488,6 +1503,9 @@ def to_arrow(self, progress_bar_type=None, bqstorage_client=None):
14881503
if pyarrow is None:
14891504
raise ValueError(_NO_PYARROW_ERROR)
14901505

1506+
if not bqstorage_client and create_bqstorage_client:
1507+
bqstorage_client = self.client._create_bqstorage_client()
1508+
14911509
progress_bar = self._get_progress_bar(progress_bar_type)
14921510

14931511
record_batches = []
@@ -1542,14 +1560,20 @@ def _to_dataframe_iterable(self, bqstorage_client=None, dtypes=None):
15421560

15431561
# If changing the signature of this method, make sure to apply the same
15441562
# changes to job.QueryJob.to_dataframe()
1545-
def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=None):
1563+
def to_dataframe(
1564+
self,
1565+
bqstorage_client=None,
1566+
dtypes=None,
1567+
progress_bar_type=None,
1568+
create_bqstorage_client=False,
1569+
):
15461570
"""Create a pandas DataFrame by loading all pages of a query.
15471571
15481572
Args:
15491573
bqstorage_client (google.cloud.bigquery_storage_v1beta1.BigQueryStorageClient):
15501574
**Beta Feature** Optional. A BigQuery Storage API client. If
15511575
supplied, use the faster BigQuery Storage API to fetch rows
1552-
from BigQuery. This API is a billable API.
1576+
from BigQuery.
15531577
15541578
This method requires the ``pyarrow`` and
15551579
``google-cloud-bigquery-storage`` libraries.
@@ -1586,6 +1610,16 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=Non
15861610
progress bar as a graphical dialog box.
15871611
15881612
..versionadded:: 1.11.0
1613+
create_bqstorage_client (bool):
1614+
**Beta Feature** Optional. If ``True``, create a BigQuery
1615+
Storage API client using the default API settings. The
1616+
BigQuery Storage API is a faster way to fetch rows from
1617+
BigQuery. See the ``bqstorage_client`` parameter for more
1618+
information.
1619+
1620+
This argument does nothing if ``bqstorage_client`` is supplied.
1621+
1622+
..versionadded:: 1.22.0
15891623
15901624
Returns:
15911625
pandas.DataFrame:
@@ -1605,6 +1639,9 @@ def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=Non
16051639
if dtypes is None:
16061640
dtypes = {}
16071641

1642+
if not bqstorage_client and create_bqstorage_client:
1643+
bqstorage_client = self.client._create_bqstorage_client()
1644+
16081645
if bqstorage_client and self.max_results is not None:
16091646
warnings.warn(
16101647
"Cannot use bqstorage_client if max_results is set, "
@@ -1651,11 +1688,18 @@ class _EmptyRowIterator(object):
16511688
pages = ()
16521689
total_rows = 0
16531690

1654-
def to_arrow(self, progress_bar_type=None):
1691+
def to_arrow(
1692+
self,
1693+
progress_bar_type=None,
1694+
bqstorage_client=None,
1695+
create_bqstorage_client=False,
1696+
):
16551697
"""[Beta] Create an empty class:`pyarrow.Table`.
16561698
16571699
Args:
16581700
progress_bar_type (Optional[str]): Ignored. Added for compatibility with RowIterator.
1701+
bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
1702+
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
16591703
16601704
Returns:
16611705
pyarrow.Table: An empty :class:`pyarrow.Table`.
@@ -1664,13 +1708,20 @@ def to_arrow(self, progress_bar_type=None):
16641708
raise ValueError(_NO_PYARROW_ERROR)
16651709
return pyarrow.Table.from_arrays(())
16661710

1667-
def to_dataframe(self, bqstorage_client=None, dtypes=None, progress_bar_type=None):
1711+
def to_dataframe(
1712+
self,
1713+
bqstorage_client=None,
1714+
dtypes=None,
1715+
progress_bar_type=None,
1716+
create_bqstorage_client=False,
1717+
):
16681718
"""Create an empty dataframe.
16691719
16701720
Args:
16711721
bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
16721722
dtypes (Any): Ignored. Added for compatibility with RowIterator.
16731723
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
1724+
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
16741725
16751726
Returns:
16761727
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def download_public_data(client):
17+
18+
# [START bigquery_pandas_public_data]
19+
# TODO(developer): Import the client library.
20+
# from google.cloud import bigquery
21+
22+
# TODO(developer): Construct a BigQuery client object.
23+
# client = bigquery.Client()
24+
25+
# TODO(developer): Set table_id to the fully-qualified table ID in standard
26+
# SQL format, including the project ID and dataset ID.
27+
table_id = "bigquery-public-data.usa_names.usa_1910_current"
28+
29+
# Use the BigQuery Storage API to speed-up downloads of large tables.
30+
dataframe = client.list_rows(table_id).to_dataframe(create_bqstorage_client=True)
31+
32+
print(dataframe.info())
33+
# [END bigquery_pandas_public_data]
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def download_public_data_sandbox(client):
17+
18+
# [START bigquery_pandas_public_data_sandbox]
19+
# TODO(developer): Import the client library.
20+
# from google.cloud import bigquery
21+
22+
# TODO(developer): Construct a BigQuery client object.
23+
# client = bigquery.Client()
24+
25+
# `SELECT *` is an anti-pattern in BigQuery because it is cheaper and
26+
# faster to use the BigQuery Storage API directly, but BigQuery Sandbox
27+
# users can only use the BigQuery Storage API to download query results.
28+
query_string = "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_current`"
29+
30+
# Use the BigQuery Storage API to speed-up downloads of large tables.
31+
dataframe = client.query(query_string).to_dataframe(create_bqstorage_client=True)
32+
33+
print(dataframe.info())
34+
# [END bigquery_pandas_public_data_sandbox]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .. import download_public_data
16+
17+
18+
def test_download_public_data(capsys, client):
19+
20+
download_public_data.download_public_data(client)
21+
out, _ = capsys.readouterr()
22+
assert "year" in out
23+
assert "gender" in out
24+
assert "name" in out
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .. import download_public_data_sandbox
16+
17+
18+
def test_download_public_data_sandbox(capsys, client):
19+
20+
download_public_data_sandbox.download_public_data_sandbox(client)
21+
out, _ = capsys.readouterr()
22+
assert "year" in out
23+
assert "gender" in out
24+
assert "name" in out

bigquery/tests/unit/test_client.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@
4545
import google.cloud._helpers
4646
from google.cloud import bigquery_v2
4747
from google.cloud.bigquery.dataset import DatasetReference
48+
49+
try:
50+
from google.cloud import bigquery_storage_v1beta1
51+
except (ImportError, AttributeError): # pragma: NO COVER
52+
bigquery_storage_v1beta1 = None
4853
from tests.unit.helpers import make_connection
4954

5055

@@ -531,6 +536,26 @@ def test_get_dataset(self):
531536
)
532537
self.assertEqual(dataset.dataset_id, self.DS_ID)
533538

539+
@unittest.skipIf(
540+
bigquery_storage_v1beta1 is None, "Requires `google-cloud-bigquery-storage`"
541+
)
542+
def test_create_bqstorage_client(self):
543+
mock_client = mock.create_autospec(
544+
bigquery_storage_v1beta1.BigQueryStorageClient
545+
)
546+
mock_client_instance = object()
547+
mock_client.return_value = mock_client_instance
548+
creds = _make_credentials()
549+
client = self._make_one(project=self.PROJECT, credentials=creds)
550+
551+
with mock.patch(
552+
"google.cloud.bigquery_storage_v1beta1.BigQueryStorageClient", mock_client
553+
):
554+
bqstorage_client = client._create_bqstorage_client()
555+
556+
self.assertIs(bqstorage_client, mock_client_instance)
557+
mock_client.assert_called_once_with(credentials=creds)
558+
534559
def test_create_dataset_minimal(self):
535560
from google.cloud.bigquery.dataset import Dataset
536561

0 commit comments

Comments
 (0)