From c2c5ba35e3fbb62fddec8307028efb4762909ca9 Mon Sep 17 00:00:00 2001 From: Tomo Suzuki Date: Wed, 18 Feb 2026 15:44:37 -0500 Subject: [PATCH 1/5] chore: replace old partner teams with updated names (#1029) This PR replaces @googleapis/api-bigquery-dataframe with @googleapis/bigquery-dataframe-team and @googleapis/yoshi-python/@googleapis/python-core-client-libraries with @googleapis/cloud-sdk-python-team. b/478003109 --- .github/CODEOWNERS | 8 ++++---- .github/blunderbuss.yml | 12 ++++++------ .repo-metadata.json | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2adbf6bd..2b1fa159 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,8 +5,8 @@ # https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax # Note: This file is autogenerated. To make changes to the codeowner team, please update .repo-metadata.json. -# @googleapis/yoshi-python @googleapis/python-core-client-libraries @googleapis/api-bigquery-dataframe are the default owners for changes in this repo -* @googleapis/yoshi-python @googleapis/python-core-client-libraries @googleapis/api-bigquery-dataframe +# @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team are the default owners for changes in this repo +* @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team -# @googleapis/python-samples-reviewers @googleapis/python-core-client-libraries @googleapis/api-bigquery-dataframe are the default owners for samples changes -/samples/ @googleapis/python-samples-reviewers @googleapis/python-core-client-libraries @googleapis/api-bigquery-dataframe +# @googleapis/python-samples-reviewers @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team are the default owners for samples changes +/samples/ @googleapis/python-samples-reviewers @googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml index 1aff5244..e13e171a 100644 --- a/.github/blunderbuss.yml +++ b/.github/blunderbuss.yml @@ -4,17 +4,17 @@ # Note: This file is autogenerated. To make changes to the assignee # team, please update `codeowner_team` in `.repo-metadata.json`. assign_issues: - - googleapis/python-core-client-libraries - - googleapis/api-bigquery-dataframe + - googleapis/cloud-sdk-python-team + - googleapis/bigquery-dataframe-team assign_issues_by: - labels: - "samples" to: - googleapis/python-samples-reviewers - - googleapis/python-core-client-libraries - - googleapis/api-bigquery-dataframe + - googleapis/cloud-sdk-python-team + - googleapis/bigquery-dataframe-team assign_prs: - - googleapis/python-core-client-libraries - - googleapis/api-bigquery-dataframe + - googleapis/cloud-sdk-python-team + - googleapis/bigquery-dataframe-team diff --git a/.repo-metadata.json b/.repo-metadata.json index 463c427b..eda7112d 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -11,5 +11,5 @@ "distribution_name": "pandas-gbq", "api_id": "bigquery.googleapis.com", "default_version": "", - "codeowner_team": "@googleapis/python-core-client-libraries @googleapis/api-bigquery-dataframe" + "codeowner_team": "@googleapis/cloud-sdk-python-team @googleapis/bigquery-dataframe-team" } From 72f772ec7b36ba081ed18ef4409f8262a412a1b9 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 20 Feb 2026 10:40:06 -0800 Subject: [PATCH 2/5] docs: Update bigframes links to new homepage (#1028) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- docs/index.rst | 4 ++-- pandas_gbq/core/read.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 14fcde28..130c734b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,8 +23,8 @@ Note: The canonical version of this documentation can always be found on the `BigQuery sandbox `__ to try the service for free. - Also, consider using BigQuery DataFrames - (`bit.ly/bigframes-intro `__) + Also, consider using `BigQuery DataFrames + `__ to process large results with pandas compatible APIs with transparent SQL pushdown to BigQuery engine. This provides an opportunity to save on costs and improve performance. diff --git a/pandas_gbq/core/read.py b/pandas_gbq/core/read.py index bc089002..ecd4e00c 100644 --- a/pandas_gbq/core/read.py +++ b/pandas_gbq/core/read.py @@ -146,7 +146,7 @@ def download_results( num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB warnings.warn( f"Recommendation: Your results are {num_gib:.1f} GiB. " - "Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)" + "Consider using BigQuery DataFrames (https://dataframes.bigquery.dev)" "to process large results with pandas compatible APIs with transparent SQL " "pushdown to BigQuery engine. This provides an opportunity to save on costs " "and improve performance. " From 4ea2d57f45c6685c13b8258923b4326fdd3b71d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 2 Mar 2026 15:29:04 -0600 Subject: [PATCH 3/5] fix(deps): support pandas 3.0 (prerelease) (#998) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- setup.py | 2 +- tests/system/test_to_gbq.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 66d5edf2..f47bf04b 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ "setuptools", "db-dtypes >=1.0.4,<2.0.0", "numpy >=1.18.1", - "pandas >=1.1.4, <3.0.0", + "pandas >=1.1.4", "pyarrow >= 4.0.0", # See https://arrow.apache.org/release/22.0.0.html "pyarrow >= 22.0.0; python_version >= '3.14'", diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index a398b9ad..b2349792 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -155,7 +155,7 @@ def test_series_round_trip( dtype="boolean", ), "object_col": pandas.Series( - [False, None, True], + [False, pandas.NA, True], dtype="object", ), } @@ -365,7 +365,10 @@ def test_series_round_trip( # google-cloud-bigquery versions 1.x and 2.x, but not 3.x. # https://github.com/googleapis/python-bigquery-pandas/issues/365 "datetime_col": [ - datetime.datetime(1, 1, 1), + # CSV loader in BigQuery currently requires leading 0s + # for TIMESTAMP but not DATETIME. See internal issue + # b/467399807. + datetime.datetime(1000, 1, 1), datetime.datetime(1970, 1, 1), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), ], From 16d08696f251781f5d84c509844ef05f98ec8dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 5 Mar 2026 15:35:15 -0600 Subject: [PATCH 4/5] feat: support biglake tables in pandas_gbq.sample (#1014) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .coveragerc | 4 +- .github/workflows/unittest.yml | 2 +- .gitignore | 1 + noxfile.py | 2 +- pandas_gbq/core/biglake.py | 81 +++++++ pandas_gbq/core/resource_references.py | 67 ++++++ pandas_gbq/core/sample.py | 203 ++++++++++++------ pandas_gbq/gbq.py | 2 +- tests/unit/core/__init__.py | 3 + tests/unit/core/test_biglake.py | 54 +++++ .../core/test_core_resource_references.py | 74 +++++++ tests/unit/{ => core}/test_core_sample.py | 193 ++++++++++++----- tests/unit/test_to_gbq.py | 8 +- 13 files changed, 572 insertions(+), 122 deletions(-) create mode 100644 pandas_gbq/core/biglake.py create mode 100644 pandas_gbq/core/resource_references.py create mode 100644 tests/unit/core/__init__.py create mode 100644 tests/unit/core/test_biglake.py create mode 100644 tests/unit/core/test_core_resource_references.py rename tests/unit/{ => core}/test_core_sample.py (66%) diff --git a/.coveragerc b/.coveragerc index b5577cdd..8b850b94 100644 --- a/.coveragerc +++ b/.coveragerc @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Generated by synthtool. DO NOT EDIT! [run] branch = True omit = @@ -22,7 +21,7 @@ omit = google/cloud/__init__.py [report] -fail_under = 96 +fail_under = 95 show_missing = True exclude_lines = # Re-enable the standard pragma @@ -34,6 +33,5 @@ exclude_lines = omit = */gapic/*.py */proto/*.py - */core/*.py */site-packages/*.py google/cloud/__init__.py diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 3a22f126..57ad3b58 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -58,4 +58,4 @@ jobs: run: | find .coverage-results -type f -name '*.zip' -exec unzip {} \; coverage combine .coverage-results/**/.coverage* - coverage report --show-missing --fail-under=96 + coverage report --show-missing --fail-under=95 diff --git a/.gitignore b/.gitignore index d083ea1d..f388882d 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,7 @@ docs.metadata # Virtual environment env/ venv/ +.venv/ # Test logs coverage.xml diff --git a/noxfile.py b/noxfile.py index 8ed65438..8ec19e84 100644 --- a/noxfile.py +++ b/noxfile.py @@ -405,7 +405,7 @@ def cover(session): test runs (not system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=96") + session.run("coverage", "report", "--show-missing", "--fail-under=95") # Make sure there is no dead code in our test directories. session.run( diff --git a/pandas_gbq/core/biglake.py b/pandas_gbq/core/biglake.py new file mode 100644 index 00000000..63a40147 --- /dev/null +++ b/pandas_gbq/core/biglake.py @@ -0,0 +1,81 @@ +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +""" +Utilities for working with BigLake tables. +""" + +# TODO(tswast): Synchronize with bigframes/session/iceberg.py, which uses +# pyiceberg and the BigLake APIs, rather than relying on dry run. + +from __future__ import annotations + +import dataclasses +from typing import Sequence + +import google.cloud.bigquery + +import pandas_gbq.core.resource_references + + +_DRY_RUN_TEMPLATE = """ +SELECT * +FROM `{project}.{catalog}.{namespace}.{table}` +""" + + +_COUNT_TEMPLATE = """ +SELECT COUNT(*) as total_rows +FROM `{project}.{catalog}.{namespace}.{table}` +""" + + +@dataclasses.dataclass(frozen=True) +class BigLakeTableMetadata: + schema: Sequence[google.cloud.bigquery.SchemaField] + num_rows: int + + +def get_table_metadata( + *, + reference: pandas_gbq.core.resource_references.BigLakeTableId, + bqclient: google.cloud.bigquery.Client, +) -> BigLakeTableMetadata: + """ + Get the schema for a BigLake table. + + Currently, this does some BigQuery queries. In the future, we'll want to get + other metadata like the number of rows and storage bytes so that we can do a + more accurate estimate of how many rows to sample. + """ + dry_run_config = google.cloud.bigquery.QueryJobConfig(dry_run=True) + query = _DRY_RUN_TEMPLATE.format( + project=reference.project, + catalog=reference.catalog, + namespace=".".join(reference.namespace), + table=reference.table, + ) + job = bqclient.query(query, job_config=dry_run_config) + job.result() + schema = job.schema + + count_rows = list( + bqclient.query_and_wait( + _COUNT_TEMPLATE.format( + project=reference.project, + catalog=reference.catalog, + namespace=".".join(reference.namespace), + table=reference.table, + ) + ) + ) + assert ( + len(count_rows) == 1 + ), "got unexpected query response when determining number of rows" + total_rows = count_rows[0].total_rows + + return BigLakeTableMetadata( + schema=schema if schema is not None else [], + num_rows=total_rows, + ) diff --git a/pandas_gbq/core/resource_references.py b/pandas_gbq/core/resource_references.py new file mode 100644 index 00000000..d8f93e83 --- /dev/null +++ b/pandas_gbq/core/resource_references.py @@ -0,0 +1,67 @@ +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import dataclasses +import re +from typing import Union + + +_TABLE_REFEREENCE_PATTERN = re.compile( + # In the past, organizations could prefix their project IDs with a domain + # name. Such projects still exist, especially at Google. + r"^(?P[^:]+:)?" + r"(?P[^.]+)\." + # Match dataset or catalog + namespace. + # + # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support + # this without catastrophic backtracking by moving the trailing "." to the + # table group. + r"(?P.*)" + # Table names can't contain ".", as that's used as the separator. + r"\.(?P[^.]+)$" +) + + +@dataclasses.dataclass(frozen=True) +class BigLakeTableId: + project: str + catalog: str + namespace: tuple[str, ...] + table: str + + +@dataclasses.dataclass(frozen=True) +class BigQueryTableId: + project_id: str + dataset_id: str + table_id: str + + +def parse_table_id(table_id: str) -> Union[BigLakeTableId, BigQueryTableId]: + """Turn a string into a BigLakeTableId or BigQueryTableId. + + Raises: + ValueError: If the table ID is invalid. + """ + regex_match = _TABLE_REFEREENCE_PATTERN.match(table_id) + if not regex_match: + raise ValueError(f"Invalid table ID: {table_id}") + + inner_parts = regex_match.group("inner_parts").split(".") + if any(part == "" for part in inner_parts): + raise ValueError(f"Invalid table ID: {table_id}") + + if len(inner_parts) == 1: + return BigQueryTableId( + project_id=regex_match.group("project"), + dataset_id=inner_parts[0], + table_id=regex_match.group("table"), + ) + + return BigLakeTableId( + project=regex_match.group("project"), + catalog=inner_parts[0], + namespace=tuple(inner_parts[1:]), + table=regex_match.group("table"), + ) diff --git a/pandas_gbq/core/sample.py b/pandas_gbq/core/sample.py index 49eee4b5..8268c76e 100644 --- a/pandas_gbq/core/sample.py +++ b/pandas_gbq/core/sample.py @@ -1,11 +1,11 @@ -# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. from __future__ import annotations import typing -from typing import Optional, Sequence, cast +from typing import Optional, Sequence, Union import google.cloud.bigquery import google.cloud.bigquery.table @@ -14,7 +14,9 @@ import pandas_gbq.constants import pandas_gbq.core.read +import pandas_gbq.core.biglake import pandas_gbq.gbq_connector +import pandas_gbq.core.resource_references # Only import at module-level at type checking time to avoid circular # dependencies in the pandas package, which has an optional dependency on @@ -52,7 +54,6 @@ # TODO(tswast): Choose an estimate based on actual BigQuery stats. _ARRAY_LENGTH_ESTIMATE = 5 _UNKNOWN_TYPE_SIZE_ESTIMATE = 4 -_MAX_ROW_BYTES = 100 * pandas_gbq.constants.BYTES_IN_MIB _MAX_AUTO_TARGET_BYTES = 1 * pandas_gbq.constants.BYTES_IN_GIB @@ -61,15 +62,15 @@ def _calculate_target_bytes(target_mb: Optional[int]) -> int: return target_mb * pandas_gbq.constants.BYTES_IN_MIB mem = psutil.virtual_memory() - return min(_MAX_AUTO_TARGET_BYTES, max(_MAX_ROW_BYTES, mem.available // 4)) + return min(_MAX_AUTO_TARGET_BYTES, mem.available // 4) def _estimate_limit( *, - target_bytes: int, - table_bytes: Optional[int], - table_rows: Optional[int], fields: Sequence[google.cloud.bigquery.SchemaField], + target_bytes: int, + table_bytes: Optional[int] = None, + table_rows: Optional[int] = None, ) -> int: if table_bytes and table_rows: proportion = target_bytes / table_bytes @@ -118,8 +119,8 @@ def _estimate_row_bytes(fields: Sequence[google.cloud.bigquery.SchemaField]) -> Returns: An integer representing the estimated total row size in logical bytes. """ - total_size = min( - _MAX_ROW_BYTES, + total_size = max( + 1, sum(_estimate_field_bytes(field) for field in fields), ) return total_size @@ -129,7 +130,7 @@ def _download_results_in_parallel( rows: google.cloud.bigquery.table.RowIterator, *, bqclient: google.cloud.bigquery.Client, - progress_bar_type: Optional[str] = None, + progress_bar_type: Union[str, None] = None, use_bqstorage_api: bool = True, ): table_reference = getattr(rows, "_table", None) @@ -156,18 +157,19 @@ def _download_results_in_parallel( def _sample_with_tablesample( - table: google.cloud.bigquery.Table, + table_id: str, *, bqclient: google.cloud.bigquery.Client, proportion: float, target_row_count: int, - progress_bar_type: Optional[str] = None, + progress_bar_type: Union[str, None] = None, use_bqstorage_api: bool = True, ) -> Optional[pandas.DataFrame]: + sample_percent = min(100, max(1, int(proportion * 100))) query = f""" SELECT * - FROM `{table.project}.{table.dataset_id}.{table.table_id}` - TABLESAMPLE SYSTEM ({float(proportion) * 100.0} PERCENT) + FROM `{table_id}` t + TABLESAMPLE SYSTEM ({sample_percent} PERCENT) ORDER BY RAND() DESC LIMIT {int(target_row_count)}; """ @@ -181,16 +183,16 @@ def _sample_with_tablesample( def _sample_with_limit( - table: google.cloud.bigquery.Table, + table_id: str, *, bqclient: google.cloud.bigquery.Client, target_row_count: int, - progress_bar_type: Optional[str] = None, + progress_bar_type: Union[str, None] = None, use_bqstorage_api: bool = True, ) -> Optional[pandas.DataFrame]: query = f""" SELECT * - FROM `{table.project}.{table.dataset_id}.{table.table_id}` + FROM `{table_id}` ORDER BY RAND() DESC LIMIT {int(target_row_count)}; """ @@ -203,13 +205,121 @@ def _sample_with_limit( ) +def _sample_biglake_table( + *, + reference: pandas_gbq.core.resource_references.BigLakeTableId, + bqclient: google.cloud.bigquery.Client, + target_bytes: int, + progress_bar_type: Union[str, None], + use_bqstorage_api: bool, +) -> Optional[pandas.DataFrame]: + metadata = pandas_gbq.core.biglake.get_table_metadata( + reference=reference, + bqclient=bqclient, + ) + total_rows = metadata.num_rows + + # Avoid divide by 0 when calculating proportions. + if total_rows == 0: + total_rows = 1 + + target_row_count = _estimate_limit( + target_bytes=target_bytes, + fields=metadata.schema, + table_rows=total_rows, + ) + proportion = max(0.01, target_row_count / total_rows) + + # BigLake tables should always support table sample, since they are backed + # by parquet files. + return _sample_with_tablesample( + f"{reference.project}.{reference.catalog}.{'.'.join(reference.namespace)}.{reference.table}", + bqclient=bqclient, + proportion=proportion, + target_row_count=target_row_count, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + +def _sample_bq_table( + *, + reference: pandas_gbq.core.resource_references.BigQueryTableId, + bqclient: google.cloud.bigquery.Client, + target_bytes: int, + progress_bar_type: Union[str, None], + use_bqstorage_api: bool, +) -> Optional[pandas.DataFrame]: + table = bqclient.get_table( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference( + reference.project_id, reference.dataset_id + ), + reference.table_id, + ) + ) + num_rows = table.num_rows + num_bytes = table.num_bytes + table_type = table.table_type + + # Some tables such as views report 0 despite actually having rows. + if num_bytes == 0: + num_bytes = None + + # Table is small enough to download the whole thing. + if ( + table_type in _READ_API_ELIGIBLE_TYPES + and num_bytes is not None + and num_bytes <= target_bytes + ): + rows_iter = bqclient.list_rows(table) + return pandas_gbq.core.read.download_results( + rows_iter, + bqclient=bqclient, + progress_bar_type=progress_bar_type, + warn_on_large_results=False, + max_results=None, + user_dtypes=None, + use_bqstorage_api=use_bqstorage_api, + ) + + target_row_count = _estimate_limit( + target_bytes=target_bytes, + table_bytes=num_bytes, + table_rows=num_rows, + fields=table.schema, + ) + + # Table is eligible for TABLESAMPLE. + if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES: + proportion = target_bytes / num_bytes + return _sample_with_tablesample( + f"{table.project}.{table.dataset_id}.{table.table_id}", + bqclient=bqclient, + proportion=proportion, + target_row_count=target_row_count, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + # Not eligible for TABLESAMPLE or reading directly, so take a random sample + # with a full table scan. + return _sample_with_limit( + f"{table.project}.{table.dataset_id}.{table.table_id}", + bqclient=bqclient, + target_row_count=target_row_count, + progress_bar_type=progress_bar_type, + use_bqstorage_api=use_bqstorage_api, + ) + + def sample( table_id: str, *, target_mb: Optional[int] = None, credentials: Optional[google.oauth2.credentials.Credentials] = None, billing_project_id: Optional[str] = None, - progress_bar_type: Optional[str] = None, + progress_bar_type: Union[str, None] = None, use_bqstorage_api: bool = True, ) -> Optional[pandas.DataFrame]: """Sample a BigQuery table, attempting to limit the amount of data read. @@ -265,59 +375,24 @@ def sample( connector = pandas_gbq.gbq_connector.GbqConnector( project_id=billing_project_id, credentials=credentials ) - credentials = cast(google.oauth2.credentials.Credentials, connector.credentials) bqclient = connector.get_client() - table = bqclient.get_table(table_id) - num_rows = table.num_rows - num_bytes = table.num_bytes - table_type = table.table_type - # Some tables such as views report 0 despite actually having rows. - if num_bytes == 0: - num_bytes = None - - # Table is small enough to download the whole thing. - if ( - table_type in _READ_API_ELIGIBLE_TYPES - and num_bytes is not None - and num_bytes <= target_bytes - ): - rows_iter = bqclient.list_rows(table) - return pandas_gbq.core.read.download_results( - rows_iter, + # BigLake tables can't be read directly by the BQ Storage Read API, so make + # sure we run a query first. + reference = pandas_gbq.core.resource_references.parse_table_id(table_id) + if isinstance(reference, pandas_gbq.core.resource_references.BigLakeTableId): + return _sample_biglake_table( + reference=reference, bqclient=bqclient, + target_bytes=target_bytes, progress_bar_type=progress_bar_type, - warn_on_large_results=False, - max_results=None, - user_dtypes=None, use_bqstorage_api=use_bqstorage_api, ) - - target_row_count = _estimate_limit( - target_bytes=target_bytes, - table_bytes=num_bytes, - table_rows=num_rows, - fields=table.schema, - ) - - # Table is eligible for TABLESAMPLE. - if num_bytes is not None and table_type in _TABLESAMPLE_ELIGIBLE_TYPES: - proportion = target_bytes / num_bytes - return _sample_with_tablesample( - table, + else: + return _sample_bq_table( + reference=reference, bqclient=bqclient, - proportion=proportion, - target_row_count=target_row_count, + target_bytes=target_bytes, progress_bar_type=progress_bar_type, use_bqstorage_api=use_bqstorage_api, ) - - # Not eligible for TABLESAMPLE or reading directly, so take a random sample - # with a full table scan. - return _sample_with_limit( - table, - bqclient=bqclient, - target_row_count=target_row_count, - progress_bar_type=progress_bar_type, - use_bqstorage_api=use_bqstorage_api, - ) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 69aabedb..6bdf475e 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -419,7 +419,7 @@ def to_gbq( destination_table : str Name of table to be written, in the form ``dataset.tablename`` or ``project.dataset.tablename``. - clustering_columns : pandas.Index | Iterable[Hashable], optional + clustering_columns: pandas.Index | Iterable[Hashable] Specifies the columns for clustering in the BigQuery table. time_partitioning_column : str, optional Specifies the column for time-based partitioning in the BigQuery table. diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py new file mode 100644 index 00000000..cff30452 --- /dev/null +++ b/tests/unit/core/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/tests/unit/core/test_biglake.py b/tests/unit/core/test_biglake.py new file mode 100644 index 00000000..7373363b --- /dev/null +++ b/tests/unit/core/test_biglake.py @@ -0,0 +1,54 @@ +# Copyright (c) 2017 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections +from unittest import mock + +import google.cloud.bigquery + +from pandas_gbq.core import biglake +from pandas_gbq.core import resource_references + + +def test_get_table_metadata(mock_bigquery_client): + reference = resource_references.BigLakeTableId( + "my-project", "my-catalog", ("my-schema",), "my-table" + ) + schema = [ + google.cloud.bigquery.SchemaField("col1", "STRING"), + ] + job_mock = mock.create_autospec(google.cloud.bigquery.QueryJob) + job_mock.schema = schema + mock_bigquery_client.query.return_value = job_mock + Row = collections.namedtuple("Row", ["total_rows"]) + mock_bigquery_client.query_and_wait.return_value = [Row(total_rows=123)] + + metadata = biglake.get_table_metadata( + reference=reference, bqclient=mock_bigquery_client + ) + + assert metadata.schema == schema + assert metadata.num_rows == 123 + mock_bigquery_client.query.assert_called_once() + mock_bigquery_client.query_and_wait.assert_called_once() + + +def test_get_table_metadata_no_schema(mock_bigquery_client): + reference = resource_references.BigLakeTableId( + "my-project", "my-catalog", ("my-schema",), "my-table" + ) + job_mock = mock.create_autospec(google.cloud.bigquery.QueryJob) + job_mock.schema = None + mock_bigquery_client.query.return_value = job_mock + Row = collections.namedtuple("Row", ["total_rows"]) + mock_bigquery_client.query_and_wait.return_value = [Row(total_rows=456)] + + metadata = biglake.get_table_metadata( + reference=reference, bqclient=mock_bigquery_client + ) + + assert metadata.schema == [] + assert metadata.num_rows == 456 + mock_bigquery_client.query.assert_called_once() + mock_bigquery_client.query_and_wait.assert_called_once() diff --git a/tests/unit/core/test_core_resource_references.py b/tests/unit/core/test_core_resource_references.py new file mode 100644 index 00000000..e8f21f09 --- /dev/null +++ b/tests/unit/core/test_core_resource_references.py @@ -0,0 +1,74 @@ +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import pytest + +from pandas_gbq.core import resource_references + + +@pytest.mark.parametrize( + ["table_id", "expected"], + [ + ( + "my-project.my_dataset.my_table", + resource_references.BigQueryTableId( + project_id="my-project", + dataset_id="my_dataset", + table_id="my_table", + ), + ), + ( + "google.com:my-project.my_dataset.my_table", + resource_references.BigQueryTableId( + project_id="my-project", + dataset_id="my_dataset", + table_id="my_table", + ), + ), + ( + "my-project.my_catalog.my_table", + resource_references.BigQueryTableId( + project_id="my-project", + dataset_id="my_catalog", + table_id="my_table", + ), + ), + ( + "my-project.my_catalog.my_namespace.my_table", + resource_references.BigLakeTableId( + project="my-project", + catalog="my_catalog", + namespace=("my_namespace",), + table="my_table", + ), + ), + ( + "my-project.my_catalog.my_namespace1.my_namespace2.my_table", + resource_references.BigLakeTableId( + project="my-project", + catalog="my_catalog", + namespace=("my_namespace1", "my_namespace2"), + table="my_table", + ), + ), + ], +) +def test_parse_table_id_valid(table_id, expected): + result = resource_references.parse_table_id(table_id) + assert result == expected + + +@pytest.mark.parametrize( + "table_id", + [ + "my-project", + "my-project.my_dataset", + ".my_dataset.my_table", + "my-project.my_dataset.", + "my-project..my_table", + ], +) +def test_parse_table_id_invalid(table_id): + with pytest.raises(ValueError): + resource_references.parse_table_id(table_id) diff --git a/tests/unit/test_core_sample.py b/tests/unit/core/test_core_sample.py similarity index 66% rename from tests/unit/test_core_sample.py rename to tests/unit/core/test_core_sample.py index 5e5a15e7..44baea29 100644 --- a/tests/unit/test_core_sample.py +++ b/tests/unit/core/test_core_sample.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Copyright (c) 2026 pandas-gbq Authors All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. @@ -6,6 +6,7 @@ from unittest import mock import google.cloud.bigquery +import google.cloud.bigquery.table import pytest import pandas_gbq.constants @@ -71,17 +72,9 @@ ), # 0 google.cloud.bigquery.SchemaField("simple_int", "INT64"), # 8 ], - 8, # 0 + 8 + 9, # 1 + 8 id="empty-struct", ), - pytest.param( - [ - google.cloud.bigquery.SchemaField("bytes", "BYTES"), - ] - * 9_999, - pandas_gbq.core.sample._MAX_ROW_BYTES, - id="many-bytes", - ), # Case 8: Complex Mix (Combining multiple cases) pytest.param( [ @@ -127,21 +120,21 @@ def test_calculate_target_bytes_with_available_memory(mock_virtual_memory): available_memory = 2 * pandas_gbq.constants.BYTES_IN_GIB # 2 GB mock_virtual_memory.return_value = mock.Mock(available=available_memory) - # Expected bytes is available memory / 4, as it falls between _MAX_ROW_BYTES and _MAX_AUTO_TARGET_BYTES + # Expected bytes is available memory / 4. expected_bytes = available_memory // 4 actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(None) assert actual_bytes == expected_bytes @mock.patch("psutil.virtual_memory") -def test_calculate_target_bytes_low_memory_uses_max_row_bytes(mock_virtual_memory): +def test_calculate_target_bytes_low_memory(mock_virtual_memory): # Mock psutil.virtual_memory to return a mock object with an 'available' attribute. # Set available memory to a low value. available_memory = 100 # 100 bytes mock_virtual_memory.return_value = mock.Mock(available=available_memory) - # Expected bytes should be _MAX_ROW_BYTES because available // 4 is less. - expected_bytes = pandas_gbq.core.sample._MAX_ROW_BYTES + # Expected bytes should be low value // 4. + expected_bytes = 25 actual_bytes = pandas_gbq.core.sample._calculate_target_bytes(None) assert actual_bytes == expected_bytes @@ -206,17 +199,52 @@ def test_estimate_limit(target_bytes, table_bytes, table_rows, fields, expected_ @mock.patch("pandas_gbq.core.read.download_results") -def test_sample_with_tablesample(mock_download_results, mock_bigquery_client): - mock_table = mock.Mock(spec=google.cloud.bigquery.Table) - mock_table.project = "test-project" - mock_table.dataset_id = "test_dataset" - mock_table.table_id = "test_table" +def test_download_results_in_parallel_with_table( + mock_download_results, mock_bigquery_client +): + rows = mock.Mock(spec=google.cloud.bigquery.table.RowIterator) + rows._table = "table" + rows._schema = "schema" + pandas_gbq.core.sample._download_results_in_parallel( + rows, bqclient=mock_bigquery_client + ) + mock_bigquery_client.list_rows.assert_called_once_with( + "table", selected_fields="schema" + ) + mock_download_results.assert_called_once() + +@mock.patch("pandas_gbq.core.read.download_results") +def test_download_results_in_parallel_no_table( + mock_download_results, mock_bigquery_client +): + rows = mock.Mock(spec=google.cloud.bigquery.table.RowIterator) + rows._table = None + rows._schema = None + pandas_gbq.core.sample._download_results_in_parallel( + rows, bqclient=mock_bigquery_client + ) + mock_bigquery_client.list_rows.assert_not_called() + mock_download_results.assert_called_once_with( + rows, + bqclient=mock_bigquery_client, + progress_bar_type=None, + warn_on_large_results=False, + max_results=None, + user_dtypes=None, + use_bqstorage_api=True, + ) + + +@mock.patch("pandas_gbq.core.sample._download_results_in_parallel") +def test_sample_with_tablesample( + mock_download_results_in_parallel, mock_bigquery_client +): proportion = 0.1 target_row_count = 100 pandas_gbq.core.sample._sample_with_tablesample( - mock_table, + "test-project.test_dataset.test_table", bqclient=mock_bigquery_client, proportion=proportion, target_row_count=target_row_count, @@ -224,27 +252,27 @@ def test_sample_with_tablesample(mock_download_results, mock_bigquery_client): mock_bigquery_client.query_and_wait.assert_called_once() query = mock_bigquery_client.query_and_wait.call_args[0][0] - assert "TABLESAMPLE SYSTEM (10.0 PERCENT)" in query + assert "TABLESAMPLE SYSTEM (10 PERCENT)" in query assert "LIMIT 100" in query - assert ( - f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`" - in query - ) - - mock_download_results.assert_called_once() + assert "FROM `test-project.test_dataset.test_table`" in query + # The mock for query_and_wait returns a mock RowIterator, which is then + # passed to _download_results_in_parallel. + mock_results = mock_bigquery_client.query_and_wait.return_value + mock_download_results_in_parallel.assert_called_with( + mock_results, + bqclient=mock_bigquery_client, + progress_bar_type=None, + use_bqstorage_api=True, + ) -@mock.patch("pandas_gbq.core.read.download_results") -def test_sample_with_limit(mock_download_results, mock_bigquery_client): - mock_table = mock.Mock(spec=google.cloud.bigquery.Table) - mock_table.project = "test-project" - mock_table.dataset_id = "test_dataset" - mock_table.table_id = "test_table" +@mock.patch("pandas_gbq.core.sample._download_results_in_parallel") +def test_sample_with_limit(mock_download_results_in_parallel, mock_bigquery_client): target_row_count = 200 pandas_gbq.core.sample._sample_with_limit( - mock_table, + "test-project.test_dataset.test_table", bqclient=mock_bigquery_client, target_row_count=target_row_count, ) @@ -253,12 +281,17 @@ def test_sample_with_limit(mock_download_results, mock_bigquery_client): query = mock_bigquery_client.query_and_wait.call_args[0][0] assert "TABLESAMPLE" not in query assert "LIMIT 200" in query - assert ( - f"FROM `{mock_table.project}.{mock_table.dataset_id}.{mock_table.table_id}`" - in query - ) + assert "FROM `test-project.test_dataset.test_table`" in query - mock_download_results.assert_called_once() + # The mock for query_and_wait returns a mock RowIterator, which is then + # passed to _download_results_in_parallel. + mock_results = mock_bigquery_client.query_and_wait.return_value + mock_download_results_in_parallel.assert_called_with( + mock_results, + bqclient=mock_bigquery_client, + progress_bar_type=None, + use_bqstorage_api=True, + ) @pytest.fixture @@ -270,15 +303,70 @@ def mock_gbq_connector(mock_bigquery_client): yield mock_connector +@mock.patch("pandas_gbq.core.biglake.get_table_metadata") +@mock.patch("pandas_gbq.core.sample._sample_with_tablesample") +def test_sample_biglake_table( + mock_sample_with_tablesample, + mock_get_table_metadata, + mock_gbq_connector, + mock_bigquery_client, +): + mock_metadata = mock.Mock() + mock_metadata.num_rows = 1000 + mock_metadata.schema = [google.cloud.bigquery.SchemaField("col1", "INT64")] + mock_get_table_metadata.return_value = mock_metadata + table_id = "p.c.d.t" + + with mock.patch( + "pandas_gbq.core.sample._calculate_target_bytes", return_value=1000 + ): + pandas_gbq.core.sample.sample(table_id, billing_project_id="p") + + mock_sample_with_tablesample.assert_called_once() + # 1000 target bytes / 8 bytes/row in schema = 125 target_row_count + # 125 / 1000 rows = 0.125 proportion. + # min(100, max(1, 0.125 * 100)) = 12.5 -> 12 + # So proportion should be about 0.125 + # Let's check the args + args, kwargs = mock_sample_with_tablesample.call_args + assert kwargs["proportion"] > 0.1 + assert kwargs["target_row_count"] == 125 + + +@mock.patch("pandas_gbq.core.biglake.get_table_metadata") +@mock.patch("pandas_gbq.core.sample._sample_with_tablesample") +def test_sample_biglake_table_zero_rows( + mock_sample_with_tablesample, + mock_get_table_metadata, + mock_gbq_connector, + mock_bigquery_client, +): + mock_metadata = mock.Mock() + mock_metadata.num_rows = 0 + mock_metadata.schema = [google.cloud.bigquery.SchemaField("col1", "INT64")] + mock_get_table_metadata.return_value = mock_metadata + table_id = "p.c.d.t" + + with mock.patch( + "pandas_gbq.core.sample._calculate_target_bytes", return_value=1000 + ): + pandas_gbq.core.sample.sample(table_id, billing_project_id="p") + + mock_sample_with_tablesample.assert_called_once() + args, kwargs = mock_sample_with_tablesample.call_args + # Avoid division by zero + assert kwargs["proportion"] > 0.0 + + @mock.patch("pandas_gbq.core.read.download_results") def test_sample_small_table_downloads_all( mock_download_results, mock_gbq_connector, mock_bigquery_client ): mock_table = mock.Mock(spec=google.cloud.bigquery.Table) - type(mock_table).table_type = mock.PropertyMock(return_value="TABLE") - type(mock_table).num_bytes = mock.PropertyMock(return_value=1000) - type(mock_table).num_rows = mock.PropertyMock(return_value=10) - type(mock_table).schema = mock.PropertyMock(return_value=[]) + type(mock_table).table_type = "TABLE" + type(mock_table).num_bytes = 1000 + type(mock_table).num_rows = 10 + type(mock_table).schema = [] mock_bigquery_client.get_table.return_value = mock_table with mock.patch( @@ -297,12 +385,13 @@ def test_sample_uses_tablesample( mock_sample_with_tablesample, mock_gbq_connector, mock_bigquery_client ): mock_table = mock.Mock(spec=google.cloud.bigquery.Table) - type(mock_table).table_type = mock.PropertyMock(return_value="TABLE") - type(mock_table).num_bytes = mock.PropertyMock(return_value=1_000_000_000_000) - type(mock_table).num_rows = mock.PropertyMock(return_value=1_000) - type(mock_table).schema = mock.PropertyMock( - return_value=[google.cloud.bigquery.SchemaField("col1", "INT64")] - ) + type(mock_table).project = "my-project" + type(mock_table).dataset_id = "my_dataset" + type(mock_table).table_id = "my_table" + type(mock_table).table_type = "TABLE" + type(mock_table).num_bytes = 1_000_000_000_000 + type(mock_table).num_rows = 1_000 + type(mock_table).schema = [google.cloud.bigquery.SchemaField("col1", "INT64")] mock_bigquery_client.get_table.return_value = mock_table pandas_gbq.core.sample.sample("my-project.my_dataset.my_table", target_mb=1) @@ -315,6 +404,9 @@ def test_sample_uses_limit_fallback( mock_sample_with_limit, mock_gbq_connector, mock_bigquery_client ): mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.project = "my-project" + mock_table.dataset_id = "my_dataset" + mock_table.table_id = "my_table" mock_table.num_bytes = 10000 mock_table.num_rows = 100 mock_table.table_type = "VIEW" # Not eligible for TABLESAMPLE @@ -334,6 +426,9 @@ def test_sample_uses_limit_fallback_no_bytes( mock_sample_with_limit, mock_gbq_connector, mock_bigquery_client ): mock_table = mock.Mock(spec=google.cloud.bigquery.Table) + mock_table.project = "my-project" + mock_table.dataset_id = "my_dataset" + mock_table.table_id = "my_table" mock_table.num_bytes = None # num_bytes can be None mock_table.num_rows = 100 mock_table.table_type = "TABLE" diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py index 6f00ccf8..1736dab5 100644 --- a/tests/unit/test_to_gbq.py +++ b/tests/unit/test_to_gbq.py @@ -188,6 +188,7 @@ def test_to_gbq_with_if_exists_unknown(): ) +@mock.patch.dict(os.environ, {}, clear=True) @pytest.mark.parametrize( "user_agent,rfc9110_delimiter,expected", [ @@ -216,7 +217,8 @@ def test_create_user_agent(user_agent, rfc9110_delimiter, expected): def test_create_user_agent_vscode(): from pandas_gbq.gbq_connector import create_user_agent - assert create_user_agent() == f"pandas-{pd.__version__} vscode" + result = create_user_agent() + assert f"pandas-{pd.__version__} vscode" in result @mock.patch.dict(os.environ, {"VSCODE_PID": "1234"}, clear=True) @@ -239,9 +241,9 @@ def test_create_user_agent_vscode_plugin(): f.write("{}") with mock.patch("pathlib.Path.home", return_value=user_home): + result = create_user_agent() assert ( - create_user_agent() - == f"pandas-{pd.__version__} vscode googlecloudtools.cloudcode" + f"pandas-{pd.__version__} vscode googlecloudtools.cloudcode" in result ) From 746c61a0edc84149592a8b7f589e839a025111fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 5 Mar 2026 16:13:53 -0600 Subject: [PATCH 5/5] chore: librarian release pull request: 20260305T213939Z (#1033) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.8.3 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620
pandas-gbq: v0.34.0 ## [v0.34.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.33.0...v0.34.0) (2026-03-05) ### Features * support biglake tables in pandas_gbq.sample (#1014) ([16d08696](https://github.com/googleapis/python-bigquery-pandas/commit/16d08696)) ### Bug Fixes * support pandas 3.0 (prerelease) (#998) ([4ea2d57f](https://github.com/googleapis/python-bigquery-pandas/commit/4ea2d57f)) ### Documentation * Update bigframes links to new homepage (#1028) ([72f772ec](https://github.com/googleapis/python-bigquery-pandas/commit/72f772ec))
--- .librarian/state.yaml | 2 +- CHANGELOG.md | 17 +++++++++++++++++ pandas_gbq/version.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 6a632065..a8754e62 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:c8612d3fffb3f6a32353b2d1abd16b61e87811866f7ec9d65b59b02eb452a620 libraries: - id: pandas-gbq - version: 0.33.0 + version: 0.34.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index f9ddd457..506b33ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,23 @@ [1]: https://pypi.org/project/pandas-gbq/#history +## [0.34.0](https://github.com/googleapis/google-cloud-python/compare/pandas-gbq-v0.33.0...pandas-gbq-v0.34.0) (2026-03-05) + + +### Documentation + +* Update bigframes links to new homepage (#1028) ([72f772ec7b36ba081ed18ef4409f8262a412a1b9](https://github.com/googleapis/google-cloud-python/commit/72f772ec7b36ba081ed18ef4409f8262a412a1b9)) + + +### Features + +* support biglake tables in pandas_gbq.sample (#1014) ([16d08696f251781f5d84c509844ef05f98ec8dd0](https://github.com/googleapis/google-cloud-python/commit/16d08696f251781f5d84c509844ef05f98ec8dd0)) + + +### Bug Fixes + +* support pandas 3.0 (prerelease) (#998) ([4ea2d57f45c6685c13b8258923b4326fdd3b71d1](https://github.com/googleapis/google-cloud-python/commit/4ea2d57f45c6685c13b8258923b4326fdd3b71d1)) + ## [0.33.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.32.0...v0.33.0) (2026-01-05) diff --git a/pandas_gbq/version.py b/pandas_gbq/version.py index 38cee1fb..df2e1fc4 100644 --- a/pandas_gbq/version.py +++ b/pandas_gbq/version.py @@ -2,4 +2,4 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -__version__ = "0.33.0" +__version__ = "0.34.0"