From 62d83f3d89b5603396831353682018626098e7cd Mon Sep 17 00:00:00 2001 From: Jennifer Davis Date: Tue, 31 Mar 2026 17:26:00 -0700 Subject: [PATCH] feat(bigquery/dataframes): add BigFrames samples and test configuration Signed-off-by: Jennifer Davis --- bigquery/dataframes/noxfile_config.py | 31 +++++++++ bigquery/dataframes/read_gbq_function.py | 51 +++++++++++++++ bigquery/dataframes/read_gbq_function_test.py | 64 +++++++++++++++++++ bigquery/dataframes/requirements-test.txt | 5 ++ bigquery/dataframes/requirements.txt | 3 + bigquery/dataframes/sql_scalar.py | 50 +++++++++++++++ bigquery/dataframes/sql_scalar_test.py | 30 +++++++++ 7 files changed, 234 insertions(+) create mode 100644 bigquery/dataframes/noxfile_config.py create mode 100644 bigquery/dataframes/read_gbq_function.py create mode 100644 bigquery/dataframes/read_gbq_function_test.py create mode 100644 bigquery/dataframes/requirements-test.txt create mode 100644 bigquery/dataframes/requirements.txt create mode 100644 bigquery/dataframes/sql_scalar.py create mode 100644 bigquery/dataframes/sql_scalar_test.py diff --git a/bigquery/dataframes/noxfile_config.py b/bigquery/dataframes/noxfile_config.py new file mode 100644 index 00000000000..ae46e2a97f5 --- /dev/null +++ b/bigquery/dataframes/noxfile_config.py @@ -0,0 +1,31 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + # Skipping for Python 3.9 due to pyarrow compilation failure. + "ignored_versions": ["2.7", "3.6", "3.9", "3.11"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": True, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/bigquery/dataframes/read_gbq_function.py b/bigquery/dataframes/read_gbq_function.py new file mode 100644 index 00000000000..38e323ef72a --- /dev/null +++ b/bigquery/dataframes/read_gbq_function.py @@ -0,0 +1,51 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Registers and applies existing BigQuery User-Defined Functions (UDFs) to DataFrames. + +Enables the reuse of existing BigQuery SQL or JavaScript UDFs as callable +objects within a BigQuery DataFrames session. +""" + +# [START bigquery_dataframes_read_gbq_function] +import bigframes.pandas as bpd + + +def use_read_gbq_function(project_id: str, function_id: str) -> None: + bpd.options.bigquery.project = project_id + bpd.options.bigquery.location = "US" + + # Register an existing BigQuery UDF. + # The function must have an explicit return type in its BigQuery definition. + # In production, use functions deployed to your own project for stability. + extract_title = bpd.read_gbq_function(function_id) + + df = bpd.DataFrame( + { + "book_xml": [ + "The Great Gatsby", + "1984", + "Brave New World", + ] + } + ) + + # Use apply to call the registered BigQuery function for each row. + # This executes the logic in BigQuery rather than locally. + df["title"] = df["book_xml"].apply(extract_title) + + print(df[["title"]].to_pandas()) + + +# [END bigquery_dataframes_read_gbq_function] diff --git a/bigquery/dataframes/read_gbq_function_test.py b/bigquery/dataframes/read_gbq_function_test.py new file mode 100644 index 00000000000..63c8e148a6e --- /dev/null +++ b/bigquery/dataframes/read_gbq_function_test.py @@ -0,0 +1,64 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Generator + +from google.cloud import bigquery +import pytest +import test_utils.prefixer + +import read_gbq_function + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +prefixer = test_utils.prefixer.Prefixer("python-docs-samples", "bigquery/dataframes") + + +@pytest.fixture(scope="module") +def bq_client() -> bigquery.Client: + return bigquery.Client(project=PROJECT_ID) + + +@pytest.fixture(scope="module") +def dataset_id(bq_client: bigquery.Client) -> Generator[str, None, None]: + dataset_name = prefixer.create_prefix().replace("-", "_") + dataset_id = f"{PROJECT_ID}.{dataset_name}" + dataset = bigquery.Dataset(dataset_id) + dataset.location = "US" + bq_client.create_dataset(dataset) + yield dataset_id + bq_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + +@pytest.fixture(scope="module") +def udf_id(bq_client: bigquery.Client, dataset_id: str) -> str: + function_id = f"{dataset_id}.extract_title" + query = f""" + CREATE OR REPLACE FUNCTION `{function_id}`(xml STRING) RETURNS STRING AS ( + SAFE.REGEXP_EXTRACT(xml, r'(.*?)') + ); + """ + bq_client.query(query).result() + return function_id + + +def test_use_read_gbq_function( + capsys: pytest.CaptureFixture[str], udf_id: str +) -> None: + read_gbq_function.use_read_gbq_function(PROJECT_ID, udf_id) + out, _ = capsys.readouterr() + + assert "The Great Gatsby" in out + assert "1984" in out + assert "Brave New World" in out diff --git a/bigquery/dataframes/requirements-test.txt b/bigquery/dataframes/requirements-test.txt new file mode 100644 index 00000000000..abb3c5a680c --- /dev/null +++ b/bigquery/dataframes/requirements-test.txt @@ -0,0 +1,5 @@ +pytest +pytest-asyncio +google-cloud-bigquery +bigframes +google-cloud-testutils diff --git a/bigquery/dataframes/requirements.txt b/bigquery/dataframes/requirements.txt new file mode 100644 index 00000000000..241dae3a997 --- /dev/null +++ b/bigquery/dataframes/requirements.txt @@ -0,0 +1,3 @@ +google-cloud-bigquery-storage +bigframes +google-cloud-bigquery diff --git a/bigquery/dataframes/sql_scalar.py b/bigquery/dataframes/sql_scalar.py new file mode 100644 index 00000000000..cf77d10f9ba --- /dev/null +++ b/bigquery/dataframes/sql_scalar.py @@ -0,0 +1,50 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Extracts data from XML strings using SQL scalar functions in BigQuery DataFrames. + +Demonstrates using BigQuery SQL expressions directly within a DataFrame +transformation for efficient server-side processing. +""" + +# [START bigquery_dataframes_sql_scalar] +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def create_sql_scalar_extraction(project_id: str) -> None: + bpd.options.bigquery.project = project_id + bpd.options.bigquery.location = "US" + + df = bpd.DataFrame( + { + "book_xml": [ + "The Great Gatsby", + "1984", + "Brave New World", + ] + } + ) + + # Use bbq.sql_scalar to execute arbitrary SQL expressions directly in BigQuery. + # The {0} placeholder refers to the first Series in the provided list. + df["title"] = bbq.sql_scalar( + "SAFE.REGEXP_EXTRACT({0}, r'(.*?)')", + [df["book_xml"]], + ) + + print(df[["title"]].to_pandas()) + + +# [END bigquery_dataframes_sql_scalar] diff --git a/bigquery/dataframes/sql_scalar_test.py b/bigquery/dataframes/sql_scalar_test.py new file mode 100644 index 00000000000..8509e5b992b --- /dev/null +++ b/bigquery/dataframes/sql_scalar_test.py @@ -0,0 +1,30 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +import sql_scalar + +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] + + +def test_create_sql_scalar_extraction(capsys: pytest.CaptureFixture[str]) -> None: + sql_scalar.create_sql_scalar_extraction(PROJECT_ID) + out, _ = capsys.readouterr() + + assert "The Great Gatsby" in out + assert "1984" in out + assert "Brave New World" in out