diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 518cec63125..2455f7abc4c 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python: ['3.9', '3.10', '3.11', '3.12', '3.13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg deleted file mode 100644 index 09d7af02ba9..00000000000 --- a/.kokoro/samples/python3.7/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.7" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py37" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.7/continuous.cfg b/.kokoro/samples/python3.7/continuous.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.7/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.7/periodic-head.cfg deleted file mode 100644 index 123a35fbd3d..00000000000 --- a/.kokoro/samples/python3.7/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.7/periodic.cfg b/.kokoro/samples/python3.7/periodic.cfg deleted file mode 100644 index 71cd1e597e3..00000000000 --- a/.kokoro/samples/python3.7/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.7/presubmit.cfg b/.kokoro/samples/python3.7/presubmit.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.7/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg deleted file mode 100644 index 976d9ce8c5c..00000000000 --- a/.kokoro/samples/python3.8/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.8" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py38" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.8/continuous.cfg b/.kokoro/samples/python3.8/continuous.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.8/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.8/periodic-head.cfg deleted file mode 100644 index 123a35fbd3d..00000000000 --- a/.kokoro/samples/python3.8/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.8/periodic.cfg b/.kokoro/samples/python3.8/periodic.cfg deleted file mode 100644 index 71cd1e597e3..00000000000 --- a/.kokoro/samples/python3.8/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.8/presubmit.cfg b/.kokoro/samples/python3.8/presubmit.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.8/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/common.cfg b/.kokoro/samples/python3.9/common.cfg deleted file mode 100644 index 603cfffa280..00000000000 --- a/.kokoro/samples/python3.9/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.9" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py39" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.9/continuous.cfg b/.kokoro/samples/python3.9/continuous.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.9/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/periodic-head.cfg b/.kokoro/samples/python3.9/periodic-head.cfg deleted file mode 100644 index 123a35fbd3d..00000000000 --- a/.kokoro/samples/python3.9/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.9/periodic.cfg b/.kokoro/samples/python3.9/periodic.cfg deleted file mode 100644 index 71cd1e597e3..00000000000 --- a/.kokoro/samples/python3.9/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.9/presubmit.cfg b/.kokoro/samples/python3.9/presubmit.cfg deleted file mode 100644 index a1c8d9759c8..00000000000 --- a/.kokoro/samples/python3.9/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/test-samples-impl.sh b/.kokoro/test-samples-impl.sh index 53e365bc4e7..97cdc9c13fe 100755 --- a/.kokoro/test-samples-impl.sh +++ b/.kokoro/test-samples-impl.sh @@ -34,7 +34,7 @@ env | grep KOKORO # Install nox # `virtualenv==20.26.6` is added for Python 3.7 compatibility -python3.9 -m pip install --upgrade --quiet nox virtualenv==20.26.6 +python3.10 -m pip install --upgrade --quiet nox virtualenv==20.26.6 # Use secrets acessor service account to get secrets if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then @@ -77,7 +77,7 @@ for file in samples/**/requirements.txt; do echo "------------------------------------------------------------" # Use nox to execute the tests for the project. - python3.9 -m nox -s "$RUN_TESTS_SESSION" + python3.10 -m nox -s "$RUN_TESTS_SESSION" EXIT=$? # If this is a periodic build, send the test log to the FlakyBot. diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 21903a51248..70eb19713cd 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677 libraries: - id: bigframes - version: 2.34.0 + version: 2.35.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index f54231f5401..874fcb0d04b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.34.0...v2.35.0) (2026-02-07) + + +### Documentation + +* fix cast method shown on public docs (#2436) ([ad0f33c65ee01409826c381ae0f70aad65bb6a27](https://github.com/googleapis/python-bigquery-dataframes/commit/ad0f33c65ee01409826c381ae0f70aad65bb6a27)) + + +### Features + +* remove redundant "started." messages from progress output (#2440) ([2017cc2f27f0a432af46f60b3286b231caa4a98b](https://github.com/googleapis/python-bigquery-dataframes/commit/2017cc2f27f0a432af46f60b3286b231caa4a98b)) +* Add bigframes.pandas.col with basic operators (#2405) ([12741677c0391efb5d05281fc756445ccbb1387e](https://github.com/googleapis/python-bigquery-dataframes/commit/12741677c0391efb5d05281fc756445ccbb1387e)) +* Disable progress bars in Anywidget mode (#2444) ([4e2689a1c975c4cabaf36b7d0817dcbedc926853](https://github.com/googleapis/python-bigquery-dataframes/commit/4e2689a1c975c4cabaf36b7d0817dcbedc926853)) +* Disable progress bars in Anywidget mode to reduce notebook clutter (#2437) ([853240daf45301ad534c635c8955cb6ce91d23c2](https://github.com/googleapis/python-bigquery-dataframes/commit/853240daf45301ad534c635c8955cb6ce91d23c2)) +* add bigquery.ai.generate_text function (#2433) ([5bd0029a99e7653843de4ac7d57370c9dffeed4d](https://github.com/googleapis/python-bigquery-dataframes/commit/5bd0029a99e7653843de4ac7d57370c9dffeed4d)) +* Add a bigframes cell magic for ipython (#2395) ([e6de52ded6c5091275a936dec36f01a6cf701233](https://github.com/googleapis/python-bigquery-dataframes/commit/e6de52ded6c5091275a936dec36f01a6cf701233)) +* add `bigframes.bigquery.ai.generate_embedding` (#2343) ([e91536c8a5b2d8d896767510ced80c6fd2a68a97](https://github.com/googleapis/python-bigquery-dataframes/commit/e91536c8a5b2d8d896767510ced80c6fd2a68a97)) +* add bigframe.bigquery.load_data function (#2426) ([4b0f13b2fe10fa5b07d3ca3b7cb1ae1cb95030c7](https://github.com/googleapis/python-bigquery-dataframes/commit/4b0f13b2fe10fa5b07d3ca3b7cb1ae1cb95030c7)) + + +### Bug Fixes + +* suppress JSONDtypeWarning in Anywidget mode and clean up progress output (#2441) ([e0d185ad2c0245b17eac315f71152a46c6da41bb](https://github.com/googleapis/python-bigquery-dataframes/commit/e0d185ad2c0245b17eac315f71152a46c6da41bb)) +* exlcude gcsfs 2026.2.0 (#2445) ([311de31e79227408515f087dafbab7edc54ddf1b](https://github.com/googleapis/python-bigquery-dataframes/commit/311de31e79227408515f087dafbab7edc54ddf1b)) +* always display the results in the `%%bqsql` cell magics output (#2439) ([2d973b54550f30429dbd10894f78db7bb0c57345](https://github.com/googleapis/python-bigquery-dataframes/commit/2d973b54550f30429dbd10894f78db7bb0c57345)) + ## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5374e7e3770..7ac410bbf7a 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.9, 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. + 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -148,7 +148,7 @@ Running System Tests .. note:: - System tests are only configured to run under Python 3.9, 3.11, 3.12 and 3.13. + System tests are only configured to run under Python 3.10, 3.11, 3.12 and 3.13. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -258,13 +258,11 @@ Supported Python Versions We support: -- `Python 3.9`_ - `Python 3.10`_ - `Python 3.11`_ - `Python 3.12`_ - `Python 3.13`_ -.. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ @@ -276,7 +274,7 @@ Supported versions can be found in our ``noxfile.py`` `config`_. .. _config: https://github.com/googleapis/python-bigquery-dataframes/blob/main/noxfile.py -We also explicitly decided to support Python 3 beginning with version 3.9. +We also explicitly decided to support Python 3 beginning with version 3.10. Reasons for this include: - Encouraging use of newest versions of Python 3 diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 240608ebc2d..a3a9b4e4c77 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,13 +14,40 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import option_context, options -from bigframes._config.bigquery_options import BigQueryOptions -from bigframes.core.global_session import close_session, get_global_session -import bigframes.enums as enums -import bigframes.exceptions as exceptions -from bigframes.session import connect, Session -from bigframes.version import __version__ +import warnings + +# Suppress Python version support warnings from google-cloud libraries. +# These are particularly noisy in Colab which still uses Python 3.10. +warnings.filterwarnings( + "ignore", + category=FutureWarning, + message=".*Google will stop supporting.*Python.*", +) + +from bigframes._config import option_context, options # noqa: E402 +from bigframes._config.bigquery_options import BigQueryOptions # noqa: E402 +from bigframes.core.global_session import ( # noqa: E402 + close_session, + get_global_session, +) +import bigframes.enums as enums # noqa: E402 +import bigframes.exceptions as exceptions # noqa: E402 +from bigframes.session import connect, Session # noqa: E402 +from bigframes.version import __version__ # noqa: E402 + +_MAGIC_NAMES = ["bqsql"] + + +def load_ipython_extension(ipython): + """Called by IPython when this module is loaded as an IPython extension.""" + # Requires IPython to be installed for import to succeed + from bigframes._magics import _cell_magic + + for magic_name in _MAGIC_NAMES: + ipython.register_magic_function( + _cell_magic, magic_kind="cell", magic_name=magic_name + ) + __all__ = [ "options", diff --git a/bigframes/_magics.py b/bigframes/_magics.py new file mode 100644 index 00000000000..613f71219be --- /dev/null +++ b/bigframes/_magics.py @@ -0,0 +1,55 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from IPython.core import magic_arguments # type: ignore +from IPython.core.getipython import get_ipython +from IPython.display import display + +import bigframes.pandas + + +@magic_arguments.magic_arguments() +@magic_arguments.argument( + "destination_var", + nargs="?", + help=("If provided, save the output to this variable instead of displaying it."), +) +@magic_arguments.argument( + "--dry_run", + action="store_true", + default=False, + help=( + "Sets query to be a dry run to estimate costs. " + "Defaults to executing the query instead of dry run if this argument is not used." + "Does not work with engine 'bigframes'. " + ), +) +def _cell_magic(line, cell): + ipython = get_ipython() + args = magic_arguments.parse_argstring(_cell_magic, line) + if not cell: + print("Query is missing.") + return + pyformat_args = ipython.user_ns + dataframe = bigframes.pandas._read_gbq_colab( + cell, pyformat_args=pyformat_args, dry_run=args.dry_run + ) + if args.destination_var: + ipython.push({args.destination_var: dataframe}) + + with bigframes.option_context( + "display.repr_mode", + "anywidget", + ): + display(dataframe) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 150fe5efc0c..e02e80cd1fb 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -43,6 +43,7 @@ st_regionstats, st_simplify, ) +from bigframes.bigquery._operations.io import load_data from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -107,6 +108,8 @@ struct, # table ops create_external_table, + # io ops + load_data, ] _module = sys.modules[__name__] @@ -160,6 +163,8 @@ "struct", # table ops "create_external_table", + # io ops + "load_data", # Modules / SQL namespaces "ai", "ml", diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index fd7dafe95fc..bc2ab8dd206 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,15 +19,17 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union +from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd from bigframes import clients, dataframe, dtypes from bigframes import pandas as bpd from bigframes import series, session +from bigframes.bigquery._operations import utils as bq_utils from bigframes.core import convert from bigframes.core.logging import log_adapter +import bigframes.core.sql.literals from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -388,6 +390,217 @@ def generate_double( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_embedding( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + output_dimensionality: Optional[int] = None, + task_type: Optional[str] = None, + start_second: Optional[float] = None, + end_second: Optional[float] = None, + interval_seconds: Optional[float] = None, + trial_id: Optional[int] = None, +) -> dataframe.DataFrame: + """ + Creates embeddings that describe an entity—for example, a piece of text or an image. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) + >>> bbq.ai.generate_embedding( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text embedding. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. + output_dimensionality (int, optional): + An INT64 value that specifies the number of dimensions to use when + generating embeddings. For example, if you specify 256 AS + output_dimensionality, then the embedding output column contains a + 256-dimensional embedding for each input value. To find the + supported range of output dimensions, read about the available + `Google text embedding models `_. + task_type (str, optional): + A STRING literal that specifies the intended downstream application to + help the model produce better quality embeddings. For a list of + supported task types and how to choose which one to use, see `Choose an + embeddings task type `_. + start_second (float, optional): + The second in the video at which to start the embedding. The default value is 0. + end_second (float, optional): + The second in the video at which to end the embedding. The default value is 120. + interval_seconds (float, optional): + The interval to use when creating embeddings. The default value is 16. + trial_id (int, optional): + An INT64 value that identifies the hyperparameter tuning trial that + you want the function to evaluate. The function uses the optimal + trial by default. Only specify this argument if you ran + hyperparameter tuning when creating the model. + + Returns: + bigframes.pandas.DataFrame: + A new DataFrame with the generated embeddings. See the `SQL + reference for AI.GENERATE_EMBEDDING + `_ + for details. + """ + data = _to_dataframe(data, series_rename="content") + model_name, session = bq_utils.get_model_name_and_session(model, data) + table_sql = bq_utils.to_sql(data) + + struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} + if output_dimensionality is not None: + struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality + if task_type is not None: + struct_fields["TASK_TYPE"] = task_type + if start_second is not None: + struct_fields["START_SECOND"] = start_second + if end_second is not None: + struct_fields["END_SECOND"] = end_second + if interval_seconds is not None: + struct_fields["INTERVAL_SECONDS"] = interval_seconds + if trial_id is not None: + struct_fields["TRIAL_ID"] = trial_id + + # Construct the TVF query + query = f""" + SELECT * + FROM AI.GENERATE_EMBEDDING( + MODEL `{model_name}`, + ({table_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)} + ) + """ + + if session is None: + return bpd.read_gbq_query(query) + else: + return session.read_gbq_query(query) + + +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_text( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + temperature: Optional[float] = None, + max_output_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + stop_sequences: Optional[List[str]] = None, + ground_with_google_search: Optional[bool] = None, + request_type: Optional[str] = None, +) -> dataframe.DataFrame: + """ + Generates text using a BigQuery ML model. + + See the `BigQuery ML GENERATE_TEXT function syntax + `_ + for additional reference. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"prompt": ["write a poem about apples"]}) + >>> bbq.ai.generate_text( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text generation. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. + temperature (float, optional): + A FLOAT64 value that is used for sampling promiscuity. The value + must be in the range ``[0.0, 1.0]``. A lower temperature works well + for prompts that expect a more deterministic and less open-ended + or creative response, while a higher temperature can lead to more + diverse or creative results. A temperature of ``0`` is + deterministic, meaning that the highest probability response is + always selected. + max_output_tokens (int, optional): + An INT64 value that sets the maximum number of tokens in the + generated text. + top_k (int, optional): + An INT64 value that changes how the model selects tokens for + output. A ``top_k`` of ``1`` means the next selected token is the + most probable among all tokens in the model's vocabulary. A + ``top_k`` of ``3`` means that the next token is selected from + among the three most probable tokens by using temperature. The + default value is ``40``. + top_p (float, optional): + A FLOAT64 value that changes how the model selects tokens for + output. Tokens are selected from most probable to least probable + until the sum of their probabilities equals the ``top_p`` value. + For example, if tokens A, B, and C have a probability of 0.3, 0.2, + and 0.1 and the ``top_p`` value is ``0.5``, then the model will + select either A or B as the next token by using temperature. The + default value is ``0.95``. + stop_sequences (List[str], optional): + An ARRAY value that contains the stop sequences for the model. + ground_with_google_search (bool, optional): + A BOOL value that determines whether to ground the model with Google Search. + request_type (str, optional): + A STRING value that contains the request type for the model. + + Returns: + bigframes.pandas.DataFrame: + The generated text. + """ + data = _to_dataframe(data, series_rename="prompt") + model_name, session = bq_utils.get_model_name_and_session(model, data) + table_sql = bq_utils.to_sql(data) + + struct_fields: Dict[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] = {} + if temperature is not None: + struct_fields["TEMPERATURE"] = temperature + if max_output_tokens is not None: + struct_fields["MAX_OUTPUT_TOKENS"] = max_output_tokens + if top_k is not None: + struct_fields["TOP_K"] = top_k + if top_p is not None: + struct_fields["TOP_P"] = top_p + if stop_sequences is not None: + struct_fields["STEP_SEQUENCES"] = stop_sequences + if ground_with_google_search is not None: + struct_fields["GROUND_WITH_GOOGLE_SEARCH"] = ground_with_google_search + if request_type is not None: + struct_fields["REQUEST_TYPE"] = request_type + + query = f""" + SELECT * + FROM AI.GENERATE_TEXT( + MODEL `{model_name}`, + ({table_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)} + ) + """ + + if session is None: + return bpd.read_gbq_query(query) + else: + return session.read_gbq_query(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, @@ -703,3 +916,20 @@ def _resolve_connection_id(series: series.Series, connection_id: str | None): series._session._project, series._session._location, ) + + +def _to_dataframe( + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + series_rename: str, +) -> dataframe.DataFrame: + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + data = data.copy() + data.name = series_rename + return data.to_frame() + elif isinstance(data, dataframe.DataFrame): + return data + + raise ValueError(f"Unsupported data type: {type(data)}") diff --git a/bigframes/bigquery/_operations/io.py b/bigframes/bigquery/_operations/io.py new file mode 100644 index 00000000000..daf28e6aedd --- /dev/null +++ b/bigframes/bigquery/_operations/io.py @@ -0,0 +1,94 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + +import pandas as pd + +from bigframes.bigquery._operations.table import _get_table_metadata +import bigframes.core.logging.log_adapter as log_adapter +import bigframes.core.sql.io +import bigframes.session + + +@log_adapter.method_logger(custom_base_name="bigquery_io") +def load_data( + table_name: str, + *, + write_disposition: str = "INTO", + columns: Optional[Mapping[str, str]] = None, + partition_by: Optional[list[str]] = None, + cluster_by: Optional[list[str]] = None, + table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, + from_files_options: Mapping[str, Union[str, int, float, bool, list]], + with_partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + session: Optional[bigframes.session.Session] = None, +) -> pd.Series: + """ + Loads data into a BigQuery table. + See the `BigQuery LOAD DATA DDL syntax + `_ + for additional reference. + Args: + table_name (str): + The name of the table in BigQuery. + write_disposition (str, default "INTO"): + Whether to replace the table if it already exists ("OVERWRITE") or append to it ("INTO"). + columns (Mapping[str, str], optional): + The table's schema. + partition_by (list[str], optional): + A list of partition expressions to partition the table by. See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#partition_expression. + cluster_by (list[str], optional): + A list of columns to cluster the table by. + table_options (Mapping[str, Union[str, int, float, bool, list]], optional): + The table options. + from_files_options (Mapping[str, Union[str, int, float, bool, list]]): + The options for loading data from files. + with_partition_columns (Mapping[str, str], optional): + The table's partition columns. + connection_name (str, optional): + The connection to use for the table. + session (bigframes.session.Session, optional): + The session to use. If not provided, the default session is used. + Returns: + pandas.Series: + A Series with object dtype containing the table metadata. Reference + the `BigQuery Table REST API reference + `_ + for available fields. + """ + import bigframes.pandas as bpd + + sql = bigframes.core.sql.io.load_data_ddl( + table_name=table_name, + write_disposition=write_disposition, + columns=columns, + partition_by=partition_by, + cluster_by=cluster_by, + table_options=table_options, + from_files_options=from_files_options, + with_partition_columns=with_partition_columns, + connection_name=connection_name, + ) + + if session is None: + bpd.read_gbq_query(sql) + session = bpd.get_global_session() + else: + session.read_gbq_query(sql) + + return _get_table_metadata(bqclient=session.bqclient, table_name=table_name) diff --git a/bigframes/bigquery/_operations/ml.py b/bigframes/bigquery/_operations/ml.py index cc5a961af74..d5b1786b258 100644 --- a/bigframes/bigquery/_operations/ml.py +++ b/bigframes/bigquery/_operations/ml.py @@ -14,12 +14,13 @@ from __future__ import annotations -from typing import cast, List, Mapping, Optional, Union +from typing import List, Mapping, Optional, Union import bigframes_vendored.constants import google.cloud.bigquery import pandas as pd +from bigframes.bigquery._operations import utils import bigframes.core.logging.log_adapter as log_adapter import bigframes.core.sql.ml import bigframes.dataframe as dataframe @@ -27,53 +28,6 @@ import bigframes.session -# Helper to convert DataFrame to SQL string -def _to_sql(df_or_sql: Union[pd.DataFrame, dataframe.DataFrame, str]) -> str: - import bigframes.pandas as bpd - - if isinstance(df_or_sql, str): - return df_or_sql - - if isinstance(df_or_sql, pd.DataFrame): - bf_df = bpd.read_pandas(df_or_sql) - else: - bf_df = cast(dataframe.DataFrame, df_or_sql) - - # Cache dataframes to make sure base table is not a snapshot. - # Cached dataframe creates a full copy, never uses snapshot. - # This is a workaround for internal issue b/310266666. - bf_df.cache() - sql, _, _ = bf_df._to_sql_query(include_index=False) - return sql - - -def _get_model_name_and_session( - model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], - # Other dataframe arguments to extract session from - *dataframes: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]], -) -> tuple[str, Optional[bigframes.session.Session]]: - if isinstance(model, pd.Series): - try: - model_ref = model["modelReference"] - model_name = f"{model_ref['projectId']}.{model_ref['datasetId']}.{model_ref['modelId']}" # type: ignore - except KeyError: - raise ValueError("modelReference must be present in the pandas Series.") - elif isinstance(model, str): - model_name = model - else: - if model._bqml_model is None: - raise ValueError("Model must be fitted to be used in ML operations.") - return model._bqml_model.model_name, model._bqml_model.session - - session = None - for df in dataframes: - if isinstance(df, dataframe.DataFrame): - session = df._session - break - - return model_name, session - - def _get_model_metadata( *, bqclient: google.cloud.bigquery.Client, @@ -143,8 +97,12 @@ def create_model( """ import bigframes.pandas as bpd - training_data_sql = _to_sql(training_data) if training_data is not None else None - custom_holiday_sql = _to_sql(custom_holiday) if custom_holiday is not None else None + training_data_sql = ( + utils.to_sql(training_data) if training_data is not None else None + ) + custom_holiday_sql = ( + utils.to_sql(custom_holiday) if custom_holiday is not None else None + ) # Determine session from DataFrames if not provided if session is None: @@ -227,8 +185,8 @@ def evaluate( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) if input_ is not None else None + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) if input_ is not None else None sql = bigframes.core.sql.ml.evaluate( model_name=model_name, @@ -281,8 +239,8 @@ def predict( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.predict( model_name=model_name, @@ -340,8 +298,8 @@ def explain_predict( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.explain_predict( model_name=model_name, @@ -383,7 +341,7 @@ def global_explain( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model) + model_name, session = utils.get_model_name_and_session(model) sql = bigframes.core.sql.ml.global_explain( model_name=model_name, class_level_explain=class_level_explain, @@ -419,8 +377,8 @@ def transform( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.transform( model_name=model_name, @@ -500,8 +458,8 @@ def generate_text( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.generate_text( model_name=model_name, @@ -565,8 +523,8 @@ def generate_embedding( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.generate_embedding( model_name=model_name, diff --git a/bigframes/bigquery/_operations/utils.py b/bigframes/bigquery/_operations/utils.py new file mode 100644 index 00000000000..f94616786e3 --- /dev/null +++ b/bigframes/bigquery/_operations/utils.py @@ -0,0 +1,70 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import cast, Optional, Union + +import pandas as pd + +import bigframes +from bigframes import dataframe +from bigframes.ml import base as ml_base + + +def get_model_name_and_session( + model: Union[ml_base.BaseEstimator, str, pd.Series], + # Other dataframe arguments to extract session from + *dataframes: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]], +) -> tuple[str, Optional[bigframes.session.Session]]: + if isinstance(model, pd.Series): + try: + model_ref = model["modelReference"] + model_name = f"{model_ref['projectId']}.{model_ref['datasetId']}.{model_ref['modelId']}" # type: ignore + except KeyError: + raise ValueError("modelReference must be present in the pandas Series.") + elif isinstance(model, str): + model_name = model + else: + if model._bqml_model is None: + raise ValueError("Model must be fitted to be used in ML operations.") + return model._bqml_model.model_name, model._bqml_model.session + + session = None + for df in dataframes: + if isinstance(df, dataframe.DataFrame): + session = df._session + break + + return model_name, session + + +def to_sql(df_or_sql: Union[pd.DataFrame, dataframe.DataFrame, str]) -> str: + """ + Helper to convert DataFrame to SQL string + """ + import bigframes.pandas as bpd + + if isinstance(df_or_sql, str): + return df_or_sql + + if isinstance(df_or_sql, pd.DataFrame): + bf_df = bpd.read_pandas(df_or_sql) + else: + bf_df = cast(dataframe.DataFrame, df_or_sql) + + # Cache dataframes to make sure base table is not a snapshot. + # Cached dataframe creates a full copy, never uses snapshot. + # This is a workaround for internal issue b/310266666. + bf_df.cache() + sql, _, _ = bf_df._to_sql_query(include_index=False) + return sql diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index 3af52205a65..053ee7352a8 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -22,7 +22,9 @@ generate, generate_bool, generate_double, + generate_embedding, generate_int, + generate_text, if_, score, ) @@ -33,7 +35,9 @@ "generate", "generate_bool", "generate_double", + "generate_embedding", "generate_int", + "generate_text", "if_", "score", ] diff --git a/bigframes/core/agg_expressions.py b/bigframes/core/agg_expressions.py index 125e3fef630..a26a9cfe087 100644 --- a/bigframes/core/agg_expressions.py +++ b/bigframes/core/agg_expressions.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Callable, Mapping, Tuple, TypeVar +from typing import Callable, Hashable, Mapping, Tuple, TypeVar from bigframes import dtypes from bigframes.core import expression, window_spec @@ -68,7 +68,7 @@ def children(self) -> Tuple[expression.Expression, ...]: return self.inputs @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -92,7 +92,7 @@ def transform_children( def bind_variables( self: TExpression, - bindings: Mapping[str, expression.Expression], + bindings: Mapping[Hashable, expression.Expression], allow_partial_bindings: bool = False, ) -> TExpression: return self.transform_children( @@ -192,7 +192,7 @@ def children(self) -> Tuple[expression.Expression, ...]: return self.inputs @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -216,7 +216,7 @@ def transform_children( def bind_variables( self: WindowExpression, - bindings: Mapping[str, expression.Expression], + bindings: Mapping[Hashable, expression.Expression], allow_partial_bindings: bool = False, ) -> WindowExpression: return self.transform_children( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5bac1a06f1e..ff7f2b9899b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -140,6 +140,7 @@ def __init__( column_labels: typing.Union[pd.Index, typing.Iterable[Label]], index_labels: typing.Union[pd.Index, typing.Iterable[Label], None] = None, *, + value_columns: Optional[Iterable[str]] = None, transpose_cache: Optional[Block] = None, ): """Construct a block object, will create default index if no index columns specified.""" @@ -158,7 +159,13 @@ def __init__( if index_labels else tuple([None for _ in index_columns]) ) - self._expr = self._normalize_expression(expr, self._index_columns) + if value_columns is None: + value_columns = [ + col_id for col_id in expr.column_ids if col_id not in index_columns + ] + self._expr = self._normalize_expression( + expr, self._index_columns, value_columns + ) # Use pandas index to more easily replicate column indexing, especially for hierarchical column index self._column_labels = ( column_labels.copy() @@ -1114,13 +1121,15 @@ def project_exprs( labels: Union[Sequence[Label], pd.Index], drop=False, ) -> Block: - new_array, _ = self.expr.compute_values(exprs) + new_array, new_cols = self.expr.compute_values(exprs) if drop: new_array = new_array.drop_columns(self.value_columns) + new_val_cols = new_cols if drop else (*self.value_columns, *new_cols) return Block( new_array, index_columns=self.index_columns, + value_columns=new_val_cols, column_labels=labels if drop else self.column_labels.append(pd.Index(labels)), @@ -1542,17 +1551,13 @@ def _get_labels_for_columns(self, column_ids: typing.Sequence[str]) -> pd.Index: def _normalize_expression( self, expr: core.ArrayValue, - index_columns: typing.Sequence[str], - assert_value_size: typing.Optional[int] = None, + index_columns: Iterable[str], + value_columns: Iterable[str], ): """Normalizes expression by moving index columns to left.""" - value_columns = [ - col_id for col_id in expr.column_ids if col_id not in index_columns - ] - if (assert_value_size is not None) and ( - len(value_columns) != assert_value_size - ): - raise ValueError("Unexpected number of value columns.") + normalized_ids = (*index_columns, *value_columns) + if tuple(expr.column_ids) == normalized_ids: + return expr return expr.select_columns([*index_columns, *value_columns]) def grouped_head( diff --git a/bigframes/core/col.py b/bigframes/core/col.py new file mode 100644 index 00000000000..60b24d5e837 --- /dev/null +++ b/bigframes/core/col.py @@ -0,0 +1,126 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +from typing import Any, Hashable + +import bigframes_vendored.pandas.core.col as pd_col + +import bigframes.core.expression as bf_expression +import bigframes.operations as bf_ops + + +# Not to be confused with the Expression class in `bigframes.core.expressions` +# Name collision unintended +@dataclasses.dataclass(frozen=True) +class Expression: + __doc__ = pd_col.Expression.__doc__ + + _value: bf_expression.Expression + + def _apply_unary(self, op: bf_ops.UnaryOp) -> Expression: + return Expression(op.as_expr(self._value)) + + def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False): + if isinstance(other, Expression): + other_value = other._value + else: + other_value = bf_expression.const(other) + if reverse: + return Expression(op.as_expr(other_value, self._value)) + else: + return Expression(op.as_expr(self._value, other_value)) + + def __add__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.add_op) + + def __radd__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.add_op, reverse=True) + + def __sub__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.sub_op) + + def __rsub__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.sub_op, reverse=True) + + def __mul__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mul_op) + + def __rmul__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mul_op, reverse=True) + + def __truediv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.div_op) + + def __rtruediv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.div_op, reverse=True) + + def __floordiv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.floordiv_op) + + def __rfloordiv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.floordiv_op, reverse=True) + + def __ge__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.ge_op) + + def __gt__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.gt_op) + + def __le__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.le_op) + + def __lt__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.lt_op) + + def __eq__(self, other: object) -> Expression: # type: ignore + return self._apply_binary(other, bf_ops.eq_op) + + def __ne__(self, other: object) -> Expression: # type: ignore + return self._apply_binary(other, bf_ops.ne_op) + + def __mod__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mod_op) + + def __rmod__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mod_op, reverse=True) + + def __and__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.and_op) + + def __rand__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.and_op, reverse=True) + + def __or__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.or_op) + + def __ror__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.or_op, reverse=True) + + def __xor__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.xor_op) + + def __rxor__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.xor_op, reverse=True) + + def __invert__(self) -> Expression: + return self._apply_unary(bf_ops.invert_op) + + +def col(col_name: Hashable) -> Expression: + return Expression(bf_expression.free_var(col_name)) + + +col.__doc__ = pd_col.col.__doc__ diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 89bcb9b9207..a1c25bdc73c 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Callable, Generator, Mapping, TypeVar, Union +from typing import Callable, Generator, Hashable, Mapping, TypeVar, Union import pandas as pd @@ -39,7 +39,7 @@ def deref(name: str) -> DerefOp: return DerefOp(ids.ColumnId(name)) -def free_var(id: str) -> UnboundVariableExpression: +def free_var(id: Hashable) -> UnboundVariableExpression: return UnboundVariableExpression(id) @@ -52,7 +52,7 @@ class Expression(abc.ABC): """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return () @property @@ -116,7 +116,9 @@ def bind_refs( @abc.abstractmethod def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: """Replace variables with expression given in `bindings`. @@ -191,7 +193,9 @@ def output_type(self) -> dtypes.ExpressionType: return self.dtype def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: return self @@ -226,10 +230,10 @@ def transform_children(self, t: Callable[[Expression], Expression]) -> Expressio class UnboundVariableExpression(Expression): """A variable expression representing an unbound variable.""" - id: str + id: Hashable @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return (self.id,) @property @@ -256,7 +260,9 @@ def bind_refs( return self def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: if self.id in bindings.keys(): return bindings[self.id] @@ -304,7 +310,9 @@ def output_type(self) -> dtypes.ExpressionType: raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: return self @@ -373,7 +381,7 @@ def column_references( ) @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -408,7 +416,9 @@ def output_type(self) -> dtypes.ExpressionType: return self.op.output_type(*input_types) def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> OpExpression: return OpExpression( self.op, diff --git a/bigframes/core/sql/io.py b/bigframes/core/sql/io.py new file mode 100644 index 00000000000..9e1a549a64f --- /dev/null +++ b/bigframes/core/sql/io.py @@ -0,0 +1,87 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + + +def load_data_ddl( + table_name: str, + *, + write_disposition: str = "INTO", + columns: Optional[Mapping[str, str]] = None, + partition_by: Optional[list[str]] = None, + cluster_by: Optional[list[str]] = None, + table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, + from_files_options: Mapping[str, Union[str, int, float, bool, list]], + with_partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, +) -> str: + """Generates the LOAD DATA DDL statement.""" + statement = ["LOAD DATA"] + statement.append(write_disposition) + statement.append(table_name) + + if columns: + column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()]) + statement.append(f"({column_defs})") + + if partition_by: + statement.append(f"PARTITION BY {', '.join(partition_by)}") + + if cluster_by: + statement.append(f"CLUSTER BY {', '.join(cluster_by)}") + + if table_options: + opts = [] + for key, value in table_options.items(): + if isinstance(value, str): + value_sql = repr(value) + opts.append(f"{key} = {value_sql}") + elif isinstance(value, bool): + opts.append(f"{key} = {str(value).upper()}") + elif isinstance(value, list): + list_str = ", ".join([repr(v) for v in value]) + opts.append(f"{key} = [{list_str}]") + else: + opts.append(f"{key} = {value}") + options_str = ", ".join(opts) + statement.append(f"OPTIONS ({options_str})") + + opts = [] + for key, value in from_files_options.items(): + if isinstance(value, str): + value_sql = repr(value) + opts.append(f"{key} = {value_sql}") + elif isinstance(value, bool): + opts.append(f"{key} = {str(value).upper()}") + elif isinstance(value, list): + list_str = ", ".join([repr(v) for v in value]) + opts.append(f"{key} = [{list_str}]") + else: + opts.append(f"{key} = {value}") + options_str = ", ".join(opts) + statement.append(f"FROM FILES ({options_str})") + + if with_partition_columns: + part_defs = ", ".join( + [f"{name} {typ}" for name, typ in with_partition_columns.items()] + ) + statement.append(f"WITH PARTITION COLUMNS ({part_defs})") + + if connection_name: + statement.append(f"WITH CONNECTION `{connection_name}`") + + return " ".join(statement) diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py new file mode 100644 index 00000000000..59c81977315 --- /dev/null +++ b/bigframes/core/sql/literals.py @@ -0,0 +1,58 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections.abc +import json +from typing import Any, List, Mapping, Union + +import bigframes.core.sql + +STRUCT_VALUES = Union[ + str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any] +] +STRUCT_TYPE = Mapping[str, STRUCT_VALUES] + + +def struct_literal(struct_options: STRUCT_TYPE) -> str: + rendered_options = [] + for option_name, option_value in struct_options.items(): + if option_name == "model_params": + json_str = json.dumps(option_value) + # Escape single quotes for SQL string literal + sql_json_str = json_str.replace("'", "''") + rendered_val = f"JSON'{sql_json_str}'" + elif isinstance(option_value, collections.abc.Mapping): + struct_body = ", ".join( + [ + f"{bigframes.core.sql.simple_literal(v)} AS {k}" + for k, v in option_value.items() + ] + ) + rendered_val = f"STRUCT({struct_body})" + elif isinstance(option_value, list): + rendered_val = ( + "[" + + ", ".join( + [bigframes.core.sql.simple_literal(v) for v in option_value] + ) + + "]" + ) + elif isinstance(option_value, bool): + rendered_val = str(option_value).lower() + else: + rendered_val = bigframes.core.sql.simple_literal(option_value) + rendered_options.append(f"{rendered_val} AS {option_name}") + return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index d77c5aa4a0b..a2a4d32ae84 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -14,12 +14,11 @@ from __future__ import annotations -import collections.abc -import json from typing import Any, Dict, List, Mapping, Optional, Union import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +import bigframes.core.sql.literals def create_model_ddl( @@ -109,36 +108,7 @@ def _build_struct_sql( ) -> str: if not struct_options: return "" - - rendered_options = [] - for option_name, option_value in struct_options.items(): - if option_name == "model_params": - json_str = json.dumps(option_value) - # Escape single quotes for SQL string literal - sql_json_str = json_str.replace("'", "''") - rendered_val = f"JSON'{sql_json_str}'" - elif isinstance(option_value, collections.abc.Mapping): - struct_body = ", ".join( - [ - f"{bigframes.core.sql.simple_literal(v)} AS {k}" - for k, v in option_value.items() - ] - ) - rendered_val = f"STRUCT({struct_body})" - elif isinstance(option_value, list): - rendered_val = ( - "[" - + ", ".join( - [bigframes.core.sql.simple_literal(v) for v in option_value] - ) - + "]" - ) - elif isinstance(option_value, bool): - rendered_val = str(option_value).lower() - else: - rendered_val = bigframes.core.sql.simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f", STRUCT({', '.join(rendered_options)})" + return f", {bigframes.core.sql.literals.struct_literal(struct_options)}" def evaluate( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index e1ad4f3e75d..b195ce9902d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -58,6 +58,7 @@ from bigframes.core import agg_expressions import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.col import bigframes.core.convert import bigframes.core.explode import bigframes.core.expression as ex @@ -94,7 +95,13 @@ import bigframes.session SingleItemValue = Union[ - bigframes.series.Series, int, float, str, pandas.Timedelta, Callable + bigframes.series.Series, + int, + float, + str, + pandas.Timedelta, + Callable, + bigframes.core.col.Expression, ] MultiItemValue = Union[ "DataFrame", Sequence[int | float | str | pandas.Timedelta | Callable] @@ -325,7 +332,7 @@ def dtypes(self) -> pandas.Series: @property def columns(self) -> pandas.Index: - return self.dtypes.index + return self._block.column_labels @columns.setter def columns(self, labels: pandas.Index): @@ -2236,6 +2243,13 @@ def _assign_single_item( ) -> DataFrame: if isinstance(v, bigframes.series.Series): return self._assign_series_join_on_index(k, v) + elif isinstance(v, bigframes.core.col.Expression): + label_to_col_ref = { + label: ex.deref(id) for id, label in self._block.col_id_to_label.items() + } + resolved_expr = v._value.bind_variables(label_to_col_ref) + block = self._block.project_block_exprs([resolved_expr], labels=[k]) + return DataFrame(block) elif isinstance(v, bigframes.dataframe.DataFrame): v_df_col_count = len(v._block.value_columns) if v_df_col_count != 1: diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index be0d2b45d09..40d04a1d713 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -23,6 +23,7 @@ import threading from typing import Any, Iterator, Optional import uuid +import warnings import pandas as pd @@ -111,16 +112,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self.page_size = initial_page_size self.max_columns = initial_max_columns - # TODO(b/469861913): Nested columns from structs (e.g., 'struct_col.name') are not currently sortable. - # TODO(b/463754889): Support non-string column labels for sorting. - if all(isinstance(col, str) for col in dataframe.columns): - self.orderable_columns = [ - str(col_name) - for col_name, dtype in dataframe.dtypes.items() - if dtypes.is_orderable(dtype) - ] - else: - self.orderable_columns = [] + self.orderable_columns = self._get_orderable_columns(dataframe) self._initial_load() @@ -128,30 +120,49 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Also used as a guard to prevent observers from firing during initialization. self._initial_load_complete = True + def _get_orderable_columns( + self, dataframe: bigframes.dataframe.DataFrame + ) -> list[str]: + """Determine which columns can be used for client-side sorting.""" + # TODO(b/469861913): Nested columns from structs (e.g., 'struct_col.name') are not currently sortable. + # TODO(b/463754889): Support non-string column labels for sorting. + if not all(isinstance(col, str) for col in dataframe.columns): + return [] + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", bigframes.exceptions.JSONDtypeWarning) + warnings.simplefilter("ignore", category=FutureWarning) + return [ + str(col_name) + for col_name, dtype in dataframe.dtypes.items() + if dtypes.is_orderable(dtype) + ] + def _initial_load(self) -> None: """Get initial data and row count.""" # obtain the row counts # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` # before we get here so that the count might already be cached. - self._reset_batches_for_new_page_size() + with bigframes.option_context("display.progress_bar", None): + self._reset_batches_for_new_page_size() - if self._batches is None: - self._error_message = ( - "Could not retrieve data batches. Data might be unavailable or " - "an error occurred." - ) - self.row_count = None - elif self._batches.total_rows is None: - # Total rows is unknown, this is an expected state. - # TODO(b/461536343): Cheaply discover if we have exactly 1 page. - # There are cases where total rows is not set, but there are no additional - # pages. We could disable the "next" button in these cases. - self.row_count = None - else: - self.row_count = self._batches.total_rows - - # get the initial page - self._set_table_html() + if self._batches is None: + self._error_message = ( + "Could not retrieve data batches. Data might be unavailable or " + "an error occurred." + ) + self.row_count = None + elif self._batches.total_rows is None: + # Total rows is unknown, this is an expected state. + # TODO(b/461536343): Cheaply discover if we have exactly 1 page. + # There are cases where total rows is not set, but there are no additional + # pages. We could disable the "next" button in these cases. + self.row_count = None + else: + self.row_count = self._batches.total_rows + + # get the initial page + self._set_table_html() @traitlets.observe("_initial_load_complete") def _on_initial_load_complete(self, change: dict[str, Any]): @@ -274,14 +285,17 @@ def _reset_batch_cache(self) -> None: def _reset_batches_for_new_page_size(self) -> None: """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + with bigframes.option_context("display.progress_bar", None): + self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) self._reset_batch_cache() def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" new_page = None - with self._setting_html_lock: + with self._setting_html_lock, bigframes.option_context( + "display.progress_bar", None + ): if self._error_message: self.table_html = ( f"
" diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 6102d1512c6..ef34985c8e8 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -363,7 +363,13 @@ def repr_mimebundle( if opts.repr_mode == "anywidget": try: - return get_anywidget_bundle(obj, include=include, exclude=exclude) + with bigframes.option_context("display.progress_bar", None): + with warnings.catch_warnings(): + warnings.simplefilter( + "ignore", category=bigframes.exceptions.JSONDtypeWarning + ) + warnings.simplefilter("ignore", category=FutureWarning) + return get_anywidget_bundle(obj, include=include, exclude=exclude) except ImportError: # Anywidget is an optional dependency, so warn rather than fail. # TODO(shuowei): When Anywidget becomes the default for all repr modes, diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 1e3cdabdaf6..094493818de 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -27,8 +27,6 @@ import humanize if TYPE_CHECKING: - from IPython import display - import bigframes.core.events GenericJob = Union[ @@ -134,16 +132,14 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): return res -current_display: Optional[display.HTML] = None current_display_id: Optional[str] = None -previous_display_html: str = "" def progress_callback( event: bigframes.core.events.Event, ): """Displays a progress bar while the query is running""" - global current_display, current_display_id, previous_display_html + global current_display_id try: import bigframes._config @@ -162,59 +158,44 @@ def progress_callback( if progress_bar == "notebook": import IPython.display as display - if ( - isinstance(event, bigframes.core.events.ExecutionStarted) - or current_display is None - or current_display_id is None - ): - previous_display_html = "" - current_display_id = str(random.random()) - current_display = display.HTML("Starting.") - display.display( - current_display, - display_id=current_display_id, - ) + display_html = None + + if isinstance(event, bigframes.core.events.ExecutionStarted): + # Start a new context for progress output. + current_display_id = None + + elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + display_html = render_bqquery_sent_event_html(event) - if isinstance(event, bigframes.core.events.BigQuerySentEvent): - previous_display_html = render_bqquery_sent_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): - previous_display_html = render_bqquery_retry_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) + display_html = render_bqquery_retry_event_html(event) + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): - previous_display_html = render_bqquery_received_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) + display_html = render_bqquery_received_event_html(event) + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): - previous_display_html = render_bqquery_finished_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) - elif isinstance(event, bigframes.core.events.ExecutionFinished): - if previous_display_html: + display_html = render_bqquery_finished_event_html(event) + + elif isinstance(event, bigframes.core.events.SessionClosed): + display_html = f"Session {event.session_id} closed." + + if display_html: + if current_display_id: display.update_display( - display.HTML(f"✅ Completed. {previous_display_html}"), + display.HTML(display_html), + display_id=current_display_id, + ) + else: + current_display_id = str(random.random()) + display.display( + display.HTML(display_html), display_id=current_display_id, ) - elif isinstance(event, bigframes.core.events.SessionClosed): - display.update_display( - display.HTML(f"Session {event.session_id} closed."), - display_id=current_display_id, - ) elif progress_bar == "terminal": - if isinstance(event, bigframes.core.events.ExecutionStarted): - print("Starting execution.") - elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + message = None + + if isinstance(event, bigframes.core.events.BigQuerySentEvent): message = render_bqquery_sent_event_plaintext(event) print(message) elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): @@ -226,8 +207,6 @@ def progress_callback( elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): message = render_bqquery_finished_event_plaintext(event) print(message) - elif isinstance(event, bigframes.core.events.ExecutionFinished): - print("Execution done.") def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 9b38702ccea..3f6ccecaa2b 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -24,7 +24,8 @@ """ import abc -from typing import cast, Optional, TypeVar, Union +import typing +from typing import Optional, TypeVar, Union import warnings import bigframes_vendored.sklearn.base @@ -133,7 +134,7 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: self._bqml_model = self._create_bqml_model() # type: ignore except AttributeError: raise RuntimeError("A model must be trained before register.") - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) self._bqml_model.register(vertex_ai_model_id) return self @@ -286,7 +287,7 @@ def _predict_and_retry( bpd.concat([df_result, df_succ]) if df_result is not None else df_succ ) - df_result = cast( + df_result = typing.cast( bpd.DataFrame, bpd.concat([df_result, df_fail]) if df_result is not None else df_fail, ) @@ -306,7 +307,7 @@ def _extract_output_names(self): output_names = [] for transform_col in self._bqml_model._model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) + transform_col_dict = typing.cast(dict, transform_col) # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d638e026e45..f8244fb0d81 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,7 +21,7 @@ import re import types import typing -from typing import cast, Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union from bigframes_vendored import constants import bigframes_vendored.sklearn.compose._column_transformer @@ -218,7 +218,7 @@ def camel_to_snake(name): output_names = [] for transform_col in bq_model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) + transform_col_dict = typing.cast(dict, transform_col) # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue @@ -282,7 +282,7 @@ def _merge( return self # SQLScalarColumnTransformer only work inside ColumnTransformer feature_columns_sorted = sorted( [ - cast(str, feature_column.name) + typing.cast(str, feature_column.name) for feature_column in bq_model.feature_columns ] ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4dbc1a5fa30..620843fb6e2 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -18,7 +18,8 @@ import dataclasses import datetime -from typing import Callable, cast, Iterable, Mapping, Optional, Union +import typing +from typing import Callable, Iterable, Mapping, Optional, Union import uuid from google.cloud import bigquery @@ -376,7 +377,7 @@ def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: if vertex_ai_model_id is None: # vertex id needs to start with letters. https://cloud.google.com/vertex-ai/docs/general/resource-naming - vertex_ai_model_id = "bigframes_" + cast(str, self._model.model_id) + vertex_ai_model_id = "bigframes_" + typing.cast(str, self._model.model_id) # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. # The possibility of conflicts should be low. diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 295649ed7f5..56b5d6735c9 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import cast, Mapping, Optional +import typing +from typing import Mapping, Optional from google.cloud import bigquery @@ -78,7 +79,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X) @@ -99,7 +100,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) @@ -157,7 +158,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) @@ -178,7 +179,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) @@ -276,7 +277,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) @@ -297,7 +298,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index f4e60f3f9d4..585599c9b6c 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import cast, Iterable, Literal, Mapping, Optional, Union +import typing +from typing import Iterable, Literal, Mapping, Optional, Union import warnings import bigframes_vendored.constants as constants @@ -252,7 +253,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "content"}) options: dict = {} @@ -391,7 +392,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "content"}) # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input @@ -604,7 +605,10 @@ def fit( options["prompt_col"] = X.columns.tolist()[0] self._bqml_model = self._bqml_model_factory.create_llm_remote_model( - X, y, options=options, connection_name=cast(str, self.connection_name) + X, + y, + options=options, + connection_name=typing.cast(str, self.connection_name), ) return self @@ -735,7 +739,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "prompt"}) options: dict = { @@ -820,8 +824,8 @@ def score( ) # BQML identified the column by name - X_col_label = cast(blocks.Label, X.columns[0]) - y_col_label = cast(blocks.Label, y.columns[0]) + X_col_label = typing.cast(blocks.Label, X.columns[0]) + y_col_label = typing.cast(blocks.Label, y.columns[0]) X = X.rename(columns={X_col_label: "input_text"}) y = y.rename(columns={y_col_label: "output_text"}) @@ -1033,7 +1037,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "prompt"}) options = { diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 5adfb03b7f5..3d23fbf5684 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -20,7 +20,8 @@ import inspect from itertools import chain import time -from typing import cast, Generator, List, Optional, Union +import typing +from typing import Generator, List, Optional, Union import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation @@ -99,10 +100,10 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra train_dfs.append(train) test_dfs.append(test) - train_df = cast( + train_df = typing.cast( bpd.DataFrame, bpd.concat(train_dfs).drop(columns="bigframes_stratify_col") ) - test_df = cast( + test_df = typing.cast( bpd.DataFrame, bpd.concat(test_dfs).drop(columns="bigframes_stratify_col") ) return [train_df, test_df] diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 8bf89b08387..22a3e7e2227 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import cast, Iterable, List, Literal, Optional, Union +from typing import Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.preprocessing._data import bigframes_vendored.sklearn.preprocessing._discretization @@ -470,7 +470,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]: s = sql[sql.find("(") + 1 : sql.find(")")] col_label, drop_str, top_k, frequency_threshold = s.split(", ") drop = ( - cast(Literal["most_frequent"], "most_frequent") + typing.cast(Literal["most_frequent"], "most_frequent") if drop_str.lower() == "'most_frequent'" else None ) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 5fe83302638..eee710b2882 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -205,7 +205,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.TIMEDELTA_DTYPE if dtypes.is_numeric(input_types[0]): - if pd.api.types.is_bool_dtype(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore return dtypes.INT_DTYPE return input_types[0] @@ -224,7 +224,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT # These will change if median is changed to exact implementation. if not dtypes.is_orderable(input_types[0]): raise TypeError(f"Type {input_types[0]} is not orderable") - if pd.api.types.is_bool_dtype(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore return dtypes.INT_DTYPE else: return input_types[0] diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 9da2204a713..a70e319747a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -27,6 +27,7 @@ import pandas import bigframes._config as config +from bigframes.core.col import col import bigframes.core.global_session as global_session import bigframes.core.indexes from bigframes.core.logging import log_adapter @@ -415,6 +416,7 @@ def reset_session(): "clean_up_by_session_id", "concat", "crosstab", + "col", "cut", "deploy_remote_function", "deploy_udf", diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 483bc5e530d..fade0558ac9 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -49,6 +49,7 @@ import pyarrow as pa import bigframes._config as config +import bigframes._importing import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.dataframe @@ -353,11 +354,14 @@ def _read_gbq_colab( ) _set_default_session_location_if_possible_deferred_query(create_query) if not config.options.bigquery._session_started: - with warnings.catch_warnings(): - # Don't warning about Polars in SQL cell. - # Related to b/437090788. + # Don't warning about Polars in SQL cell. + # Related to b/437090788. + try: + bigframes._importing.import_polars() warnings.simplefilter("ignore", bigframes.exceptions.PreviewWarning) config.options.bigquery.enable_polars_execution = True + except ImportError: + pass # don't fail if polars isn't available return global_session.with_default_session( bigframes.session.Session._read_gbq_colab, diff --git a/bigframes/version.py b/bigframes/version.py index a6862ee201c..c5b120dc239 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.34.0" +__version__ = "2.35.0" # {x-release-please-start-date} -__release_date__ = "2026-02-02" +__release_date__ = "2026-02-07" # {x-release-please-end} diff --git a/notebooks/getting_started/magics.ipynb b/notebooks/getting_started/magics.ipynb new file mode 100644 index 00000000000..1f2cf7a409b --- /dev/null +++ b/notebooks/getting_started/magics.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "91edcf7b", + "metadata": {}, + "source": [ + "# %%bqsql cell magics\n", + "\n", + "The BigQuery DataFrames (aka BigFrames) package provides a `%%bqsql` cell magics for Jupyter environments.\n", + "\n", + "To use it, first activate the extension:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "98cd0489", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext bigframes" + ] + }, + { + "cell_type": "markdown", + "id": "f18fdc63", + "metadata": {}, + "source": [ + "Now, use the magics by including SQL in the body." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "269c5862", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes. [Job bigframes-dev:US.job_UVe7FsupxF3CbYuLcLT7fpw9dozg details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1e2fb7b019754d31b11323a054f97f47", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stategenderyearnamenumber
0HIF1999Ariana10
1HIF2002Jordyn10
2HIF2006Mya10
3HIF2010Jordyn10
4HIM1921Nobuo10
5HIM1925Ralph10
6HIM1926Hisao10
7HIM1927Moses10
8HIM1933Larry10
9HIM1933Alfredo10
\n", + "

10 rows × 5 columns

\n", + "
[5552452 rows x 5 columns in total]" + ], + "text/plain": [ + "state gender year name number\n", + " HI F 1999 Ariana 10\n", + " HI F 2002 Jordyn 10\n", + " HI F 2006 Mya 10\n", + " HI F 2010 Jordyn 10\n", + " HI M 1921 Nobuo 10\n", + " HI M 1925 Ralph 10\n", + " HI M 1926 Hisao 10\n", + " HI M 1927 Moses 10\n", + " HI M 1933 Larry 10\n", + " HI M 1933 Alfredo 10\n", + "...\n", + "\n", + "[5552452 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`" + ] + }, + { + "cell_type": "markdown", + "id": "8771e10f", + "metadata": {}, + "source": [ + "The output DataFrame can be saved to a variable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "30bb6327", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes. [Job bigframes-dev:US.c142adf3-cd95-42da-bbdc-c176b36b934f details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql mydf\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`" + ] + }, + { + "cell_type": "markdown", + "id": "533e2e9e", + "metadata": {}, + "source": [ + "You can chain cells together using format strings. DataFrame objects are automatically turned into table expressions." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6a8a8123", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 88.1 MB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c4889de9296440428de90defb5c58070", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_countname
0304036Tracy
1293876Travis
2203784Troy
3150127Trevor
496397Tristan
589996Tracey
665546Trinity
750112Traci
849657Trenton
945692Trent
\n", + "

10 rows × 2 columns

\n", + "
[238 rows x 2 columns in total]" + ], + "text/plain": [ + " total_count name\n", + "0 304036 Tracy\n", + "1 293876 Travis\n", + "2 203784 Troy\n", + "3 150127 Trevor\n", + "4 96397 Tristan\n", + "5 89996 Tracey\n", + "6 65546 Trinity\n", + "7 50112 Traci\n", + "8 49657 Trenton\n", + "9 45692 Trent\n", + "...\n", + "\n", + "[238 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql\n", + "SELECT sum(number) as total_count, name\n", + "FROM {mydf}\n", + "WHERE name LIKE 'Tr%'\n", + "GROUP BY name\n", + "ORDER BY total_count DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2a17078", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/noxfile.py b/noxfile.py index 00ada18a469..a8a1a84987e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -123,7 +123,7 @@ # TODO(tswast): Consider removing this when unit_noextras and cover is run # from GitHub actions. "unit_noextras", - "system-3.9", # No extras. + "system-3.10", # No extras. f"system-{LATEST_FULLY_SUPPORTED_PYTHON}", # All extras. "cover", # TODO(b/401609005): remove @@ -661,9 +661,7 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( # version, the first version we test with in the unit tests sessions has a # constraints file containing all dependencies and extras. with open( - CURRENT_DIRECTORY - / "testing" - / f"constraints-{UNIT_TEST_PYTHON_VERSIONS[0]}.txt", + CURRENT_DIRECTORY / "testing" / f"constraints-{DEFAULT_PYTHON_VERSION}.txt", encoding="utf-8", ) as constraints_file: constraints_text = constraints_file.read() diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 6e366b6854e..6abecd0ac40 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -31,10 +31,8 @@ def api_coverage_df(): reason="Issues with installing sklearn for this test in python 3.13", ) def test_api_coverage_produces_expected_schema(api_coverage_df): - if sys.version.split(".")[:2] == ["3", "9"]: - pytest.skip( - "Python 3.9 uses older pandas without good microsecond timestamp support." - ) + # Older pandas has different timestamp default precision + pytest.importorskip("pandas", minversion="2.0.0") pandas.testing.assert_series_equal( api_coverage_df.dtypes, @@ -56,6 +54,8 @@ def test_api_coverage_produces_expected_schema(api_coverage_df): "release_version": "string", }, ), + # String dtype behavior not consistent across pandas versions + check_dtype=False, ) diff --git a/setup.py b/setup.py index 090b1035364..d393375d142 100644 --- a/setup.py +++ b/setup.py @@ -33,10 +33,10 @@ # 'Development Status :: 5 - Production/Stable' release_status = "Development Status :: 5 - Production/Stable" dependencies = [ - # please keep these in sync with the minimum versions in testing/constraints-3.9.txt + # please keep these in sync with the minimum versions in testing/constraints-3.10.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0, !=2025.5.0", + "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", @@ -133,7 +133,6 @@ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -149,7 +148,7 @@ "bigframes_vendored": "third_party/bigframes_vendored", }, packages=packages, - python_requires=">=3.9", + python_requires=">=3.10", include_package_data=True, zip_safe=False, ) diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 4810d6461b8..b74ff81063b 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,18 +1,39 @@ # When we drop Python 3.9, # please keep these in sync with the minimum versions in setup.py -google-auth==2.27.0 -ipykernel==5.5.6 -ipython==7.34.0 -notebook==6.5.5 -pandas==2.1.4 -pandas-stubs==2.1.4.231227 -portpicker==1.5.2 -requests==2.32.3 -tornado==6.3.3 -absl-py==1.4.0 -debugpy==1.6.6 +cloudpickle==2.0.0 +fsspec==2023.3.0 +gcsfs==2023.3.0 +geopandas==0.12.2 +google-auth==2.15.0 +google-cloud-bigtable==2.24.0 +google-cloud-pubsub==2.21.4 +google-cloud-bigquery==3.36.0 +google-cloud-functions==1.12.0 +google-cloud-bigquery-connection==1.12.0 +google-cloud-iam==2.12.1 +google-cloud-resource-manager==1.10.3 +google-cloud-storage==2.0.0 +grpc-google-iam-v1==0.14.2 +numpy==1.24.0 +pandas==1.5.3 +pandas-gbq==0.26.1 +pyarrow==15.0.2 +pydata-google-auth==1.8.2 +requests==2.27.1 +scikit-learn==1.2.2 +shapely==1.8.5 +tabulate==0.9 +ipywidgets==7.7.1 +humanize==4.6.0 matplotlib==3.7.1 -psutil==5.9.5 -seaborn==0.13.1 -traitlets==5.7.1 -polars==1.21.0 +db-dtypes==1.4.2 +# For vendored ibis-framework. +atpublic==2.3 +python-dateutil==2.8.2 +pytz==2022.7 +toolz==0.11 +typing-extensions==4.5.0 +rich==12.4.4 +# For anywidget mode +anywidget>=0.9.18 +traitlets==5.0.0 diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py new file mode 100644 index 00000000000..e318a8a720f --- /dev/null +++ b/tests/system/large/bigquery/test_ai.py @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.bigquery import ai, ml +import bigframes.pandas as bpd + + +@pytest.fixture(scope="session") +def embedding_model(bq_connection, dataset_id): + model_name = f"{dataset_id}.embedding_model" + return ml.create_model( + model_name=model_name, + options={"endpoint": "gemini-embedding-001"}, + connection_name=bq_connection, + ) + + +@pytest.fixture(scope="session") +def text_model(bq_connection, dataset_id): + model_name = f"{dataset_id}.text_model" + return ml.create_model( + model_name=model_name, + options={"endpoint": "gemini-2.5-flash"}, + connection_name=bq_connection, + ) + + +def test_generate_embedding(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ai.generate_embedding(embedding_model, df) + + assert len(result) == 2 + assert "embedding" in result.columns + assert "statistics" in result.columns + assert "status" in result.columns + + +def test_generate_embedding_with_options(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ai.generate_embedding( + embedding_model, df, task_type="RETRIEVAL_DOCUMENT", output_dimensionality=256 + ) + + assert len(result) == 2 + embedding = result["embedding"].to_pandas() + assert len(embedding[0]) == 256 + + +def test_generate_text(text_model): + df = bpd.DataFrame({"prompt": ["Dog", "Cat"]}) + + result = ai.generate_text(text_model, df) + + assert len(result) == 2 + assert "result" in result.columns + assert "statistics" in result.columns + assert "full_response" in result.columns + assert "status" in result.columns + + +def test_generate_text_with_options(text_model): + df = bpd.DataFrame({"prompt": ["Dog", "Cat"]}) + + result = ai.generate_text(text_model, df, max_output_tokens=1) + + # It basically asserts that the results are still returned. + assert len(result) == 2 diff --git a/tests/system/large/bigquery/test_io.py b/tests/system/large/bigquery/test_io.py new file mode 100644 index 00000000000..024c6174709 --- /dev/null +++ b/tests/system/large/bigquery/test_io.py @@ -0,0 +1,39 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for for the specific language governing permissions and +# limitations under the License. + +import bigframes.bigquery as bbq + + +def test_load_data(session, dataset_id): + table_name = f"{dataset_id}.test_load_data" + uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + + # Create the external table + table = bbq.load_data( + table_name, + columns={ + "name": "STRING", + "post_abbr": "STRING", + }, + from_files_options={"format": "CSV", "uris": [uri], "skip_leading_rows": 1}, + session=session, + ) + assert table is not None + + # Read the table to verify + import bigframes.pandas as bpd + + bf_df = bpd.read_gbq(table_name) + pd_df = bf_df.to_pandas() + assert len(pd_df) > 0 diff --git a/tests/system/large/bigquery/test_ml.py b/tests/system/large/bigquery/test_ml.py index 22011199feb..20a62ae2b64 100644 --- a/tests/system/large/bigquery/test_ml.py +++ b/tests/system/large/bigquery/test_ml.py @@ -62,3 +62,30 @@ def test_generate_embedding_with_options(embedding_model): assert "ml_generate_embedding_status" in result.columns embedding = result["ml_generate_embedding_result"].to_pandas() assert len(embedding[0]) == 256 + + +def test_create_model_linear_regression(dataset_id): + df = bpd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + model_name = f"{dataset_id}.linear_regression_model" + + result = ml.create_model( + model_name=model_name, + options={"model_type": "LINEAR_REG", "input_label_cols": ["y"]}, + training_data=df, + ) + + assert result["modelType"] == "LINEAR_REGRESSION" + + +def test_create_model_with_transform(dataset_id): + df = bpd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + model_name = f"{dataset_id}.transform_model" + + result = ml.create_model( + model_name=model_name, + options={"model_type": "LINEAR_REG", "input_label_cols": ["y"]}, + training_data=df, + transform=["x * 2 AS x_doubled", "y"], + ) + + assert result["modelType"] == "LINEAR_REGRESSION" diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b6..6c7d8121005 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -26,6 +26,8 @@ from bigframes import dtypes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + @pytest.fixture(scope="function") def images_output_folder() -> Generator[str, None, None]: diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 102d6083822..c89fb4c6e6e 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -20,6 +20,9 @@ import bigframes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + + idisplay = pytest.importorskip("IPython.display") diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index 47d4d2aa04f..f63de38a8ce 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -13,10 +13,13 @@ # limitations under the License. import pandas as pd +import pytest import bigframes.dtypes as dtypes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + def test_blob_uri(images_uris: list[str], images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.uri().to_pandas() diff --git a/tests/system/small/blob/test_urls.py b/tests/system/small/blob/test_urls.py index 02a76587f5f..b2dd6604343 100644 --- a/tests/system/small/blob/test_urls.py +++ b/tests/system/small/blob/test_urls.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + def test_blob_read_url(images_mm_df: bpd.DataFrame): urls = images_mm_df["blob_col"].blob.read_url() diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0f7b782b66d..fa82cce6054 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5754,16 +5754,9 @@ def test_df_dot_operator_series( ) -# TODO(tswast): We may be able to re-enable this test after we break large -# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) def test_recursion_limit(scalars_df_index): scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - for i in range(400): + for i in range(250): scalars_df_index = scalars_df_index + 4 scalars_df_index.to_pandas() @@ -5964,7 +5957,7 @@ def test_resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") + pytest.importorskip("pandas", minversion="2.2.0") bf_result = ( scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 579e7cd414d..1d0e05f5ccf 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -123,7 +123,7 @@ def test_dataframe_groupby_rank( scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct ): # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") + pytest.importorskip("pandas", minversion="2.2.0") col_names = ["int64_too", "float64_col", "int64_col", "string_col"] bf_result = ( scalars_df_index[col_names] diff --git a/tests/system/small/test_magics.py b/tests/system/small/test_magics.py new file mode 100644 index 00000000000..91ada5b9e34 --- /dev/null +++ b/tests/system/small/test_magics.py @@ -0,0 +1,100 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd + +IPython = pytest.importorskip("IPython") + + +MAGIC_NAME = "bqsql" + + +@pytest.fixture(scope="module") +def ip(): + """Provides a persistent IPython shell instance for the test session.""" + from IPython.testing.globalipapp import get_ipython + + shell = get_ipython() + shell.extension_manager.load_extension("bigframes") + return shell + + +def test_magic_select_lit_to_var(ip): + bigframes.close_session() + + line = "dst_var" + cell_body = "SELECT 3" + + ip.run_cell_magic(MAGIC_NAME, line, cell_body) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.shape == (1, 1) + assert result_df.loc[0, 0] == 3 + + +def test_magic_select_lit_dry_run(ip): + bigframes.close_session() + + line = "dst_var --dry_run" + cell_body = "SELECT 3" + + ip.run_cell_magic(MAGIC_NAME, line, cell_body) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.totalBytesProcessed == 0 + + +def test_magic_select_lit_display(ip): + from IPython.utils.capture import capture_output + + bigframes.close_session() + + cell_body = "SELECT 3" + + with capture_output() as io: + ip.run_cell_magic(MAGIC_NAME, "", cell_body) + assert len(io.outputs) > 0 + # Check that the output has data, regardless of the format (html, plain, etc) + available_formats = io.outputs[0].data.keys() + assert len(available_formats) > 0 + + +def test_magic_select_interpolate(ip): + bigframes.close_session() + df = bpd.read_pandas( + pd.DataFrame({"col_a": [1, 2, 3, 4, 5, 6], "col_b": [1, 2, 1, 3, 1, 2]}) + ) + const_val = 1 + + ip.push({"df": df, "const_val": const_val}) + + query = """ + SELECT + SUM(col_a) AS total + FROM + {df} + WHERE col_b={const_val} + """ + + ip.run_cell_magic(MAGIC_NAME, "dst_var", query) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.shape == (1, 1) + assert result_df.loc[0, 0] == 9 diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index a95c9623e52..f5408dc323d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3885,9 +3885,9 @@ def test_date_time_astype_int( assert bf_result.dtype == "Int64" -def test_string_astype_int(): - pd_series = pd.Series(["4", "-7", "0", " -03"]) - bf_series = series.Series(pd_series) +def test_string_astype_int(session): + pd_series = pd.Series(["4", "-7", "0", "-03"]) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype("Int64") bf_result = bf_series.astype("Int64").to_pandas() @@ -3895,12 +3895,12 @@ def test_string_astype_int(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_float(): +def test_string_astype_float(session): pd_series = pd.Series( - ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] + ["1", "-1", "-0", "000", "-03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] ) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype("Float64") bf_result = bf_series.astype("Float64").to_pandas() @@ -3908,7 +3908,7 @@ def test_string_astype_float(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_date(): +def test_string_astype_date(session): if int(pa.__version__.split(".")[0]) < 15: pytest.skip( "Avoid pyarrow.lib.ArrowNotImplementedError: " @@ -3919,7 +3919,7 @@ def test_string_astype_date(): pd.ArrowDtype(pa.string()) ) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) # TODO(b/340885567): fix type error pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore @@ -3928,12 +3928,12 @@ def test_string_astype_date(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_datetime(): +def test_string_astype_datetime(session): pd_series = pd.Series( ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] ).astype(pd.ArrowDtype(pa.string())) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() @@ -3941,7 +3941,7 @@ def test_string_astype_datetime(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_timestamp(): +def test_string_astype_timestamp(session): pd_series = pd.Series( [ "2014-08-15 08:15:12+00:00", @@ -3950,7 +3950,7 @@ def test_string_astype_timestamp(): ] ).astype(pd.ArrowDtype(pa.string())) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) bf_result = bf_series.astype( @@ -3960,13 +3960,14 @@ def test_string_astype_timestamp(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_timestamp_astype_string(): +def test_timestamp_astype_string(session): bf_series = series.Series( [ "2014-08-15 08:15:12+00:00", "2015-08-15 08:15:12.654754+05:00", "2016-02-29 00:00:00+08:00", - ] + ], + session=session, ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) expected_result = pd.Series( @@ -3985,9 +3986,9 @@ def test_timestamp_astype_string(): @pytest.mark.parametrize("errors", ["raise", "null"]) -def test_float_astype_json(errors): +def test_float_astype_json(errors, session): data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) assert bf_result.dtype == dtypes.JSON_DTYPE @@ -3997,9 +3998,9 @@ def test_float_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) -def test_float_astype_json_str(): +def test_float_astype_json_str(session): data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype("json") assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4010,14 +4011,14 @@ def test_float_astype_json_str(): @pytest.mark.parametrize("errors", ["raise", "null"]) -def test_string_astype_json(errors): +def test_string_astype_json(errors, session): data = [ "1", None, '["1","3","5"]', '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4026,9 +4027,9 @@ def test_string_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) -def test_string_astype_json_in_safe_mode(): +def test_string_astype_json_in_safe_mode(session): data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4037,9 +4038,9 @@ def test_string_astype_json_in_safe_mode(): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) -def test_string_astype_json_raise_error(): +def test_string_astype_json_raise_error(session): data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) with pytest.raises( google.api_core.exceptions.BadRequest, match="syntax error while parsing value", @@ -4063,8 +4064,8 @@ def test_string_astype_json_raise_error(): ), ], ) -def test_json_astype_others(data, to_type, errors): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others(data, to_type, errors, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) bf_result = bf_series.astype(to_type, errors=errors) assert bf_result.dtype == to_type @@ -4084,8 +4085,8 @@ def test_json_astype_others(data, to_type, errors): pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) -def test_json_astype_others_raise_error(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others_raise_error(data, to_type, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) with pytest.raises(google.api_core.exceptions.BadRequest): bf_series.astype(to_type, errors="raise").to_pandas() @@ -4099,8 +4100,8 @@ def test_json_astype_others_raise_error(data, to_type): pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) -def test_json_astype_others_in_safe_mode(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others_in_safe_mode(data, to_type, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) bf_result = bf_series.astype(to_type, errors="null") assert bf_result.dtype == to_type @@ -4414,8 +4415,8 @@ def test_query_job_setters(scalars_dfs): ([1, 1, 1, 1, 1],), ], ) -def test_is_monotonic_increasing(series_input): - scalars_df = series.Series(series_input, dtype=pd.Int64Dtype()) +def test_is_monotonic_increasing(series_input, session): + scalars_df = series.Series(series_input, dtype=pd.Int64Dtype(), session=session) scalars_pandas_df = pd.Series(series_input, dtype=pd.Int64Dtype()) assert ( scalars_df.is_monotonic_increasing == scalars_pandas_df.is_monotonic_increasing @@ -4433,8 +4434,8 @@ def test_is_monotonic_increasing(series_input): ([1, 1, 1, 1, 1],), ], ) -def test_is_monotonic_decreasing(series_input): - scalars_df = series.Series(series_input) +def test_is_monotonic_decreasing(series_input, session): + scalars_df = series.Series(series_input, session=session) scalars_pandas_df = pd.Series(series_input) assert ( scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing diff --git a/tests/unit/bigquery/_operations/test_io.py b/tests/unit/bigquery/_operations/test_io.py new file mode 100644 index 00000000000..97b38f86495 --- /dev/null +++ b/tests/unit/bigquery/_operations/test_io.py @@ -0,0 +1,41 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest + +import bigframes.bigquery._operations.io +import bigframes.core.sql.io +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@mock.patch("bigframes.bigquery._operations.io._get_table_metadata") +def test_load_data(get_table_metadata_mock, mock_session): + bigframes.bigquery._operations.io.load_data( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + session=mock_session, + ) + mock_session.read_gbq_query.assert_called_once() + generated_sql = mock_session.read_gbq_query.call_args[0][0] + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert generated_sql == expected + get_table_metadata_mock.assert_called_once() diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py new file mode 100644 index 00000000000..0be32b9e8a5 --- /dev/null +++ b/tests/unit/bigquery/test_ai.py @@ -0,0 +1,244 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.dataframe +import bigframes.series +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@pytest.fixture +def mock_dataframe(mock_session): + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT * FROM my_table" + df._to_sql_query.return_value = ("SELECT * FROM my_table", None, None) + return df + + +@pytest.fixture +def mock_embedding_series(mock_session): + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS content FROM my_table" + df._to_sql_query.return_value = ( + "SELECT my_col AS content FROM my_table", + None, + None, + ) + series.copy.return_value = series + series.to_frame.return_value = df + return series + + +@pytest.fixture +def mock_text_series(mock_session): + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS prompt FROM my_table" + df._to_sql_query.return_value = ( + "SELECT my_col AS prompt FROM my_table", + None, + None, + ) + series.copy.return_value = series + series.to_frame.return_value = df + return series + + +def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + output_dimensionality=256, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_embedding_with_series(mock_embedding_series, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_embedding_series, + start_second=0.0, + end_second=10.0, + interval_seconds=5.0, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS content FROM my_table)" in query + assert ( + "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)" + in query + ) + + +def test_generate_embedding_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + bbq.ai.generate_embedding( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq_query.assert_called_once() + + +def test_generate_text_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_dataframe, + max_output_tokens=256, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_TEXT(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS MAX_OUTPUT_TOKENS)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_text_with_series(mock_text_series, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_text_series, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS prompt FROM my_table)" in query + + +def test_generate_text_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_text_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + bbq.ai.generate_text( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq_query.assert_called_once() diff --git a/tests/unit/bigquery/test_ml.py b/tests/unit/bigquery/test_ml.py index fd774691528..e5c957767b9 100644 --- a/tests/unit/bigquery/test_ml.py +++ b/tests/unit/bigquery/test_ml.py @@ -40,31 +40,6 @@ def mock_session(): MODEL_NAME = "test-project.test-dataset.test-model" -def test_get_model_name_and_session_with_pandas_series_model_input(): - model_name, _ = ml_ops._get_model_name_and_session(MODEL_SERIES) - assert model_name == MODEL_NAME - - -def test_get_model_name_and_session_with_pandas_series_model_input_missing_model_reference(): - model_series = pd.Series({"some_other_key": "value"}) - with pytest.raises( - ValueError, match="modelReference must be present in the pandas Series" - ): - ml_ops._get_model_name_and_session(model_series) - - -@mock.patch("bigframes.pandas.read_pandas") -def test_to_sql_with_pandas_dataframe(read_pandas_mock): - df = pd.DataFrame({"col1": [1, 2, 3]}) - read_pandas_mock.return_value._to_sql_query.return_value = ( - "SELECT * FROM `pandas_df`", - [], - [], - ) - ml_ops._to_sql(df) - read_pandas_mock.assert_called_once() - - @mock.patch("bigframes.bigquery._operations.ml._get_model_metadata") @mock.patch("bigframes.pandas.read_pandas") def test_create_model_with_pandas_dataframe( diff --git a/tests/unit/core/sql/test_io.py b/tests/unit/core/sql/test_io.py new file mode 100644 index 00000000000..23e5f796e31 --- /dev/null +++ b/tests/unit/core/sql/test_io.py @@ -0,0 +1,90 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.sql.io + + +def test_load_data_ddl(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_overwrite(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + write_disposition="OVERWRITE", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA OVERWRITE my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_with_partition_columns(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + with_partition_columns={"part1": "DATE", "part2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH PARTITION COLUMNS (part1 DATE, part2 STRING)" + assert sql == expected + + +def test_load_data_ddl_connection(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + connection_name="my-connection", + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH CONNECTION `my-connection`" + assert sql == expected + + +def test_load_data_ddl_partition_by(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + partition_by=["date_col"], + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) PARTITION BY date_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_cluster_by(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + cluster_by=["cluster_col"], + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) CLUSTER BY cluster_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_table_options(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + table_options={"description": "my table"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (description = 'my table') FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected diff --git a/tests/unit/test_col.py b/tests/unit/test_col.py new file mode 100644 index 00000000000..e01c25ddd2c --- /dev/null +++ b/tests/unit/test_col.py @@ -0,0 +1,160 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +import pathlib +from typing import Generator + +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd +from bigframes.testing.utils import assert_frame_equal, convert_pandas_dtypes + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="3.0.0") + + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.invert,), + ], +) +def test_pd_col_unary_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("float64_col")), + } + pd_kwargs = { + "result": op(pd.col("float64_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.add,), + (operator.sub,), + (operator.mul,), + (operator.truediv,), + (operator.floordiv,), + (operator.gt,), + (operator.lt,), + (operator.ge,), + (operator.le,), + (operator.eq,), + (operator.mod,), + ], +) +def test_pd_col_binary_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("float64_col"), 2.4), + "reverse_result": op(2.4, bpd.col("float64_col")), + } + pd_kwargs = { + "result": op(pd.col("float64_col"), 2.4), # type: ignore + "reverse_result": op(2.4, pd.col("float64_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.and_,), + (operator.or_,), + (operator.xor,), + ], +) +def test_pd_col_binary_bool_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("bool_col"), True), + "reverse_result": op(False, bpd.col("bool_col")), + } + pd_kwargs = { + "result": op(pd.col("bool_col"), True), # type: ignore + "reverse_result": op(False, pd.col("bool_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 1c73d9dc6b0..263fc82e3e5 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -828,6 +828,26 @@ def test_assign_new_column(scalars_dfs): assert_frame_equal(bf_result, pd_result) +def test_assign_using_pd_col(scalars_dfs): + if pd.__version__.startswith("1.") or pd.__version__.startswith("2."): + pytest.skip("col expression interface only supported for pandas 3+") + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "new_col_1": 4 - bpd.col("int64_col"), + "new_col_2": bpd.col("int64_col") / (bpd.col("float64_col") * 0.5), + } + pd_kwargs = { + "new_col_1": 4 - pd.col("int64_col"), # type: ignore + "new_col_2": pd.col("int64_col") / (pd.col("float64_col") * 0.5), # type: ignore + } + + df = scalars_df.assign(**bf_kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + def test_assign_new_column_w_loc(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() @@ -4450,3 +4470,10 @@ def test_dataframe_explode_reserve_order(session, ignore_index, ordered): def test_dataframe_explode_xfail(col_names): df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) df.explode(col_names) + + +def test_recursion_limit_unit(scalars_df_index): + scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + for i in range(250): + scalars_df_index = scalars_df_index + 4 + scalars_df_index.to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/col.py b/third_party/bigframes_vendored/pandas/core/col.py new file mode 100644 index 00000000000..9b71293a7e3 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/col.py @@ -0,0 +1,36 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/col.py +from __future__ import annotations + +from collections.abc import Hashable + +from bigframes import constants + + +class Expression: + """ + Class representing a deferred column. + + This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. + """ + + +def col(col_name: Hashable) -> Expression: + """ + Generate deferred object representing a column of a DataFrame. + + Any place which accepts ``lambda df: df[col_name]``, such as + :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept + ``pd.col(col_name)``. + + Args: + col_name (Hashable): + Column name. + + Returns: + Expression: + A deferred object representing a column of a DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +__all__ = ["Expression", "col"] diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index a6862ee201c..c5b120dc239 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.34.0" +__version__ = "2.35.0" # {x-release-please-start-date} -__release_date__ = "2026-02-02" +__release_date__ = "2026-02-07" # {x-release-please-end}