From 4b0f13b2fe10fa5b07d3ca3b7cb1ae1cb95030c7 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 3 Feb 2026 14:22:56 -0800 Subject: [PATCH 01/18] feat: add bigframe.bigquery.load_data function (#2426) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/bigquery/__init__.py | 5 ++ bigframes/bigquery/_operations/io.py | 94 ++++++++++++++++++++++ bigframes/core/sql/io.py | 87 ++++++++++++++++++++ tests/system/large/bigquery/test_io.py | 39 +++++++++ tests/unit/bigquery/_operations/test_io.py | 41 ++++++++++ tests/unit/core/sql/test_io.py | 90 +++++++++++++++++++++ 6 files changed, 356 insertions(+) create mode 100644 bigframes/bigquery/_operations/io.py create mode 100644 bigframes/core/sql/io.py create mode 100644 tests/system/large/bigquery/test_io.py create mode 100644 tests/unit/bigquery/_operations/test_io.py create mode 100644 tests/unit/core/sql/test_io.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 150fe5efc0..e02e80cd1f 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -43,6 +43,7 @@ st_regionstats, st_simplify, ) +from bigframes.bigquery._operations.io import load_data from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -107,6 +108,8 @@ struct, # table ops create_external_table, + # io ops + load_data, ] _module = sys.modules[__name__] @@ -160,6 +163,8 @@ "struct", # table ops "create_external_table", + # io ops + "load_data", # Modules / SQL namespaces "ai", "ml", diff --git a/bigframes/bigquery/_operations/io.py b/bigframes/bigquery/_operations/io.py new file mode 100644 index 0000000000..daf28e6aed --- /dev/null +++ b/bigframes/bigquery/_operations/io.py @@ -0,0 +1,94 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + +import pandas as pd + +from bigframes.bigquery._operations.table import _get_table_metadata +import bigframes.core.logging.log_adapter as log_adapter +import bigframes.core.sql.io +import bigframes.session + + +@log_adapter.method_logger(custom_base_name="bigquery_io") +def load_data( + table_name: str, + *, + write_disposition: str = "INTO", + columns: Optional[Mapping[str, str]] = None, + partition_by: Optional[list[str]] = None, + cluster_by: Optional[list[str]] = None, + table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, + from_files_options: Mapping[str, Union[str, int, float, bool, list]], + with_partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, + session: Optional[bigframes.session.Session] = None, +) -> pd.Series: + """ + Loads data into a BigQuery table. + See the `BigQuery LOAD DATA DDL syntax + `_ + for additional reference. + Args: + table_name (str): + The name of the table in BigQuery. + write_disposition (str, default "INTO"): + Whether to replace the table if it already exists ("OVERWRITE") or append to it ("INTO"). + columns (Mapping[str, str], optional): + The table's schema. + partition_by (list[str], optional): + A list of partition expressions to partition the table by. See https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/load-statements#partition_expression. + cluster_by (list[str], optional): + A list of columns to cluster the table by. + table_options (Mapping[str, Union[str, int, float, bool, list]], optional): + The table options. + from_files_options (Mapping[str, Union[str, int, float, bool, list]]): + The options for loading data from files. + with_partition_columns (Mapping[str, str], optional): + The table's partition columns. + connection_name (str, optional): + The connection to use for the table. + session (bigframes.session.Session, optional): + The session to use. If not provided, the default session is used. + Returns: + pandas.Series: + A Series with object dtype containing the table metadata. Reference + the `BigQuery Table REST API reference + `_ + for available fields. + """ + import bigframes.pandas as bpd + + sql = bigframes.core.sql.io.load_data_ddl( + table_name=table_name, + write_disposition=write_disposition, + columns=columns, + partition_by=partition_by, + cluster_by=cluster_by, + table_options=table_options, + from_files_options=from_files_options, + with_partition_columns=with_partition_columns, + connection_name=connection_name, + ) + + if session is None: + bpd.read_gbq_query(sql) + session = bpd.get_global_session() + else: + session.read_gbq_query(sql) + + return _get_table_metadata(bqclient=session.bqclient, table_name=table_name) diff --git a/bigframes/core/sql/io.py b/bigframes/core/sql/io.py new file mode 100644 index 0000000000..9e1a549a64 --- /dev/null +++ b/bigframes/core/sql/io.py @@ -0,0 +1,87 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Mapping, Optional, Union + + +def load_data_ddl( + table_name: str, + *, + write_disposition: str = "INTO", + columns: Optional[Mapping[str, str]] = None, + partition_by: Optional[list[str]] = None, + cluster_by: Optional[list[str]] = None, + table_options: Optional[Mapping[str, Union[str, int, float, bool, list]]] = None, + from_files_options: Mapping[str, Union[str, int, float, bool, list]], + with_partition_columns: Optional[Mapping[str, str]] = None, + connection_name: Optional[str] = None, +) -> str: + """Generates the LOAD DATA DDL statement.""" + statement = ["LOAD DATA"] + statement.append(write_disposition) + statement.append(table_name) + + if columns: + column_defs = ", ".join([f"{name} {typ}" for name, typ in columns.items()]) + statement.append(f"({column_defs})") + + if partition_by: + statement.append(f"PARTITION BY {', '.join(partition_by)}") + + if cluster_by: + statement.append(f"CLUSTER BY {', '.join(cluster_by)}") + + if table_options: + opts = [] + for key, value in table_options.items(): + if isinstance(value, str): + value_sql = repr(value) + opts.append(f"{key} = {value_sql}") + elif isinstance(value, bool): + opts.append(f"{key} = {str(value).upper()}") + elif isinstance(value, list): + list_str = ", ".join([repr(v) for v in value]) + opts.append(f"{key} = [{list_str}]") + else: + opts.append(f"{key} = {value}") + options_str = ", ".join(opts) + statement.append(f"OPTIONS ({options_str})") + + opts = [] + for key, value in from_files_options.items(): + if isinstance(value, str): + value_sql = repr(value) + opts.append(f"{key} = {value_sql}") + elif isinstance(value, bool): + opts.append(f"{key} = {str(value).upper()}") + elif isinstance(value, list): + list_str = ", ".join([repr(v) for v in value]) + opts.append(f"{key} = [{list_str}]") + else: + opts.append(f"{key} = {value}") + options_str = ", ".join(opts) + statement.append(f"FROM FILES ({options_str})") + + if with_partition_columns: + part_defs = ", ".join( + [f"{name} {typ}" for name, typ in with_partition_columns.items()] + ) + statement.append(f"WITH PARTITION COLUMNS ({part_defs})") + + if connection_name: + statement.append(f"WITH CONNECTION `{connection_name}`") + + return " ".join(statement) diff --git a/tests/system/large/bigquery/test_io.py b/tests/system/large/bigquery/test_io.py new file mode 100644 index 0000000000..024c617470 --- /dev/null +++ b/tests/system/large/bigquery/test_io.py @@ -0,0 +1,39 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for for the specific language governing permissions and +# limitations under the License. + +import bigframes.bigquery as bbq + + +def test_load_data(session, dataset_id): + table_name = f"{dataset_id}.test_load_data" + uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv" + + # Create the external table + table = bbq.load_data( + table_name, + columns={ + "name": "STRING", + "post_abbr": "STRING", + }, + from_files_options={"format": "CSV", "uris": [uri], "skip_leading_rows": 1}, + session=session, + ) + assert table is not None + + # Read the table to verify + import bigframes.pandas as bpd + + bf_df = bpd.read_gbq(table_name) + pd_df = bf_df.to_pandas() + assert len(pd_df) > 0 diff --git a/tests/unit/bigquery/_operations/test_io.py b/tests/unit/bigquery/_operations/test_io.py new file mode 100644 index 0000000000..97b38f8649 --- /dev/null +++ b/tests/unit/bigquery/_operations/test_io.py @@ -0,0 +1,41 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest + +import bigframes.bigquery._operations.io +import bigframes.core.sql.io +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@mock.patch("bigframes.bigquery._operations.io._get_table_metadata") +def test_load_data(get_table_metadata_mock, mock_session): + bigframes.bigquery._operations.io.load_data( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + session=mock_session, + ) + mock_session.read_gbq_query.assert_called_once() + generated_sql = mock_session.read_gbq_query.call_args[0][0] + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert generated_sql == expected + get_table_metadata_mock.assert_called_once() diff --git a/tests/unit/core/sql/test_io.py b/tests/unit/core/sql/test_io.py new file mode 100644 index 0000000000..23e5f796e3 --- /dev/null +++ b/tests/unit/core/sql/test_io.py @@ -0,0 +1,90 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.sql.io + + +def test_load_data_ddl(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_overwrite(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + write_disposition="OVERWRITE", + columns={"col1": "INT64", "col2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA OVERWRITE my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_with_partition_columns(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + with_partition_columns={"part1": "DATE", "part2": "STRING"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH PARTITION COLUMNS (part1 DATE, part2 STRING)" + assert sql == expected + + +def test_load_data_ddl_connection(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + connection_name="my-connection", + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) FROM FILES (format = 'CSV', uris = ['gs://bucket/path*']) WITH CONNECTION `my-connection`" + assert sql == expected + + +def test_load_data_ddl_partition_by(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + partition_by=["date_col"], + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) PARTITION BY date_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_cluster_by(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + cluster_by=["cluster_col"], + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) CLUSTER BY cluster_col FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected + + +def test_load_data_ddl_table_options(): + sql = bigframes.core.sql.io.load_data_ddl( + "my-project.my_dataset.my_table", + columns={"col1": "INT64", "col2": "STRING"}, + table_options={"description": "my table"}, + from_files_options={"format": "CSV", "uris": ["gs://bucket/path*"]}, + ) + expected = "LOAD DATA INTO my-project.my_dataset.my_table (col1 INT64, col2 STRING) OPTIONS (description = 'my table') FROM FILES (format = 'CSV', uris = ['gs://bucket/path*'])" + assert sql == expected From e91536c8a5b2d8d896767510ced80c6fd2a68a97 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:22:14 -0600 Subject: [PATCH 02/18] feat: add `bigframes.bigquery.ai.generate_embedding` (#2343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement AI.GENERATE_EMBEDDING function in bigframes.bigquery.ai. --- *PR created automatically by Jules for task [11924477578091076513](https://jules.google.com/task/11924477578091076513) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Tim Sweña Co-authored-by: Shenyang Cai --- bigframes/bigquery/_operations/ai.py | 110 +++++++++++++++++++++- bigframes/bigquery/ai.py | 2 + bigframes/core/sql/literals.py | 58 ++++++++++++ bigframes/core/sql/ml.py | 34 +------ tests/unit/bigquery/test_ai.py | 134 +++++++++++++++++++++++++++ 5 files changed, 305 insertions(+), 33 deletions(-) create mode 100644 bigframes/core/sql/literals.py create mode 100644 tests/unit/bigquery/test_ai.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index fd7dafe95f..17af5dd5cf 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -19,7 +19,7 @@ from __future__ import annotations import json -from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union +from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union import pandas as pd @@ -28,6 +28,7 @@ from bigframes import series, session from bigframes.core import convert from bigframes.core.logging import log_adapter +import bigframes.core.sql.literals from bigframes.ml import core as ml_core from bigframes.operations import ai_ops, output_schemas @@ -388,6 +389,113 @@ def generate_double( return series_list[0]._apply_nary_op(operator, series_list[1:]) +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_embedding( + model_name: str, + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + output_dimensionality: Optional[int] = None, + task_type: Optional[str] = None, + start_second: Optional[float] = None, + end_second: Optional[float] = None, + interval_seconds: Optional[float] = None, + trial_id: Optional[int] = None, +) -> dataframe.DataFrame: + """ + Creates embeddings that describe an entity—for example, a piece of text or an image. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]}) + >>> bbq.ai.generate_embedding( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model_name (str): + The name of a remote model from Vertex AI, such as the + multimodalembedding@001 model. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. + output_dimensionality (int, optional): + An INT64 value that specifies the number of dimensions to use when + generating embeddings. For example, if you specify 256 AS + output_dimensionality, then the embedding output column contains a + 256-dimensional embedding for each input value. To find the + supported range of output dimensions, read about the available + `Google text embedding models `_. + task_type (str, optional): + A STRING literal that specifies the intended downstream application to + help the model produce better quality embeddings. For a list of + supported task types and how to choose which one to use, see `Choose an + embeddings task type `_. + start_second (float, optional): + The second in the video at which to start the embedding. The default value is 0. + end_second (float, optional): + The second in the video at which to end the embedding. The default value is 120. + interval_seconds (float, optional): + The interval to use when creating embeddings. The default value is 16. + trial_id (int, optional): + An INT64 value that identifies the hyperparameter tuning trial that + you want the function to evaluate. The function uses the optimal + trial by default. Only specify this argument if you ran + hyperparameter tuning when creating the model. + + Returns: + bigframes.pandas.DataFrame: + A new DataFrame with the generated embeddings. See the `SQL + reference for AI.GENERATE_EMBEDDING + `_ + for details. + """ + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + data = data.copy() + data.name = "content" + data_df = data.to_frame() + elif isinstance(data, dataframe.DataFrame): + data_df = data + else: + raise ValueError(f"Unsupported data type: {type(data)}") + + # We need to get the SQL for the input data to pass as a subquery to the TVF + source_sql = data_df.sql + + struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} + if output_dimensionality is not None: + struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality + if task_type is not None: + struct_fields["TASK_TYPE"] = task_type + if start_second is not None: + struct_fields["START_SECOND"] = start_second + if end_second is not None: + struct_fields["END_SECOND"] = end_second + if interval_seconds is not None: + struct_fields["INTERVAL_SECONDS"] = interval_seconds + if trial_id is not None: + struct_fields["TRIAL_ID"] = trial_id + + # Construct the TVF query + query = f""" + SELECT * + FROM AI.GENERATE_EMBEDDING( + MODEL `{model_name}`, + ({source_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)}) + ) + """ + + return data_df._session.read_gbq(query) + + @log_adapter.method_logger(custom_base_name="bigquery_ai") def if_( prompt: PROMPT_TYPE, diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index 3af52205a6..b0d9b62f9b 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -22,6 +22,7 @@ generate, generate_bool, generate_double, + generate_embedding, generate_int, if_, score, @@ -33,6 +34,7 @@ "generate", "generate_bool", "generate_double", + "generate_embedding", "generate_int", "if_", "score", diff --git a/bigframes/core/sql/literals.py b/bigframes/core/sql/literals.py new file mode 100644 index 0000000000..59c8197731 --- /dev/null +++ b/bigframes/core/sql/literals.py @@ -0,0 +1,58 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections.abc +import json +from typing import Any, List, Mapping, Union + +import bigframes.core.sql + +STRUCT_VALUES = Union[ + str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any] +] +STRUCT_TYPE = Mapping[str, STRUCT_VALUES] + + +def struct_literal(struct_options: STRUCT_TYPE) -> str: + rendered_options = [] + for option_name, option_value in struct_options.items(): + if option_name == "model_params": + json_str = json.dumps(option_value) + # Escape single quotes for SQL string literal + sql_json_str = json_str.replace("'", "''") + rendered_val = f"JSON'{sql_json_str}'" + elif isinstance(option_value, collections.abc.Mapping): + struct_body = ", ".join( + [ + f"{bigframes.core.sql.simple_literal(v)} AS {k}" + for k, v in option_value.items() + ] + ) + rendered_val = f"STRUCT({struct_body})" + elif isinstance(option_value, list): + rendered_val = ( + "[" + + ", ".join( + [bigframes.core.sql.simple_literal(v) for v in option_value] + ) + + "]" + ) + elif isinstance(option_value, bool): + rendered_val = str(option_value).lower() + else: + rendered_val = bigframes.core.sql.simple_literal(option_value) + rendered_options.append(f"{rendered_val} AS {option_name}") + return f"STRUCT({', '.join(rendered_options)})" diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index d77c5aa4a0..a2a4d32ae8 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -14,12 +14,11 @@ from __future__ import annotations -import collections.abc -import json from typing import Any, Dict, List, Mapping, Optional, Union import bigframes.core.compile.googlesql as googlesql import bigframes.core.sql +import bigframes.core.sql.literals def create_model_ddl( @@ -109,36 +108,7 @@ def _build_struct_sql( ) -> str: if not struct_options: return "" - - rendered_options = [] - for option_name, option_value in struct_options.items(): - if option_name == "model_params": - json_str = json.dumps(option_value) - # Escape single quotes for SQL string literal - sql_json_str = json_str.replace("'", "''") - rendered_val = f"JSON'{sql_json_str}'" - elif isinstance(option_value, collections.abc.Mapping): - struct_body = ", ".join( - [ - f"{bigframes.core.sql.simple_literal(v)} AS {k}" - for k, v in option_value.items() - ] - ) - rendered_val = f"STRUCT({struct_body})" - elif isinstance(option_value, list): - rendered_val = ( - "[" - + ", ".join( - [bigframes.core.sql.simple_literal(v) for v in option_value] - ) - + "]" - ) - elif isinstance(option_value, bool): - rendered_val = str(option_value).lower() - else: - rendered_val = bigframes.core.sql.simple_literal(option_value) - rendered_options.append(f"{rendered_val} AS {option_name}") - return f", STRUCT({', '.join(rendered_options)})" + return f", {bigframes.core.sql.literals.struct_literal(struct_options)}" def evaluate( diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py new file mode 100644 index 0000000000..0f9df6cc26 --- /dev/null +++ b/tests/unit/bigquery/test_ai.py @@ -0,0 +1,134 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pandas as pd +import pytest + +import bigframes.bigquery as bbq +import bigframes.dataframe +import bigframes.series +import bigframes.session + + +@pytest.fixture +def mock_session(): + return mock.create_autospec(spec=bigframes.session.Session) + + +@pytest.fixture +def mock_dataframe(mock_session): + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT * FROM my_table" + return df + + +@pytest.fixture +def mock_series(mock_session): + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS content FROM my_table" + series.copy.return_value = series + series.to_frame.return_value = df + return series + + +def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + output_dimensionality=256, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_embedding_with_series(mock_series, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS content FROM my_table)" in query + assert ( + "STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)" + in query + ) + + +def test_generate_embedding_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_embedding( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq.assert_called_once() + query = mock_session.read_gbq.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_embedding_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + bbq.ai.generate_embedding( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq.assert_called_once() From 301272474f036aedeb74a5a8cb3091544265d659 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 4 Feb 2026 15:59:06 -0800 Subject: [PATCH 03/18] test: skip blob test (#2431) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip all blob tests Fixes #<481790217> 🦕 --- tests/system/large/blob/test_function.py | 2 ++ tests/system/small/blob/test_io.py | 3 +++ tests/system/small/blob/test_properties.py | 3 +++ tests/system/small/blob/test_urls.py | 4 ++++ 4 files changed, 12 insertions(+) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b..6c7d812100 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -26,6 +26,8 @@ from bigframes import dtypes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + @pytest.fixture(scope="function") def images_output_folder() -> Generator[str, None, None]: diff --git a/tests/system/small/blob/test_io.py b/tests/system/small/blob/test_io.py index 102d608382..c89fb4c6e6 100644 --- a/tests/system/small/blob/test_io.py +++ b/tests/system/small/blob/test_io.py @@ -20,6 +20,9 @@ import bigframes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + + idisplay = pytest.importorskip("IPython.display") diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index 47d4d2aa04..f63de38a8c 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -13,10 +13,13 @@ # limitations under the License. import pandas as pd +import pytest import bigframes.dtypes as dtypes import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + def test_blob_uri(images_uris: list[str], images_mm_df: bpd.DataFrame): actual = images_mm_df["blob_col"].blob.uri().to_pandas() diff --git a/tests/system/small/blob/test_urls.py b/tests/system/small/blob/test_urls.py index 02a76587f5..b2dd660434 100644 --- a/tests/system/small/blob/test_urls.py +++ b/tests/system/small/blob/test_urls.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + import bigframes.pandas as bpd +pytest.skip("Skipping blob tests due to b/481790217", allow_module_level=True) + def test_blob_read_url(images_mm_df: bpd.DataFrame): urls = images_mm_df["blob_col"].blob.read_url() From 318752a612b3323aab46ea977fea6d4c02289596 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 4 Feb 2026 16:59:18 -0800 Subject: [PATCH 04/18] test: add system tests for bigquery.ml.create_model (#2432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/large/bigquery/test_ml.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/system/large/bigquery/test_ml.py b/tests/system/large/bigquery/test_ml.py index 22011199fe..20a62ae2b6 100644 --- a/tests/system/large/bigquery/test_ml.py +++ b/tests/system/large/bigquery/test_ml.py @@ -62,3 +62,30 @@ def test_generate_embedding_with_options(embedding_model): assert "ml_generate_embedding_status" in result.columns embedding = result["ml_generate_embedding_result"].to_pandas() assert len(embedding[0]) == 256 + + +def test_create_model_linear_regression(dataset_id): + df = bpd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + model_name = f"{dataset_id}.linear_regression_model" + + result = ml.create_model( + model_name=model_name, + options={"model_type": "LINEAR_REG", "input_label_cols": ["y"]}, + training_data=df, + ) + + assert result["modelType"] == "LINEAR_REGRESSION" + + +def test_create_model_with_transform(dataset_id): + df = bpd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) + model_name = f"{dataset_id}.transform_model" + + result = ml.create_model( + model_name=model_name, + options={"model_type": "LINEAR_REG", "input_label_cols": ["y"]}, + training_data=df, + transform=["x * 2 AS x_doubled", "y"], + ) + + assert result["modelType"] == "LINEAR_REGRESSION" From e6de52ded6c5091275a936dec36f01a6cf701233 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 5 Feb 2026 07:33:47 -0800 Subject: [PATCH 05/18] feat: Add a bigframes cell magic for ipython (#2395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/__init__.py | 14 +++++ bigframes/_magics.py | 56 +++++++++++++++++ bigframes/pandas/io/api.py | 9 ++- tests/system/small/test_magics.py | 100 ++++++++++++++++++++++++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 bigframes/_magics.py create mode 100644 tests/system/small/test_magics.py diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 240608ebc2..419899a4ae 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -22,6 +22,20 @@ from bigframes.session import connect, Session from bigframes.version import __version__ +_MAGIC_NAMES = ["bqsql"] + + +def load_ipython_extension(ipython): + """Called by IPython when this module is loaded as an IPython extension.""" + # Requires IPython to be installed for import to succeed + from bigframes._magics import _cell_magic + + for magic_name in _MAGIC_NAMES: + ipython.register_magic_function( + _cell_magic, magic_kind="cell", magic_name=magic_name + ) + + __all__ = [ "options", "BigQueryOptions", diff --git a/bigframes/_magics.py b/bigframes/_magics.py new file mode 100644 index 0000000000..33205e3a37 --- /dev/null +++ b/bigframes/_magics.py @@ -0,0 +1,56 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from IPython.core import magic_arguments # type: ignore +from IPython.core.getipython import get_ipython +from IPython.display import display + +import bigframes.pandas + + +@magic_arguments.magic_arguments() +@magic_arguments.argument( + "destination_var", + nargs="?", + help=("If provided, save the output to this variable instead of displaying it."), +) +@magic_arguments.argument( + "--dry_run", + action="store_true", + default=False, + help=( + "Sets query to be a dry run to estimate costs. " + "Defaults to executing the query instead of dry run if this argument is not used." + "Does not work with engine 'bigframes'. " + ), +) +def _cell_magic(line, cell): + ipython = get_ipython() + args = magic_arguments.parse_argstring(_cell_magic, line) + if not cell: + print("Query is missing.") + return + pyformat_args = ipython.user_ns + dataframe = bigframes.pandas._read_gbq_colab( + cell, pyformat_args=pyformat_args, dry_run=args.dry_run + ) + if args.destination_var: + ipython.push({args.destination_var: dataframe}) + else: + with bigframes.option_context( + "display.repr_mode", + "anywidget", + ): + display(dataframe) + return diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 483bc5e530..b8ad6cbd0c 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -49,6 +49,7 @@ import pyarrow as pa import bigframes._config as config +import bigframes._importing import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.dataframe @@ -356,8 +357,12 @@ def _read_gbq_colab( with warnings.catch_warnings(): # Don't warning about Polars in SQL cell. # Related to b/437090788. - warnings.simplefilter("ignore", bigframes.exceptions.PreviewWarning) - config.options.bigquery.enable_polars_execution = True + try: + bigframes._importing.import_polars() + warnings.simplefilter("ignore", bigframes.exceptions.PreviewWarning) + config.options.bigquery.enable_polars_execution = True + except ImportError: + pass # don't fail if polars isn't available return global_session.with_default_session( bigframes.session.Session._read_gbq_colab, diff --git a/tests/system/small/test_magics.py b/tests/system/small/test_magics.py new file mode 100644 index 0000000000..91ada5b9e3 --- /dev/null +++ b/tests/system/small/test_magics.py @@ -0,0 +1,100 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd + +IPython = pytest.importorskip("IPython") + + +MAGIC_NAME = "bqsql" + + +@pytest.fixture(scope="module") +def ip(): + """Provides a persistent IPython shell instance for the test session.""" + from IPython.testing.globalipapp import get_ipython + + shell = get_ipython() + shell.extension_manager.load_extension("bigframes") + return shell + + +def test_magic_select_lit_to_var(ip): + bigframes.close_session() + + line = "dst_var" + cell_body = "SELECT 3" + + ip.run_cell_magic(MAGIC_NAME, line, cell_body) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.shape == (1, 1) + assert result_df.loc[0, 0] == 3 + + +def test_magic_select_lit_dry_run(ip): + bigframes.close_session() + + line = "dst_var --dry_run" + cell_body = "SELECT 3" + + ip.run_cell_magic(MAGIC_NAME, line, cell_body) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.totalBytesProcessed == 0 + + +def test_magic_select_lit_display(ip): + from IPython.utils.capture import capture_output + + bigframes.close_session() + + cell_body = "SELECT 3" + + with capture_output() as io: + ip.run_cell_magic(MAGIC_NAME, "", cell_body) + assert len(io.outputs) > 0 + # Check that the output has data, regardless of the format (html, plain, etc) + available_formats = io.outputs[0].data.keys() + assert len(available_formats) > 0 + + +def test_magic_select_interpolate(ip): + bigframes.close_session() + df = bpd.read_pandas( + pd.DataFrame({"col_a": [1, 2, 3, 4, 5, 6], "col_b": [1, 2, 1, 3, 1, 2]}) + ) + const_val = 1 + + ip.push({"df": df, "const_val": const_val}) + + query = """ + SELECT + SUM(col_a) AS total + FROM + {df} + WHERE col_b={const_val} + """ + + ip.run_cell_magic(MAGIC_NAME, "dst_var", query) + + assert "dst_var" in ip.user_ns + result_df = ip.user_ns["dst_var"] + assert result_df.shape == (1, 1) + assert result_df.loc[0, 0] == 9 From 5bd0029a99e7653843de4ac7d57370c9dffeed4d Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 5 Feb 2026 10:57:29 -0800 Subject: [PATCH 06/18] feat: add bigquery.ai.generate_text function (#2433) * Added the API for ai.generate_text * Fixed SQL syntax bug of ai.generate_embedding * Refactored the code base to keep util functions organized. Fixes b/481092205 --- bigframes/bigquery/_operations/ai.py | 164 +++++++++++++++++++++--- bigframes/bigquery/_operations/ml.py | 84 +++--------- bigframes/bigquery/_operations/utils.py | 70 ++++++++++ bigframes/bigquery/ai.py | 2 + tests/system/large/bigquery/test_ai.py | 96 ++++++++++++++ tests/unit/bigquery/test_ai.py | 130 +++++++++++++++++-- tests/unit/bigquery/test_ml.py | 25 ---- 7 files changed, 452 insertions(+), 119 deletions(-) create mode 100644 bigframes/bigquery/_operations/utils.py create mode 100644 tests/system/large/bigquery/test_ai.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 17af5dd5cf..bc2ab8dd20 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -26,6 +26,7 @@ from bigframes import clients, dataframe, dtypes from bigframes import pandas as bpd from bigframes import series, session +from bigframes.bigquery._operations import utils as bq_utils from bigframes.core import convert from bigframes.core.logging import log_adapter import bigframes.core.sql.literals @@ -391,7 +392,7 @@ def generate_double( @log_adapter.method_logger(custom_base_name="bigquery_ai") def generate_embedding( - model_name: str, + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], *, output_dimensionality: Optional[int] = None, @@ -415,9 +416,8 @@ def generate_embedding( ... ) # doctest: +SKIP Args: - model_name (str): - The name of a remote model from Vertex AI, such as the - multimodalembedding@001 model. + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text embedding. data (bigframes.pandas.DataFrame or bigframes.pandas.Series): The data to generate embeddings for. If a Series is provided, it is treated as the 'content' column. If a DataFrame is provided, it @@ -454,20 +454,9 @@ def generate_embedding( `_ for details. """ - if isinstance(data, (pd.DataFrame, pd.Series)): - data = bpd.read_pandas(data) - - if isinstance(data, series.Series): - data = data.copy() - data.name = "content" - data_df = data.to_frame() - elif isinstance(data, dataframe.DataFrame): - data_df = data - else: - raise ValueError(f"Unsupported data type: {type(data)}") - - # We need to get the SQL for the input data to pass as a subquery to the TVF - source_sql = data_df.sql + data = _to_dataframe(data, series_rename="content") + model_name, session = bq_utils.get_model_name_and_session(model, data) + table_sql = bq_utils.to_sql(data) struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {} if output_dimensionality is not None: @@ -488,12 +477,128 @@ def generate_embedding( SELECT * FROM AI.GENERATE_EMBEDDING( MODEL `{model_name}`, - ({source_sql}), - {bigframes.core.sql.literals.struct_literal(struct_fields)}) + ({table_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)} ) """ - return data_df._session.read_gbq(query) + if session is None: + return bpd.read_gbq_query(query) + else: + return session.read_gbq_query(query) + + +@log_adapter.method_logger(custom_base_name="bigquery_ai") +def generate_text( + model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + *, + temperature: Optional[float] = None, + max_output_tokens: Optional[int] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + stop_sequences: Optional[List[str]] = None, + ground_with_google_search: Optional[bool] = None, + request_type: Optional[str] = None, +) -> dataframe.DataFrame: + """ + Generates text using a BigQuery ML model. + + See the `BigQuery ML GENERATE_TEXT function syntax + `_ + for additional reference. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"prompt": ["write a poem about apples"]}) + >>> bbq.ai.generate_text( + ... "project.dataset.model_name", + ... df + ... ) # doctest: +SKIP + + Args: + model (bigframes.ml.base.BaseEstimator or str): + The model to use for text generation. + data (bigframes.pandas.DataFrame or bigframes.pandas.Series): + The data to generate embeddings for. If a Series is provided, it is + treated as the 'content' column. If a DataFrame is provided, it + must contain a 'content' column, or you must rename the column you + wish to embed to 'content'. + temperature (float, optional): + A FLOAT64 value that is used for sampling promiscuity. The value + must be in the range ``[0.0, 1.0]``. A lower temperature works well + for prompts that expect a more deterministic and less open-ended + or creative response, while a higher temperature can lead to more + diverse or creative results. A temperature of ``0`` is + deterministic, meaning that the highest probability response is + always selected. + max_output_tokens (int, optional): + An INT64 value that sets the maximum number of tokens in the + generated text. + top_k (int, optional): + An INT64 value that changes how the model selects tokens for + output. A ``top_k`` of ``1`` means the next selected token is the + most probable among all tokens in the model's vocabulary. A + ``top_k`` of ``3`` means that the next token is selected from + among the three most probable tokens by using temperature. The + default value is ``40``. + top_p (float, optional): + A FLOAT64 value that changes how the model selects tokens for + output. Tokens are selected from most probable to least probable + until the sum of their probabilities equals the ``top_p`` value. + For example, if tokens A, B, and C have a probability of 0.3, 0.2, + and 0.1 and the ``top_p`` value is ``0.5``, then the model will + select either A or B as the next token by using temperature. The + default value is ``0.95``. + stop_sequences (List[str], optional): + An ARRAY value that contains the stop sequences for the model. + ground_with_google_search (bool, optional): + A BOOL value that determines whether to ground the model with Google Search. + request_type (str, optional): + A STRING value that contains the request type for the model. + + Returns: + bigframes.pandas.DataFrame: + The generated text. + """ + data = _to_dataframe(data, series_rename="prompt") + model_name, session = bq_utils.get_model_name_and_session(model, data) + table_sql = bq_utils.to_sql(data) + + struct_fields: Dict[ + str, + Union[str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]], + ] = {} + if temperature is not None: + struct_fields["TEMPERATURE"] = temperature + if max_output_tokens is not None: + struct_fields["MAX_OUTPUT_TOKENS"] = max_output_tokens + if top_k is not None: + struct_fields["TOP_K"] = top_k + if top_p is not None: + struct_fields["TOP_P"] = top_p + if stop_sequences is not None: + struct_fields["STEP_SEQUENCES"] = stop_sequences + if ground_with_google_search is not None: + struct_fields["GROUND_WITH_GOOGLE_SEARCH"] = ground_with_google_search + if request_type is not None: + struct_fields["REQUEST_TYPE"] = request_type + + query = f""" + SELECT * + FROM AI.GENERATE_TEXT( + MODEL `{model_name}`, + ({table_sql}), + {bigframes.core.sql.literals.struct_literal(struct_fields)} + ) + """ + + if session is None: + return bpd.read_gbq_query(query) + else: + return session.read_gbq_query(query) @log_adapter.method_logger(custom_base_name="bigquery_ai") @@ -811,3 +916,20 @@ def _resolve_connection_id(series: series.Series, connection_id: str | None): series._session._project, series._session._location, ) + + +def _to_dataframe( + data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series], + series_rename: str, +) -> dataframe.DataFrame: + if isinstance(data, (pd.DataFrame, pd.Series)): + data = bpd.read_pandas(data) + + if isinstance(data, series.Series): + data = data.copy() + data.name = series_rename + return data.to_frame() + elif isinstance(data, dataframe.DataFrame): + return data + + raise ValueError(f"Unsupported data type: {type(data)}") diff --git a/bigframes/bigquery/_operations/ml.py b/bigframes/bigquery/_operations/ml.py index cc5a961af7..d5b1786b25 100644 --- a/bigframes/bigquery/_operations/ml.py +++ b/bigframes/bigquery/_operations/ml.py @@ -14,12 +14,13 @@ from __future__ import annotations -from typing import cast, List, Mapping, Optional, Union +from typing import List, Mapping, Optional, Union import bigframes_vendored.constants import google.cloud.bigquery import pandas as pd +from bigframes.bigquery._operations import utils import bigframes.core.logging.log_adapter as log_adapter import bigframes.core.sql.ml import bigframes.dataframe as dataframe @@ -27,53 +28,6 @@ import bigframes.session -# Helper to convert DataFrame to SQL string -def _to_sql(df_or_sql: Union[pd.DataFrame, dataframe.DataFrame, str]) -> str: - import bigframes.pandas as bpd - - if isinstance(df_or_sql, str): - return df_or_sql - - if isinstance(df_or_sql, pd.DataFrame): - bf_df = bpd.read_pandas(df_or_sql) - else: - bf_df = cast(dataframe.DataFrame, df_or_sql) - - # Cache dataframes to make sure base table is not a snapshot. - # Cached dataframe creates a full copy, never uses snapshot. - # This is a workaround for internal issue b/310266666. - bf_df.cache() - sql, _, _ = bf_df._to_sql_query(include_index=False) - return sql - - -def _get_model_name_and_session( - model: Union[bigframes.ml.base.BaseEstimator, str, pd.Series], - # Other dataframe arguments to extract session from - *dataframes: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]], -) -> tuple[str, Optional[bigframes.session.Session]]: - if isinstance(model, pd.Series): - try: - model_ref = model["modelReference"] - model_name = f"{model_ref['projectId']}.{model_ref['datasetId']}.{model_ref['modelId']}" # type: ignore - except KeyError: - raise ValueError("modelReference must be present in the pandas Series.") - elif isinstance(model, str): - model_name = model - else: - if model._bqml_model is None: - raise ValueError("Model must be fitted to be used in ML operations.") - return model._bqml_model.model_name, model._bqml_model.session - - session = None - for df in dataframes: - if isinstance(df, dataframe.DataFrame): - session = df._session - break - - return model_name, session - - def _get_model_metadata( *, bqclient: google.cloud.bigquery.Client, @@ -143,8 +97,12 @@ def create_model( """ import bigframes.pandas as bpd - training_data_sql = _to_sql(training_data) if training_data is not None else None - custom_holiday_sql = _to_sql(custom_holiday) if custom_holiday is not None else None + training_data_sql = ( + utils.to_sql(training_data) if training_data is not None else None + ) + custom_holiday_sql = ( + utils.to_sql(custom_holiday) if custom_holiday is not None else None + ) # Determine session from DataFrames if not provided if session is None: @@ -227,8 +185,8 @@ def evaluate( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) if input_ is not None else None + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) if input_ is not None else None sql = bigframes.core.sql.ml.evaluate( model_name=model_name, @@ -281,8 +239,8 @@ def predict( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.predict( model_name=model_name, @@ -340,8 +298,8 @@ def explain_predict( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.explain_predict( model_name=model_name, @@ -383,7 +341,7 @@ def global_explain( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model) + model_name, session = utils.get_model_name_and_session(model) sql = bigframes.core.sql.ml.global_explain( model_name=model_name, class_level_explain=class_level_explain, @@ -419,8 +377,8 @@ def transform( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.transform( model_name=model_name, @@ -500,8 +458,8 @@ def generate_text( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.generate_text( model_name=model_name, @@ -565,8 +523,8 @@ def generate_embedding( """ import bigframes.pandas as bpd - model_name, session = _get_model_name_and_session(model, input_) - table_sql = _to_sql(input_) + model_name, session = utils.get_model_name_and_session(model, input_) + table_sql = utils.to_sql(input_) sql = bigframes.core.sql.ml.generate_embedding( model_name=model_name, diff --git a/bigframes/bigquery/_operations/utils.py b/bigframes/bigquery/_operations/utils.py new file mode 100644 index 0000000000..f94616786e --- /dev/null +++ b/bigframes/bigquery/_operations/utils.py @@ -0,0 +1,70 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import cast, Optional, Union + +import pandas as pd + +import bigframes +from bigframes import dataframe +from bigframes.ml import base as ml_base + + +def get_model_name_and_session( + model: Union[ml_base.BaseEstimator, str, pd.Series], + # Other dataframe arguments to extract session from + *dataframes: Optional[Union[pd.DataFrame, dataframe.DataFrame, str]], +) -> tuple[str, Optional[bigframes.session.Session]]: + if isinstance(model, pd.Series): + try: + model_ref = model["modelReference"] + model_name = f"{model_ref['projectId']}.{model_ref['datasetId']}.{model_ref['modelId']}" # type: ignore + except KeyError: + raise ValueError("modelReference must be present in the pandas Series.") + elif isinstance(model, str): + model_name = model + else: + if model._bqml_model is None: + raise ValueError("Model must be fitted to be used in ML operations.") + return model._bqml_model.model_name, model._bqml_model.session + + session = None + for df in dataframes: + if isinstance(df, dataframe.DataFrame): + session = df._session + break + + return model_name, session + + +def to_sql(df_or_sql: Union[pd.DataFrame, dataframe.DataFrame, str]) -> str: + """ + Helper to convert DataFrame to SQL string + """ + import bigframes.pandas as bpd + + if isinstance(df_or_sql, str): + return df_or_sql + + if isinstance(df_or_sql, pd.DataFrame): + bf_df = bpd.read_pandas(df_or_sql) + else: + bf_df = cast(dataframe.DataFrame, df_or_sql) + + # Cache dataframes to make sure base table is not a snapshot. + # Cached dataframe creates a full copy, never uses snapshot. + # This is a workaround for internal issue b/310266666. + bf_df.cache() + sql, _, _ = bf_df._to_sql_query(include_index=False) + return sql diff --git a/bigframes/bigquery/ai.py b/bigframes/bigquery/ai.py index b0d9b62f9b..053ee7352a 100644 --- a/bigframes/bigquery/ai.py +++ b/bigframes/bigquery/ai.py @@ -24,6 +24,7 @@ generate_double, generate_embedding, generate_int, + generate_text, if_, score, ) @@ -36,6 +37,7 @@ "generate_double", "generate_embedding", "generate_int", + "generate_text", "if_", "score", ] diff --git a/tests/system/large/bigquery/test_ai.py b/tests/system/large/bigquery/test_ai.py new file mode 100644 index 0000000000..e318a8a720 --- /dev/null +++ b/tests/system/large/bigquery/test_ai.py @@ -0,0 +1,96 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.bigquery import ai, ml +import bigframes.pandas as bpd + + +@pytest.fixture(scope="session") +def embedding_model(bq_connection, dataset_id): + model_name = f"{dataset_id}.embedding_model" + return ml.create_model( + model_name=model_name, + options={"endpoint": "gemini-embedding-001"}, + connection_name=bq_connection, + ) + + +@pytest.fixture(scope="session") +def text_model(bq_connection, dataset_id): + model_name = f"{dataset_id}.text_model" + return ml.create_model( + model_name=model_name, + options={"endpoint": "gemini-2.5-flash"}, + connection_name=bq_connection, + ) + + +def test_generate_embedding(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ai.generate_embedding(embedding_model, df) + + assert len(result) == 2 + assert "embedding" in result.columns + assert "statistics" in result.columns + assert "status" in result.columns + + +def test_generate_embedding_with_options(embedding_model): + df = bpd.DataFrame( + { + "content": [ + "What is BigQuery?", + "What is BQML?", + ] + } + ) + + result = ai.generate_embedding( + embedding_model, df, task_type="RETRIEVAL_DOCUMENT", output_dimensionality=256 + ) + + assert len(result) == 2 + embedding = result["embedding"].to_pandas() + assert len(embedding[0]) == 256 + + +def test_generate_text(text_model): + df = bpd.DataFrame({"prompt": ["Dog", "Cat"]}) + + result = ai.generate_text(text_model, df) + + assert len(result) == 2 + assert "result" in result.columns + assert "statistics" in result.columns + assert "full_response" in result.columns + assert "status" in result.columns + + +def test_generate_text_with_options(text_model): + df = bpd.DataFrame({"prompt": ["Dog", "Cat"]}) + + result = ai.generate_text(text_model, df, max_output_tokens=1) + + # It basically asserts that the results are still returned. + assert len(result) == 2 diff --git a/tests/unit/bigquery/test_ai.py b/tests/unit/bigquery/test_ai.py index 0f9df6cc26..0be32b9e8a 100644 --- a/tests/unit/bigquery/test_ai.py +++ b/tests/unit/bigquery/test_ai.py @@ -33,17 +33,41 @@ def mock_dataframe(mock_session): df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) df._session = mock_session df.sql = "SELECT * FROM my_table" + df._to_sql_query.return_value = ("SELECT * FROM my_table", None, None) return df @pytest.fixture -def mock_series(mock_session): +def mock_embedding_series(mock_session): series = mock.create_autospec(spec=bigframes.series.Series) series._session = mock_session # Mock to_frame to return a mock dataframe df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) df._session = mock_session df.sql = "SELECT my_col AS content FROM my_table" + df._to_sql_query.return_value = ( + "SELECT my_col AS content FROM my_table", + None, + None, + ) + series.copy.return_value = series + series.to_frame.return_value = df + return series + + +@pytest.fixture +def mock_text_series(mock_session): + series = mock.create_autospec(spec=bigframes.series.Series) + series._session = mock_session + # Mock to_frame to return a mock dataframe + df = mock.create_autospec(spec=bigframes.dataframe.DataFrame) + df._session = mock_session + df.sql = "SELECT my_col AS prompt FROM my_table" + df._to_sql_query.return_value = ( + "SELECT my_col AS prompt FROM my_table", + None, + None, + ) series.copy.return_value = series series.to_frame.return_value = df return series @@ -58,8 +82,8 @@ def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): output_dimensionality=256, ) - mock_session.read_gbq.assert_called_once() - query = mock_session.read_gbq.call_args[0][0] + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] # Normalize whitespace for comparison query = " ".join(query.split()) @@ -75,15 +99,19 @@ def test_generate_embedding_with_dataframe(mock_dataframe, mock_session): assert expected_part_4 in query -def test_generate_embedding_with_series(mock_series, mock_session): +def test_generate_embedding_with_series(mock_embedding_series, mock_session): model_name = "project.dataset.model" bbq.ai.generate_embedding( - model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0 + model_name, + mock_embedding_series, + start_second=0.0, + end_second=10.0, + interval_seconds=5.0, ) - mock_session.read_gbq.assert_called_once() - query = mock_session.read_gbq.call_args[0][0] + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] query = " ".join(query.split()) assert f"MODEL `{model_name}`" in query @@ -102,8 +130,8 @@ def test_generate_embedding_defaults(mock_dataframe, mock_session): mock_dataframe, ) - mock_session.read_gbq.assert_called_once() - query = mock_session.read_gbq.call_args[0][0] + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] query = " ".join(query.split()) assert f"MODEL `{model_name}`" in query @@ -131,4 +159,86 @@ def test_generate_embedding_with_pandas_dataframe( # Check that read_pandas was called with something (the pandas df) assert read_pandas_mock.call_args[0][0] is pandas_df - mock_session.read_gbq.assert_called_once() + mock_session.read_gbq_query.assert_called_once() + + +def test_generate_text_with_dataframe(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_dataframe, + max_output_tokens=256, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + + # Normalize whitespace for comparison + query = " ".join(query.split()) + + expected_part_1 = "SELECT * FROM AI.GENERATE_TEXT(" + expected_part_2 = f"MODEL `{model_name}`," + expected_part_3 = "(SELECT * FROM my_table)," + expected_part_4 = "STRUCT(256 AS MAX_OUTPUT_TOKENS)" + + assert expected_part_1 in query + assert expected_part_2 in query + assert expected_part_3 in query + assert expected_part_4 in query + + +def test_generate_text_with_series(mock_text_series, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_text_series, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "(SELECT my_col AS prompt FROM my_table)" in query + + +def test_generate_text_defaults(mock_dataframe, mock_session): + model_name = "project.dataset.model" + + bbq.ai.generate_text( + model_name, + mock_dataframe, + ) + + mock_session.read_gbq_query.assert_called_once() + query = mock_session.read_gbq_query.call_args[0][0] + query = " ".join(query.split()) + + assert f"MODEL `{model_name}`" in query + assert "STRUCT()" in query + + +@mock.patch("bigframes.pandas.read_pandas") +def test_generate_text_with_pandas_dataframe( + read_pandas_mock, mock_dataframe, mock_session +): + # This tests that pandas input path works and calls read_pandas + model_name = "project.dataset.model" + + # Mock return value of read_pandas to be a BigFrames DataFrame + read_pandas_mock.return_value = mock_dataframe + + pandas_df = pd.DataFrame({"content": ["test"]}) + + bbq.ai.generate_text( + model_name, + pandas_df, + ) + + read_pandas_mock.assert_called_once() + # Check that read_pandas was called with something (the pandas df) + assert read_pandas_mock.call_args[0][0] is pandas_df + + mock_session.read_gbq_query.assert_called_once() diff --git a/tests/unit/bigquery/test_ml.py b/tests/unit/bigquery/test_ml.py index fd77469152..e5c957767b 100644 --- a/tests/unit/bigquery/test_ml.py +++ b/tests/unit/bigquery/test_ml.py @@ -40,31 +40,6 @@ def mock_session(): MODEL_NAME = "test-project.test-dataset.test-model" -def test_get_model_name_and_session_with_pandas_series_model_input(): - model_name, _ = ml_ops._get_model_name_and_session(MODEL_SERIES) - assert model_name == MODEL_NAME - - -def test_get_model_name_and_session_with_pandas_series_model_input_missing_model_reference(): - model_series = pd.Series({"some_other_key": "value"}) - with pytest.raises( - ValueError, match="modelReference must be present in the pandas Series" - ): - ml_ops._get_model_name_and_session(model_series) - - -@mock.patch("bigframes.pandas.read_pandas") -def test_to_sql_with_pandas_dataframe(read_pandas_mock): - df = pd.DataFrame({"col1": [1, 2, 3]}) - read_pandas_mock.return_value._to_sql_query.return_value = ( - "SELECT * FROM `pandas_df`", - [], - [], - ) - ml_ops._to_sql(df) - read_pandas_mock.assert_called_once() - - @mock.patch("bigframes.bigquery._operations.ml._get_model_metadata") @mock.patch("bigframes.pandas.read_pandas") def test_create_model_with_pandas_dataframe( From 51309dfdd920520b462141b803620a29aeca0a48 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 5 Feb 2026 11:17:56 -0800 Subject: [PATCH 07/18] deps: Drop support for python 3.9 (#2418) --- .github/workflows/unittest.yml | 2 +- .kokoro/samples/python3.7/common.cfg | 40 ---------------- .kokoro/samples/python3.7/continuous.cfg | 6 --- .kokoro/samples/python3.7/periodic-head.cfg | 11 ----- .kokoro/samples/python3.7/periodic.cfg | 6 --- .kokoro/samples/python3.7/presubmit.cfg | 6 --- .kokoro/samples/python3.8/common.cfg | 40 ---------------- .kokoro/samples/python3.8/continuous.cfg | 6 --- .kokoro/samples/python3.8/periodic-head.cfg | 11 ----- .kokoro/samples/python3.8/periodic.cfg | 6 --- .kokoro/samples/python3.8/presubmit.cfg | 6 --- .kokoro/samples/python3.9/common.cfg | 40 ---------------- .kokoro/samples/python3.9/continuous.cfg | 6 --- .kokoro/samples/python3.9/periodic-head.cfg | 11 ----- .kokoro/samples/python3.9/periodic.cfg | 6 --- .kokoro/samples/python3.9/presubmit.cfg | 6 --- .kokoro/test-samples-impl.sh | 4 +- CONTRIBUTING.rst | 8 ++-- bigframes/core/blocks.py | 27 ++++++----- bigframes/operations/aggregations.py | 4 +- noxfile.py | 6 +-- scripts/test_publish_api_coverage.py | 8 ++-- setup.py | 5 +- testing/constraints-3.10.txt | 51 +++++++++++++++------ tests/system/small/test_dataframe.py | 11 +---- tests/system/small/test_groupby.py | 2 +- tests/unit/test_dataframe_polars.py | 7 +++ 27 files changed, 78 insertions(+), 264 deletions(-) delete mode 100644 .kokoro/samples/python3.7/common.cfg delete mode 100644 .kokoro/samples/python3.7/continuous.cfg delete mode 100644 .kokoro/samples/python3.7/periodic-head.cfg delete mode 100644 .kokoro/samples/python3.7/periodic.cfg delete mode 100644 .kokoro/samples/python3.7/presubmit.cfg delete mode 100644 .kokoro/samples/python3.8/common.cfg delete mode 100644 .kokoro/samples/python3.8/continuous.cfg delete mode 100644 .kokoro/samples/python3.8/periodic-head.cfg delete mode 100644 .kokoro/samples/python3.8/periodic.cfg delete mode 100644 .kokoro/samples/python3.8/presubmit.cfg delete mode 100644 .kokoro/samples/python3.9/common.cfg delete mode 100644 .kokoro/samples/python3.9/continuous.cfg delete mode 100644 .kokoro/samples/python3.9/periodic-head.cfg delete mode 100644 .kokoro/samples/python3.9/periodic.cfg delete mode 100644 .kokoro/samples/python3.9/presubmit.cfg diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 518cec6312..2455f7abc4 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python: ['3.9', '3.10', '3.11', '3.12', '3.13'] + python: ['3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg deleted file mode 100644 index 09d7af02ba..0000000000 --- a/.kokoro/samples/python3.7/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.7" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py37" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.7/continuous.cfg b/.kokoro/samples/python3.7/continuous.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.7/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.7/periodic-head.cfg b/.kokoro/samples/python3.7/periodic-head.cfg deleted file mode 100644 index 123a35fbd3..0000000000 --- a/.kokoro/samples/python3.7/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.7/periodic.cfg b/.kokoro/samples/python3.7/periodic.cfg deleted file mode 100644 index 71cd1e597e..0000000000 --- a/.kokoro/samples/python3.7/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.7/presubmit.cfg b/.kokoro/samples/python3.7/presubmit.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.7/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg deleted file mode 100644 index 976d9ce8c5..0000000000 --- a/.kokoro/samples/python3.8/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.8" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py38" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.8/continuous.cfg b/.kokoro/samples/python3.8/continuous.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.8/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.8/periodic-head.cfg b/.kokoro/samples/python3.8/periodic-head.cfg deleted file mode 100644 index 123a35fbd3..0000000000 --- a/.kokoro/samples/python3.8/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.8/periodic.cfg b/.kokoro/samples/python3.8/periodic.cfg deleted file mode 100644 index 71cd1e597e..0000000000 --- a/.kokoro/samples/python3.8/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.8/presubmit.cfg b/.kokoro/samples/python3.8/presubmit.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.8/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/common.cfg b/.kokoro/samples/python3.9/common.cfg deleted file mode 100644 index 603cfffa28..0000000000 --- a/.kokoro/samples/python3.9/common.cfg +++ /dev/null @@ -1,40 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Specify which tests to run -env_vars: { - key: "RUN_TESTS_SESSION" - value: "py-3.9" -} - -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py39" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" -} - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" -} - -# Download secrets for samples -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.9/continuous.cfg b/.kokoro/samples/python3.9/continuous.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.9/continuous.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/samples/python3.9/periodic-head.cfg b/.kokoro/samples/python3.9/periodic-head.cfg deleted file mode 100644 index 123a35fbd3..0000000000 --- a/.kokoro/samples/python3.9/periodic-head.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" -} diff --git a/.kokoro/samples/python3.9/periodic.cfg b/.kokoro/samples/python3.9/periodic.cfg deleted file mode 100644 index 71cd1e597e..0000000000 --- a/.kokoro/samples/python3.9/periodic.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "False" -} diff --git a/.kokoro/samples/python3.9/presubmit.cfg b/.kokoro/samples/python3.9/presubmit.cfg deleted file mode 100644 index a1c8d9759c..0000000000 --- a/.kokoro/samples/python3.9/presubmit.cfg +++ /dev/null @@ -1,6 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "INSTALL_LIBRARY_FROM_SOURCE" - value: "True" -} \ No newline at end of file diff --git a/.kokoro/test-samples-impl.sh b/.kokoro/test-samples-impl.sh index 53e365bc4e..97cdc9c13f 100755 --- a/.kokoro/test-samples-impl.sh +++ b/.kokoro/test-samples-impl.sh @@ -34,7 +34,7 @@ env | grep KOKORO # Install nox # `virtualenv==20.26.6` is added for Python 3.7 compatibility -python3.9 -m pip install --upgrade --quiet nox virtualenv==20.26.6 +python3.10 -m pip install --upgrade --quiet nox virtualenv==20.26.6 # Use secrets acessor service account to get secrets if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then @@ -77,7 +77,7 @@ for file in samples/**/requirements.txt; do echo "------------------------------------------------------------" # Use nox to execute the tests for the project. - python3.9 -m nox -s "$RUN_TESTS_SESSION" + python3.10 -m nox -s "$RUN_TESTS_SESSION" EXIT=$? # If this is a periodic build, send the test log to the FlakyBot. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5374e7e377..7ac410bbf7 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.9, 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. + 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -148,7 +148,7 @@ Running System Tests .. note:: - System tests are only configured to run under Python 3.9, 3.11, 3.12 and 3.13. + System tests are only configured to run under Python 3.10, 3.11, 3.12 and 3.13. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -258,13 +258,11 @@ Supported Python Versions We support: -- `Python 3.9`_ - `Python 3.10`_ - `Python 3.11`_ - `Python 3.12`_ - `Python 3.13`_ -.. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ @@ -276,7 +274,7 @@ Supported versions can be found in our ``noxfile.py`` `config`_. .. _config: https://github.com/googleapis/python-bigquery-dataframes/blob/main/noxfile.py -We also explicitly decided to support Python 3 beginning with version 3.9. +We also explicitly decided to support Python 3 beginning with version 3.10. Reasons for this include: - Encouraging use of newest versions of Python 3 diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5bac1a06f1..ff7f2b9899 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -140,6 +140,7 @@ def __init__( column_labels: typing.Union[pd.Index, typing.Iterable[Label]], index_labels: typing.Union[pd.Index, typing.Iterable[Label], None] = None, *, + value_columns: Optional[Iterable[str]] = None, transpose_cache: Optional[Block] = None, ): """Construct a block object, will create default index if no index columns specified.""" @@ -158,7 +159,13 @@ def __init__( if index_labels else tuple([None for _ in index_columns]) ) - self._expr = self._normalize_expression(expr, self._index_columns) + if value_columns is None: + value_columns = [ + col_id for col_id in expr.column_ids if col_id not in index_columns + ] + self._expr = self._normalize_expression( + expr, self._index_columns, value_columns + ) # Use pandas index to more easily replicate column indexing, especially for hierarchical column index self._column_labels = ( column_labels.copy() @@ -1114,13 +1121,15 @@ def project_exprs( labels: Union[Sequence[Label], pd.Index], drop=False, ) -> Block: - new_array, _ = self.expr.compute_values(exprs) + new_array, new_cols = self.expr.compute_values(exprs) if drop: new_array = new_array.drop_columns(self.value_columns) + new_val_cols = new_cols if drop else (*self.value_columns, *new_cols) return Block( new_array, index_columns=self.index_columns, + value_columns=new_val_cols, column_labels=labels if drop else self.column_labels.append(pd.Index(labels)), @@ -1542,17 +1551,13 @@ def _get_labels_for_columns(self, column_ids: typing.Sequence[str]) -> pd.Index: def _normalize_expression( self, expr: core.ArrayValue, - index_columns: typing.Sequence[str], - assert_value_size: typing.Optional[int] = None, + index_columns: Iterable[str], + value_columns: Iterable[str], ): """Normalizes expression by moving index columns to left.""" - value_columns = [ - col_id for col_id in expr.column_ids if col_id not in index_columns - ] - if (assert_value_size is not None) and ( - len(value_columns) != assert_value_size - ): - raise ValueError("Unexpected number of value columns.") + normalized_ids = (*index_columns, *value_columns) + if tuple(expr.column_ids) == normalized_ids: + return expr return expr.select_columns([*index_columns, *value_columns]) def grouped_head( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 5fe8330263..eee710b288 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -205,7 +205,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.TIMEDELTA_DTYPE if dtypes.is_numeric(input_types[0]): - if pd.api.types.is_bool_dtype(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore return dtypes.INT_DTYPE return input_types[0] @@ -224,7 +224,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT # These will change if median is changed to exact implementation. if not dtypes.is_orderable(input_types[0]): raise TypeError(f"Type {input_types[0]} is not orderable") - if pd.api.types.is_bool_dtype(input_types[0]): + if pd.api.types.is_bool_dtype(input_types[0]): # type: ignore return dtypes.INT_DTYPE else: return input_types[0] diff --git a/noxfile.py b/noxfile.py index 00ada18a46..a8a1a84987 100644 --- a/noxfile.py +++ b/noxfile.py @@ -123,7 +123,7 @@ # TODO(tswast): Consider removing this when unit_noextras and cover is run # from GitHub actions. "unit_noextras", - "system-3.9", # No extras. + "system-3.10", # No extras. f"system-{LATEST_FULLY_SUPPORTED_PYTHON}", # All extras. "cover", # TODO(b/401609005): remove @@ -661,9 +661,7 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( # version, the first version we test with in the unit tests sessions has a # constraints file containing all dependencies and extras. with open( - CURRENT_DIRECTORY - / "testing" - / f"constraints-{UNIT_TEST_PYTHON_VERSIONS[0]}.txt", + CURRENT_DIRECTORY / "testing" / f"constraints-{DEFAULT_PYTHON_VERSION}.txt", encoding="utf-8", ) as constraints_file: constraints_text = constraints_file.read() diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 6e366b6854..6abecd0ac4 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -31,10 +31,8 @@ def api_coverage_df(): reason="Issues with installing sklearn for this test in python 3.13", ) def test_api_coverage_produces_expected_schema(api_coverage_df): - if sys.version.split(".")[:2] == ["3", "9"]: - pytest.skip( - "Python 3.9 uses older pandas without good microsecond timestamp support." - ) + # Older pandas has different timestamp default precision + pytest.importorskip("pandas", minversion="2.0.0") pandas.testing.assert_series_equal( api_coverage_df.dtypes, @@ -56,6 +54,8 @@ def test_api_coverage_produces_expected_schema(api_coverage_df): "release_version": "string", }, ), + # String dtype behavior not consistent across pandas versions + check_dtype=False, ) diff --git a/setup.py b/setup.py index 090b103536..38c2a6032e 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ # 'Development Status :: 5 - Production/Stable' release_status = "Development Status :: 5 - Production/Stable" dependencies = [ - # please keep these in sync with the minimum versions in testing/constraints-3.9.txt + # please keep these in sync with the minimum versions in testing/constraints-3.10.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", "gcsfs >=2023.3.0, !=2025.5.0", @@ -133,7 +133,6 @@ "License :: OSI Approved :: Apache Software License", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -149,7 +148,7 @@ "bigframes_vendored": "third_party/bigframes_vendored", }, packages=packages, - python_requires=">=3.9", + python_requires=">=3.10", include_package_data=True, zip_safe=False, ) diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 4810d6461b..b74ff81063 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,18 +1,39 @@ # When we drop Python 3.9, # please keep these in sync with the minimum versions in setup.py -google-auth==2.27.0 -ipykernel==5.5.6 -ipython==7.34.0 -notebook==6.5.5 -pandas==2.1.4 -pandas-stubs==2.1.4.231227 -portpicker==1.5.2 -requests==2.32.3 -tornado==6.3.3 -absl-py==1.4.0 -debugpy==1.6.6 +cloudpickle==2.0.0 +fsspec==2023.3.0 +gcsfs==2023.3.0 +geopandas==0.12.2 +google-auth==2.15.0 +google-cloud-bigtable==2.24.0 +google-cloud-pubsub==2.21.4 +google-cloud-bigquery==3.36.0 +google-cloud-functions==1.12.0 +google-cloud-bigquery-connection==1.12.0 +google-cloud-iam==2.12.1 +google-cloud-resource-manager==1.10.3 +google-cloud-storage==2.0.0 +grpc-google-iam-v1==0.14.2 +numpy==1.24.0 +pandas==1.5.3 +pandas-gbq==0.26.1 +pyarrow==15.0.2 +pydata-google-auth==1.8.2 +requests==2.27.1 +scikit-learn==1.2.2 +shapely==1.8.5 +tabulate==0.9 +ipywidgets==7.7.1 +humanize==4.6.0 matplotlib==3.7.1 -psutil==5.9.5 -seaborn==0.13.1 -traitlets==5.7.1 -polars==1.21.0 +db-dtypes==1.4.2 +# For vendored ibis-framework. +atpublic==2.3 +python-dateutil==2.8.2 +pytz==2022.7 +toolz==0.11 +typing-extensions==4.5.0 +rich==12.4.4 +# For anywidget mode +anywidget>=0.9.18 +traitlets==5.0.0 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0f7b782b66..fa82cce605 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5754,16 +5754,9 @@ def test_df_dot_operator_series( ) -# TODO(tswast): We may be able to re-enable this test after we break large -# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) def test_recursion_limit(scalars_df_index): scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - for i in range(400): + for i in range(250): scalars_df_index = scalars_df_index + 4 scalars_df_index.to_pandas() @@ -5964,7 +5957,7 @@ def test_resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") + pytest.importorskip("pandas", minversion="2.2.0") bf_result = ( scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 579e7cd414..1d0e05f5cc 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -123,7 +123,7 @@ def test_dataframe_groupby_rank( scalars_df_index, scalars_pandas_df_index, na_option, method, ascending, pct ): # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") + pytest.importorskip("pandas", minversion="2.2.0") col_names = ["int64_too", "float64_col", "int64_col", "string_col"] bf_result = ( scalars_df_index[col_names] diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index 1c73d9dc6b..a7e40469a2 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -4450,3 +4450,10 @@ def test_dataframe_explode_reserve_order(session, ignore_index, ordered): def test_dataframe_explode_xfail(col_names): df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) df.explode(col_names) + + +def test_recursion_limit_unit(scalars_df_index): + scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + for i in range(250): + scalars_df_index = scalars_df_index + 4 + scalars_df_index.to_pandas() From ad0f33c65ee01409826c381ae0f70aad65bb6a27 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 5 Feb 2026 15:00:50 -0800 Subject: [PATCH 08/18] docs: fix cast method shown on public docs (#2436) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/base.py | 9 +++++---- bigframes/ml/compose.py | 6 +++--- bigframes/ml/core.py | 5 +++-- bigframes/ml/imported.py | 15 ++++++++------- bigframes/ml/llm.py | 20 ++++++++++++-------- bigframes/ml/model_selection.py | 7 ++++--- bigframes/ml/preprocessing.py | 4 ++-- 7 files changed, 37 insertions(+), 29 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 9b38702cce..3f6ccecaa2 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -24,7 +24,8 @@ """ import abc -from typing import cast, Optional, TypeVar, Union +import typing +from typing import Optional, TypeVar, Union import warnings import bigframes_vendored.sklearn.base @@ -133,7 +134,7 @@ def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: self._bqml_model = self._create_bqml_model() # type: ignore except AttributeError: raise RuntimeError("A model must be trained before register.") - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) self._bqml_model.register(vertex_ai_model_id) return self @@ -286,7 +287,7 @@ def _predict_and_retry( bpd.concat([df_result, df_succ]) if df_result is not None else df_succ ) - df_result = cast( + df_result = typing.cast( bpd.DataFrame, bpd.concat([df_result, df_fail]) if df_result is not None else df_fail, ) @@ -306,7 +307,7 @@ def _extract_output_names(self): output_names = [] for transform_col in self._bqml_model._model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) + transform_col_dict = typing.cast(dict, transform_col) # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d638e026e4..f8244fb0d8 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,7 +21,7 @@ import re import types import typing -from typing import cast, Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union from bigframes_vendored import constants import bigframes_vendored.sklearn.compose._column_transformer @@ -218,7 +218,7 @@ def camel_to_snake(name): output_names = [] for transform_col in bq_model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) + transform_col_dict = typing.cast(dict, transform_col) # pass the columns that are not transformed if "transformSql" not in transform_col_dict: continue @@ -282,7 +282,7 @@ def _merge( return self # SQLScalarColumnTransformer only work inside ColumnTransformer feature_columns_sorted = sorted( [ - cast(str, feature_column.name) + typing.cast(str, feature_column.name) for feature_column in bq_model.feature_columns ] ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4dbc1a5fa3..620843fb6e 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -18,7 +18,8 @@ import dataclasses import datetime -from typing import Callable, cast, Iterable, Mapping, Optional, Union +import typing +from typing import Callable, Iterable, Mapping, Optional, Union import uuid from google.cloud import bigquery @@ -376,7 +377,7 @@ def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: if vertex_ai_model_id is None: # vertex id needs to start with letters. https://cloud.google.com/vertex-ai/docs/general/resource-naming - vertex_ai_model_id = "bigframes_" + cast(str, self._model.model_id) + vertex_ai_model_id = "bigframes_" + typing.cast(str, self._model.model_id) # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. # The possibility of conflicts should be low. diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 295649ed7f..56b5d6735c 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import cast, Mapping, Optional +import typing +from typing import Mapping, Optional from google.cloud import bigquery @@ -78,7 +79,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X) @@ -99,7 +100,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TensorFlowModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) @@ -157,7 +158,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) @@ -178,7 +179,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) @@ -276,7 +277,7 @@ def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) @@ -297,7 +298,7 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: if self.model_path is None: raise ValueError("Model GCS path must be provided.") self._bqml_model = self._create_bqml_model() - self._bqml_model = cast(core.BqmlModel, self._bqml_model) + self._bqml_model = typing.cast(core.BqmlModel, self._bqml_model) new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index f4e60f3f9d..585599c9b6 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import cast, Iterable, Literal, Mapping, Optional, Union +import typing +from typing import Iterable, Literal, Mapping, Optional, Union import warnings import bigframes_vendored.constants as constants @@ -252,7 +253,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "content"}) options: dict = {} @@ -391,7 +392,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "content"}) # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input @@ -604,7 +605,10 @@ def fit( options["prompt_col"] = X.columns.tolist()[0] self._bqml_model = self._bqml_model_factory.create_llm_remote_model( - X, y, options=options, connection_name=cast(str, self.connection_name) + X, + y, + options=options, + connection_name=typing.cast(str, self.connection_name), ) return self @@ -735,7 +739,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "prompt"}) options: dict = { @@ -820,8 +824,8 @@ def score( ) # BQML identified the column by name - X_col_label = cast(blocks.Label, X.columns[0]) - y_col_label = cast(blocks.Label, y.columns[0]) + X_col_label = typing.cast(blocks.Label, X.columns[0]) + y_col_label = typing.cast(blocks.Label, y.columns[0]) X = X.rename(columns={X_col_label: "input_text"}) y = y.rename(columns={y_col_label: "output_text"}) @@ -1033,7 +1037,7 @@ def predict( if len(X.columns) == 1: # BQML identified the column by name - col_label = cast(blocks.Label, X.columns[0]) + col_label = typing.cast(blocks.Label, X.columns[0]) X = X.rename(columns={col_label: "prompt"}) options = { diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 5adfb03b7f..3d23fbf568 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -20,7 +20,8 @@ import inspect from itertools import chain import time -from typing import cast, Generator, List, Optional, Union +import typing +from typing import Generator, List, Optional, Union import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation @@ -99,10 +100,10 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra train_dfs.append(train) test_dfs.append(test) - train_df = cast( + train_df = typing.cast( bpd.DataFrame, bpd.concat(train_dfs).drop(columns="bigframes_stratify_col") ) - test_df = cast( + test_df = typing.cast( bpd.DataFrame, bpd.concat(test_dfs).drop(columns="bigframes_stratify_col") ) return [train_df, test_df] diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 8bf89b0838..22a3e7e222 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import cast, Iterable, List, Literal, Optional, Union +from typing import Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.preprocessing._data import bigframes_vendored.sklearn.preprocessing._discretization @@ -470,7 +470,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]: s = sql[sql.find("(") + 1 : sql.find(")")] col_label, drop_str, top_k, frequency_threshold = s.split(", ") drop = ( - cast(Literal["most_frequent"], "most_frequent") + typing.cast(Literal["most_frequent"], "most_frequent") if drop_str.lower() == "'most_frequent'" else None ) From 2d973b54550f30429dbd10894f78db7bb0c57345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 6 Feb 2026 11:51:55 -0600 Subject: [PATCH 09/18] fix: always display the results in the `%%bqsql` cell magics output (#2439) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes makes the magics function more similar to SQL Cell. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/_magics.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/bigframes/_magics.py b/bigframes/_magics.py index 33205e3a37..613f71219b 100644 --- a/bigframes/_magics.py +++ b/bigframes/_magics.py @@ -47,10 +47,9 @@ def _cell_magic(line, cell): ) if args.destination_var: ipython.push({args.destination_var: dataframe}) - else: - with bigframes.option_context( - "display.repr_mode", - "anywidget", - ): - display(dataframe) - return + + with bigframes.option_context( + "display.repr_mode", + "anywidget", + ): + display(dataframe) From 853240daf45301ad534c635c8955cb6ce91d23c2 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 6 Feb 2026 10:11:59 -0800 Subject: [PATCH 10/18] feat: Disable progress bars in Anywidget mode to reduce notebook clutter (#2437) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR improves the user experience of the interactive Anywidget display mode by automatically disabling progress bars and job logging during widget operations. Changes: - Wrapped the get_anywidget_bundle call in repr_mimebundle with option_context("display.progress_bar", None) to silence the initial widget load. - Wrapped TableWidget._initial_load and TableWidget._set_table_html methods to silence subsequent interactions like pagination and sorting. Motivation: When interacting with the TableWidget (paging, sorting), the repeated appearance of progress bars creates visual noise and clutter in the notebook output cell, distracting from the interactive data exploration experience. This change ensures a clean and seamless interface. Verified at: - vs code notebook: https://screencast.googleplex.com/cast/NTY2NDkzOTc4Mjk2MzIwMHwwYjU0Njc1MS03MA - colab notebook: https://screencast.googleplex.com/cast/NjIzMjM0NTEyNzQxOTkwNHxiMDM0YzM2Ni1iZQ Fixes #<482120359> 🦕 --- bigframes/core/compile/polars/compiler.py | 64 ++++- bigframes/core/compile/polars/lowering.py | 2 +- bigframes/display/anywidget.py | 39 +-- bigframes/display/html.py | 3 +- bigframes/operations/json_ops.py | 1 + bigframes/session/polars_executor.py | 2 + notebooks/dataframes/anywidget_mode.ipynb | 336 +++++++--------------- tests/unit/test_series_polars.py | 1 - 8 files changed, 195 insertions(+), 253 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 1f0ca592e5..a9fd492f98 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,6 +16,7 @@ import dataclasses import functools import itertools +import json from typing import cast, Literal, Optional, Sequence, Tuple, Type, TYPE_CHECKING import pandas as pd @@ -429,7 +430,68 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.JSONDecode) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) - return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) + target_dtype = _bigframes_dtype_to_polars_dtype(op.to_type) + if op.safe: + # Polars does not support safe JSON decoding (returning null on failure). + # We use map_elements to provide safe JSON decoding. + def safe_decode(val): + if val is None: + return None + try: + decoded = json.loads(val) + except Exception: + return None + + if decoded is None: + return None + + if op.to_type == bigframes.dtypes.INT_DTYPE: + if type(decoded) is bool: + return None + if isinstance(decoded, int): + return decoded + if isinstance(decoded, float): + if decoded.is_integer(): + return int(decoded) + if isinstance(decoded, str): + try: + return int(decoded) + except Exception: + pass + return None + + if op.to_type == bigframes.dtypes.FLOAT_DTYPE: + if type(decoded) is bool: + return None + if isinstance(decoded, (int, float)): + return float(decoded) + if isinstance(decoded, str): + try: + return float(decoded) + except Exception: + pass + return None + + if op.to_type == bigframes.dtypes.BOOL_DTYPE: + if isinstance(decoded, bool): + return decoded + if isinstance(decoded, str): + if decoded.lower() == "true": + return True + if decoded.lower() == "false": + return False + return None + + if op.to_type == bigframes.dtypes.STRING_DTYPE: + if isinstance(decoded, str): + return decoded + return None + + return decoded + + return input.map_elements(safe_decode, return_dtype=target_dtype) + + return input.str.json_decode(target_dtype) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index bf617d6879..5f80904b3b 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -391,7 +391,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): return arg if arg.output_type == dtypes.JSON_DTYPE: - return json_ops.JSONDecode(cast_op.to_type).as_expr(arg) + return json_ops.JSONDecode(cast_op.to_type, safe=cast_op.safe).as_expr(arg) if ( arg.output_type == dtypes.STRING_DTYPE and cast_op.to_type == dtypes.DATETIME_DTYPE diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index be0d2b45d0..9bd20fce43 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -133,25 +133,26 @@ def _initial_load(self) -> None: # obtain the row counts # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` # before we get here so that the count might already be cached. - self._reset_batches_for_new_page_size() + with bigframes.option_context("display.progress_bar", None): + self._reset_batches_for_new_page_size() - if self._batches is None: - self._error_message = ( - "Could not retrieve data batches. Data might be unavailable or " - "an error occurred." - ) - self.row_count = None - elif self._batches.total_rows is None: - # Total rows is unknown, this is an expected state. - # TODO(b/461536343): Cheaply discover if we have exactly 1 page. - # There are cases where total rows is not set, but there are no additional - # pages. We could disable the "next" button in these cases. - self.row_count = None - else: - self.row_count = self._batches.total_rows + if self._batches is None: + self._error_message = ( + "Could not retrieve data batches. Data might be unavailable or " + "an error occurred." + ) + self.row_count = None + elif self._batches.total_rows is None: + # Total rows is unknown, this is an expected state. + # TODO(b/461536343): Cheaply discover if we have exactly 1 page. + # There are cases where total rows is not set, but there are no additional + # pages. We could disable the "next" button in these cases. + self.row_count = None + else: + self.row_count = self._batches.total_rows - # get the initial page - self._set_table_html() + # get the initial page + self._set_table_html() @traitlets.observe("_initial_load_complete") def _on_initial_load_complete(self, change: dict[str, Any]): @@ -281,7 +282,9 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" new_page = None - with self._setting_html_lock: + with self._setting_html_lock, bigframes.option_context( + "display.progress_bar", None + ): if self._error_message: self.table_html = ( f"
" diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 6102d1512c..2bf847a259 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -363,7 +363,8 @@ def repr_mimebundle( if opts.repr_mode == "anywidget": try: - return get_anywidget_bundle(obj, include=include, exclude=exclude) + with bigframes.option_context("display.progress_bar", None): + return get_anywidget_bundle(obj, include=include, exclude=exclude) except ImportError: # Anywidget is an optional dependency, so warn rather than fail. # TODO(shuowei): When Anywidget becomes the default for all repr modes, diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 7260a79223..3d3ccfef11 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -220,6 +220,7 @@ def output_type(self, *input_types): class JSONDecode(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_decode" to_type: dtypes.Dtype + safe: bool = False def output_type(self, *input_types): input_type = input_types[0] diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 575beff8fc..ead0b0591b 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -34,6 +34,7 @@ numeric_ops, string_ops, ) +import bigframes.operations.json_ops as json_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -94,6 +95,7 @@ string_ops.EndsWithOp, string_ops.StrContainsOp, string_ops.StrContainsRegexOp, + json_ops.JSONDecode, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index e9491610ac..3bc12617fc 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -91,7 +91,9 @@ "outputs": [ { "data": { - "text/html": [], + "text/html": [ + "Starting." + ], "text/plain": [ "" ] @@ -117,17 +119,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Annie 482\n", + " AL F 1910 Myrtle 104\n", + " AR F 1910 Lillian 56\n", + " CT F 1910 Anne 38\n", + " CT F 1910 Frances 45\n", + " FL F 1910 Margaret 53\n", + " GA F 1910 Mae 73\n", + " GA F 1910 Beatrice 96\n", + " GA F 1910 Lola 47\n", + " IA F 1910 Viola 49\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -145,30 +147,10 @@ "id": "220340b0", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", + "model_id": "c74c3719ba43489890185b5c9880acfc", "version_major": 2, "version_minor": 1 }, @@ -204,80 +186,80 @@ " AL\n", " F\n", " 1910\n", - " Lillian\n", - " 99\n", + " Hazel\n", + " 51\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Ruby\n", - " 204\n", + " Lucy\n", + " 76\n", " \n", " \n", " 2\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Helen\n", - " 76\n", + " Nellie\n", + " 39\n", " \n", " \n", " 3\n", - " AL\n", + " AR\n", " F\n", " 1910\n", - " Eunice\n", - " 41\n", + " Lena\n", + " 40\n", " \n", " \n", " 4\n", - " AR\n", + " CO\n", " F\n", " 1910\n", - " Dora\n", - " 42\n", + " Thelma\n", + " 36\n", " \n", " \n", " 5\n", - " CA\n", + " CO\n", " F\n", " 1910\n", - " Edna\n", - " 62\n", + " Ruth\n", + " 68\n", " \n", " \n", " 6\n", - " CA\n", + " CT\n", " F\n", " 1910\n", - " Helen\n", - " 239\n", + " Elizabeth\n", + " 86\n", " \n", " \n", " 7\n", - " CO\n", + " DC\n", " F\n", " 1910\n", - " Alice\n", - " 46\n", + " Mary\n", + " 80\n", " \n", " \n", " 8\n", " FL\n", " F\n", " 1910\n", - " Willie\n", - " 71\n", + " Annie\n", + " 101\n", " \n", " \n", " 9\n", " FL\n", " F\n", " 1910\n", - " Thelma\n", - " 65\n", + " Alma\n", + " 39\n", " \n", " \n", "\n", @@ -285,17 +267,17 @@ "
[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Lillian 99\n", - " AL F 1910 Ruby 204\n", - " AL F 1910 Helen 76\n", - " AL F 1910 Eunice 41\n", - " AR F 1910 Dora 42\n", - " CA F 1910 Edna 62\n", - " CA F 1910 Helen 239\n", - " CO F 1910 Alice 46\n", - " FL F 1910 Willie 71\n", - " FL F 1910 Thelma 65\n", + "state gender year name number\n", + " AL F 1910 Hazel 51\n", + " AL F 1910 Lucy 76\n", + " AR F 1910 Nellie 39\n", + " AR F 1910 Lena 40\n", + " CO F 1910 Thelma 36\n", + " CO F 1910 Ruth 68\n", + " CT F 1910 Elizabeth 86\n", + " DC F 1910 Mary 80\n", + " FL F 1910 Annie 101\n", + " FL F 1910 Alma 39\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -329,7 +311,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", + " Query processed 171.4 MB in 35 seconds of slot time. [Job bigframes-dev:US.e15f1b34-e414-42d2-857b-926ea25947c4 details]\n", " " ], "text/plain": [ @@ -355,7 +337,9 @@ }, { "data": { - "text/html": [], + "text/html": [ + "Starting." + ], "text/plain": [ "" ] @@ -404,38 +388,10 @@ "id": "da23e0f3", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", + "model_id": "2ad9004bda464950ab6eda63b1b86a3a", "version_major": 2, "version_minor": 1 }, @@ -533,34 +489,6 @@ "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -571,12 +499,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "52d11291ba1d42e6b544acbd86eef6cf", + "model_id": "c1b84125429c4fbc90a22e1adbeea901", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -648,34 +576,6 @@ "id": "a9d5d13a", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 215.9 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -686,12 +586,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "32c61c84740d45a0ac37202a76c7c14e", + "model_id": "23fc730e004b4eec807d2829bffa3ce0", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -752,33 +652,7 @@ "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", @@ -792,7 +666,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d60a47296214553bb10c434b5ee8330", + "model_id": "a3bf6021c6fd44299d317d3b44213b50", "version_major": 2, "version_minor": 1 }, @@ -839,24 +713,6 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", - " 29.08.018\n", - " E04H 6/12\n", - " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", - " \n", - " \n", - " 1\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", " 03.10.2018\n", " H05B 6/12\n", " <NA>\n", @@ -870,7 +726,7 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 2\n", + " 1\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -888,7 +744,7 @@ " EP 3 382 744 A1\n", " \n", " \n", - " 3\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -906,7 +762,7 @@ " EP 3 382 553 A1\n", " \n", " \n", - " 4\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -923,6 +779,24 @@ " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", + " \n", + " 4\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", + " 29.08.018\n", + " E04H 6/12\n", + " <NA>\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", + " \n", " \n", "\n", "

5 rows × 15 columns

\n", @@ -944,32 +818,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 29.08.018 E04H 6/12 18157874.1 \n", - "1 03.10.2018 H05B 6/12 18165514.3 \n", - "2 03.10.2018 H01L 21/20 18166536.5 \n", - "3 03.10.2018 G06F 11/30 18157347.8 \n", - "4 03.10.2018 A01K 31/00 18171005.4 \n", + "0 03.10.2018 H05B 6/12 18165514.3 \n", + "1 03.10.2018 H01L 21/20 18166536.5 \n", + "2 03.10.2018 G06F 11/30 18157347.8 \n", + "3 03.10.2018 A01K 31/00 18171005.4 \n", + "4 29.08.018 E04H 6/12 18157874.1 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", - "1 03.04.2018 30.03.2017 \n", - "2 16.02.2016 Scheider, Sascha et al \n", - "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "4 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "0 03.04.2018 30.03.2017 \n", + "1 16.02.2016 Scheider, Sascha et al \n", + "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "3 05.02.2015 05.02.2014 Stork Bamberger Patentanwälte \n", + "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanwälte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 BSH Hausgeräte GmbH Acero Acero, Jesus \n", - "2 EV Group E. Thallner GmbH Kurz, Florian \n", - "3 FUJITSU LIMITED Kukihara, Kensuke \n", - "4 Linco Food Systems A/S Thrane, Uffe \n", + "0 BSH Hausgeräte GmbH Acero Acero, Jesus \n", + "1 EV Group E. Thallner GmbH Kurz, Florian \n", + "2 FUJITSU LIMITED Kukihara, Kensuke \n", + "3 Linco Food Systems A/S Thrane, Uffe \n", + "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", "\n", " title_line_1 number \n", - "0 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", - "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", - "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "4 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIEÜBERTRAGUNG EP 3 383 141 A2 \n", + "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "3 MASTHÄHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "4 STEUERUNGSSYSTEM FÜR AUTOMATISCHE PARKHÄUSER EP 3 366 869 A1 \n", "\n", "[5 rows x 15 columns]" ] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 516a46d4dd..6f04631264 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -4142,7 +4142,6 @@ def test_json_astype_others_raise_error(data, to_type): bf_series.astype(to_type, errors="raise").to_pandas() -@pytest.mark.skip(reason="AssertionError: Series NA mask are different") @pytest.mark.parametrize( ("data", "to_type"), [ From 1fc29f62996e97f41a31114e4f83e3389c441d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 6 Feb 2026 14:03:15 -0600 Subject: [PATCH 11/18] Revert "feat: Disable progress bars in Anywidget mode to reduce notebook clutter" (#2442) Reverts googleapis/python-bigquery-dataframes#2437 --- bigframes/core/compile/polars/compiler.py | 64 +---- bigframes/core/compile/polars/lowering.py | 2 +- bigframes/display/anywidget.py | 39 ++- bigframes/display/html.py | 3 +- bigframes/operations/json_ops.py | 1 - bigframes/session/polars_executor.py | 2 - notebooks/dataframes/anywidget_mode.ipynb | 336 +++++++++++++++------- tests/unit/test_series_polars.py | 1 + 8 files changed, 253 insertions(+), 195 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index a9fd492f98..1f0ca592e5 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,7 +16,6 @@ import dataclasses import functools import itertools -import json from typing import cast, Literal, Optional, Sequence, Tuple, Type, TYPE_CHECKING import pandas as pd @@ -430,68 +429,7 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.JSONDecode) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) - target_dtype = _bigframes_dtype_to_polars_dtype(op.to_type) - if op.safe: - # Polars does not support safe JSON decoding (returning null on failure). - # We use map_elements to provide safe JSON decoding. - def safe_decode(val): - if val is None: - return None - try: - decoded = json.loads(val) - except Exception: - return None - - if decoded is None: - return None - - if op.to_type == bigframes.dtypes.INT_DTYPE: - if type(decoded) is bool: - return None - if isinstance(decoded, int): - return decoded - if isinstance(decoded, float): - if decoded.is_integer(): - return int(decoded) - if isinstance(decoded, str): - try: - return int(decoded) - except Exception: - pass - return None - - if op.to_type == bigframes.dtypes.FLOAT_DTYPE: - if type(decoded) is bool: - return None - if isinstance(decoded, (int, float)): - return float(decoded) - if isinstance(decoded, str): - try: - return float(decoded) - except Exception: - pass - return None - - if op.to_type == bigframes.dtypes.BOOL_DTYPE: - if isinstance(decoded, bool): - return decoded - if isinstance(decoded, str): - if decoded.lower() == "true": - return True - if decoded.lower() == "false": - return False - return None - - if op.to_type == bigframes.dtypes.STRING_DTYPE: - if isinstance(decoded, str): - return decoded - return None - - return decoded - - return input.map_elements(safe_decode, return_dtype=target_dtype) - - return input.str.json_decode(target_dtype) + return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index 5f80904b3b..bf617d6879 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -391,7 +391,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): return arg if arg.output_type == dtypes.JSON_DTYPE: - return json_ops.JSONDecode(cast_op.to_type, safe=cast_op.safe).as_expr(arg) + return json_ops.JSONDecode(cast_op.to_type).as_expr(arg) if ( arg.output_type == dtypes.STRING_DTYPE and cast_op.to_type == dtypes.DATETIME_DTYPE diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 9bd20fce43..be0d2b45d0 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -133,26 +133,25 @@ def _initial_load(self) -> None: # obtain the row counts # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` # before we get here so that the count might already be cached. - with bigframes.option_context("display.progress_bar", None): - self._reset_batches_for_new_page_size() + self._reset_batches_for_new_page_size() - if self._batches is None: - self._error_message = ( - "Could not retrieve data batches. Data might be unavailable or " - "an error occurred." - ) - self.row_count = None - elif self._batches.total_rows is None: - # Total rows is unknown, this is an expected state. - # TODO(b/461536343): Cheaply discover if we have exactly 1 page. - # There are cases where total rows is not set, but there are no additional - # pages. We could disable the "next" button in these cases. - self.row_count = None - else: - self.row_count = self._batches.total_rows + if self._batches is None: + self._error_message = ( + "Could not retrieve data batches. Data might be unavailable or " + "an error occurred." + ) + self.row_count = None + elif self._batches.total_rows is None: + # Total rows is unknown, this is an expected state. + # TODO(b/461536343): Cheaply discover if we have exactly 1 page. + # There are cases where total rows is not set, but there are no additional + # pages. We could disable the "next" button in these cases. + self.row_count = None + else: + self.row_count = self._batches.total_rows - # get the initial page - self._set_table_html() + # get the initial page + self._set_table_html() @traitlets.observe("_initial_load_complete") def _on_initial_load_complete(self, change: dict[str, Any]): @@ -282,9 +281,7 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" new_page = None - with self._setting_html_lock, bigframes.option_context( - "display.progress_bar", None - ): + with self._setting_html_lock: if self._error_message: self.table_html = ( f"
" diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 2bf847a259..6102d1512c 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -363,8 +363,7 @@ def repr_mimebundle( if opts.repr_mode == "anywidget": try: - with bigframes.option_context("display.progress_bar", None): - return get_anywidget_bundle(obj, include=include, exclude=exclude) + return get_anywidget_bundle(obj, include=include, exclude=exclude) except ImportError: # Anywidget is an optional dependency, so warn rather than fail. # TODO(shuowei): When Anywidget becomes the default for all repr modes, diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 3d3ccfef11..7260a79223 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -220,7 +220,6 @@ def output_type(self, *input_types): class JSONDecode(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_decode" to_type: dtypes.Dtype - safe: bool = False def output_type(self, *input_types): input_type = input_types[0] diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index ead0b0591b..575beff8fc 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -34,7 +34,6 @@ numeric_ops, string_ops, ) -import bigframes.operations.json_ops as json_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -95,7 +94,6 @@ string_ops.EndsWithOp, string_ops.StrContainsOp, string_ops.StrContainsRegexOp, - json_ops.JSONDecode, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 3bc12617fc..e9491610ac 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -91,9 +91,7 @@ "outputs": [ { "data": { - "text/html": [ - "Starting." - ], + "text/html": [], "text/plain": [ "" ] @@ -119,17 +117,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "state gender year name number\n", - " AL F 1910 Annie 482\n", - " AL F 1910 Myrtle 104\n", - " AR F 1910 Lillian 56\n", - " CT F 1910 Anne 38\n", - " CT F 1910 Frances 45\n", - " FL F 1910 Margaret 53\n", - " GA F 1910 Mae 73\n", - " GA F 1910 Beatrice 96\n", - " GA F 1910 Lola 47\n", - " IA F 1910 Viola 49\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]\n" @@ -147,10 +145,30 @@ "id": "220340b0", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c74c3719ba43489890185b5c9880acfc", + "model_id": "6fb22be7f21f4d1dacd76dc62a1a7818", "version_major": 2, "version_minor": 1 }, @@ -186,80 +204,80 @@ " AL\n", " F\n", " 1910\n", - " Hazel\n", - " 51\n", + " Lillian\n", + " 99\n", " \n", " \n", " 1\n", " AL\n", " F\n", " 1910\n", - " Lucy\n", - " 76\n", + " Ruby\n", + " 204\n", " \n", " \n", " 2\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Nellie\n", - " 39\n", + " Helen\n", + " 76\n", " \n", " \n", " 3\n", - " AR\n", + " AL\n", " F\n", " 1910\n", - " Lena\n", - " 40\n", + " Eunice\n", + " 41\n", " \n", " \n", " 4\n", - " CO\n", + " AR\n", " F\n", " 1910\n", - " Thelma\n", - " 36\n", + " Dora\n", + " 42\n", " \n", " \n", " 5\n", - " CO\n", + " CA\n", " F\n", " 1910\n", - " Ruth\n", - " 68\n", + " Edna\n", + " 62\n", " \n", " \n", " 6\n", - " CT\n", + " CA\n", " F\n", " 1910\n", - " Elizabeth\n", - " 86\n", + " Helen\n", + " 239\n", " \n", " \n", " 7\n", - " DC\n", + " CO\n", " F\n", " 1910\n", - " Mary\n", - " 80\n", + " Alice\n", + " 46\n", " \n", " \n", " 8\n", " FL\n", " F\n", " 1910\n", - " Annie\n", - " 101\n", + " Willie\n", + " 71\n", " \n", " \n", " 9\n", " FL\n", " F\n", " 1910\n", - " Alma\n", - " 39\n", + " Thelma\n", + " 65\n", " \n", " \n", "\n", @@ -267,17 +285,17 @@ "
[5552452 rows x 5 columns in total]" ], "text/plain": [ - "state gender year name number\n", - " AL F 1910 Hazel 51\n", - " AL F 1910 Lucy 76\n", - " AR F 1910 Nellie 39\n", - " AR F 1910 Lena 40\n", - " CO F 1910 Thelma 36\n", - " CO F 1910 Ruth 68\n", - " CT F 1910 Elizabeth 86\n", - " DC F 1910 Mary 80\n", - " FL F 1910 Annie 101\n", - " FL F 1910 Alma 39\n", + "state gender year name number\n", + " AL F 1910 Lillian 99\n", + " AL F 1910 Ruby 204\n", + " AL F 1910 Helen 76\n", + " AL F 1910 Eunice 41\n", + " AR F 1910 Dora 42\n", + " CA F 1910 Edna 62\n", + " CA F 1910 Helen 239\n", + " CO F 1910 Alice 46\n", + " FL F 1910 Willie 71\n", + " FL F 1910 Thelma 65\n", "...\n", "\n", "[5552452 rows x 5 columns]" @@ -311,7 +329,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 171.4 MB in 35 seconds of slot time. [Job bigframes-dev:US.e15f1b34-e414-42d2-857b-926ea25947c4 details]\n", + " Query processed 171.4 MB in 41 seconds of slot time. [Job bigframes-dev:US.492b5260-9f44-495c-be09-2ae1324a986c details]\n", " " ], "text/plain": [ @@ -337,9 +355,7 @@ }, { "data": { - "text/html": [ - "Starting." - ], + "text/html": [], "text/plain": [ "" ] @@ -388,10 +404,38 @@ "id": "da23e0f3", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 88.8 MB in 2 seconds of slot time. [Job bigframes-dev:US.job_gsx0h2jHoOSYwqGKUS3lAYLf_qi3 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 88.8 MB in 3 seconds of slot time. [Job bigframes-dev:US.job_1VivAJ2InPdg5RXjWfvAJ1B0oxO3 details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2ad9004bda464950ab6eda63b1b86a3a", + "model_id": "7d82208e7e5e40dd9dbf64c4c561cab3", "version_major": 2, "version_minor": 1 }, @@ -489,6 +533,34 @@ "id": "6920d49b", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 215.9 MB in 10 seconds of slot time. [Job bigframes-dev:US.job_cmNyG5sJ1IDCyFINx7teExQOZ6UQ details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 215.9 MB in 8 seconds of slot time. [Job bigframes-dev:US.job_aQvP3Sn04Ss4flSLaLhm0sKzFvrd details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -499,12 +571,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c1b84125429c4fbc90a22e1adbeea901", + "model_id": "52d11291ba1d42e6b544acbd86eef6cf", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -576,6 +648,34 @@ "id": "a9d5d13a", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 215.9 MB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 215.9 MB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -586,12 +686,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "23fc730e004b4eec807d2829bffa3ce0", + "model_id": "32c61c84740d45a0ac37202a76c7c14e", "version_major": 2, "version_minor": 1 }, "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -652,7 +752,33 @@ "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:987: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", @@ -666,7 +792,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a3bf6021c6fd44299d317d3b44213b50", + "model_id": "9d60a47296214553bb10c434b5ee8330", "version_major": 2, "version_minor": 1 }, @@ -713,6 +839,24 @@ " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", " DE\n", + " 29.08.018\n", + " E04H 6/12\n", + " <NA>\n", + " 18157874.1\n", + " 21.02.2018\n", + " 22.02.2017\n", + " Liedtke & Partner Patentanw√§lte\n", + " SHB Hebezeugbau GmbH\n", + " VOLGER, Alexander\n", + " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", + " EP 3 366 869 A1\n", + " \n", + " \n", + " 1\n", + " {'application_number': None, 'class_internatio...\n", + " gs://gcs-public-data--labeled-patents/espacene...\n", + " EU\n", + " DE\n", " 03.10.2018\n", " H05B 6/12\n", " <NA>\n", @@ -726,7 +870,7 @@ " EP 3 383 141 A2\n", " \n", " \n", - " 1\n", + " 2\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -744,7 +888,7 @@ " EP 3 382 744 A1\n", " \n", " \n", - " 2\n", + " 3\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -762,7 +906,7 @@ " EP 3 382 553 A1\n", " \n", " \n", - " 3\n", + " 4\n", " {'application_number': None, 'class_internatio...\n", " gs://gcs-public-data--labeled-patents/espacene...\n", " EU\n", @@ -779,24 +923,6 @@ " MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E...\n", " EP 3 381 276 A1\n", " \n", - " \n", - " 4\n", - " {'application_number': None, 'class_internatio...\n", - " gs://gcs-public-data--labeled-patents/espacene...\n", - " EU\n", - " DE\n", - " 29.08.018\n", - " E04H 6/12\n", - " <NA>\n", - " 18157874.1\n", - " 21.02.2018\n", - " 22.02.2017\n", - " Liedtke & Partner Patentanw√§lte\n", - " SHB Hebezeugbau GmbH\n", - " VOLGER, Alexander\n", - " STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER\n", - " EP 3 366 869 A1\n", - " \n", " \n", "\n", "

5 rows × 15 columns

\n", @@ -818,32 +944,32 @@ "4 gs://gcs-public-data--labeled-patents/espacene... EU DE \n", "\n", " publication_date class_international class_us application_number \\\n", - "0 03.10.2018 H05B 6/12 18165514.3 \n", - "1 03.10.2018 H01L 21/20 18166536.5 \n", - "2 03.10.2018 G06F 11/30 18157347.8 \n", - "3 03.10.2018 A01K 31/00 18171005.4 \n", - "4 29.08.018 E04H 6/12 18157874.1 \n", + "0 29.08.018 E04H 6/12 18157874.1 \n", + "1 03.10.2018 H05B 6/12 18165514.3 \n", + "2 03.10.2018 H01L 21/20 18166536.5 \n", + "3 03.10.2018 G06F 11/30 18157347.8 \n", + "4 03.10.2018 A01K 31/00 18171005.4 \n", "\n", " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 03.04.2018 30.03.2017 \n", - "1 16.02.2016 Scheider, Sascha et al \n", - "2 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "3 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", - "4 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "1 03.04.2018 30.03.2017 \n", + "2 16.02.2016 Scheider, Sascha et al \n", + "3 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "4 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", "\n", " applicant_line_1 inventor_line_1 \\\n", - "0 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "1 EV Group E. Thallner GmbH Kurz, Florian \n", - "2 FUJITSU LIMITED Kukihara, Kensuke \n", - "3 Linco Food Systems A/S Thrane, Uffe \n", - "4 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "1 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "2 EV Group E. Thallner GmbH Kurz, Florian \n", + "3 FUJITSU LIMITED Kukihara, Kensuke \n", + "4 Linco Food Systems A/S Thrane, Uffe \n", "\n", " title_line_1 number \n", - "0 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "1 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "2 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "3 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "4 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "1 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "2 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "3 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "4 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", "\n", "[5 rows x 15 columns]" ] diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 6f04631264..516a46d4dd 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -4142,6 +4142,7 @@ def test_json_astype_others_raise_error(data, to_type): bf_series.astype(to_type, errors="raise").to_pandas() +@pytest.mark.skip(reason="AssertionError: Series NA mask are different") @pytest.mark.parametrize( ("data", "to_type"), [ From 311de31e79227408515f087dafbab7edc54ddf1b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 6 Feb 2026 14:27:42 -0800 Subject: [PATCH 12/18] fix: exlcude gcsfs 2026.2.0 (#2445) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<482422167> 🦕 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 38c2a6032e..d393375d14 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ # please keep these in sync with the minimum versions in testing/constraints-3.10.txt "cloudpickle >= 2.0.0", "fsspec >=2023.3.0", - "gcsfs >=2023.3.0, !=2025.5.0", + "gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", From 3cb57b1ccf84eb40141b253c3fb5b76182adbc89 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 6 Feb 2026 15:16:25 -0800 Subject: [PATCH 13/18] tests: Fix global session dependence of astype tests (#2443) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/small/test_series.py | 69 ++++++++++++++++--------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index a95c9623e5..f5408dc323 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3885,9 +3885,9 @@ def test_date_time_astype_int( assert bf_result.dtype == "Int64" -def test_string_astype_int(): - pd_series = pd.Series(["4", "-7", "0", " -03"]) - bf_series = series.Series(pd_series) +def test_string_astype_int(session): + pd_series = pd.Series(["4", "-7", "0", "-03"]) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype("Int64") bf_result = bf_series.astype("Int64").to_pandas() @@ -3895,12 +3895,12 @@ def test_string_astype_int(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_float(): +def test_string_astype_float(session): pd_series = pd.Series( - ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] + ["1", "-1", "-0", "000", "-03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] ) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype("Float64") bf_result = bf_series.astype("Float64").to_pandas() @@ -3908,7 +3908,7 @@ def test_string_astype_float(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_date(): +def test_string_astype_date(session): if int(pa.__version__.split(".")[0]) < 15: pytest.skip( "Avoid pyarrow.lib.ArrowNotImplementedError: " @@ -3919,7 +3919,7 @@ def test_string_astype_date(): pd.ArrowDtype(pa.string()) ) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) # TODO(b/340885567): fix type error pd_result = pd_series.astype("date32[day][pyarrow]") # type: ignore @@ -3928,12 +3928,12 @@ def test_string_astype_date(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_datetime(): +def test_string_astype_datetime(session): pd_series = pd.Series( ["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"] ).astype(pd.ArrowDtype(pa.string())) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us"))) bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas() @@ -3941,7 +3941,7 @@ def test_string_astype_datetime(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_string_astype_timestamp(): +def test_string_astype_timestamp(session): pd_series = pd.Series( [ "2014-08-15 08:15:12+00:00", @@ -3950,7 +3950,7 @@ def test_string_astype_timestamp(): ] ).astype(pd.ArrowDtype(pa.string())) - bf_series = series.Series(pd_series) + bf_series = series.Series(pd_series, session=session) pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) bf_result = bf_series.astype( @@ -3960,13 +3960,14 @@ def test_string_astype_timestamp(): pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) -def test_timestamp_astype_string(): +def test_timestamp_astype_string(session): bf_series = series.Series( [ "2014-08-15 08:15:12+00:00", "2015-08-15 08:15:12.654754+05:00", "2016-02-29 00:00:00+08:00", - ] + ], + session=session, ).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC"))) expected_result = pd.Series( @@ -3985,9 +3986,9 @@ def test_timestamp_astype_string(): @pytest.mark.parametrize("errors", ["raise", "null"]) -def test_float_astype_json(errors): +def test_float_astype_json(errors, session): data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) assert bf_result.dtype == dtypes.JSON_DTYPE @@ -3997,9 +3998,9 @@ def test_float_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), expected_result) -def test_float_astype_json_str(): +def test_float_astype_json_str(session): data = ["1.25", "2500000000", None, "-12323.24"] - bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE) + bf_series = series.Series(data, dtype=dtypes.FLOAT_DTYPE, session=session) bf_result = bf_series.astype("json") assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4010,14 +4011,14 @@ def test_float_astype_json_str(): @pytest.mark.parametrize("errors", ["raise", "null"]) -def test_string_astype_json(errors): +def test_string_astype_json(errors, session): data = [ "1", None, '["1","3","5"]', '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors=errors) assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4026,9 +4027,9 @@ def test_string_astype_json(errors): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) -def test_string_astype_json_in_safe_mode(): +def test_string_astype_json_in_safe_mode(session): data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) bf_result = bf_series.astype(dtypes.JSON_DTYPE, errors="null") assert bf_result.dtype == dtypes.JSON_DTYPE @@ -4037,9 +4038,9 @@ def test_string_astype_json_in_safe_mode(): pd.testing.assert_series_equal(bf_result.to_pandas(), expected) -def test_string_astype_json_raise_error(): +def test_string_astype_json_raise_error(session): data = ["this is not a valid json string"] - bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE) + bf_series = series.Series(data, dtype=dtypes.STRING_DTYPE, session=session) with pytest.raises( google.api_core.exceptions.BadRequest, match="syntax error while parsing value", @@ -4063,8 +4064,8 @@ def test_string_astype_json_raise_error(): ), ], ) -def test_json_astype_others(data, to_type, errors): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others(data, to_type, errors, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) bf_result = bf_series.astype(to_type, errors=errors) assert bf_result.dtype == to_type @@ -4084,8 +4085,8 @@ def test_json_astype_others(data, to_type, errors): pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) -def test_json_astype_others_raise_error(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others_raise_error(data, to_type, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) with pytest.raises(google.api_core.exceptions.BadRequest): bf_series.astype(to_type, errors="raise").to_pandas() @@ -4099,8 +4100,8 @@ def test_json_astype_others_raise_error(data, to_type): pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) -def test_json_astype_others_in_safe_mode(data, to_type): - bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE) +def test_json_astype_others_in_safe_mode(data, to_type, session): + bf_series = series.Series(data, dtype=dtypes.JSON_DTYPE, session=session) bf_result = bf_series.astype(to_type, errors="null") assert bf_result.dtype == to_type @@ -4414,8 +4415,8 @@ def test_query_job_setters(scalars_dfs): ([1, 1, 1, 1, 1],), ], ) -def test_is_monotonic_increasing(series_input): - scalars_df = series.Series(series_input, dtype=pd.Int64Dtype()) +def test_is_monotonic_increasing(series_input, session): + scalars_df = series.Series(series_input, dtype=pd.Int64Dtype(), session=session) scalars_pandas_df = pd.Series(series_input, dtype=pd.Int64Dtype()) assert ( scalars_df.is_monotonic_increasing == scalars_pandas_df.is_monotonic_increasing @@ -4433,8 +4434,8 @@ def test_is_monotonic_increasing(series_input): ([1, 1, 1, 1, 1],), ], ) -def test_is_monotonic_decreasing(series_input): - scalars_df = series.Series(series_input) +def test_is_monotonic_decreasing(series_input, session): + scalars_df = series.Series(series_input, session=session) scalars_pandas_df = pd.Series(series_input) assert ( scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing From 4e2689a1c975c4cabaf36b7d0817dcbedc926853 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 6 Feb 2026 15:51:08 -0800 Subject: [PATCH 14/18] feat: Disable progress bars in Anywidget mode (#2444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR improves the user experience of the interactive Anywidget display mode by automatically disabling progress bars and job logging during widget operations. Changes: Wrapped the get_anywidget_bundle call in repr_mimebundle with option_context("display.progress_bar", None) to silence the initial widget load. Wrapped TableWidget._initial_load and TableWidget._set_table_html methods to silence subsequent interactions like pagination and sorting. Motivation: When interacting with the TableWidget (paging, sorting), the repeated appearance of progress bars creates visual noise and clutter in the notebook output cell, distracting from the interactive data exploration experience. This change ensures a clean and seamless interface. Verified at: vs code notebook: screen/B7hCnXdLeTiFshr colab notebook: screen/4d8Kpo5ZbbfZUBb Fixes #< 482120359 > 🦕 --- bigframes/display/anywidget.py | 42 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index be0d2b45d0..eca435e960 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -133,25 +133,26 @@ def _initial_load(self) -> None: # obtain the row counts # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` # before we get here so that the count might already be cached. - self._reset_batches_for_new_page_size() + with bigframes.option_context("display.progress_bar", None): + self._reset_batches_for_new_page_size() - if self._batches is None: - self._error_message = ( - "Could not retrieve data batches. Data might be unavailable or " - "an error occurred." - ) - self.row_count = None - elif self._batches.total_rows is None: - # Total rows is unknown, this is an expected state. - # TODO(b/461536343): Cheaply discover if we have exactly 1 page. - # There are cases where total rows is not set, but there are no additional - # pages. We could disable the "next" button in these cases. - self.row_count = None - else: - self.row_count = self._batches.total_rows + if self._batches is None: + self._error_message = ( + "Could not retrieve data batches. Data might be unavailable or " + "an error occurred." + ) + self.row_count = None + elif self._batches.total_rows is None: + # Total rows is unknown, this is an expected state. + # TODO(b/461536343): Cheaply discover if we have exactly 1 page. + # There are cases where total rows is not set, but there are no additional + # pages. We could disable the "next" button in these cases. + self.row_count = None + else: + self.row_count = self._batches.total_rows - # get the initial page - self._set_table_html() + # get the initial page + self._set_table_html() @traitlets.observe("_initial_load_complete") def _on_initial_load_complete(self, change: dict[str, Any]): @@ -274,14 +275,17 @@ def _reset_batch_cache(self) -> None: def _reset_batches_for_new_page_size(self) -> None: """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + with bigframes.option_context("display.progress_bar", None): + self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) self._reset_batch_cache() def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" new_page = None - with self._setting_html_lock: + with self._setting_html_lock, bigframes.option_context( + "display.progress_bar", None + ): if self._error_message: self.table_html = ( f"
" From 12741677c0391efb5d05281fc756445ccbb1387e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 6 Feb 2026 15:59:12 -0800 Subject: [PATCH 15/18] feat: Add bigframes.pandas.col with basic operators (#2405) --- bigframes/core/agg_expressions.py | 10 +- bigframes/core/col.py | 126 ++++++++++++++ bigframes/core/expression.py | 32 ++-- bigframes/dataframe.py | 16 +- bigframes/pandas/__init__.py | 2 + tests/unit/test_col.py | 160 ++++++++++++++++++ tests/unit/test_dataframe_polars.py | 20 +++ .../bigframes_vendored/pandas/core/col.py | 36 ++++ 8 files changed, 385 insertions(+), 17 deletions(-) create mode 100644 bigframes/core/col.py create mode 100644 tests/unit/test_col.py create mode 100644 third_party/bigframes_vendored/pandas/core/col.py diff --git a/bigframes/core/agg_expressions.py b/bigframes/core/agg_expressions.py index 125e3fef63..a26a9cfe08 100644 --- a/bigframes/core/agg_expressions.py +++ b/bigframes/core/agg_expressions.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Callable, Mapping, Tuple, TypeVar +from typing import Callable, Hashable, Mapping, Tuple, TypeVar from bigframes import dtypes from bigframes.core import expression, window_spec @@ -68,7 +68,7 @@ def children(self) -> Tuple[expression.Expression, ...]: return self.inputs @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -92,7 +92,7 @@ def transform_children( def bind_variables( self: TExpression, - bindings: Mapping[str, expression.Expression], + bindings: Mapping[Hashable, expression.Expression], allow_partial_bindings: bool = False, ) -> TExpression: return self.transform_children( @@ -192,7 +192,7 @@ def children(self) -> Tuple[expression.Expression, ...]: return self.inputs @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -216,7 +216,7 @@ def transform_children( def bind_variables( self: WindowExpression, - bindings: Mapping[str, expression.Expression], + bindings: Mapping[Hashable, expression.Expression], allow_partial_bindings: bool = False, ) -> WindowExpression: return self.transform_children( diff --git a/bigframes/core/col.py b/bigframes/core/col.py new file mode 100644 index 0000000000..60b24d5e83 --- /dev/null +++ b/bigframes/core/col.py @@ -0,0 +1,126 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import dataclasses +from typing import Any, Hashable + +import bigframes_vendored.pandas.core.col as pd_col + +import bigframes.core.expression as bf_expression +import bigframes.operations as bf_ops + + +# Not to be confused with the Expression class in `bigframes.core.expressions` +# Name collision unintended +@dataclasses.dataclass(frozen=True) +class Expression: + __doc__ = pd_col.Expression.__doc__ + + _value: bf_expression.Expression + + def _apply_unary(self, op: bf_ops.UnaryOp) -> Expression: + return Expression(op.as_expr(self._value)) + + def _apply_binary(self, other: Any, op: bf_ops.BinaryOp, reverse: bool = False): + if isinstance(other, Expression): + other_value = other._value + else: + other_value = bf_expression.const(other) + if reverse: + return Expression(op.as_expr(other_value, self._value)) + else: + return Expression(op.as_expr(self._value, other_value)) + + def __add__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.add_op) + + def __radd__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.add_op, reverse=True) + + def __sub__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.sub_op) + + def __rsub__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.sub_op, reverse=True) + + def __mul__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mul_op) + + def __rmul__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mul_op, reverse=True) + + def __truediv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.div_op) + + def __rtruediv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.div_op, reverse=True) + + def __floordiv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.floordiv_op) + + def __rfloordiv__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.floordiv_op, reverse=True) + + def __ge__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.ge_op) + + def __gt__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.gt_op) + + def __le__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.le_op) + + def __lt__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.lt_op) + + def __eq__(self, other: object) -> Expression: # type: ignore + return self._apply_binary(other, bf_ops.eq_op) + + def __ne__(self, other: object) -> Expression: # type: ignore + return self._apply_binary(other, bf_ops.ne_op) + + def __mod__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mod_op) + + def __rmod__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.mod_op, reverse=True) + + def __and__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.and_op) + + def __rand__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.and_op, reverse=True) + + def __or__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.or_op) + + def __ror__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.or_op, reverse=True) + + def __xor__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.xor_op) + + def __rxor__(self, other: Any) -> Expression: + return self._apply_binary(other, bf_ops.xor_op, reverse=True) + + def __invert__(self) -> Expression: + return self._apply_unary(bf_ops.invert_op) + + +def col(col_name: Hashable) -> Expression: + return Expression(bf_expression.free_var(col_name)) + + +col.__doc__ = pd_col.col.__doc__ diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 89bcb9b920..a1c25bdc73 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -19,7 +19,7 @@ import functools import itertools import typing -from typing import Callable, Generator, Mapping, TypeVar, Union +from typing import Callable, Generator, Hashable, Mapping, TypeVar, Union import pandas as pd @@ -39,7 +39,7 @@ def deref(name: str) -> DerefOp: return DerefOp(ids.ColumnId(name)) -def free_var(id: str) -> UnboundVariableExpression: +def free_var(id: Hashable) -> UnboundVariableExpression: return UnboundVariableExpression(id) @@ -52,7 +52,7 @@ class Expression(abc.ABC): """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return () @property @@ -116,7 +116,9 @@ def bind_refs( @abc.abstractmethod def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: """Replace variables with expression given in `bindings`. @@ -191,7 +193,9 @@ def output_type(self) -> dtypes.ExpressionType: return self.dtype def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: return self @@ -226,10 +230,10 @@ def transform_children(self, t: Callable[[Expression], Expression]) -> Expressio class UnboundVariableExpression(Expression): """A variable expression representing an unbound variable.""" - id: str + id: Hashable @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return (self.id,) @property @@ -256,7 +260,9 @@ def bind_refs( return self def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: if self.id in bindings.keys(): return bindings[self.id] @@ -304,7 +310,9 @@ def output_type(self) -> dtypes.ExpressionType: raise ValueError(f"Type of variable {self.id} has not been fixed.") def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> Expression: return self @@ -373,7 +381,7 @@ def column_references( ) @property - def free_variables(self) -> typing.Tuple[str, ...]: + def free_variables(self) -> typing.Tuple[Hashable, ...]: return tuple( itertools.chain.from_iterable(map(lambda x: x.free_variables, self.inputs)) ) @@ -408,7 +416,9 @@ def output_type(self) -> dtypes.ExpressionType: return self.op.output_type(*input_types) def bind_variables( - self, bindings: Mapping[str, Expression], allow_partial_bindings: bool = False + self, + bindings: Mapping[Hashable, Expression], + allow_partial_bindings: bool = False, ) -> OpExpression: return OpExpression( self.op, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index e1ad4f3e75..c226e5010a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -58,6 +58,7 @@ from bigframes.core import agg_expressions import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.col import bigframes.core.convert import bigframes.core.explode import bigframes.core.expression as ex @@ -94,7 +95,13 @@ import bigframes.session SingleItemValue = Union[ - bigframes.series.Series, int, float, str, pandas.Timedelta, Callable + bigframes.series.Series, + int, + float, + str, + pandas.Timedelta, + Callable, + bigframes.core.col.Expression, ] MultiItemValue = Union[ "DataFrame", Sequence[int | float | str | pandas.Timedelta | Callable] @@ -2236,6 +2243,13 @@ def _assign_single_item( ) -> DataFrame: if isinstance(v, bigframes.series.Series): return self._assign_series_join_on_index(k, v) + elif isinstance(v, bigframes.core.col.Expression): + label_to_col_ref = { + label: ex.deref(id) for id, label in self._block.col_id_to_label.items() + } + resolved_expr = v._value.bind_variables(label_to_col_ref) + block = self._block.project_block_exprs([resolved_expr], labels=[k]) + return DataFrame(block) elif isinstance(v, bigframes.dataframe.DataFrame): v_df_col_count = len(v._block.value_columns) if v_df_col_count != 1: diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 9da2204a71..a70e319747 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -27,6 +27,7 @@ import pandas import bigframes._config as config +from bigframes.core.col import col import bigframes.core.global_session as global_session import bigframes.core.indexes from bigframes.core.logging import log_adapter @@ -415,6 +416,7 @@ def reset_session(): "clean_up_by_session_id", "concat", "crosstab", + "col", "cut", "deploy_remote_function", "deploy_udf", diff --git a/tests/unit/test_col.py b/tests/unit/test_col.py new file mode 100644 index 0000000000..e01c25ddd2 --- /dev/null +++ b/tests/unit/test_col.py @@ -0,0 +1,160 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +import pathlib +from typing import Generator + +import pandas as pd +import pytest + +import bigframes +import bigframes.pandas as bpd +from bigframes.testing.utils import assert_frame_equal, convert_pandas_dtypes + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="3.0.0") + + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.invert,), + ], +) +def test_pd_col_unary_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("float64_col")), + } + pd_kwargs = { + "result": op(pd.col("float64_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.add,), + (operator.sub,), + (operator.mul,), + (operator.truediv,), + (operator.floordiv,), + (operator.gt,), + (operator.lt,), + (operator.ge,), + (operator.le,), + (operator.eq,), + (operator.mod,), + ], +) +def test_pd_col_binary_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("float64_col"), 2.4), + "reverse_result": op(2.4, bpd.col("float64_col")), + } + pd_kwargs = { + "result": op(pd.col("float64_col"), 2.4), # type: ignore + "reverse_result": op(2.4, pd.col("float64_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op",), + [ + (operator.and_,), + (operator.or_,), + (operator.xor,), + ], +) +def test_pd_col_binary_bool_operators(scalars_dfs, op): + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "result": op(bpd.col("bool_col"), True), + "reverse_result": op(False, bpd.col("bool_col")), + } + pd_kwargs = { + "result": op(pd.col("bool_col"), True), # type: ignore + "reverse_result": op(False, pd.col("bool_col")), # type: ignore + } + df = scalars_df.assign(**bf_kwargs) + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index a7e40469a2..263fc82e3e 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -828,6 +828,26 @@ def test_assign_new_column(scalars_dfs): assert_frame_equal(bf_result, pd_result) +def test_assign_using_pd_col(scalars_dfs): + if pd.__version__.startswith("1.") or pd.__version__.startswith("2."): + pytest.skip("col expression interface only supported for pandas 3+") + scalars_df, scalars_pandas_df = scalars_dfs + bf_kwargs = { + "new_col_1": 4 - bpd.col("int64_col"), + "new_col_2": bpd.col("int64_col") / (bpd.col("float64_col") * 0.5), + } + pd_kwargs = { + "new_col_1": 4 - pd.col("int64_col"), # type: ignore + "new_col_2": pd.col("int64_col") / (pd.col("float64_col") * 0.5), # type: ignore + } + + df = scalars_df.assign(**bf_kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**pd_kwargs) + + assert_frame_equal(bf_result, pd_result) + + def test_assign_new_column_w_loc(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() diff --git a/third_party/bigframes_vendored/pandas/core/col.py b/third_party/bigframes_vendored/pandas/core/col.py new file mode 100644 index 0000000000..9b71293a7e --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/col.py @@ -0,0 +1,36 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/col.py +from __future__ import annotations + +from collections.abc import Hashable + +from bigframes import constants + + +class Expression: + """ + Class representing a deferred column. + + This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. + """ + + +def col(col_name: Hashable) -> Expression: + """ + Generate deferred object representing a column of a DataFrame. + + Any place which accepts ``lambda df: df[col_name]``, such as + :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept + ``pd.col(col_name)``. + + Args: + col_name (Hashable): + Column name. + + Returns: + Expression: + A deferred object representing a column of a DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +__all__ = ["Expression", "col"] From 2017cc2f27f0a432af46f60b3286b231caa4a98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 6 Feb 2026 18:26:32 -0600 Subject: [PATCH 16/18] feat: remove redundant "started." messages from progress output (#2440) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to https://github.com/googleapis/python-bigquery-dataframes/pull/2419 Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Context: b/479944983 🦕 --- bigframes/formatting_helpers.py | 81 ++--- notebooks/getting_started/magics.ipynb | 406 +++++++++++++++++++++++++ 2 files changed, 436 insertions(+), 51 deletions(-) create mode 100644 notebooks/getting_started/magics.ipynb diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 1e3cdabdaf..094493818d 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -27,8 +27,6 @@ import humanize if TYPE_CHECKING: - from IPython import display - import bigframes.core.events GenericJob = Union[ @@ -134,16 +132,14 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): return res -current_display: Optional[display.HTML] = None current_display_id: Optional[str] = None -previous_display_html: str = "" def progress_callback( event: bigframes.core.events.Event, ): """Displays a progress bar while the query is running""" - global current_display, current_display_id, previous_display_html + global current_display_id try: import bigframes._config @@ -162,59 +158,44 @@ def progress_callback( if progress_bar == "notebook": import IPython.display as display - if ( - isinstance(event, bigframes.core.events.ExecutionStarted) - or current_display is None - or current_display_id is None - ): - previous_display_html = "" - current_display_id = str(random.random()) - current_display = display.HTML("Starting.") - display.display( - current_display, - display_id=current_display_id, - ) + display_html = None + + if isinstance(event, bigframes.core.events.ExecutionStarted): + # Start a new context for progress output. + current_display_id = None + + elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + display_html = render_bqquery_sent_event_html(event) - if isinstance(event, bigframes.core.events.BigQuerySentEvent): - previous_display_html = render_bqquery_sent_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): - previous_display_html = render_bqquery_retry_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) + display_html = render_bqquery_retry_event_html(event) + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): - previous_display_html = render_bqquery_received_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) + display_html = render_bqquery_received_event_html(event) + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): - previous_display_html = render_bqquery_finished_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) - elif isinstance(event, bigframes.core.events.ExecutionFinished): - if previous_display_html: + display_html = render_bqquery_finished_event_html(event) + + elif isinstance(event, bigframes.core.events.SessionClosed): + display_html = f"Session {event.session_id} closed." + + if display_html: + if current_display_id: display.update_display( - display.HTML(f"✅ Completed. {previous_display_html}"), + display.HTML(display_html), + display_id=current_display_id, + ) + else: + current_display_id = str(random.random()) + display.display( + display.HTML(display_html), display_id=current_display_id, ) - elif isinstance(event, bigframes.core.events.SessionClosed): - display.update_display( - display.HTML(f"Session {event.session_id} closed."), - display_id=current_display_id, - ) elif progress_bar == "terminal": - if isinstance(event, bigframes.core.events.ExecutionStarted): - print("Starting execution.") - elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + message = None + + if isinstance(event, bigframes.core.events.BigQuerySentEvent): message = render_bqquery_sent_event_plaintext(event) print(message) elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): @@ -226,8 +207,6 @@ def progress_callback( elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): message = render_bqquery_finished_event_plaintext(event) print(message) - elif isinstance(event, bigframes.core.events.ExecutionFinished): - print("Execution done.") def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): diff --git a/notebooks/getting_started/magics.ipynb b/notebooks/getting_started/magics.ipynb new file mode 100644 index 0000000000..1f2cf7a409 --- /dev/null +++ b/notebooks/getting_started/magics.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "91edcf7b", + "metadata": {}, + "source": [ + "# %%bqsql cell magics\n", + "\n", + "The BigQuery DataFrames (aka BigFrames) package provides a `%%bqsql` cell magics for Jupyter environments.\n", + "\n", + "To use it, first activate the extension:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "98cd0489", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext bigframes" + ] + }, + { + "cell_type": "markdown", + "id": "f18fdc63", + "metadata": {}, + "source": [ + "Now, use the magics by including SQL in the body." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "269c5862", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes. [Job bigframes-dev:US.job_UVe7FsupxF3CbYuLcLT7fpw9dozg details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1e2fb7b019754d31b11323a054f97f47", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stategenderyearnamenumber
0HIF1999Ariana10
1HIF2002Jordyn10
2HIF2006Mya10
3HIF2010Jordyn10
4HIM1921Nobuo10
5HIM1925Ralph10
6HIM1926Hisao10
7HIM1927Moses10
8HIM1933Larry10
9HIM1933Alfredo10
\n", + "

10 rows × 5 columns

\n", + "
[5552452 rows x 5 columns in total]" + ], + "text/plain": [ + "state gender year name number\n", + " HI F 1999 Ariana 10\n", + " HI F 2002 Jordyn 10\n", + " HI F 2006 Mya 10\n", + " HI F 2010 Jordyn 10\n", + " HI M 1921 Nobuo 10\n", + " HI M 1925 Ralph 10\n", + " HI M 1926 Hisao 10\n", + " HI M 1927 Moses 10\n", + " HI M 1933 Larry 10\n", + " HI M 1933 Alfredo 10\n", + "...\n", + "\n", + "[5552452 rows x 5 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`" + ] + }, + { + "cell_type": "markdown", + "id": "8771e10f", + "metadata": {}, + "source": [ + "The output DataFrame can be saved to a variable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "30bb6327", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes. [Job bigframes-dev:US.c142adf3-cd95-42da-bbdc-c176b36b934f details]\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql mydf\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`" + ] + }, + { + "cell_type": "markdown", + "id": "533e2e9e", + "metadata": {}, + "source": [ + "You can chain cells together using format strings. DataFrame objects are automatically turned into table expressions." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6a8a8123", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " Query processed 88.1 MB in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c4889de9296440428de90defb5c58070", + "version_major": 2, + "version_minor": 1 + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_countname
0304036Tracy
1293876Travis
2203784Troy
3150127Trevor
496397Tristan
589996Tracey
665546Trinity
750112Traci
849657Trenton
945692Trent
\n", + "

10 rows × 2 columns

\n", + "
[238 rows x 2 columns in total]" + ], + "text/plain": [ + " total_count name\n", + "0 304036 Tracy\n", + "1 293876 Travis\n", + "2 203784 Troy\n", + "3 150127 Trevor\n", + "4 96397 Tristan\n", + "5 89996 Tracey\n", + "6 65546 Trinity\n", + "7 50112 Traci\n", + "8 49657 Trenton\n", + "9 45692 Trent\n", + "...\n", + "\n", + "[238 rows x 2 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%bqsql\n", + "SELECT sum(number) as total_count, name\n", + "FROM {mydf}\n", + "WHERE name LIKE 'Tr%'\n", + "GROUP BY name\n", + "ORDER BY total_count DESC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2a17078", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e0d185ad2c0245b17eac315f71152a46c6da41bb Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 6 Feb 2026 17:14:52 -0800 Subject: [PATCH 17/18] fix: suppress JSONDtypeWarning in Anywidget mode and clean up progress output (#2441) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR improves the user experience when using the interactive anywidget display mode (bpd.options.display.repr_mode = "anywidget") by reducing console noise. Verified at: vs code notebook: screen/ACCJRLwyThciMk8 colab notebook: screen/BhNxzpvckYg9Wp8 Fixes #<482120359> 🦕 --- bigframes/__init__.py | 27 ++++++++++++++++++++------- bigframes/dataframe.py | 2 +- bigframes/display/anywidget.py | 30 ++++++++++++++++++++---------- bigframes/display/html.py | 8 +++++++- bigframes/pandas/io/api.py | 17 ++++++++--------- 5 files changed, 56 insertions(+), 28 deletions(-) diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 419899a4ae..a3a9b4e4c7 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,13 +14,26 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import option_context, options -from bigframes._config.bigquery_options import BigQueryOptions -from bigframes.core.global_session import close_session, get_global_session -import bigframes.enums as enums -import bigframes.exceptions as exceptions -from bigframes.session import connect, Session -from bigframes.version import __version__ +import warnings + +# Suppress Python version support warnings from google-cloud libraries. +# These are particularly noisy in Colab which still uses Python 3.10. +warnings.filterwarnings( + "ignore", + category=FutureWarning, + message=".*Google will stop supporting.*Python.*", +) + +from bigframes._config import option_context, options # noqa: E402 +from bigframes._config.bigquery_options import BigQueryOptions # noqa: E402 +from bigframes.core.global_session import ( # noqa: E402 + close_session, + get_global_session, +) +import bigframes.enums as enums # noqa: E402 +import bigframes.exceptions as exceptions # noqa: E402 +from bigframes.session import connect, Session # noqa: E402 +from bigframes.version import __version__ # noqa: E402 _MAGIC_NAMES = ["bqsql"] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c226e5010a..b195ce9902 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -332,7 +332,7 @@ def dtypes(self) -> pandas.Series: @property def columns(self) -> pandas.Index: - return self.dtypes.index + return self._block.column_labels @columns.setter def columns(self, labels: pandas.Index): diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index eca435e960..40d04a1d71 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -23,6 +23,7 @@ import threading from typing import Any, Iterator, Optional import uuid +import warnings import pandas as pd @@ -111,16 +112,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self.page_size = initial_page_size self.max_columns = initial_max_columns - # TODO(b/469861913): Nested columns from structs (e.g., 'struct_col.name') are not currently sortable. - # TODO(b/463754889): Support non-string column labels for sorting. - if all(isinstance(col, str) for col in dataframe.columns): - self.orderable_columns = [ - str(col_name) - for col_name, dtype in dataframe.dtypes.items() - if dtypes.is_orderable(dtype) - ] - else: - self.orderable_columns = [] + self.orderable_columns = self._get_orderable_columns(dataframe) self._initial_load() @@ -128,6 +120,24 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Also used as a guard to prevent observers from firing during initialization. self._initial_load_complete = True + def _get_orderable_columns( + self, dataframe: bigframes.dataframe.DataFrame + ) -> list[str]: + """Determine which columns can be used for client-side sorting.""" + # TODO(b/469861913): Nested columns from structs (e.g., 'struct_col.name') are not currently sortable. + # TODO(b/463754889): Support non-string column labels for sorting. + if not all(isinstance(col, str) for col in dataframe.columns): + return [] + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", bigframes.exceptions.JSONDtypeWarning) + warnings.simplefilter("ignore", category=FutureWarning) + return [ + str(col_name) + for col_name, dtype in dataframe.dtypes.items() + if dtypes.is_orderable(dtype) + ] + def _initial_load(self) -> None: """Get initial data and row count.""" # obtain the row counts diff --git a/bigframes/display/html.py b/bigframes/display/html.py index 6102d1512c..ef34985c8e 100644 --- a/bigframes/display/html.py +++ b/bigframes/display/html.py @@ -363,7 +363,13 @@ def repr_mimebundle( if opts.repr_mode == "anywidget": try: - return get_anywidget_bundle(obj, include=include, exclude=exclude) + with bigframes.option_context("display.progress_bar", None): + with warnings.catch_warnings(): + warnings.simplefilter( + "ignore", category=bigframes.exceptions.JSONDtypeWarning + ) + warnings.simplefilter("ignore", category=FutureWarning) + return get_anywidget_bundle(obj, include=include, exclude=exclude) except ImportError: # Anywidget is an optional dependency, so warn rather than fail. # TODO(shuowei): When Anywidget becomes the default for all repr modes, diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index b8ad6cbd0c..fade0558ac 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -354,15 +354,14 @@ def _read_gbq_colab( ) _set_default_session_location_if_possible_deferred_query(create_query) if not config.options.bigquery._session_started: - with warnings.catch_warnings(): - # Don't warning about Polars in SQL cell. - # Related to b/437090788. - try: - bigframes._importing.import_polars() - warnings.simplefilter("ignore", bigframes.exceptions.PreviewWarning) - config.options.bigquery.enable_polars_execution = True - except ImportError: - pass # don't fail if polars isn't available + # Don't warning about Polars in SQL cell. + # Related to b/437090788. + try: + bigframes._importing.import_polars() + warnings.simplefilter("ignore", bigframes.exceptions.PreviewWarning) + config.options.bigquery.enable_polars_execution = True + except ImportError: + pass # don't fail if polars isn't available return global_session.with_default_session( bigframes.session.Session._read_gbq_colab, From 9f1ba1dbdd876f456049b8607a64888a2ffbeaac Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 6 Feb 2026 18:30:42 -0800 Subject: [PATCH 18/18] chore: librarian release pull request: 20260207T015024Z (#2446) PR created by the Librarian CLI to initialize a release. Merging this PR will auto trigger a release. Librarian Version: v0.7.0 Language Image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677
bigframes: 2.35.0 ## [2.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.34.0...v2.35.0) (2026-02-07) ### Features * Add bigframes.pandas.col with basic operators (#2405) ([12741677](https://github.com/googleapis/python-bigquery-dataframes/commit/12741677)) * remove redundant "started." messages from progress output (#2440) ([2017cc2f](https://github.com/googleapis/python-bigquery-dataframes/commit/2017cc2f)) * add bigframe.bigquery.load_data function (#2426) ([4b0f13b2](https://github.com/googleapis/python-bigquery-dataframes/commit/4b0f13b2)) * Disable progress bars in Anywidget mode (#2444) ([4e2689a1](https://github.com/googleapis/python-bigquery-dataframes/commit/4e2689a1)) * add bigquery.ai.generate_text function (#2433) ([5bd0029a](https://github.com/googleapis/python-bigquery-dataframes/commit/5bd0029a)) * Disable progress bars in Anywidget mode to reduce notebook clutter (#2437) ([853240da](https://github.com/googleapis/python-bigquery-dataframes/commit/853240da)) * Add a bigframes cell magic for ipython (#2395) ([e6de52de](https://github.com/googleapis/python-bigquery-dataframes/commit/e6de52de)) * add `bigframes.bigquery.ai.generate_embedding` (#2343) ([e91536c8](https://github.com/googleapis/python-bigquery-dataframes/commit/e91536c8)) ### Bug Fixes * always display the results in the `%%bqsql` cell magics output (#2439) ([2d973b54](https://github.com/googleapis/python-bigquery-dataframes/commit/2d973b54)) * exlcude gcsfs 2026.2.0 (#2445) ([311de31e](https://github.com/googleapis/python-bigquery-dataframes/commit/311de31e)) * suppress JSONDtypeWarning in Anywidget mode and clean up progress output (#2441) ([e0d185ad](https://github.com/googleapis/python-bigquery-dataframes/commit/e0d185ad)) ### Documentation * fix cast method shown on public docs (#2436) ([ad0f33c6](https://github.com/googleapis/python-bigquery-dataframes/commit/ad0f33c6))
--- .librarian/state.yaml | 2 +- CHANGELOG.md | 26 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/.librarian/state.yaml b/.librarian/state.yaml index 21903a5124..70eb19713c 100644 --- a/.librarian/state.yaml +++ b/.librarian/state.yaml @@ -1,7 +1,7 @@ image: us-central1-docker.pkg.dev/cloud-sdk-librarian-prod/images-prod/python-librarian-generator@sha256:e7cc6823efb073a8a26e7cefdd869f12ec228abfbd2a44aa9a7eacc284023677 libraries: - id: bigframes - version: 2.34.0 + version: 2.35.0 last_generated_commit: "" apis: [] source_roots: diff --git a/CHANGELOG.md b/CHANGELOG.md index f54231f540..874fcb0d04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.34.0...v2.35.0) (2026-02-07) + + +### Documentation + +* fix cast method shown on public docs (#2436) ([ad0f33c65ee01409826c381ae0f70aad65bb6a27](https://github.com/googleapis/python-bigquery-dataframes/commit/ad0f33c65ee01409826c381ae0f70aad65bb6a27)) + + +### Features + +* remove redundant "started." messages from progress output (#2440) ([2017cc2f27f0a432af46f60b3286b231caa4a98b](https://github.com/googleapis/python-bigquery-dataframes/commit/2017cc2f27f0a432af46f60b3286b231caa4a98b)) +* Add bigframes.pandas.col with basic operators (#2405) ([12741677c0391efb5d05281fc756445ccbb1387e](https://github.com/googleapis/python-bigquery-dataframes/commit/12741677c0391efb5d05281fc756445ccbb1387e)) +* Disable progress bars in Anywidget mode (#2444) ([4e2689a1c975c4cabaf36b7d0817dcbedc926853](https://github.com/googleapis/python-bigquery-dataframes/commit/4e2689a1c975c4cabaf36b7d0817dcbedc926853)) +* Disable progress bars in Anywidget mode to reduce notebook clutter (#2437) ([853240daf45301ad534c635c8955cb6ce91d23c2](https://github.com/googleapis/python-bigquery-dataframes/commit/853240daf45301ad534c635c8955cb6ce91d23c2)) +* add bigquery.ai.generate_text function (#2433) ([5bd0029a99e7653843de4ac7d57370c9dffeed4d](https://github.com/googleapis/python-bigquery-dataframes/commit/5bd0029a99e7653843de4ac7d57370c9dffeed4d)) +* Add a bigframes cell magic for ipython (#2395) ([e6de52ded6c5091275a936dec36f01a6cf701233](https://github.com/googleapis/python-bigquery-dataframes/commit/e6de52ded6c5091275a936dec36f01a6cf701233)) +* add `bigframes.bigquery.ai.generate_embedding` (#2343) ([e91536c8a5b2d8d896767510ced80c6fd2a68a97](https://github.com/googleapis/python-bigquery-dataframes/commit/e91536c8a5b2d8d896767510ced80c6fd2a68a97)) +* add bigframe.bigquery.load_data function (#2426) ([4b0f13b2fe10fa5b07d3ca3b7cb1ae1cb95030c7](https://github.com/googleapis/python-bigquery-dataframes/commit/4b0f13b2fe10fa5b07d3ca3b7cb1ae1cb95030c7)) + + +### Bug Fixes + +* suppress JSONDtypeWarning in Anywidget mode and clean up progress output (#2441) ([e0d185ad2c0245b17eac315f71152a46c6da41bb](https://github.com/googleapis/python-bigquery-dataframes/commit/e0d185ad2c0245b17eac315f71152a46c6da41bb)) +* exlcude gcsfs 2026.2.0 (#2445) ([311de31e79227408515f087dafbab7edc54ddf1b](https://github.com/googleapis/python-bigquery-dataframes/commit/311de31e79227408515f087dafbab7edc54ddf1b)) +* always display the results in the `%%bqsql` cell magics output (#2439) ([2d973b54550f30429dbd10894f78db7bb0c57345](https://github.com/googleapis/python-bigquery-dataframes/commit/2d973b54550f30429dbd10894f78db7bb0c57345)) + ## [2.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.33.0...v2.34.0) (2026-02-02) diff --git a/bigframes/version.py b/bigframes/version.py index a6862ee201..c5b120dc23 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.34.0" +__version__ = "2.35.0" # {x-release-please-start-date} -__release_date__ = "2026-02-02" +__release_date__ = "2026-02-07" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index a6862ee201..c5b120dc23 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.34.0" +__version__ = "2.35.0" # {x-release-please-start-date} -__release_date__ = "2026-02-02" +__release_date__ = "2026-02-07" # {x-release-please-end}