diff --git a/CHANGELOG.md b/CHANGELOG.md index af87cae3b2..a8ebb7a417 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.36.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.35.0...v1.36.0) (2025-02-11) + + +### Features + +* Add `bigframes.bigquery.st_area` and suggest it from `GeoSeries.area` ([#1318](https://github.com/googleapis/python-bigquery-dataframes/issues/1318)) ([8b5ffa8](https://github.com/googleapis/python-bigquery-dataframes/commit/8b5ffa8893b51016c51794865c40def74ea6716b)) +* Add `GeoSeries.from_xy()` ([#1364](https://github.com/googleapis/python-bigquery-dataframes/issues/1364)) ([3c3e14c](https://github.com/googleapis/python-bigquery-dataframes/commit/3c3e14c715f476ca44f254c0d53d639ea5988a8d)) + + +### Bug Fixes + +* Dtype parameter ineffective in Series/DataFrame construction ([#1354](https://github.com/googleapis/python-bigquery-dataframes/issues/1354)) ([b9bdca8](https://github.com/googleapis/python-bigquery-dataframes/commit/b9bdca8285ee54fecf3795fbf3cbea6f878ee8ca)) +* Translate labels to col ids when copying dataframes ([#1372](https://github.com/googleapis/python-bigquery-dataframes/issues/1372)) ([0c55b07](https://github.com/googleapis/python-bigquery-dataframes/commit/0c55b07dc001b568875f06d578ca7d59409f2a11)) + + +### Performance Improvements + +* Prune unused operations from sql ([#1365](https://github.com/googleapis/python-bigquery-dataframes/issues/1365)) ([923da03](https://github.com/googleapis/python-bigquery-dataframes/commit/923da037ef6e4e7f8b54924ea5644c2c5ceb2234)) +* Simplify merge join key coalescing ([#1361](https://github.com/googleapis/python-bigquery-dataframes/issues/1361)) ([7ae565d](https://github.com/googleapis/python-bigquery-dataframes/commit/7ae565d9e0e59fdf75c7659c0263562688ccc1e8)) + ## [1.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.34.0...v1.35.0) (2025-02-04) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 21e61bc4b1..56aee38bfe 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -27,6 +27,7 @@ unix_millis, unix_seconds, ) +from bigframes.bigquery._operations.geo import st_area from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -45,6 +46,8 @@ "array_length", "array_agg", "array_to_string", + # geo ops + "st_area", # json ops "json_set", "json_extract", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py new file mode 100644 index 0000000000..262ced4fe8 --- /dev/null +++ b/bigframes/bigquery/_operations/geo.py @@ -0,0 +1,93 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from bigframes import operations as ops +import bigframes.geopandas +import bigframes.series + +""" +Search functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions +""" + + +def st_area(self) -> bigframes.series.Series: + """ + Returns the area in square meters covered by the polygons in the input + GEOGRAPHY. + + If geography_expression is a point or a line, returns zero. If + geography_expression is a collection, returns the area of the polygons + in the collection; if the collection doesn't contain polygons, returns zero. + + + ..note:: + BigQuery's Geography functions, like `st_area`, interpet the geomtry + data type as a point set on the Earth's surface. A point set is a set + of points, lines, and polygons on the WGS84 reference spheroid, with + geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data + + + **Examples:** + + >>> import bigframes.geopandas + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + >>> series = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), + ... Polygon([(0.10, 0.4), (0.9, 0.5), (0.10, 0.5)]), + ... Polygon([(0.1, 0.1), (0.2, 0.1), (0.2, 0.2)]), + ... LineString([(0, 0), (1, 1), (0, 1)]), + ... Point(0, 1), + ... ] + ... ) + >>> series + 0 POLYGON ((0 0, 0.1 0.1, 0 0.1, 0 0)) + 1 POLYGON ((0.1 0.4, 0.9 0.5, 0.1 0.5, 0.1 0.4)) + 2 POLYGON ((0.1 0.1, 0.2 0.1, 0.2 0.2, 0.1 0.1)) + 3 LINESTRING (0 0, 1 1, 0 1) + 4 POINT (0 1) + dtype: geometry + + >>> bbq.st_area(series) + 0 61821689.855985 + 1 494563347.88721 + 2 61821689.855841 + 3 0.0 + 4 0.0 + dtype: Float64 + + Use `round()` to round the outputed areas to the neares ten millions + + >>> bbq.st_area(series).round(-7) + 0 60000000.0 + 1 490000000.0 + 2 60000000.0 + 3 0.0 + 4 0.0 + dtype: Float64 + + Returns: + bigframes.pandas.Series: + Series of float representing the areas. + """ + series = self._apply_unary_op(ops.geo_area_op) + series.name = None + return series diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 397a37ee92..a05030140e 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -128,3 +128,77 @@ def image_blur_func( image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) + + +# Extracts all text from a PDF url +def pdf_extract_func(src_obj_ref_rt: str) -> str: + import io + import json + + from pypdf import PdfReader # type: ignore + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url, stream=True) + response.raise_for_status() + pdf_bytes = response.content + + pdf_file = io.BytesIO(pdf_bytes) + reader = PdfReader(pdf_file, strict=False) + + all_text = "" + for page in reader.pages: + page_extract_text = page.extract_text() + if page_extract_text: + all_text += page_extract_text + return all_text + + +pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests"]) + + +# Extracts text from a PDF url and chunks it simultaneously +def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> str: + import io + import json + + from pypdf import PdfReader # type: ignore + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url, stream=True) + response.raise_for_status() + pdf_bytes = response.content + + pdf_file = io.BytesIO(pdf_bytes) + reader = PdfReader(pdf_file, strict=False) + + # extract and chunk text simultaneously + all_text_chunks = [] + curr_chunk = "" + for page in reader.pages: + page_text = page.extract_text() + if page_text: + curr_chunk += page_text + # split the accumulated text into chunks of a specific size with overlaop + # this loop implements a sliding window approach to create chunks + while len(curr_chunk) >= chunk_size: + split_idx = curr_chunk.rfind(" ", 0, chunk_size) + if split_idx == -1: + split_idx = chunk_size + actual_chunk = curr_chunk[:split_idx] + all_text_chunks.append(actual_chunk) + overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size] + curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :] + if curr_chunk: + all_text_chunks.append(curr_chunk) + + all_text_json_string = json.dumps(all_text_chunks) + return all_text_json_string + + +pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests"]) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 5f64bf68dd..2f3f15953c 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -11,531 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations -from dataclasses import dataclass -import datetime -import functools -import io -import typing -from typing import Iterable, List, Optional, Sequence, Tuple -import warnings +from bigframes.core.array_value import ArrayValue -import google.cloud.bigquery -import pandas -import pyarrow as pa -import pyarrow.feather as pa_feather - -import bigframes.core.expression as ex -import bigframes.core.guid -import bigframes.core.identifiers as ids -import bigframes.core.join_def as join_def -import bigframes.core.local_data as local_data -import bigframes.core.nodes as nodes -from bigframes.core.ordering import OrderingExpression -import bigframes.core.ordering as orderings -import bigframes.core.schema as schemata -import bigframes.core.tree_properties -import bigframes.core.utils -from bigframes.core.window_spec import WindowSpec -import bigframes.dtypes -import bigframes.exceptions as bfe -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops - -if typing.TYPE_CHECKING: - from bigframes.session import Session - -ORDER_ID_COLUMN = "bigframes_ordering_id" -PREDICATE_COLUMN = "bigframes_predicate" - - -@dataclass(frozen=True) -class ArrayValue: - """ - ArrayValue is an immutable type representing a 2D array with per-column types. - """ - - node: nodes.BigFrameNode - - @classmethod - def from_pyarrow(cls, arrow_table: pa.Table, session: Session): - adapted_table = local_data.adapt_pa_table(arrow_table) - schema = local_data.arrow_schema_to_bigframes(adapted_table.schema) - - iobytes = io.BytesIO() - pa_feather.write_feather(adapted_table, iobytes) - # Scan all columns by default, we define this list as it can be pruned while preserving source_def - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) - for item in schema.items - ) - ) - - node = nodes.ReadLocalNode( - iobytes.getvalue(), - data_schema=schema, - session=session, - n_rows=arrow_table.num_rows, - scan_list=scan_list, - ) - return cls(node) - - @classmethod - def from_range(cls, start, end, step): - return cls( - nodes.FromRangeNode( - start=start.node, - end=end.node, - step=step, - ) - ) - - @classmethod - def from_table( - cls, - table: google.cloud.bigquery.Table, - schema: schemata.ArraySchema, - session: Session, - *, - predicate: Optional[str] = None, - at_time: Optional[datetime.datetime] = None, - primary_key: Sequence[str] = (), - offsets_col: Optional[str] = None, - ): - if offsets_col and primary_key: - raise ValueError("must set at most one of 'offests', 'primary_key'") - if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): - msg = ( - "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" - "in preview; this behavior may change in future versions." - ) - warnings.warn(msg, bfe.PreviewWarning) - # define data source only for needed columns, this makes row-hashing cheaper - table_def = nodes.GbqTable.from_table(table, columns=schema.names) - - # create ordering from info - ordering = None - if offsets_col: - ordering = orderings.TotalOrdering.from_offset_col(offsets_col) - elif primary_key: - ordering = orderings.TotalOrdering.from_primary_key( - [ids.ColumnId(key_part) for key_part in primary_key] - ) - - # Scan all columns by default, we define this list as it can be pruned while preserving source_def - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) - for item in schema.items - ) - ) - source_def = nodes.BigqueryDataSource( - table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering - ) - node = nodes.ReadTableNode( - source=source_def, - scan_list=scan_list, - table_session=session, - ) - return cls(node) - - @property - def column_ids(self) -> typing.Sequence[str]: - """Returns column ids as strings.""" - return self.schema.names - - @property - def session(self) -> Session: - required_session = self.node.session - from bigframes import get_global_session - - return ( - required_session if (required_session is not None) else get_global_session() - ) - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - return self.node.schema - - @property - def explicitly_ordered(self) -> bool: - # see BigFrameNode.explicitly_ordered - return self.node.explicitly_ordered - - @property - def order_ambiguous(self) -> bool: - # see BigFrameNode.order_ambiguous - return self.node.order_ambiguous - - @property - def supports_fast_peek(self) -> bool: - return bigframes.core.tree_properties.can_fast_peek(self.node) - - def as_cached( - self: ArrayValue, - cache_table: google.cloud.bigquery.Table, - ordering: Optional[orderings.RowOrdering], - ) -> ArrayValue: - """ - Replace the node with an equivalent one that references a table where the value has been materialized to. - """ - table = nodes.GbqTable.from_table(cache_table) - source = nodes.BigqueryDataSource(table, ordering=ordering) - # Assumption: GBQ cached table uses field name as bq column name - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(field.id, field.dtype, field.id.name) - for field in self.node.fields - ) - ) - node = nodes.CachedTableNode( - original_node=self.node, - source=source, - table_session=self.session, - scan_list=scan_list, - ) - return ArrayValue(node) - - def _try_evaluate_local(self): - """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - import bigframes.core.compile - - return bigframes.core.compile.test_only_try_evaluate(self.node) - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self.schema.get_type(key) - - def row_count(self) -> ArrayValue: - """Get number of rows in ArrayValue as a single-entry ArrayValue.""" - return ArrayValue(nodes.RowCountNode(child=self.node)) - - # Operations - def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - predicate: ex.Expression = ex.deref(predicate_id) - if keep_null: - predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) - return self.filter(predicate) - - def filter(self, predicate: ex.Expression): - return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) - - def order_by( - self, by: Sequence[OrderingExpression], is_total_order: bool = False - ) -> ArrayValue: - return ArrayValue( - nodes.OrderByNode( - child=self.node, by=tuple(by), is_total_order=is_total_order - ) - ) - - def reversed(self) -> ArrayValue: - return ArrayValue(nodes.ReversedNode(child=self.node)) - - def slice( - self, start: Optional[int], stop: Optional[int], step: Optional[int] - ) -> ArrayValue: - if self.node.order_ambiguous and not (self.session._strictly_ordered): - msg = "Window ordering may be ambiguous, this can cause unstable results." - warnings.warn(msg, bfe.AmbiguousWindowWarning) - return ArrayValue( - nodes.SliceNode( - self.node, - start=start, - stop=stop, - step=step if (step is not None) else 1, - ) - ) - - def promote_offsets(self) -> Tuple[ArrayValue, str]: - """ - Convenience function to promote copy of column offsets to a value column. Can be used to reset index. - """ - col_id = self._gen_namespaced_uid() - if self.node.order_ambiguous and not (self.session._strictly_ordered): - if not self.session._allows_ambiguity: - raise ValueError( - "Generating offsets not supported in partial ordering mode" - ) - else: - msg = ( - "Window ordering may be ambiguous, this can cause unstable results." - ) - warnings.warn(msg, category=bfe.AmbiguousWindowWarning) - - return ( - ArrayValue( - nodes.PromoteOffsetsNode(child=self.node, col_id=ids.ColumnId(col_id)) - ), - col_id, - ) - - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: - """Append together multiple ArrayValue objects.""" - return ArrayValue( - nodes.ConcatNode( - children=tuple([self.node, *[val.node for val in other]]), - output_ids=tuple( - ids.ColumnId(bigframes.core.guid.generate_guid()) - for id in self.column_ids - ), - ) - ) - - def compute_values(self, assignments: Sequence[ex.Expression]): - col_ids = self._gen_namespaced_uids(len(assignments)) - ex_id_pairs = tuple( - (ex, ids.ColumnId(id)) for ex, id in zip(assignments, col_ids) - ) - return ( - ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), - col_ids, - ) - - def project_to_id(self, expression: ex.Expression): - array_val, ids = self.compute_values( - [expression], - ) - return array_val, ids[0] - - def assign(self, source_id: str, destination_id: str) -> ArrayValue: - if destination_id in self.column_ids: # Mutate case - exprs = [ - ( - ex.deref(source_id if (col_id == destination_id) else col_id), - ids.ColumnId(col_id), - ) - for col_id in self.column_ids - ] - else: # append case - self_projection = ( - (ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids - ) - exprs = [ - *self_projection, - (ex.deref(source_id), ids.ColumnId(destination_id)), - ] - return ArrayValue( - nodes.SelectionNode( - child=self.node, - input_output_pairs=tuple(exprs), - ) - ) - - def create_constant( - self, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> Tuple[ArrayValue, str]: - if pandas.isna(value): - # Need to assign a data type when value is NaN. - dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE - - return self.project_to_id(ex.const(value, dtype)) - - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids) - return ArrayValue( - nodes.SelectionNode( - child=self.node, - input_output_pairs=tuple(selections), - ) - ) - - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - return self.select_columns( - [col_id for col_id in self.column_ids if col_id not in columns] - ) - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> ArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - agg_defs = tuple((agg, ids.ColumnId(name)) for agg, name in aggregations) - return ArrayValue( - nodes.AggregateNode( - child=self.node, - aggregations=agg_defs, - by_column_ids=tuple(map(ex.deref, by_column_ids)), - dropna=dropna, - ) - ) - - def project_window_op( - self, - column_name: str, - op: agg_ops.UnaryWindowOp, - window_spec: WindowSpec, - *, - never_skip_nulls=False, - skip_reproject_unsafe: bool = False, - ) -> Tuple[ArrayValue, str]: - """ - Creates a new expression based on this expression with unary operation applied to one column. - column_name: the id of the input column present in the expression - op: the windowable operator to apply to the input column - window_spec: a specification of the window over which to apply the operator - output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided - never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection - """ - # TODO: Support non-deterministic windowing - if window_spec.row_bounded or not op.order_independent: - if self.node.order_ambiguous and not self.session._strictly_ordered: - if not self.session._allows_ambiguity: - raise ValueError( - "Generating offsets not supported in partial ordering mode" - ) - else: - msg = "Window ordering may be ambiguous, this can cause unstable results." - warnings.warn(msg, category=bfe.AmbiguousWindowWarning) - - output_name = self._gen_namespaced_uid() - return ( - ArrayValue( - nodes.WindowOpNode( - child=self.node, - expression=ex.UnaryAggregation(op, ex.deref(column_name)), - window_spec=window_spec, - output_name=ids.ColumnId(output_name), - never_skip_nulls=never_skip_nulls, - skip_reproject_unsafe=skip_reproject_unsafe, - ) - ), - output_name, - ) - - def isin( - self, other: ArrayValue, lcol: str, rcol: str - ) -> typing.Tuple[ArrayValue, str]: - node = nodes.InNode( - self.node, - other.node, - ex.deref(lcol), - ex.deref(rcol), - indicator_col=ids.ColumnId.unique(), - ) - return ArrayValue(node), node.indicator_col.name - - def relational_join( - self, - other: ArrayValue, - conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), - type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", - ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: - l_mapping = { # Identity mapping, only rename right side - lcol.name: lcol.name for lcol in self.node.ids - } - other_node, r_mapping = self.prepare_join_names(other) - join_node = nodes.JoinNode( - left_child=self.node, - right_child=other_node, - conditions=tuple( - (ex.deref(l_mapping[l_col]), ex.deref(r_mapping[r_col])) - for l_col, r_col in conditions - ), - type=type, - ) - return ArrayValue(join_node), (l_mapping, r_mapping) - - def try_row_join( - self, - other: ArrayValue, - conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), - ) -> Optional[ - typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]] - ]: - l_mapping = { # Identity mapping, only rename right side - lcol.name: lcol.name for lcol in self.node.ids - } - other_node, r_mapping = self.prepare_join_names(other) - import bigframes.core.rewrite - - result_node = bigframes.core.rewrite.try_row_join( - self.node, other_node, conditions - ) - if result_node is None: - return None - - return ( - ArrayValue(result_node), - (l_mapping, r_mapping), - ) - - def prepare_join_names( - self, other: ArrayValue - ) -> Tuple[bigframes.core.nodes.BigFrameNode, dict[str, str]]: - if set(other.node.ids) & set(self.node.ids): - r_mapping = { # Rename conflicting names - rcol.name: rcol.name - if (rcol.name not in self.column_ids) - else bigframes.core.guid.generate_guid() - for rcol in other.node.ids - } - return ( - nodes.SelectionNode( - other.node, - tuple( - (ex.deref(old_id), ids.ColumnId(new_id)) - for old_id, new_id in r_mapping.items() - ), - ), - r_mapping, - ) - else: - return other.node, {id: id for id in other.column_ids} - - def try_legacy_row_join( - self, - other: ArrayValue, - join_type: join_def.JoinType, - join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], - mappings: typing.Tuple[join_def.JoinColumnMapping, ...], - ) -> typing.Optional[ArrayValue]: - import bigframes.core.rewrite - - result = bigframes.core.rewrite.legacy_join_as_projection( - self.node, other.node, join_keys, mappings, join_type - ) - if result is not None: - return ArrayValue(result) - return None - - def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: - assert len(column_ids) > 0 - for column_id in column_ids: - assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) - - offsets = tuple(ex.deref(id) for id in column_ids) - return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) - - def _uniform_sampling(self, fraction: float) -> ArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) - - # Deterministically generate namespaced ids for new variables - # These new ids are only unique within the current namespace. - # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace - # When migrating to integer ids, these will generate the next available integer, in order to densely pack ids - # this will help represent variables sets as compact bitsets - def _gen_namespaced_uid(self) -> str: - return self._gen_namespaced_uids(1)[0] - - def _gen_namespaced_uids(self, n: int) -> List[str]: - return [ids.ColumnId.unique().name for _ in range(n)] +__all__ = ["ArrayValue"] diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py new file mode 100644 index 0000000000..dc9b8e3b9b --- /dev/null +++ b/bigframes/core/array_value.py @@ -0,0 +1,553 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass +import datetime +import functools +import io +import typing +from typing import Iterable, List, Optional, Sequence, Tuple +import warnings + +import google.cloud.bigquery +import pandas +import pyarrow as pa +import pyarrow.feather as pa_feather + +import bigframes.core.expression as ex +import bigframes.core.guid +import bigframes.core.identifiers as ids +import bigframes.core.join_def as join_def +import bigframes.core.local_data as local_data +import bigframes.core.nodes as nodes +from bigframes.core.ordering import OrderingExpression +import bigframes.core.ordering as orderings +import bigframes.core.schema as schemata +import bigframes.core.tree_properties +import bigframes.core.utils +from bigframes.core.window_spec import WindowSpec +import bigframes.dtypes +import bigframes.exceptions as bfe +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + from bigframes.session import Session + +ORDER_ID_COLUMN = "bigframes_ordering_id" +PREDICATE_COLUMN = "bigframes_predicate" + + +@dataclass(frozen=True) +class ArrayValue: + """ + ArrayValue is an immutable type representing a 2D array with per-column types. + """ + + node: nodes.BigFrameNode + + @classmethod + def from_pyarrow(cls, arrow_table: pa.Table, session: Session): + adapted_table = local_data.adapt_pa_table(arrow_table) + schema = local_data.arrow_schema_to_bigframes(adapted_table.schema) + + iobytes = io.BytesIO() + pa_feather.write_feather(adapted_table, iobytes) + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + + node = nodes.ReadLocalNode( + iobytes.getvalue(), + data_schema=schema, + session=session, + n_rows=arrow_table.num_rows, + scan_list=scan_list, + ) + return cls(node) + + @classmethod + def from_range(cls, start, end, step): + return cls( + nodes.FromRangeNode( + start=start.node, + end=end.node, + step=step, + ) + ) + + @classmethod + def from_table( + cls, + table: google.cloud.bigquery.Table, + schema: schemata.ArraySchema, + session: Session, + *, + predicate: Optional[str] = None, + at_time: Optional[datetime.datetime] = None, + primary_key: Sequence[str] = (), + offsets_col: Optional[str] = None, + ): + if offsets_col and primary_key: + raise ValueError("must set at most one of 'offests', 'primary_key'") + if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): + msg = ( + "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" + "in preview; this behavior may change in future versions." + ) + warnings.warn(msg, bfe.PreviewWarning) + # define data source only for needed columns, this makes row-hashing cheaper + table_def = nodes.GbqTable.from_table(table, columns=schema.names) + + # create ordering from info + ordering = None + if offsets_col: + ordering = orderings.TotalOrdering.from_offset_col(offsets_col) + elif primary_key: + ordering = orderings.TotalOrdering.from_primary_key( + [ids.ColumnId(key_part) for key_part in primary_key] + ) + + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + source_def = nodes.BigqueryDataSource( + table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering + ) + node = nodes.ReadTableNode( + source=source_def, + scan_list=scan_list, + table_session=session, + ) + return cls(node) + + @property + def column_ids(self) -> typing.Sequence[str]: + """Returns column ids as strings.""" + return self.schema.names + + @property + def session(self) -> Session: + required_session = self.node.session + from bigframes import get_global_session + + return ( + required_session if (required_session is not None) else get_global_session() + ) + + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + return self.node.schema + + @property + def explicitly_ordered(self) -> bool: + # see BigFrameNode.explicitly_ordered + return self.node.explicitly_ordered + + @property + def order_ambiguous(self) -> bool: + # see BigFrameNode.order_ambiguous + return self.node.order_ambiguous + + @property + def supports_fast_peek(self) -> bool: + return bigframes.core.tree_properties.can_fast_peek(self.node) + + def as_cached( + self: ArrayValue, + cache_table: google.cloud.bigquery.Table, + ordering: Optional[orderings.RowOrdering], + ) -> ArrayValue: + """ + Replace the node with an equivalent one that references a table where the value has been materialized to. + """ + table = nodes.GbqTable.from_table(cache_table) + source = nodes.BigqueryDataSource(table, ordering=ordering) + # Assumption: GBQ cached table uses field name as bq column name + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(field.id, field.dtype, field.id.name) + for field in self.node.fields + ) + ) + node = nodes.CachedTableNode( + original_node=self.node, + source=source, + table_session=self.session, + scan_list=scan_list, + ) + return ArrayValue(node) + + def _try_evaluate_local(self): + """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" + import bigframes.core.compile + + return bigframes.core.compile.test_only_try_evaluate(self.node) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + return self.schema.get_type(key) + + def row_count(self) -> ArrayValue: + """Get number of rows in ArrayValue as a single-entry ArrayValue.""" + return ArrayValue(nodes.RowCountNode(child=self.node)) + + # Operations + def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + predicate: ex.Expression = ex.deref(predicate_id) + if keep_null: + predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) + return self.filter(predicate) + + def filter(self, predicate: ex.Expression): + return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) + + def order_by( + self, by: Sequence[OrderingExpression], is_total_order: bool = False + ) -> ArrayValue: + return ArrayValue( + nodes.OrderByNode( + child=self.node, by=tuple(by), is_total_order=is_total_order + ) + ) + + def reversed(self) -> ArrayValue: + return ArrayValue(nodes.ReversedNode(child=self.node)) + + def slice( + self, start: Optional[int], stop: Optional[int], step: Optional[int] + ) -> ArrayValue: + if self.node.order_ambiguous and not (self.session._strictly_ordered): + msg = "Window ordering may be ambiguous, this can cause unstable results." + warnings.warn(msg, bfe.AmbiguousWindowWarning) + return ArrayValue( + nodes.SliceNode( + self.node, + start=start, + stop=stop, + step=step if (step is not None) else 1, + ) + ) + + def promote_offsets(self) -> Tuple[ArrayValue, str]: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + col_id = self._gen_namespaced_uid() + if self.node.order_ambiguous and not (self.session._strictly_ordered): + if not self.session._allows_ambiguity: + raise ValueError( + "Generating offsets not supported in partial ordering mode" + ) + else: + msg = ( + "Window ordering may be ambiguous, this can cause unstable results." + ) + warnings.warn(msg, category=bfe.AmbiguousWindowWarning) + + return ( + ArrayValue( + nodes.PromoteOffsetsNode(child=self.node, col_id=ids.ColumnId(col_id)) + ), + col_id, + ) + + def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: + """Append together multiple ArrayValue objects.""" + return ArrayValue( + nodes.ConcatNode( + children=tuple([self.node, *[val.node for val in other]]), + output_ids=tuple( + ids.ColumnId(bigframes.core.guid.generate_guid()) + for id in self.column_ids + ), + ) + ) + + def compute_values(self, assignments: Sequence[ex.Expression]): + col_ids = self._gen_namespaced_uids(len(assignments)) + ex_id_pairs = tuple( + (ex, ids.ColumnId(id)) for ex, id in zip(assignments, col_ids) + ) + return ( + ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), + col_ids, + ) + + def project_to_id(self, expression: ex.Expression): + array_val, ids = self.compute_values( + [expression], + ) + return array_val, ids[0] + + def assign(self, source_id: str, destination_id: str) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id if (col_id == destination_id) else col_id), + ids.ColumnId(col_id), + ) + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in self.column_ids + ) + exprs = [ + *self_projection, + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id), ids.ColumnId(destination_id) + ) + ), + ] + return ArrayValue( + nodes.SelectionNode( + child=self.node, + input_output_pairs=tuple(exprs), + ) + ) + + def create_constant( + self, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> Tuple[ArrayValue, str]: + if pandas.isna(value): + # Need to assign a data type when value is NaN. + dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE + + return self.project_to_id(ex.const(value, dtype)) + + def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + # This basically just drops and reorders columns - logically a no-op except as a final step + selections = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in column_ids + ) + return ArrayValue( + nodes.SelectionNode( + child=self.node, + input_output_pairs=tuple(selections), + ) + ) + + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + return self.select_columns( + [col_id for col_id in self.column_ids if col_id not in columns] + ) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> ArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + agg_defs = tuple((agg, ids.ColumnId(name)) for agg, name in aggregations) + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=agg_defs, + by_column_ids=tuple(map(ex.deref, by_column_ids)), + dropna=dropna, + ) + ) + + def project_window_op( + self, + column_name: str, + op: agg_ops.UnaryWindowOp, + window_spec: WindowSpec, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> Tuple[ArrayValue, str]: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + # TODO: Support non-deterministic windowing + if window_spec.row_bounded or not op.order_independent: + if self.node.order_ambiguous and not self.session._strictly_ordered: + if not self.session._allows_ambiguity: + raise ValueError( + "Generating offsets not supported in partial ordering mode" + ) + else: + msg = "Window ordering may be ambiguous, this can cause unstable results." + warnings.warn(msg, category=bfe.AmbiguousWindowWarning) + + output_name = self._gen_namespaced_uid() + return ( + ArrayValue( + nodes.WindowOpNode( + child=self.node, + expression=ex.UnaryAggregation(op, ex.deref(column_name)), + window_spec=window_spec, + output_name=ids.ColumnId(output_name), + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, + ) + ), + output_name, + ) + + def isin( + self, other: ArrayValue, lcol: str, rcol: str + ) -> typing.Tuple[ArrayValue, str]: + node = nodes.InNode( + self.node, + other.node, + ex.deref(lcol), + ex.deref(rcol), + indicator_col=ids.ColumnId.unique(), + ) + return ArrayValue(node), node.indicator_col.name + + def relational_join( + self, + other: ArrayValue, + conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), + type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", + ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: + l_mapping = { # Identity mapping, only rename right side + lcol.name: lcol.name for lcol in self.node.ids + } + other_node, r_mapping = self.prepare_join_names(other) + join_node = nodes.JoinNode( + left_child=self.node, + right_child=other_node, + conditions=tuple( + (ex.deref(l_mapping[l_col]), ex.deref(r_mapping[r_col])) + for l_col, r_col in conditions + ), + type=type, + ) + return ArrayValue(join_node), (l_mapping, r_mapping) + + def try_row_join( + self, + other: ArrayValue, + conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), + ) -> Optional[ + typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]] + ]: + l_mapping = { # Identity mapping, only rename right side + lcol.name: lcol.name for lcol in self.node.ids + } + other_node, r_mapping = self.prepare_join_names(other) + import bigframes.core.rewrite + + result_node = bigframes.core.rewrite.try_row_join( + self.node, other_node, conditions + ) + if result_node is None: + return None + + return ( + ArrayValue(result_node), + (l_mapping, r_mapping), + ) + + def prepare_join_names( + self, other: ArrayValue + ) -> Tuple[bigframes.core.nodes.BigFrameNode, dict[str, str]]: + if set(other.node.ids) & set(self.node.ids): + r_mapping = { # Rename conflicting names + rcol.name: rcol.name + if (rcol.name not in self.column_ids) + else bigframes.core.guid.generate_guid() + for rcol in other.node.ids + } + return ( + nodes.SelectionNode( + other.node, + tuple( + bigframes.core.nodes.AliasedRef( + ex.deref(old_id), ids.ColumnId(new_id) + ) + for old_id, new_id in r_mapping.items() + ), + ), + r_mapping, + ) + else: + return other.node, {id: id for id in other.column_ids} + + def try_legacy_row_join( + self, + other: ArrayValue, + join_type: join_def.JoinType, + join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], + mappings: typing.Tuple[join_def.JoinColumnMapping, ...], + ) -> typing.Optional[ArrayValue]: + import bigframes.core.rewrite + + result = bigframes.core.rewrite.legacy_join_as_projection( + self.node, other.node, join_keys, mappings, join_type + ) + if result is not None: + return ArrayValue(result) + return None + + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: + assert len(column_ids) > 0 + for column_id in column_ids: + assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) + + offsets = tuple(ex.deref(id) for id in column_ids) + return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) + + def _uniform_sampling(self, fraction: float) -> ArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) + + # Deterministically generate namespaced ids for new variables + # These new ids are only unique within the current namespace. + # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace + # When migrating to integer ids, these will generate the next available integer, in order to densely pack ids + # this will help represent variables sets as compact bitsets + def _gen_namespaced_uid(self) -> str: + return self._gen_namespaced_uids(1)[0] + + def _gen_namespaced_uids(self, n: int) -> List[str]: + return [ids.ColumnId.unique().name for _ in range(n)] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 43f605dc03..8d3732f3fe 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -48,6 +48,7 @@ import pandas as pd import pyarrow as pa +from bigframes import session import bigframes._config.sampling_options as sampling_options import bigframes.constants import bigframes.core as core @@ -257,7 +258,7 @@ def dtypes( return [self.expr.get_column_type(col) for col in self.value_columns] @property - def session(self) -> core.Session: + def session(self) -> session.Session: return self._expr.session @functools.cached_property @@ -276,6 +277,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: mapping[label] = (*mapping.get(label, ()), id) return mapping + def resolve_label_exact(self, label: Label) -> Optional[str]: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, returns None.""" + matches = self.label_to_col_id.get(label, []) + if len(matches) > 1: + raise ValueError( + f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" + ) + return matches[0] if len(matches) != 0 else None + + def resolve_label_exact_or_error(self, label: Label) -> str: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, raises an error too.""" + col_id = self.resolve_label_exact(label) + if col_id is None: + raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}") + return col_id + @functools.cached_property def col_id_to_index_name(self) -> typing.Mapping[str, Label]: """Get column label for value columns, or index name for index columns""" @@ -2077,14 +2098,12 @@ def merge( result_columns = [] matching_join_labels = [] - coalesced_ids = [] - for left_id, right_id in zip(left_join_ids, right_join_ids): - joined_expr, coalesced_id = joined_expr.project_to_id( - ops.coalesce_op.as_expr( - get_column_left[left_id], get_column_right[right_id] - ), - ) - coalesced_ids.append(coalesced_id) + left_post_join_ids = tuple(get_column_left[id] for id in left_join_ids) + right_post_join_ids = tuple(get_column_right[id] for id in right_join_ids) + + joined_expr, coalesced_ids = coalesce_columns( + joined_expr, left_post_join_ids, right_post_join_ids, how=how, drop=False + ) for col_id in self.value_columns: if col_id in left_join_ids: @@ -2102,7 +2121,6 @@ def merge( result_columns.append(get_column_left[col_id]) for col_id in other.value_columns: if col_id in right_join_ids: - key_part = right_join_ids.index(col_id) if other.col_id_to_label[matching_right_id] in matching_join_labels: pass else: @@ -2636,7 +2654,7 @@ def dtypes( ] @property - def session(self) -> core.Session: + def session(self) -> session.Session: return self._expr.session @property @@ -2928,26 +2946,31 @@ def resolve_label_id(label: Label) -> str: ) +# TODO: Rewrite just to return expressions def coalesce_columns( expr: core.ArrayValue, left_ids: typing.Sequence[str], right_ids: typing.Sequence[str], how: str, + drop: bool = True, ) -> Tuple[core.ArrayValue, Sequence[str]]: result_ids = [] for left_id, right_id in zip(left_ids, right_ids): if how == "left" or how == "inner" or how == "cross": result_ids.append(left_id) - expr = expr.drop_columns([right_id]) + if drop: + expr = expr.drop_columns([right_id]) elif how == "right": result_ids.append(right_id) - expr = expr.drop_columns([left_id]) + if drop: + expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = guid.generate_guid() expr, coalesced_id = expr.project_to_id( ops.coalesce_op.as_expr(left_id, right_id) ) - expr = expr.drop_columns([left_id, right_id]) + if drop: + expr = expr.drop_columns([left_id, right_id]) result_ids.append(coalesced_id) else: raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}") @@ -3149,7 +3172,7 @@ def unpivot( def _pd_index_to_array_value( - session: core.Session, + session: session.Session, index: pd.Index, ) -> core.ArrayValue: """ diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 906bdb1f0d..93be998b5b 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -184,7 +184,7 @@ def _to_ibis_expr( # Special case for empty tables, since we can't create an empty # projection. if not self._columns: - return bigframes_vendored.ibis.memtable([]) + return self._table.select([bigframes_vendored.ibis.literal(1)]) table = self._table.select(self._columns) if fraction is not None: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index a72ca47190..64a0ae265f 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -58,12 +58,15 @@ def compile_sql( # TODO: get rid of output_ids arg assert len(output_ids) == len(list(node.fields)) node = set_output_names(node, output_ids) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) if ordered: node, limit = rewrites.pullup_limit_from_slice(node) node = nodes.bottom_up(node, rewrites.rewrite_slice) + # TODO: Extract out CTEs node, ordering = rewrites.pull_up_order( node, order_root=True, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql( order_by=ordering.all_ordering_columns, @@ -75,15 +78,18 @@ def compile_sql( node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql(selections=output_ids) def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: ids = [id.sql for id in node.ids] node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) return self.compile_node(node).to_sql(limit=n_rows, selections=ids) def compile_raw( @@ -93,13 +99,16 @@ def compile_raw( str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering ]: node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) + node = rewrites.column_pruning(node) ir = self.compile_node(node) sql = ir.to_sql() return sql, node.schema.to_bigquery(), ordering def _preprocess(self, node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) @@ -188,31 +197,34 @@ def compile_readtable(self, node: nodes.ReadTableNode): return self.compile_read_table_unordered(node.source, node.scan_list) def read_table_as_unordered_ibis( - self, source: nodes.BigqueryDataSource + self, + source: nodes.BigqueryDataSource, + scan_cols: typing.Sequence[str], ) -> ibis_types.Table: full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" - used_columns = tuple(col.name for col in source.table.physical_schema) # Physical schema might include unused columns, unsupported datatypes like JSON physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( - list(i for i in source.table.physical_schema if i.name in used_columns) + list(source.table.physical_schema) ) if source.at_time is not None or source.sql_predicate is not None: import bigframes.session._io.bigquery sql = bigframes.session._io.bigquery.to_query( full_table_name, - columns=used_columns, + columns=scan_cols, sql_predicate=source.sql_predicate, time_travel_timestamp=source.at_time, ) return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) else: - return ibis_api.table(physical_schema, full_table_name) + return ibis_api.table(physical_schema, full_table_name).select(scan_cols) def compile_read_table_unordered( self, source: nodes.BigqueryDataSource, scan: nodes.ScanList ): - ibis_table = self.read_table_as_unordered_ibis(source) + ibis_table = self.read_table_as_unordered_ibis( + source, scan_cols=[col.source_id for col in scan.items] + ) return compiled.UnorderedIR( ibis_table, tuple( @@ -287,7 +299,7 @@ def set_output_names( return nodes.SelectionNode( node, tuple( - (ex.DerefOp(old_id), ids.ColumnId(out_id)) + bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id)) for old_id, out_id in zip(node.ids, output_ids) ), ) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 8a55f6775d..af2b7908ad 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -47,6 +47,8 @@ ibis_dtypes.JSON, ] +IBIS_GEO_TYPE = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) + BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, bigframes.dtypes.Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), @@ -70,7 +72,7 @@ pd.ArrowDtype(pa.decimal256(76, 38)), ), ( - ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), + IBIS_GEO_TYPE, gpd.array.GeometryDtype(), ), (ibis_dtypes.json, db_dtypes.JSONDtype()), @@ -79,7 +81,7 @@ BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } -BIGFRAMES_TO_IBIS.update({bigframes.dtypes.TIMEDETLA_DTYPE: ibis_dtypes.int64}) +BIGFRAMES_TO_IBIS.update({bigframes.dtypes.TIMEDELTA_DTYPE: ibis_dtypes.int64}) IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, bigframes.dtypes.Dtype] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } @@ -177,6 +179,14 @@ def cast_ibis_value( ibis_dtypes.timestamp, ), ibis_dtypes.binary: (ibis_dtypes.string,), + ibis_dtypes.point: (IBIS_GEO_TYPE,), + ibis_dtypes.geometry: (IBIS_GEO_TYPE,), + ibis_dtypes.geography: (IBIS_GEO_TYPE,), + ibis_dtypes.linestring: (IBIS_GEO_TYPE,), + ibis_dtypes.polygon: (IBIS_GEO_TYPE,), + ibis_dtypes.multilinestring: (IBIS_GEO_TYPE,), + ibis_dtypes.multipoint: (IBIS_GEO_TYPE,), + ibis_dtypes.multipolygon: (IBIS_GEO_TYPE,), } value = ibis_value_to_canonical_type(value) @@ -282,6 +292,9 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.JSON): return bigframes.dtypes.JSON_DTYPE + if isinstance(ibis_dtype, ibis_dtypes.GeoSpatial): + return gpd.array.GeometryDtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Decimal): diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b42f983619..3e5f10eca4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -737,6 +737,16 @@ def unix_millis_op_impl(x: ibis_types.TimestampValue): return unix_millis(x) +@scalar_op_compiler.register_binary_op(ops.timestamp_diff_op) +def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.TimestampValue): + return x.delta(y, "microsecond") + + +@scalar_op_compiler.register_binary_op(ops.timestamp_add_op) +def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x + y.to_interval("us") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] @@ -993,6 +1003,18 @@ def geo_y_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).y() +@scalar_op_compiler.register_unary_op(ops.geo_area_op) +def geo_area_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).area() + + +@scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) +def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).point( + typing.cast(ibis_types.NumericValue, y) + ) + + # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5f9fcb257e..f619cd72c9 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -22,8 +22,8 @@ import jellyfish import pandas as pd +from bigframes import session from bigframes.core import log_adapter -import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression @@ -76,7 +76,7 @@ def __init__( ] @property - def _session(self) -> core.Session: + def _session(self) -> session.Session: return self._block.session def __getitem__( @@ -492,7 +492,7 @@ def _aggregate_all( def _apply_window_op( self, op: agg_ops.WindowOp, - window: typing.Optional[core.WindowSpec] = None, + window: typing.Optional[window_specs.WindowSpec] = None, numeric_only: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" @@ -536,7 +536,7 @@ def __init__( self._dropna = dropna # Applies to aggregations but not windowing @property - def _session(self) -> core.Session: + def _session(self) -> session.Session: return self._block.session @validations.requires_ordering() @@ -759,7 +759,7 @@ def _apply_window_op( self, op: agg_ops.WindowOp, discard_name=False, - window: typing.Optional[core.WindowSpec] = None, + window: typing.Optional[window_specs.WindowSpec] = None, never_skip_nulls: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 085d52daa6..d5273e5c0a 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,14 +20,13 @@ import functools import itertools import typing -from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple +from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple, TypeVar import google.cloud.bigquery as bq +from bigframes.core import identifiers import bigframes.core.expression as ex import bigframes.core.guid -import bigframes.core.identifiers -import bigframes.core.identifiers as bfet_ids from bigframes.core.ordering import OrderingExpression import bigframes.core.schema as schemata import bigframes.core.slices as slices @@ -42,12 +41,14 @@ # A fixed number of variable to assume for overhead on some operations OVERHEAD_VARIABLES = 5 -COLUMN_SET = frozenset[bfet_ids.ColumnId] +COLUMN_SET = frozenset[identifiers.ColumnId] + +Self = TypeVar("Self") @dataclasses.dataclass(frozen=True) class Field: - id: bfet_ids.ColumnId + id: identifiers.ColumnId dtype: bigframes.dtypes.Dtype @@ -87,19 +88,30 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def row_count(self) -> typing.Optional[int]: return None + @abc.abstractmethod + def remap_vars( + self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> Self: + """Remap defined (in this node only) variables.""" + ... + @abc.abstractmethod def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> Self: """Remap variable references""" ... @property @abc.abstractmethod - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: """The variables defined in this node (as opposed to by child nodes).""" ... + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @functools.cached_property def session(self): sessions = [] @@ -166,7 +178,7 @@ def fields(self) -> Iterable[Field]: ... @property - def ids(self) -> Iterable[bfet_ids.ColumnId]: + def ids(self) -> Iterable[identifiers.ColumnId]: """All output ids from the node.""" return (field.id for field in self.fields) @@ -248,18 +260,11 @@ def planning_complexity(self) -> int: @abc.abstractmethod def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + self: Self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> Self: """Apply a function to each child node.""" ... - @abc.abstractmethod - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - """Remap defined (in this node only) variables.""" - ... - @property def defines_namespace(self) -> bool: """ @@ -269,26 +274,13 @@ def defines_namespace(self) -> bool: """ return False - @functools.cached_property - def defined_variables(self) -> set[str]: - """Full set of variables defined in the namespace, even if not selected.""" - self_defined_variables = set(self.schema.names) - if self.defines_namespace: - return self_defined_variables - return self_defined_variables.union( - *(child.defined_variables for child in self.child_nodes) - ) - - def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: + def get_type(self, id: identifiers.ColumnId) -> bigframes.dtypes.Dtype: return self._dtype_lookup[id] @functools.cached_property def _dtype_lookup(self): return {field.id: field.dtype for field in self.fields} - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self.transform_children(lambda x: x.prune(used_cols)) - class AdditiveNode: """Definition of additive - if you drop added_fields, you end up with the descendent. @@ -336,7 +328,7 @@ def explicitly_ordered(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> UnaryNode: transformed = dataclasses.replace(self, child=t(self.child)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory @@ -403,15 +395,21 @@ def row_count(self) -> typing.Optional[int]: ) @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> SliceNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> SliceNode: return self @@ -427,7 +425,7 @@ class InNode(BigFrameNode, AdditiveNode): right_child: BigFrameNode left_col: ex.DerefOp right_col: ex.DerefOp - indicator_col: bfet_ids.ColumnId + indicator_col: identifiers.ColumnId def _validate(self): assert not ( @@ -480,9 +478,13 @@ def row_count(self) -> Optional[int]: return self.left_child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.indicator_col,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset({self.left_col.id, self.right_col.id}) + @property def additive_base(self) -> BigFrameNode: return self.left_child @@ -490,9 +492,7 @@ def additive_base(self) -> BigFrameNode: def replace_additive_base(self, node: BigFrameNode): return dataclasses.replace(self, left_child=node) - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> InNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -501,17 +501,16 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> InNode: return dataclasses.replace( self, indicator_col=mappings.get(self.indicator_col, self.indicator_col) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> InNode: return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore @@ -571,12 +570,23 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + (*l_cond.column_references, *r_cond.column_references) + for l_cond, r_cond in self.conditions + ) + ) + + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(*self.ids, *self.referenced_ids) + + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -585,21 +595,14 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # If this is a cross join, make sure to select at least one column from each side - condition_cols = used_cols.union( - map(lambda x: x.id, itertools.chain.from_iterable(self.conditions)) - ) - return self.transform_children( - lambda x: x.prune(frozenset([*condition_cols, *used_cols])) - ) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> JoinNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> JoinNode: new_conds = tuple( ( l_cond.remap_column_refs(mappings, allow_partial_bindings=True), @@ -614,7 +617,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): class ConcatNode(BigFrameNode): # TODO: Explcitly map column ids from each child children: Tuple[BigFrameNode, ...] - output_ids: Tuple[bfet_ids.ColumnId, ...] + output_ids: Tuple[identifiers.ColumnId, ...] def _validate(self): if len(self.children) == 0: @@ -660,12 +663,12 @@ def row_count(self) -> Optional[int]: return total @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return self.output_ids def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> ConcatNode: transformed = dataclasses.replace( self, children=tuple(t(child) for child in self.children) ) @@ -674,17 +677,15 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make concat prunable, probably by redefining - return self - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ConcatNode: new_ids = tuple(mappings.get(id, id) for id in self.output_ids) return dataclasses.replace(self, output_ids=new_ids) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ConcatNode: return self @@ -694,7 +695,7 @@ class FromRangeNode(BigFrameNode): start: BigFrameNode end: BigFrameNode step: int - output_id: bfet_ids.ColumnId = bfet_ids.ColumnId("labels") + output_id: identifiers.ColumnId = identifiers.ColumnId("labels") @property def roots(self) -> typing.Set[BigFrameNode]: @@ -726,7 +727,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.output_id,) @property @@ -735,25 +736,23 @@ def defines_namespace(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> FromRangeNode: transformed = dataclasses.replace(self, start=t(self.start), end=t(self.end)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make FromRangeNode prunable (or convert to other node types) - return self - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> FromRangeNode: return dataclasses.replace( self, output_id=mappings.get(self.output_id, self.output_id) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> FromRangeNode: return self @@ -774,17 +773,18 @@ def fast_offsets(self) -> bool: def fast_ordered_limit(self) -> bool: return False - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafNode: return self class ScanItem(typing.NamedTuple): - id: bfet_ids.ColumnId + id: identifiers.ColumnId dtype: bigframes.dtypes.Dtype # Might be multiple logical types for a given physical source type source_id: str # Flexible enough for both local data and bq data + def with_id(self, id: identifiers.ColumnId) -> ScanItem: + return ScanItem(id, self.dtype, self.source_id) + @dataclasses.dataclass(frozen=True) class ScanList: @@ -838,28 +838,12 @@ def row_count(self) -> typing.Optional[int]: return self.n_rows @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(item.id for item in self.fields) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Don't preoduce empty scan list no matter what, will result in broken sql syntax - # TODO: Handle more elegantly - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return ReadLocalNode( - self.feather_bytes, - self.data_schema, - self.n_rows, - new_scan_list, - self.offsets_col, - self.session, - ) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReadLocalNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -875,7 +859,9 @@ def remap_vars( self, scan_list=new_scan_list, offsets_col=new_offsets_col ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReadLocalNode: return self @@ -1000,19 +986,12 @@ def row_count(self) -> typing.Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(item.id for item in self.scan_list.items) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return dataclasses.replace(self, scan_list=new_scan_list) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReadTableNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -1021,7 +1000,9 @@ def remap_vars( ) return dataclasses.replace(self, scan_list=new_scan_list) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReadTableNode: return self def with_order_cols(self): @@ -1033,7 +1014,7 @@ def with_order_cols(self): scan_cols = {col.source_id for col in self.scan_list.items} new_scan_cols = [ ScanItem( - bigframes.core.ids.ColumnId.unique(), + identifiers.ColumnId.unique(), dtype=bigframes.dtypes.convert_schema_field(field)[1], source_id=field.name, ) @@ -1042,10 +1023,7 @@ def with_order_cols(self): ] new_scan_list = ScanList(items=(*self.scan_list.items, *new_scan_cols)) new_order = self.source.ordering.remap_column_refs( - { - bigframes.core.ids.ColumnId(item.source_id): item.id - for item in new_scan_cols - }, + {identifiers.ColumnId(item.source_id): item.id for item in new_scan_cols}, allow_partial_bindings=True, ) return dataclasses.replace(self, scan_list=new_scan_list), new_order @@ -1086,9 +1064,13 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.col_id,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @property def added_fields(self) -> Tuple[Field, ...]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) @@ -1097,22 +1079,17 @@ def added_fields(self) -> Tuple[Field, ...]: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> PromoteOffsetsNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.col_id not in used_cols: - return self.child.prune(used_cols) - else: - new_used = used_cols.difference([self.col_id]) - return self.transform_children(lambda x: x.prune(new_used)) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> PromoteOffsetsNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> PromoteOffsetsNode: return self @@ -1133,20 +1110,25 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - consumed_ids = used_cols.union(self.predicate.column_references) - pruned_child = self.child.prune(consumed_ids) - return FilterNode(pruned_child, self.predicate) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(self.predicate.column_references) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> FilterNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> FilterNode: return dataclasses.replace( self, predicate=self.predicate.remap_column_refs( @@ -1180,23 +1162,27 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - ordering_cols = itertools.chain.from_iterable( - map(lambda x: x.referenced_columns, self.by) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) - consumed_ids = used_cols.union(ordering_cols) - pruned_child = self.child.prune(consumed_ids) - return OrderByNode(pruned_child, self.by) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> OrderByNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> OrderByNode: all_refs = set( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) @@ -1230,23 +1216,46 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReversedNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ReversedNode: return self +class AliasedRef(typing.NamedTuple): + ref: ex.DerefOp + id: identifiers.ColumnId + + @classmethod + def identity(cls, id: identifiers.ColumnId) -> AliasedRef: + return cls(ex.DerefOp(id), id) + + def remap_vars( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> AliasedRef: + return AliasedRef(self.ref, mappings.get(self.id, self.id)) + + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> AliasedRef: + return AliasedRef(ex.DerefOp(mappings.get(self.ref.id, self.ref.id)), self.id) + + @dataclasses.dataclass(frozen=True, eq=False) class SelectionNode(UnaryNode): - input_output_pairs: typing.Tuple[ - typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ... - ] + input_output_pairs: Tuple[AliasedRef, ...] def _validate(self): for ref, _ in self.input_output_pairs: @@ -1277,36 +1286,29 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.input_output_pairs) - def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: - return {ref.id: out_id for ref, out_id in self.input_output_pairs} - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_selections = ( - tuple( - select for select in self.input_output_pairs if select[1] in used_cols - ) - or self.input_output_pairs[:1] - ) - consumed_ids = frozenset(i[0].id for i in pruned_selections) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref, id in self.input_output_pairs) - pruned_child = self.child.prune(consumed_ids) - return SelectionNode(pruned_child, pruned_selections) + def get_id_mapping(self) -> dict[identifiers.ColumnId, identifiers.ColumnId]: + return {ref.id: id for ref, id in self.input_output_pairs} def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - new_pairs = tuple( - (ref, mappings.get(id, id)) for ref, id in self.input_output_pairs + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> SelectionNode: + new_fields = tuple( + item.remap_vars(mappings) for item in self.input_output_pairs ) - return dataclasses.replace(self, input_output_pairs=new_pairs) + return dataclasses.replace(self, input_output_pairs=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> SelectionNode: new_fields = tuple( - (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) - for ex, id in self.input_output_pairs + item.remap_refs(mappings) for item in self.input_output_pairs ) return dataclasses.replace(self, input_output_pairs=new_fields) # type: ignore @@ -1350,33 +1352,41 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.assignments) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + i[0].column_references for i in self.assignments + ) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + ex.column_references for ex, id in self.assignments + ) + ) + @property def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) - if len(pruned_assignments) == 0: - return self.child.prune(used_cols) - consumed_ids = itertools.chain.from_iterable( - i[0].column_references for i in pruned_assignments - ) - pruned_child = self.child.prune(used_cols.union(consumed_ids)) - return ProjectionNode(pruned_child, pruned_assignments) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ProjectionNode: new_fields = tuple((ex, mappings.get(id, id)) for ex, id in self.assignments) return dataclasses.replace(self, assignments=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ProjectionNode: new_fields = tuple( (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) for ex, id in self.assignments @@ -1388,7 +1398,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): # Row count can be compute from table metadata sometimes, so it is a bit special. @dataclasses.dataclass(frozen=True, eq=False) class RowCountNode(UnaryNode): - col_id: bfet_ids.ColumnId = bfet_ids.ColumnId("count") + col_id: identifiers.ColumnId = identifiers.ColumnId("count") @property def row_preserving(self) -> bool: @@ -1415,19 +1425,21 @@ def row_count(self) -> Optional[int]: return 1 @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.col_id,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> RowCountNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): - return self - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Handle row count pruning + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> RowCountNode: return self @@ -1483,37 +1495,35 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.aggregations) @property - def has_ordered_ops(self) -> bool: - return not all( - aggregate.op.order_independent for aggregate, _ in self.aggregations - ) - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + def consumed_ids(self) -> COLUMN_SET: by_ids = (ref.id for ref in self.by_column_ids) - pruned_aggs = ( - tuple(agg for agg in self.aggregations if agg[1] in used_cols) - or self.aggregations[:1] - ) agg_inputs = itertools.chain.from_iterable( - agg.column_references for agg, _ in pruned_aggs + agg.column_references for agg, _ in self.aggregations ) - consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs)) - pruned_child = self.child.prune(consumed_ids) - return AggregateNode( - pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna + order_ids = itertools.chain.from_iterable( + part.scalar_expression.column_references for part in self.order_by + ) + return frozenset(itertools.chain(by_ids, agg_inputs, order_ids)) + + @property + def has_ordered_ops(self) -> bool: + return not all( + aggregate.op.order_independent for aggregate, _ in self.aggregations ) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> AggregateNode: new_aggs = tuple((agg, mappings.get(id, id)) for agg, id in self.aggregations) return dataclasses.replace(self, aggregations=new_aggs) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> AggregateNode: new_aggs = tuple( (agg.remap_column_refs(mappings, allow_partial_bindings=True), id) for agg, id in self.aggregations @@ -1575,9 +1585,23 @@ def added_field(self) -> Field: ) @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.output_name,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + set(self.ids).difference([self.output_name]).union(self.referenced_ids) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return ( + frozenset() + .union(self.expression.column_references) + .union(self.window_spec.all_referenced_columns) + ) + @property def inherits_order(self) -> bool: # does the op both use ordering at all? and if so, can it inherit order? @@ -1590,27 +1614,19 @@ def inherits_order(self) -> bool: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.output_name not in used_cols: - return self.child.prune(used_cols) - consumed_ids = ( - used_cols.difference([self.output_name]) - .union(self.expression.column_references) - .union(self.window_spec.all_referenced_columns) - ) - return self.transform_children(lambda x: x.prune(consumed_ids)) - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> WindowOpNode: return dataclasses.replace( self, output_name=mappings.get(self.output_name, self.output_name) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> WindowOpNode: return dataclasses.replace( self, expression=self.expression.remap_column_refs( @@ -1643,17 +1659,21 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> RandomSampleNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> RandomSampleNode: return self @@ -1700,24 +1720,23 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.offsets_col,) if (self.offsets_col is not None) else () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Cannot prune explode op - consumed_ids = used_cols.union(ref.id for ref in self.column_ids) - return self.transform_children(lambda x: x.prune(consumed_ids)) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref in self.column_ids) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ExplodeNode: if (self.offsets_col is not None) and self.offsets_col in mappings: return dataclasses.replace(self, offsets_col=mappings[self.offsets_col]) return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ExplodeNode: new_ids = tuple(id.remap_column_refs(mappings) for id in self.column_ids) return dataclasses.replace(self, column_ids=new_ids) # type: ignore diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 9044cb25f9..e5f7578911 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -16,13 +16,17 @@ from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.order import pull_up_order +from bigframes.core.rewrite.pruning import column_pruning from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice +from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions __all__ = [ "legacy_join_as_projection", "try_row_join", "rewrite_slice", + "rewrite_timedelta_expressions", "pullup_limit_from_slice", "remap_variables", "pull_up_order", + "column_pruning", ] diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 1b864fb919..1989b1a543 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -113,7 +113,7 @@ def try_row_join( r_node, r_selection = pull_up_selection( r_node, stop=divergent_node, rename_vars=True ) # Rename only right vars to avoid collisions with left vars - combined_selection = (*l_selection, *r_selection) + combined_selection = l_selection + r_selection def _linearize_trees( base_tree: bigframes.core.nodes.BigFrameNode, @@ -139,10 +139,7 @@ def pull_up_selection( rename_vars: bool = False, ) -> Tuple[ bigframes.core.nodes.BigFrameNode, - Tuple[ - Tuple[bigframes.core.expression.DerefOp, bigframes.core.identifiers.ColumnId], - ..., - ], + Tuple[bigframes.core.nodes.AliasedRef, ...], ]: """Remove all selection nodes above the base node. Returns stripped tree. @@ -157,8 +154,7 @@ def pull_up_selection( """ if node == stop: # base case return node, tuple( - (bigframes.core.expression.DerefOp(field.id), field.id) - for field in node.fields + bigframes.core.nodes.AliasedRef.identity(field.id) for field in node.fields ) # InNode needs special handling, as its a binary node, but row identity is from left side only. # TODO: Merge code with unary op paths @@ -179,11 +175,15 @@ def pull_up_selection( {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()} ), ) - added_selection = ( - bigframes.core.expression.DerefOp(new_in_node.indicator_col), - node.indicator_col, + added_selection = tuple( + ( + bigframes.core.nodes.AliasedRef( + bigframes.core.expression.DerefOp(new_in_node.indicator_col), + node.indicator_col, + ), + ) ) - new_selection = (*child_selections, added_selection) + new_selection = child_selections + added_selection return new_in_node, new_selection if isinstance(node, bigframes.core.nodes.AdditiveNode): @@ -204,28 +204,20 @@ def pull_up_selection( else: var_renames = {} assert isinstance(new_node, bigframes.core.nodes.AdditiveNode) - added_selections = ( - ( - bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)), - field.id, - ) + added_selections = tuple( + bigframes.core.nodes.AliasedRef.identity(field.id).remap_refs(var_renames) for field in node.added_fields ) - new_selection = (*child_selections, *added_selections) + new_selection = child_selections + added_selections return new_node, new_selection elif isinstance(node, bigframes.core.nodes.SelectionNode): child_node, child_selections = pull_up_selection( node.child, stop, rename_vars=rename_vars ) mapping = {out: ref.id for ref, out in child_selections} - new_selection = tuple( - ( - bigframes.core.expression.DerefOp(mapping[ref.id]), - out, - ) - for ref, out in node.input_output_pairs + return child_node, tuple( + ref.remap_refs(mapping) for ref in node.input_output_pairs ) - return child_node, new_selection raise ValueError(f"Couldn't pull up select from node: {node}") diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py index 05641130fb..573a7026e4 100644 --- a/bigframes/core/rewrite/legacy_align.py +++ b/bigframes/core/rewrite/legacy_align.py @@ -57,7 +57,7 @@ def from_node_span( if isinstance(node, nodes.SelectionNode): return cls.from_node_span(node.child, target).select( - node.input_output_pairs + tuple(node.input_output_pairs) ) elif isinstance(node, nodes.ProjectionNode): return cls.from_node_span(node.child, target).project(node.assignments) @@ -228,7 +228,9 @@ def expand(self) -> nodes.BigFrameNode: root = nodes.FilterNode(child=root, predicate=self.predicate) if self.ordering: root = nodes.OrderByNode(child=root, by=self.ordering) - selection = tuple((scalar_exprs.DerefOp(id), id) for _, id in self.columns) + selection = tuple( + bigframes.core.nodes.AliasedRef.identity(id) for _, id in self.columns + ) return nodes.SelectionNode( child=nodes.ProjectionNode(child=root, assignments=self.columns), input_output_pairs=selection, diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 3f8c409b76..bdb30fbc34 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -15,12 +15,13 @@ import functools from typing import Mapping, Tuple +from bigframes.core import identifiers import bigframes.core.expression -import bigframes.core.identifiers import bigframes.core.nodes import bigframes.core.ordering import bigframes.core.window_spec import bigframes.operations +from bigframes.operations import aggregations as agg_ops # Makes ordering explicit in window definitions @@ -54,12 +55,12 @@ def pull_up_order_inner( new_node, child_order = pull_up_order_inner(node.child) new_by = [] - ids: list[bigframes.core.ids.ColumnId] = [] + ids: list[identifiers.ColumnId] = [] for part in node.by: if not isinstance( part.scalar_expression, bigframes.core.expression.DerefOp ): - id = bigframes.core.ids.ColumnId.unique() + id = identifiers.ColumnId.unique() new_node = bigframes.core.nodes.ProjectionNode( new_node, ((part.scalar_expression, id),) ) @@ -114,7 +115,7 @@ def pull_up_order_inner( ) elif isinstance(node, bigframes.core.nodes.ReadLocalNode): if node.offsets_col is None: - offsets_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() new_root = dataclasses.replace(node, offsets_col=offsets_id) return new_root, bigframes.core.ordering.TotalOrdering.from_offset_col( offsets_id @@ -145,7 +146,7 @@ def pull_up_order_inner( else: # Otherwise we need to generate offsets agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() + agg_ops.RowNumberOp() ) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(child_order.all_ordering_columns) @@ -177,17 +178,12 @@ def pull_up_order_inner( ) # Create unique ids just to be safe new_selections = { - col: bigframes.core.ids.ColumnId.unique() - for col in unselected_order_cols + col: identifiers.ColumnId.unique() for col in unselected_order_cols } - all_selections = ( - *node.input_output_pairs, - *( - (bigframes.core.expression.DerefOp(k), v) - for k, v in new_selections.items() - ), + all_selections = node.input_output_pairs + tuple( + bigframes.core.nodes.AliasedRef(bigframes.core.expression.DerefOp(k), v) + for k, v in new_selections.items() ) - new_select_node = dataclasses.replace( node, child=child_result, input_output_pairs=all_selections ) @@ -244,14 +240,14 @@ def pull_up_order_inner( elif isinstance(node, bigframes.core.nodes.ExplodeNode): child_result, child_order = pull_up_order_inner(node.child) if node.offsets_col is None: - offsets_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() new_explode: bigframes.core.nodes.BigFrameNode = dataclasses.replace( node, child=child_result, offsets_col=offsets_id ) else: offsets_id = node.offsets_col new_explode = node.replace_child(child_result) - inner_order = bigframes.core.orderings.TotalOrdering.from_offset_col( + inner_order = bigframes.core.ordering.TotalOrdering.from_offset_col( offsets_id ) return new_explode, child_order.join(inner_order) @@ -265,8 +261,8 @@ def pull_order_concat( new_sources = [] for i, source in enumerate(node.child_nodes): new_source, order = pull_up_order_inner(source) - offsets_id = bigframes.core.ids.ColumnId.unique() - table_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() + table_id = identifiers.ColumnId.unique() if order.is_total_ordering and order.integer_encoding.is_encoded: order_expression = order.total_order_col assert order_expression is not None @@ -275,7 +271,7 @@ def pull_order_concat( ) else: agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() + agg_ops.RowNumberOp() ) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(order.all_ordering_columns) @@ -288,15 +284,15 @@ def pull_order_concat( ) selection = tuple( ( - (bigframes.core.expression.DerefOp(id), id) + bigframes.core.nodes.AliasedRef.identity(id) for id in (*source.ids, table_id, offsets_id) ) ) new_source = bigframes.core.nodes.SelectionNode(new_source, selection) new_sources.append(new_source) - union_offsets_id = bigframes.core.ids.ColumnId.unique() - union_table_id = bigframes.core.ids.ColumnId.unique() + union_offsets_id = identifiers.ColumnId.unique() + union_table_id = identifiers.ColumnId.unique() new_ids = (*node.output_ids, union_table_id, union_offsets_id) new_node = dataclasses.replace( node, children=tuple(new_sources), output_ids=new_ids @@ -321,7 +317,7 @@ def pull_order_join( if node.type in ("right", "outer"): # right side is nullable - left_indicator = bigframes.core.ids.ColumnId.unique() + left_indicator = identifiers.ColumnId.unique() left_child = bigframes.core.nodes.ProjectionNode( left_child, ((bigframes.core.expression.const(True), left_indicator),) ) @@ -330,7 +326,7 @@ def pull_order_join( ) if node.type in ("left", "outer"): # right side is nullable - right_indicator = bigframes.core.ids.ColumnId.unique() + right_indicator = identifiers.ColumnId.unique() right_child = bigframes.core.nodes.ProjectionNode( right_child, ((bigframes.core.expression.const(True), right_indicator),) ) @@ -396,7 +392,7 @@ def remove_order_strict( if result.ids != node.ids: return bigframes.core.nodes.SelectionNode( result, - tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids), + tuple(bigframes.core.nodes.AliasedRef.identity(id) for id in node.ids), ) return result @@ -410,25 +406,23 @@ def remove_order_strict( def rewrite_promote_offsets( node: bigframes.core.nodes.PromoteOffsetsNode, ) -> bigframes.core.nodes.WindowOpNode: - agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() - ) + agg = bigframes.core.expression.NullaryAggregation(agg_ops.RowNumberOp()) window_spec = bigframes.core.window_spec.unbound() return bigframes.core.nodes.WindowOpNode(node.child, agg, window_spec, node.col_id) def rename_cols( - node: bigframes.core.nodes.BigFrameNode, cols: set[bigframes.core.ids.ColumnId] + node: bigframes.core.nodes.BigFrameNode, cols: set[identifiers.ColumnId] ) -> Tuple[ bigframes.core.nodes.BigFrameNode, - Mapping[bigframes.core.ids.ColumnId, bigframes.core.ids.ColumnId], + Mapping[identifiers.ColumnId, identifiers.ColumnId], ]: - mappings = dict((id, bigframes.core.ids.ColumnId.unique()) for id in cols) + mappings = dict((id, identifiers.ColumnId.unique()) for id in cols) result_node = bigframes.core.nodes.SelectionNode( node, tuple( - (bigframes.core.expression.DerefOp(id), mappings.get(id, id)) + bigframes.core.nodes.AliasedRef.identity(id).remap_vars(mappings) for id in node.ids ), ) diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py new file mode 100644 index 0000000000..7e40137f3e --- /dev/null +++ b/bigframes/core/rewrite/pruning.py @@ -0,0 +1,195 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import functools +from typing import AbstractSet + +from bigframes.core import identifiers +import bigframes.core.nodes + + +def column_pruning( + root: bigframes.core.nodes.BigFrameNode, +) -> bigframes.core.nodes.BigFrameNode: + return bigframes.core.nodes.top_down(root, prune_columns) + + +def to_fixed(max_iterations: int = 100): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + previous_result = None + current_result = func(*args, **kwargs) + attempts = 1 + + while attempts < max_iterations: + if current_result == previous_result: + return current_result + previous_result = current_result + current_result = func(current_result) + attempts += 1 + + return current_result + + return wrapper + + return decorator + + +@to_fixed(max_iterations=100) +def prune_columns(node: bigframes.core.nodes.BigFrameNode): + if isinstance(node, bigframes.core.nodes.SelectionNode): + result = prune_selection_child(node) + elif isinstance(node, bigframes.core.nodes.AggregateNode): + result = node.replace_child(prune_node(node.child, node.consumed_ids)) + elif isinstance(node, bigframes.core.nodes.InNode): + result = dataclasses.replace( + node, + right_child=prune_node(node.right_child, frozenset([node.right_col.id])), + ) + else: + result = node + return result + + +def prune_selection_child( + selection: bigframes.core.nodes.SelectionNode, +) -> bigframes.core.nodes.BigFrameNode: + child = selection.child + + # Important to check this first + if list(selection.ids) == list(child.ids): + return child + + if isinstance(child, bigframes.core.nodes.SelectionNode): + return selection.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + elif isinstance(child, bigframes.core.nodes.AdditiveNode): + if not set(field.id for field in child.added_fields) & selection.consumed_ids: + return selection.replace_child(child.additive_base) + return selection.replace_child( + child.replace_additive_base( + prune_node( + child.additive_base, selection.consumed_ids | child.referenced_ids + ) + ) + ) + elif isinstance(child, bigframes.core.nodes.ConcatNode): + indices = [ + list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs + ] + new_children = [] + for concat_node in child.child_nodes: + cc_ids = tuple(concat_node.ids) + sub_selection = tuple( + bigframes.core.nodes.AliasedRef.identity(cc_ids[i]) for i in indices + ) + new_children.append( + bigframes.core.nodes.SelectionNode(concat_node, sub_selection) + ) + return bigframes.core.nodes.ConcatNode( + children=tuple(new_children), output_ids=tuple(selection.ids) + ) + # Nodes that pass through input columns + elif isinstance( + child, + ( + bigframes.core.nodes.RandomSampleNode, + bigframes.core.nodes.ReversedNode, + bigframes.core.nodes.OrderByNode, + bigframes.core.nodes.FilterNode, + bigframes.core.nodes.SliceNode, + bigframes.core.nodes.JoinNode, + bigframes.core.nodes.ExplodeNode, + ), + ): + ids = selection.consumed_ids | child.referenced_ids + return selection.replace_child( + child.transform_children(lambda x: prune_node(x, ids)) + ) + elif isinstance(child, bigframes.core.nodes.AggregateNode): + return selection.replace_child(prune_aggregate(child, selection.consumed_ids)) + elif isinstance(child, bigframes.core.nodes.LeafNode): + return selection.replace_child(prune_leaf(child, selection.consumed_ids)) + return selection + + +def prune_node( + node: bigframes.core.nodes.BigFrameNode, + ids: AbstractSet[identifiers.ColumnId], +): + # This clause is important, ensures idempotency, so can reach fixed point + if not (set(node.ids) - ids): + return node + else: + return bigframes.core.nodes.SelectionNode( + node, + tuple( + bigframes.core.nodes.AliasedRef.identity(id) + for id in node.ids + if id in ids + ), + ) + + +def prune_aggregate( + node: bigframes.core.nodes.AggregateNode, + used_cols: AbstractSet[identifiers.ColumnId], +) -> bigframes.core.nodes.AggregateNode: + pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) + return dataclasses.replace(node, aggregations=pruned_aggs) + + +@functools.singledispatch +def prune_leaf( + node: bigframes.core.nodes.BigFrameNode, + used_cols: AbstractSet[identifiers.ColumnId], +): + ... + + +@prune_leaf.register +def prune_readlocal( + node: bigframes.core.nodes.ReadLocalNode, + selection: AbstractSet[identifiers.ColumnId], +) -> bigframes.core.nodes.ReadLocalNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace( + node, + scan_list=new_scan_list, + offsets_col=node.offsets_col if (node.offsets_col in selection) else None, + ) + + +@prune_leaf.register +def prune_readtable( + node: bigframes.core.nodes.ReadTableNode, + selection: AbstractSet[identifiers.ColumnId], +) -> bigframes.core.nodes.ReadTableNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace(node, scan_list=new_scan_list) + + +def filter_scanlist( + scanlist: bigframes.core.nodes.ScanList, + ids: AbstractSet[identifiers.ColumnId], +): + result = bigframes.core.nodes.ScanList( + tuple(item for item in scanlist.items if item.id in ids) + ) + if len(result.items) == 0: + # We need to select something, or stuff breaks + result = bigframes.core.nodes.ScanList(scanlist.items[:1]) + return result diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 102ffcf773..87a7720e2f 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -120,7 +120,9 @@ def drop_cols( ) -> nodes.SelectionNode: # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient selections = tuple( - (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols + nodes.AliasedRef(scalar_exprs.DerefOp(id), id) + for id in node.ids + if id not in drop_cols ) return nodes.SelectionNode(node, selections) diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py new file mode 100644 index 0000000000..d740b28d7d --- /dev/null +++ b/bigframes/core/rewrite/timedeltas.py @@ -0,0 +1,114 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import functools +import typing + +from bigframes import dtypes +from bigframes import operations as ops +from bigframes.core import expression as ex +from bigframes.core import nodes, schema, utils + + +@dataclasses.dataclass +class _TypedExpr: + expr: ex.Expression + dtype: dtypes.Dtype + + +def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNode: + """ + Rewrites expressions to properly handle timedelta values, because this type does not exist + in the SQL world. + """ + if isinstance(root, nodes.ProjectionNode): + updated_assignments = tuple( + (_rewrite_expressions(expr, root.schema).expr, column_id) + for expr, column_id in root.assignments + ) + root = nodes.ProjectionNode(root.child, updated_assignments) + + # TODO(b/394354614): FilterByNode and OrderNode also contain expressions. Need to update them too. + return root + + +@functools.cache +def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _TypedExpr: + if isinstance(expr, ex.DerefOp): + return _TypedExpr(expr, schema.get_type(expr.id.sql)) + + if isinstance(expr, ex.ScalarConstantExpression): + return _rewrite_scalar_constant_expr(expr) + + if isinstance(expr, ex.OpExpression): + updated_inputs = tuple( + map(lambda x: _rewrite_expressions(x, schema), expr.inputs) + ) + return _rewrite_op_expr(expr, updated_inputs) + + raise AssertionError(f"Unexpected expression type: {type(expr)}") + + +def _rewrite_scalar_constant_expr(expr: ex.ScalarConstantExpression) -> _TypedExpr: + if expr.dtype is dtypes.TIMEDELTA_DTYPE: + int_repr = utils.timedelta_to_micros(expr.value) # type: ignore + return _TypedExpr(ex.const(int_repr, expr.dtype), expr.dtype) + + return _TypedExpr(expr, expr.dtype) + + +def _rewrite_op_expr( + expr: ex.OpExpression, inputs: typing.Tuple[_TypedExpr, ...] +) -> _TypedExpr: + if isinstance(expr.op, ops.SubOp): + return _rewrite_sub_op(inputs[0], inputs[1]) + + if isinstance(expr.op, ops.AddOp): + return _rewrite_add_op(inputs[0], inputs[1]) + + input_types = tuple(map(lambda x: x.dtype, inputs)) + return _TypedExpr(expr, expr.op.output_type(*input_types)) + + +def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + result_op: ops.BinaryOp = ops.sub_op + if dtypes.is_datetime_like(left.dtype) and dtypes.is_datetime_like(right.dtype): + result_op = ops.timestamp_diff_op + + return _TypedExpr( + result_op.as_expr(left.expr, right.expr), + result_op.output_type(left.dtype, right.dtype), + ) + + +def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr( + ops.timestamp_add_op.as_expr(left.expr, right.expr), + ops.timestamp_add_op.output_type(left.dtype, right.dtype), + ) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): + # Re-arrange operands such that timestamp is always on the left and timedelta is + # always on the right. + return _TypedExpr( + ops.timestamp_add_op.as_expr(right.expr, left.expr), + ops.timestamp_add_op.output_type(right.dtype, left.dtype), + ) + + return _TypedExpr( + ops.add_op.as_expr(left.expr, right.expr), + ops.add_op.output_type(left.dtype, right.dtype), + ) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 7cb2ec7535..0198f12537 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import datetime import functools import re import typing @@ -18,6 +19,7 @@ import warnings import bigframes_vendored.pandas.io.common as vendored_pandas_io_common +import numpy as np import pandas as pd import pandas.api.types as pdtypes import typing_extensions @@ -187,9 +189,22 @@ def wrapper(*args, **kwargs): return decorator -def timedelta_to_micros(td: pd.Timedelta) -> int: - # td.value returns total nanoseconds. - return td.value // 1000 +def timedelta_to_micros( + timedelta: typing.Union[pd.Timedelta, datetime.timedelta, np.timedelta64] +) -> int: + if isinstance(timedelta, pd.Timedelta): + # pd.Timedelta.value returns total nanoseconds. + return timedelta.value // 1000 + + if isinstance(timedelta, np.timedelta64): + return timedelta.astype("timedelta64[us]").astype(np.int64) + + if isinstance(timedelta, datetime.timedelta): + return ( + (timedelta.days * 3600 * 24) + timedelta.seconds + ) * 1_000_000 + timedelta.microseconds + + raise TypeError(f"Unrecognized input type: {type(timedelta)}") def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index 2b45560b15..7758145fd4 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -18,8 +18,7 @@ import bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling -from bigframes.core import log_adapter -import bigframes.core as core +from bigframes.core import log_adapter, window_spec import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops @@ -31,7 +30,7 @@ class Window(vendored_pandas_rolling.Window): def __init__( self, block: blocks.Block, - window_spec: core.WindowSpec, + window_spec: window_spec.WindowSpec, value_column_ids: typing.Sequence[str], drop_null_groups: bool = True, is_series: bool = False, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6308dcc8da..4ffa56c2e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -84,7 +84,7 @@ import bigframes.session - SingleItemValue = Union[bigframes.series.Series, int, float, Callable] + SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable] LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -180,7 +180,10 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - block = block.select_columns(list(columns)) # type:ignore + column_ids = [ + block.resolve_label_exact_or_error(label) for label in list(columns) + ] + block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -238,15 +241,7 @@ def _find_indices( return [self._block.value_columns.index(col_id) for col_id in col_ids] def _resolve_label_exact(self, label) -> Optional[str]: - """Returns the column id matching the label if there is exactly - one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, returns None.""" - matches = self._block.label_to_col_id.get(label, []) - if len(matches) > 1: - raise ValueError( - f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" - ) - return matches[0] if len(matches) != 0 else None + return self._block.resolve_label_exact(label) def _sql_names( self, @@ -1953,7 +1948,7 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: result_block = result_block.drop_columns([src_col]) return DataFrame(result_block) - def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: + def _assign_scalar(self, label: str, value: Union[int, float, str]) -> DataFrame: col_ids = self._block.cols_matching_label(label) block, constant_col_id = self._block.create_constant(value, label) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 8b1ca3b0c8..eed45e1dde 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -56,7 +56,7 @@ TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) -TIMEDETLA_DTYPE = pd.ArrowDtype(pa.duration("us")) +TIMEDELTA_DTYPE = pd.ArrowDtype(pa.duration("us")) NUMERIC_DTYPE = pd.ArrowDtype(pa.decimal128(38, 9)) BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38)) # No arrow equivalent @@ -105,6 +105,9 @@ pd.Timestamp, datetime.date, datetime.time, + pd.Timedelta, + datetime.timedelta, + np.timedelta64, ] LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) @@ -295,7 +298,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: # See: https://stackoverflow.com/a/40312924/101923 and # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html # for the way to identify object type. - return type_ in ("object", "O") or getattr(type_, "kind", None) == "O" + return type_ in ("object", "O") or ( + getattr(type_, "kind", None) == "O" + and getattr(type_, "storage", None) != "pyarrow" + ) def is_string_like(type_: ExpressionType) -> bool: @@ -417,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: return pd.ArrowDtype(arrow_dtype) if pa.types.is_duration(arrow_dtype): - return pd.ArrowDtype(arrow_dtype) + return TIMEDELTA_DTYPE # BigFrames doesn't distinguish between string and large_string because the # largest string (2 GB) is already larger than the largest BigQuery row. @@ -559,6 +565,10 @@ def _is_bigframes_dtype(dtype) -> bool: def _infer_dtype_from_python_type(type: type) -> Dtype: + if type in (datetime.timedelta, pd.Timedelta, np.timedelta64): + # Must check timedelta type first. Otherwise other branchs will be evaluated to true + # E.g. np.timedelta64 is a sublcass as np.integer + return TIMEDELTA_DTYPE if issubclass(type, (bool, np.bool_)): return BOOL_DTYPE if issubclass(type, (int, np.integer)): diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 7a5b24f413..b757e2b971 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import bigframes_vendored.constants as constants import bigframes_vendored.geopandas.geoseries as vendored_geoseries import geopandas.array # type: ignore @@ -39,3 +40,38 @@ def y(self) -> bigframes.series.Series: series = self._apply_unary_op(ops.geo_y_op) series.name = None return series + + # GeoSeries.area overrides Series.area with something totally different. + # Ignore this type error, as we are trying to be as close to geopandas as + # we can. + @property + def area(self, crs=None) -> bigframes.series.Series: # type: ignore + """Returns a Series containing the area of each geometry in the GeoSeries + expressed in the units of the CRS. + + Args: + crs (optional): + Coordinate Reference System of the geometry objects. Can be + anything accepted by pyproj.CRS.from_user_input(), such as an + authority string (eg “EPSG:4326”) or a WKT string. + + Returns: + bigframes.pandas.Series: + Series of float representing the areas. + + Raises: + NotImplementedError: + GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead. + """ + raise NotImplementedError( + f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" + ) + + @classmethod + def from_xy(cls, x, y, index=None, session=None, **kwargs) -> GeoSeries: + # TODO: if either x or y is local and the other is remote. Use the + # session from the remote object. + series_x = bigframes.series.Series(x, index=index, session=session, **kwargs) + series_y = bigframes.series.Series(y, index=index, session=session, **kwargs) + + return cls(series_x._apply_binary_op(series_y, ops.geo_st_geogpoint_op)) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index d2e97a7608..72c49e124b 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,17 +16,19 @@ from __future__ import annotations -from typing import Callable, cast, Literal, Mapping, Optional +from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union import warnings import bigframes_vendored.constants as constants from google.cloud import bigquery import typing_extensions -from bigframes import clients, exceptions +from bigframes import clients, dtypes, exceptions +import bigframes.bigquery as bbq from bigframes.core import blocks, global_session, log_adapter import bigframes.dataframe from bigframes.ml import base, core, globals, utils +import bigframes.series _BQML_PARAMS_MAPPING = { "max_iterations": "maxIterations", @@ -55,6 +57,8 @@ _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT, ) +_MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001" + _GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" @@ -83,6 +87,13 @@ _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, ) +_GEMINI_MULTIMODAL_ENDPOINTS = ( + _GEMINI_1P5_PRO_001_ENDPOINT, + _GEMINI_1P5_PRO_002_ENDPOINT, + _GEMINI_1P5_FLASH_001_ENDPOINT, + _GEMINI_1P5_FLASH_002_ENDPOINT, + _GEMINI_2_FLASH_EXP_ENDPOINT, +) _CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" _CLAUDE_3_HAIKU_ENDPOINT = "claude-3-haiku" @@ -753,6 +764,152 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerat return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger +class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): + """Multimodal embedding generator LLM model. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + model_name (str, Default to "multimodalembedding@001"): + The model for multimodal embedding. Can set to "multimodalembedding@001". Multimodal-embedding models returns model embeddings for text, image and video inputs. + Default to "multimodalembedding@001". + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + If None, use default connection in session context. + """ + + def __init__( + self, + *, + model_name: Literal["multimodalembedding@001"] = "multimodalembedding@001", + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + if not bigframes.options.experiments.blob: + raise NotImplementedError() + self.model_name = model_name + self.session = session or global_session.get_global_session() + self.connection_name = connection_name + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) + + if self.model_name != _MULTIMODAL_EMBEDDING_001_ENDPOINT: + msg = _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT, + ) + warnings.warn(msg) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> MultimodalEmbeddingGenerator: + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + model = cls( + session=session, + model_name=model_endpoint, # type: ignore + connection_name=model_connection, + ) + + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + @property + def _predict_func( + self, + ) -> Callable[ + [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame + ]: + return self._bqml_model.generate_embedding + + @property + def _status_col(self) -> str: + return _ML_GENERATE_EMBEDDING_STATUS + + def predict( + self, X: utils.ArrayType, *, max_retries: int = 0 + ) -> bigframes.dataframe.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. + The content column must be of string type or BigFrames Blob of image or video. + + max_retries (int, default 0): + Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. + Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + if max_retries < 0: + raise ValueError( + f"max_retries must be larger than or equal to 0, but is {max_retries}." + ) + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) + + # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input + if X["content"].dtype == dtypes.OBJ_REF_DTYPE: + X["content"] = X["content"].blob._get_runtime("R", with_metadata=True) + + options = { + "flatten_json_output": True, + } + + return self._predict_and_retry(X, options=options, max_retries=max_retries) + + def to_gbq( + self, model_name: str, replace: bool = False + ) -> MultimodalEmbeddingGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + MultimodalEmbeddingGenerator: Saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. @@ -925,12 +1082,13 @@ def predict( top_p: float = 1.0, ground_with_google_search: bool = False, max_retries: int = 0, + prompt: Optional[Iterable[Union[str, bigframes.series.Series]]] = None, ) -> bigframes.dataframe.DataFrame: """Predict the result from input DataFrame. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, the "prompt" column, or created by "prompt" parameter, is used for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. temperature (float, default 0.9): @@ -966,6 +1124,14 @@ def predict( max_retries (int, default 0): Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result. + + prompt (Iterable of str or bigframes.series.Series, or None, default None): + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Construct a prompt struct column for prediction based on the input. The input must be an Iterable that can take string literals, + such as "summarize", string column(s) of X, such as X["str_col"], or blob column(s) of X, such as X["blob_col"]. + It creates a struct column of the items of the iterable, and use the concatenated result as the input prompt. No-op if set to None. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ @@ -990,7 +1156,38 @@ def predict( f"max_retries must be larger than or equal to 0, but is {max_retries}." ) - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + session = self._bqml_model.session + (X,) = utils.batch_convert_to_dataframe(X, session=session) + + if prompt: + if not bigframes.options.experiments.blob: + raise NotImplementedError() + + if self.model_name not in _GEMINI_MULTIMODAL_ENDPOINTS: + raise NotImplementedError( + f"GeminiTextGenerator only supports model_name {', '.join(_GEMINI_MULTIMODAL_ENDPOINTS)} for Multimodal prompt." + ) + + df_prompt = X[[X.columns[0]]].rename( + columns={X.columns[0]: "bigframes_placeholder_col"} + ) + for i, item in enumerate(prompt): + # must be distinct str column labels to construct a struct + if isinstance(item, str): + label = f"input_{i}" + else: # Series + label = f"input_{i}_{item.name}" + + # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input + if ( + isinstance(item, bigframes.series.Series) + and item.dtype == dtypes.OBJ_REF_DTYPE + ): + item = item.blob._get_runtime("R", with_metadata=True) + + df_prompt[label] = item + df_prompt = df_prompt.drop(columns="bigframes_placeholder_col") + X["prompt"] = bbq.struct(df_prompt) if len(X.columns) == 1: # BQML identified the column by name diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 5d52927ded..eef72584bc 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -75,6 +75,7 @@ llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, + llm._MULTIMODAL_EMBEDDING_001_ENDPOINT: llm.MultimodalEmbeddingGenerator, } ) @@ -98,6 +99,7 @@ def from_bq( llm.PaLM2TextEmbeddingGenerator, llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, preprocessing.PreprocessingType, diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index e1620485d5..e034fd00f7 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -100,6 +100,9 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: model_name = model_endpoint version = None + if model_endpoint.startswith("multimodalembedding"): + return model_name, version + at_idx = model_endpoint.find("@") if at_idx != -1: version = model_endpoint[at_idx + 1 :] diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index d8b0447686..88406317fe 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -49,6 +49,7 @@ date_op, StrftimeOp, time_op, + timestamp_diff_op, ToDatetimeOp, ToTimestampOp, UnixMicros, @@ -84,7 +85,12 @@ SqlScalarOp, where_op, ) -from bigframes.operations.geo_ops import geo_x_op, geo_y_op +from bigframes.operations.geo_ops import ( + geo_area_op, + geo_st_geogpoint_op, + geo_x_op, + geo_y_op, +) from bigframes.operations.json_ops import ( JSONExtract, JSONExtractArray, @@ -97,6 +103,7 @@ from bigframes.operations.numeric_ops import ( abs_op, add_op, + AddOp, arccos_op, arccosh_op, arcsin_op, @@ -125,6 +132,7 @@ sinh_op, sqrt_op, sub_op, + SubOp, tan_op, tanh_op, unsafe_pow_op, @@ -170,7 +178,7 @@ ) from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op -from bigframes.operations.timedelta_ops import ToTimedeltaOp +from bigframes.operations.timedelta_ops import timestamp_add_op, ToTimedeltaOp __all__ = [ # Base ops @@ -242,10 +250,12 @@ "second_op", "normalize_op", # Timedelta ops + "timestamp_add_op", "ToTimedeltaOp", # Datetime ops "date_op", "time_op", + "timestamp_diff_op", "ToDatetimeOp", "ToTimestampOp", "StrftimeOp", @@ -255,6 +265,7 @@ # Numeric ops "abs_op", "add_op", + "AddOp", "arccos_op", "arccosh_op", "arcsin_op", @@ -283,6 +294,7 @@ "sinh_op", "sqrt_op", "sub_op", + "SubOp", "tan_op", "tanh_op", "unsafe_pow_op", @@ -332,6 +344,8 @@ # Geo ops "geo_x_op", "geo_y_op", + "geo_area_op", + "geo_st_geogpoint_op", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 205a9fcf5c..7fa4dd9633 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -224,6 +224,54 @@ def display_single_url(read_url: str, content_type: str): for _, row in df.iterrows(): display_single_url(row["read_url"], row["content_type"]) + def _resolve_connection(self, connection: Optional[str] = None) -> str: + """Resovle the BigQuery connection. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" is + str. If None, uses default connection of the session. + + Returns: + str: the resolved BigQuery connection string in the format: + "project.location.connection_id". + + Raises: + ValueError: If the connection cannot be resolved to a valid string. + """ + connection = connection or self._block.session._bq_connection + return clients.resolve_full_bq_connection_name( + connection, + default_project=self._block.session._project, + default_location=self._block.session._location, + ) + + def _get_runtime_json_str( + self, mode: str = "R", with_metadata: bool = False + ) -> bigframes.series.Series: + """Get the runtime and apply the ToJSONSTring transformation. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + mode(str or str, default "R"): the mode for accessing the runtime. + Default to "R". Possible values are "R" (read-only) and + "RW" (read-write) + with_metadata (bool, default False): whether to include metadata + in the JOSN string. Default to False. + + Returns: + str: the runtime object in the JSON string. + """ + runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) + return runtime._apply_unary_op(ops.ToJSONString()) + def image_blur( self, ksize: tuple[int, int], @@ -246,12 +294,7 @@ def image_blur( """ import bigframes.blob._functions as blob_func - connection = connection or self._block.session._bq_connection - connection = clients.resolve_full_bq_connection_name( - connection, - default_project=self._block.session._project, - default_location=self._block.session._location, - ) + connection = self._resolve_connection(connection) if isinstance(dst, str): dst = os.path.join(dst, "") @@ -268,11 +311,8 @@ def image_blur( connection=connection, ).udf() - src_rt = self._get_runtime(mode="R") - dst_rt = dst.blob._get_runtime(mode="RW") - - src_rt = src_rt._apply_unary_op(ops.ToJSONString()) - dst_rt = dst_rt._apply_unary_op(ops.ToJSONString()) + src_rt = self._get_runtime_json_str(mode="R") + dst_rt = dst.blob._get_runtime_json_str(mode="RW") df = src_rt.to_frame().join(dst_rt.to_frame(), how="outer") df["ksize_x"], df["ksize_y"] = ksize @@ -281,3 +321,93 @@ def image_blur( res.cache() # to execute the udf return dst + + def pdf_extract( + self, *, connection: Optional[str] = None + ) -> bigframes.series.Series: + """Extracts and chunks text from PDF URLs and saves the text as + arrays of string. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + + Returns: + bigframes.series.Series: conatins all text from a pdf file + """ + + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + + pdf_chunk_udf = blob_func.TransformFunction( + blob_func.pdf_extract_def, + session=self._block.session, + connection=connection, + ).udf() + + src_rt = self._get_runtime_json_str(mode="R") + res = src_rt.apply(pdf_chunk_udf) + return res + + def pdf_chunk( + self, + *, + connection: Optional[str] = None, + chunk_size: int = 1000, + overlap_size: int = 200, + ) -> bigframes.series.Series: + """Extracts and chunks text from PDF URLs and saves the text as + arrays of strings. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + chunk_size (int, default 1000): the desired size of each text chunk + (number of characters). + overlap_size (int, default 200): the number of overlapping characters + between consective chunks. The helps to ensure context is + perserved across chunk boundaries. + + Returns: + bigframe.series.Series of array[str], where each string is a + chunk of text extracted from PDF. + """ + + import bigframes.bigquery as bbq + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + + if chunk_size <= 0: + raise ValueError("chunk_size must be a positive integer.") + if overlap_size < 0: + raise ValueError("overlap_size must be a non-negative integer.") + if overlap_size >= chunk_size: + raise ValueError("overlap_size must be smaller than chunk_size.") + + pdf_chunk_udf = blob_func.TransformFunction( + blob_func.pdf_chunk_def, + session=self._block.session, + connection=connection, + ).udf() + + src_rt = self._get_runtime_json_str(mode="R") + df = src_rt.to_frame() + df["chunk_size"] = chunk_size + df["overlap_size"] = overlap_size + + res = df.apply(pdf_chunk_udf, axis=1) + + res_array = bbq.json_extract_string_array(res) + return res_array diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 5086de27d3..3ea4c652f1 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -107,3 +107,22 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT if input_types[0] is not dtypes.TIMESTAMP_DTYPE: raise TypeError("expected timestamp input") return dtypes.INT_DTYPE + + +@dataclasses.dataclass(frozen=True) +class TimestampDiff(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_diff" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not input_types[1]: + raise TypeError( + f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}" + ) + + if not dtypes.is_datetime_like(input_types[0]): + raise TypeError("expected timestamp input") + + return dtypes.TIMEDELTA_DTYPE + + +timestamp_diff_op = TimestampDiff() diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 73e7e89197..0ae8accd56 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -29,3 +29,14 @@ dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) + +geo_area_op = base_ops.create_unary_op( + name="geo_area", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" + ), +) + +geo_st_geogpoint_op = base_ops.create_binary_op( + name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() +) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 939330954d..5183e5c4c5 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -116,12 +116,18 @@ def output_type(self, *input_types): if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: # String addition return input_types[0] + + # Timestamp addition. + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + return left_type + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): + return right_type + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): # Numeric addition return dtypes.coerce_to_common(left_type, right_type) - # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") @@ -141,7 +147,10 @@ def output_type(self, *input_types): ): # Numeric subtraction return dtypes.coerce_to_common(left_type, right_type) - # TODO: Add temporal addition once delta types supported + + if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 0bcd6eb08f..69e054fa5c 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -25,7 +25,32 @@ class ToTimedeltaOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_timedelta" unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] - def output_type(self, *input_types): - if input_types[0] is not dtypes.INT_DTYPE: - raise TypeError("expected integer input") - return dtypes.TIMEDETLA_DTYPE + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): + return dtypes.TIMEDELTA_DTYPE + raise TypeError("expected integer or float input") + + +@dataclasses.dataclass(frozen=True) +class TimestampAdd(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # timestamp + timedelta => timestamp + if ( + dtypes.is_datetime_like(input_types[0]) + and input_types[1] is dtypes.TIMEDELTA_DTYPE + ): + return input_types[0] + # timedelta + timestamp => timestamp + if input_types[0] is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like( + input_types[1] + ): + return input_types[1] + + raise TypeError( + f"unsupported types for timestamp_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +timestamp_add_op = TimestampAdd() diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 441134aff5..86bb56fc39 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -121,6 +121,20 @@ def output_type( return bigframes.dtypes.coerce_to_common(left_type, right_type) +@dataclasses.dataclass +class BinaryNumericGeo(BinaryTypeSignature): + """Type signature for geo functions like from_xy that can map ints to ints.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.GEO_DTYPE + + @dataclasses.dataclass class BinaryRealNumeric(BinaryTypeSignature): """Type signature for real-valued functions like divide, arctan2, pow.""" diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py index 0cedf425fe..070a41d62d 100644 --- a/bigframes/pandas/core/tools/timedeltas.py +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -18,20 +18,26 @@ timedeltas as vendored_pandas_timedeltas, ) import pandas as pd +import pandas.api.types as pdtypes from bigframes import operations as ops -from bigframes import series +from bigframes import series, session def to_timedelta( - arg: typing.Union[series.Series, str, int, float], + arg, unit: typing.Optional[vendored_pandas_timedeltas.UnitChoices] = None, -) -> typing.Union[series.Series, pd.Timedelta]: - if not isinstance(arg, series.Series): - return pd.to_timedelta(arg, unit) + *, + session: typing.Optional[session.Session] = None, +): + if isinstance(arg, series.Series): + canonical_unit = "us" if unit is None else _canonicalize_unit(unit) + return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) - canonical_unit = "us" if unit is None else _canonicalize_unit(unit) - return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) + if pdtypes.is_list_like(arg): + return to_timedelta(series.Series(arg), unit, session=session) + + return pd.to_timedelta(arg, unit) to_timedelta.__doc__ = vendored_pandas_timedeltas.to_timedelta.__doc__ diff --git a/bigframes/series.py b/bigframes/series.py index 706c0f4f09..af9fce6e20 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -805,10 +805,10 @@ def __rsub__(self, other: float | int | Series) -> Series: __rsub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rsub__) - def sub(self, other: float | int | Series) -> Series: + def sub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op) - def rsub(self, other: float | int | Series) -> Series: + def rsub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ba693696c3..b7550583e5 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -177,7 +177,7 @@ def read_pandas_load_job( destination_table = self._bqclient.get_table(load_table_destination) col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDETLA_DTYPE + col: bigframes.dtypes.TIMEDELTA_DTYPE for col in df_and_labels.timedelta_cols } array_value = core.ArrayValue.from_table( @@ -236,7 +236,7 @@ def read_pandas_streaming( ) col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDETLA_DTYPE + col: bigframes.dtypes.TIMEDELTA_DTYPE for col in df_and_labels.timedelta_cols } array_value = ( diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 90c638b82e..2180a66207 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -24,7 +24,7 @@ from google.cloud import bigquery from bigframes import dataframe -from bigframes.core import log_adapter +from bigframes.core import log_adapter, nodes import bigframes.exceptions as bfe import bigframes.session @@ -54,7 +54,7 @@ def _curate_df_doc(doc: Optional[str]): class StreamingBase: - sql: str + _appends_sql: str _session: bigframes.session.Session def to_bigtable( @@ -124,7 +124,7 @@ def to_bigtable( can be examined. """ return _to_bigtable( - self.sql, + self._appends_sql, instance=instance, table=table, service_account_email=service_account_email, @@ -181,7 +181,7 @@ def to_pubsub( can be examined. """ return _to_pubsub( - self.sql, + self._appends_sql, topic=topic, service_account_email=service_account_email, session=self._session, @@ -218,6 +218,19 @@ def __init__(self, df: dataframe.DataFrame, *, create_key=0): def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame: return cls(df, create_key=cls._create_key) + @property + def _original_table(self): + def traverse(node: nodes.BigFrameNode): + if isinstance(node, nodes.ReadTableNode): + return f"{node.source.table.project_id}.{node.source.table.dataset_id}.{node.source.table.table_id}" + for child in node.child_nodes: + original_table = traverse(child) + if original_table: + return original_table + return None + + return traverse(self._df._block._expr.node) + def __getitem__(self, *args, **kwargs): return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)( *args, **kwargs @@ -266,6 +279,17 @@ def sql(self): sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql)) + # Patch for the required APPENDS clause + @property + def _appends_sql(self): + sql_str = self.sql + original_table = self._original_table + assert original_table is not None + + appends_clause = f"APPENDS(TABLE `{original_table}`, NULL, NULL)" + sql_str = sql_str.replace(f"`{original_table}`", appends_clause) + return sql_str + @property def _session(self): return self._df._session diff --git a/bigframes/version.py b/bigframes/version.py index d9b9875805..e92072bea8 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.35.0" +__version__ = "1.36.0" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c17a1788df..d57ab1c8ac 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -209,7 +209,7 @@ name: bigframes.bigquery - items: - name: GeoSeries - uid: bigframes.geopandas + uid: bigframes.geopandas.GeoSeries name: bigframes.geopandas - items: - name: Overview diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 160d19ce91..4792c4fe27 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -37,7 +37,6 @@ "import bigframes\n", "import bigframes.geopandas\n", "import bigframes.pandas as bpd\n", - "import shapely\n", "bpd.options.display.progress_bar = None" ] }, @@ -45,7 +44,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Load the Counties table from the Census Bureau US Boundaries dataset" + "### 1. Load the Counties table from the Census Bureau US Boundaries dataset" ] }, { @@ -57,7 +56,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:274: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src/bigframes3/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -70,7 +69,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a series from the int_point_geom column" + "### 2. Create a series from the int_point_geom column" ] }, { @@ -98,21 +97,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "54 POINT (-93.47523 45.00612)\n", - "256 POINT (-89.60507 42.67552)\n", - "266 POINT (-104.11408 39.31516)\n", - "485 POINT (-91.23193 32.34688)\n", - "765 POINT (-83.42808 38.20427)\n", + "171 POINT (-95.50742 42.39186)\n", + "219 POINT (-105.42894 37.27755)\n", + "402 POINT (-93.34905 32.10121)\n", + "526 POINT (-84.60469 43.29233)\n", + "677 POINT (-89.5681 37.04779)\n", "Name: int_point_geom, dtype: geometry" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -129,13 +128,6 @@ "### Convert the five geo points to `bigframes.gopandas.GeoSeries`" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Note: TypeError is raised if the GEOGRAPHY column contains geometry type other than `Point`." - ] - }, { "cell_type": "code", "execution_count": 6, @@ -144,11 +136,11 @@ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-95.50742 42.39186)\n", + "1 POINT (-105.42894 37.27755)\n", + "2 POINT (-93.34905 32.10121)\n", + "3 POINT (-84.60469 43.29233)\n", + "4 POINT (-89.5681 37.04779)\n", "dtype: geometry" ] }, @@ -171,6 +163,13 @@ "### Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note: TypeError is raised if `.x` and `.y` are used with a geometry type other than `Point`." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -186,11 +185,11 @@ { "data": { "text/plain": [ - "0 -86.873385\n", - "1 -118.48037\n", - "2 -92.5617\n", - "3 -83.461893\n", - "4 -119.467788\n", + "0 -95.507421\n", + "1 -105.42894\n", + "2 -93.34905\n", + "3 -84.60469\n", + "4 -89.568097\n", "dtype: Float64" ] }, @@ -218,11 +217,11 @@ { "data": { "text/plain": [ - "0 38.373344\n", - "1 46.254606\n", - "2 32.30429\n", - "3 39.555246\n", - "4 47.213633\n", + "0 42.39186\n", + "1 37.277547\n", + "2 32.101213\n", + "3 43.292326\n", + "4 37.047793\n", "dtype: Float64" ] }, @@ -251,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -285,7 +284,7 @@ "dtype: Float64" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -303,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -337,7 +336,7 @@ "dtype: Float64" ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -345,6 +344,199 @@ "source": [ "point_geom_series.geo.y" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrive the `area` of different geometry shapes. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a geometry collection from local data with `Peek`" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "54 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", + "256 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", + "266 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", + "485 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", + "765 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", + "Name: county_geom, dtype: geometry" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_series = df[\"county_geom\"].peek(n = 5)\n", + "geom_series" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Convert the geometry collection to `bigframes.gopandas.GeoSeries`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", + "1 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", + "2 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", + "3 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", + "4 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", + "dtype: geometry" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "five_geom = bigframes.geopandas.GeoSeries(\n", + " [point for point in geom_series]\n", + ")\n", + "five_geom" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "source": [ + "## Note: `bigframes.geopandas.GeoSeries.area` raises NotImplementedError. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "ename": "NotImplementedError", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", + "File \u001b[0;32m~/src/bigframes3/bigframes/geopandas/geoseries.py:66\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 68\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0" + ] + } + ], + "source": [ + "five_geom.area" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.bigquery as bbq" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1567505274.453911\n", + "1 1511436852.079554\n", + "2 4789800692.948824\n", + "3 1686877416.586061\n", + "4 740944862.916908\n", + "dtype: Float64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_area = bbq.st_area(five_geom)\n", + "geom_area" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use `bigframes.geopandas.GeoSeries.from_xy()` to create a GeoSeries of `Point` geometries. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Reuse the `geo_points.x` and `geo_points.y` results by passing them to `.from_xy()` " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-95.50742 42.39186)\n", + "1 POINT (-105.42894 37.27755)\n", + "2 POINT (-93.34905 32.10121)\n", + "3 POINT (-84.60469 43.29233)\n", + "4 POINT (-89.5681 37.04779)\n", + "dtype: geometry" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigframes.geopandas.GeoSeries.from_xy(geo_points.x, geo_points.y)" + ] } ], "metadata": { diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 90186b297d..6b0682bb1a 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -118,7 +118,7 @@ "metadata": {}, "outputs": [], "source": [ - "#%pip install --upgrade" + "#%pip install --upgrade bigframes" ] }, { diff --git a/scripts/dev-utils/tpcds_upload_helper.py b/scripts/dev-utils/tpcds_upload_helper.py new file mode 100644 index 0000000000..52bb553cd8 --- /dev/null +++ b/scripts/dev-utils/tpcds_upload_helper.py @@ -0,0 +1,597 @@ +import argparse +import csv +import os +import sys + +import google.api_core.exceptions +from google.cloud import bigquery + + +def preprocess_csv(input_file_path, output_file_path): + try: + with open( + input_file_path, mode="r", newline="", encoding="utf-8" + ) as infile, open( + output_file_path, mode="w", newline="", encoding="utf-8" + ) as outfile: + reader = csv.reader(infile, delimiter="|") + writer = csv.writer(outfile, delimiter="|") + + for row in reader: + writer.writerow(row[:-1]) + except Exception as e: + print(f"An error occurred: {e}") + + +def get_schema(table_name): + schema = { + "customer_address": [ + bigquery.SchemaField("ca_address_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ca_address_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("ca_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ca_location_type", "STRING", mode="NULLABLE"), + ], + "customer_demographics": [ + bigquery.SchemaField("cd_demo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cd_gender", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_marital_status", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_education_status", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_purchase_estimate", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_credit_rating", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_employed_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_college_count", "INTEGER", mode="NULLABLE"), + ], + "date_dim": [ + bigquery.SchemaField("d_date_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("d_date_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("d_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("d_month_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_week_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_quarter_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_dow", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_moy", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_qoy", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_quarter_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_week_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_day_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_quarter_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_holiday", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_weekend", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_following_holiday", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_first_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_last_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_same_day_ly", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_same_day_lq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_current_day", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_week", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_month", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_quarter", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_year", "STRING", mode="NULLABLE"), + ], + "warehouse": [ + bigquery.SchemaField("w_warehouse_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("w_warehouse_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("w_warehouse_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_warehouse_sq_ft", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("w_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_gmt_offset", "FLOAT", mode="NULLABLE"), + ], + "ship_mode": [ + bigquery.SchemaField("sm_ship_mode_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sm_ship_mode_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("sm_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_code", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_carrier", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_contract", "STRING", mode="NULLABLE"), + ], + "time_dim": [ + bigquery.SchemaField("t_time_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("t_time_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("t_time", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_hour", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_minute", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_second", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_am_pm", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_shift", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_sub_shift", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_meal_time", "STRING", mode="NULLABLE"), + ], + "reason": [ + bigquery.SchemaField("r_reason_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("r_reason_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("r_reason_desc", "STRING", mode="NULLABLE"), + ], + "income_band": [ + bigquery.SchemaField("ib_income_band_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ib_lower_bound", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ib_upper_bound", "INTEGER", mode="NULLABLE"), + ], + "item": [ + bigquery.SchemaField("i_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("i_item_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("i_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("i_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("i_item_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_current_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("i_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("i_brand_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_brand", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_class_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_category_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_category", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_manufact_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_manufact", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_size", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_formulation", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_color", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_units", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_container", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_manager_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_product_name", "STRING", mode="NULLABLE"), + ], + "store": [ + bigquery.SchemaField("s_store_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("s_store_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("s_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("s_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("s_closed_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_store_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_number_employees", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_floor_space", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_hours", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_geography_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_division_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_division_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_company_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("s_tax_precentage", "FLOAT", mode="NULLABLE"), + ], + "call_center": [ + bigquery.SchemaField("cc_call_center_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cc_call_center_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("cc_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("cc_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("cc_closed_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_open_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_employees", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_sq_ft", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_hours", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_division", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_division_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_company", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cc_tax_percentage", "FLOAT", mode="NULLABLE"), + ], + "customer": [ + bigquery.SchemaField("c_customer_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("c_customer_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("c_current_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_current_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_current_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_first_shipto_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_first_sales_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_salutation", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_first_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_last_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_preferred_cust_flag", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_birth_day", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_month", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_login", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_email_address", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_last_review_date_sk", "STRING", mode="NULLABLE"), + ], + "web_site": [ + bigquery.SchemaField("web_site_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("web_site_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("web_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("web_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("web_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_open_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_close_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_company_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("web_tax_percentage", "FLOAT", mode="NULLABLE"), + ], + "store_returns": [ + bigquery.SchemaField("sr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_return_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sr_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_store_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_ticket_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_return_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_store_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "household_demographics": [ + bigquery.SchemaField("hd_demo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("hd_income_band_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("hd_buy_potential", "STRING", mode="NULLABLE"), + bigquery.SchemaField("hd_dep_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("hd_vehicle_count", "INTEGER", mode="NULLABLE"), + ], + "web_page": [ + bigquery.SchemaField("wp_web_page_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wp_web_page_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("wp_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("wp_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("wp_creation_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_access_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_autogen_flag", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_url", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_char_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_link_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_image_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_max_ad_count", "INTEGER", mode="NULLABLE"), + ], + "promotion": [ + bigquery.SchemaField("p_promo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("p_promo_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("p_start_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_end_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_item_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("p_response_target", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_promo_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_dmail", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_email", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_catalog", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_tv", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_radio", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_press", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_event", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_demo", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_details", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_purpose", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_discount_active", "STRING", mode="NULLABLE"), + ], + "catalog_page": [ + bigquery.SchemaField("cp_catalog_page_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cp_catalog_page_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("cp_start_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_end_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_department", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cp_catalog_number", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_catalog_page_number", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_description", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cp_type", "STRING", mode="NULLABLE"), + ], + "inventory": [ + bigquery.SchemaField("inv_date_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_warehouse_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_quantity_on_hand", "INTEGER", mode="NULLABLE"), + ], + "catalog_returns": [ + bigquery.SchemaField("cr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returned_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cr_refunded_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField( + "cr_returning_customer_sk", "INTEGER", mode="NULLABLE" + ), + bigquery.SchemaField("cr_returning_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returning_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returning_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_call_center_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_catalog_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_return_amount", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_store_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "web_returns": [ + bigquery.SchemaField("wr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returned_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wr_refunded_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField( + "wr_returning_customer_sk", "INTEGER", mode="NULLABLE" + ), + bigquery.SchemaField("wr_returning_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returning_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returning_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_web_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_return_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_account_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "web_sales": [ + bigquery.SchemaField("ws_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ws_bill_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_web_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_web_site_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ws_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_ship", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_ship_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_profit", "FLOAT", mode="NULLABLE"), + ], + "catalog_sales": [ + bigquery.SchemaField("cs_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_call_center_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_catalog_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cs_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cs_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_ship", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_ship_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_profit", "FLOAT", mode="NULLABLE"), + ], + "store_sales": [ + bigquery.SchemaField("ss_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ss_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_store_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_ticket_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ss_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_profit", "FLOAT", mode="NULLABLE"), + ], + } + + return schema[table_name] + + +def load_data_to_bigquery(table_name, file_paths, client, dataset_ref, temp_file): + """Loads data from a list of files into a BigQuery table.""" + job_config = bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.CSV, + skip_leading_rows=0, # No header in .dat files + field_delimiter="|", + schema=get_schema(table_name), + ) + + table_ref = dataset_ref.table(table_name) + table = bigquery.Table(table_ref) + client.create_table(table) + + # Load data from each file + for file_path in sorted(file_paths): + preprocess_csv(file_path, temp_file) + with open(temp_file, "rb") as source_file: + job = client.load_table_from_file( + source_file, table_ref, job_config=job_config + ) + job.result() + print( + f"Loaded data from {file_path} into table {project_id}:{dataset_id}.{table_name}" + ) + + +if __name__ == "__main__": + """ + Loads TPC-DS data to BigQuery. + + This script loads TPC-DS data generated with source code from + https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp + into BigQuery. + + Note: If the dataset already exists, the script will exit without uploading data. + + Usage: + python tpcds_upload_helper.py --project_id --dataset_id --ds_path + python tpcds_upload_helper.py -d -p -s + """ + parser = argparse.ArgumentParser(description="Load TPC-DS data to BigQuery") + parser.add_argument( + "--project_id", "-p", required=True, help="Google Cloud project ID" + ) + parser.add_argument("--dataset_id", "-d", required=True, help="BigQuery dataset ID") + parser.add_argument( + "--ds_path", "-s", required=True, help="Path to the TPC-DS data directory" + ) + args = parser.parse_args() + + project_id = args.project_id + dataset_id = args.dataset_id + ds_path = args.ds_path + temp_file = "temp.csv" + + # Initialize BigQuery client + client = bigquery.Client(project=project_id) + dataset_ref = client.dataset(dataset_id) + try: + # Quit if dataset exists + client.get_dataset(dataset_ref) + print(f"Dataset {project_id}:{dataset_id} already exists. Skipping.") + sys.exit(1) + except google.api_core.exceptions.NotFound: + # Create the dataset if it doesn't exist + dataset = bigquery.Dataset(dataset_ref) + client.create_dataset(dataset) + print(f"Created dataset {project_id}:{dataset_id}") + + # Iterate through the folders + for table_name in sorted(os.listdir(ds_path)): + table_path = os.path.join(ds_path, table_name) + table_name = table_name.split(".")[0] + if os.path.isdir(table_path): + file_paths = [ + os.path.join(table_path, f) + for f in os.listdir(table_path) + if f.endswith(".dat") + ] + load_data_to_bigquery( + table_name, file_paths, client, dataset_ref, temp_file + ) + + try: + os.remove(temp_file) + print("Removed temporary file: temp.csv") + except FileNotFoundError: + print("Temporary file not found.") diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl index 779b0fe2d7..e6f7a444f6 100644 --- a/tests/benchmark/tpch/config.jsonl +++ b/tests/benchmark/tpch/config.jsonl @@ -6,3 +6,5 @@ {"benchmark_suffix": "100g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": false} {"benchmark_suffix": "1t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": true} {"benchmark_suffix": "1t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": false} +{"benchmark_suffix": "10t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": true} +{"benchmark_suffix": "10t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": false} diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 172a55ec11..03755c94b7 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -6,4 +6,4 @@ {"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} {"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} {"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file diff --git a/tests/system/large/functions/__init__.py b/tests/system/large/functions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/large/functions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py new file mode 100644 index 0000000000..7d38cd7d91 --- /dev/null +++ b/tests/system/small/bigquery/test_geo.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import geopandas # type: ignore +import pandas as pd +from shapely.geometry import LineString, Point, Polygon # type: ignore + +import bigframes.bigquery as bbq +import bigframes.geopandas +import bigframes.series + + +def test_geo_st_area(): + data = [ + Polygon([(0.000, 0.0), (0.001, 0.001), (0.000, 0.001)]), + Polygon([(0.0010, 0.004), (0.009, 0.005), (0.0010, 0.005)]), + Polygon([(0.001, 0.001), (0.002, 0.001), (0.002, 0.002)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + geopd_s = geopandas.GeoSeries(data=data, crs="EPSG:4326") + geobf_s = bigframes.geopandas.GeoSeries(data=data) + + # For `geopd_s`, the data was further projected with `geopandas.GeoSeries.to_crs` + # to `to_crs(26393)` to get the area in square meter. See: https://geopandas.org/en/stable/docs/user_guide/projections.html + # and https://spatialreference.org/ref/epsg/26393/. We then rounded both results + # to get them as close to each other as possible. Initially, the area results + # were +ten-millions. We added more zeros after the decimal point to round the + # area results to the nearest thousands. + geopd_s_result = geopd_s.to_crs(26393).area.round(-3) + geobf_s_result = bbq.st_area(geobf_s).to_pandas().round(-3) + assert geobf_s_result.iloc[0] >= 1000 + + pd.testing.assert_series_equal( + geobf_s_result, + geopd_s_result, + check_dtype=False, + check_index_type=False, + check_exact=False, + rtol=1, + ) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index aa490749ae..8f97856eea 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -16,6 +16,7 @@ import geopandas as gpd # type: ignore import pandas as pd +import pyarrow as pa import pytest import bigframes.bigquery as bbq @@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings(): actual = bbq.json_extract_array(s, "$.a") expected = bpd.Series( [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), @@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings(): actual = bbq.json_extract_array(s) expected = bpd.Series( [["1", "2", "3"], [], ["4", "5"]], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/functions/__init__.py b/tests/system/small/functions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/functions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index a30460d461..5951d0b12c 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + +import bigframes_vendored.constants as constants import geopandas # type: ignore +from geopandas.array import GeometryDtype # type:ignore import google.api_core.exceptions import pandas as pd import pytest +from shapely.geometry import LineString, Point, Polygon # type: ignore import bigframes.geopandas import bigframes.series @@ -61,3 +66,44 @@ def test_geo_y(urban_areas_dfs): pd_result.astype(pd.Float64Dtype()), bf_result, ) + + +def test_geo_area_not_supported(): + s = bigframes.pandas.Series( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ], + dtype=GeometryDtype(), + ) + bf_series: bigframes.geopandas.GeoSeries = s.geo + with pytest.raises( + NotImplementedError, + match=re.escape( + f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" + ), + ): + bf_series.area + + +def test_geo_from_xy(): + x = [2.5, 5, -3.0] + y = [0.5, 1, 1.5] + bf_result = ( + bigframes.geopandas.GeoSeries.from_xy(x, y) + .astype(geopandas.array.GeometryDtype()) + .to_pandas() + ) + pd_result = geopandas.GeoSeries.from_xy(x, y, crs="EPSG:4326").astype( + geopandas.array.GeometryDtype() + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_series_type=False, + check_index=False, + ) diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index c5c649c638..936becff76 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -14,6 +14,8 @@ import datetime +import numpy +from pandas import testing import pandas as pd import pytest @@ -367,3 +369,82 @@ def test_dt_clip_coerce_str_timestamp(scalars_dfs): pd_result, bf_result, ) + + +@pytest.mark.parametrize("column", ["timestamp_col", "datetime_col"]) +def test_timestamp_diff_two_series(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (bf_series - bf_series).to_pandas() + + expected_result = pd_series - pd_series + assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize("column", ["timestamp_col", "datetime_col"]) +def test_timestamp_diff_two_series_with_numpy_ops(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = numpy.subtract(bf_series, bf_series).to_pandas() + + expected_result = numpy.subtract(pd_series, pd_series) + assert_series_equal(actual_result, expected_result) + + +def test_timestamp_diff_two_dataframes(scalars_dfs): + columns = ["timestamp_col", "datetime_col"] + bf_df, pd_df = scalars_dfs + bf_df = bf_df[columns] + pd_df = pd_df[columns] + + actual_result = (bf_df - bf_df).to_pandas() + + expected_result = pd_df - pd_df + testing.assert_frame_equal(actual_result, expected_result) + + +def test_timestamp_diff_two_series_with_different_types_raise_error(scalars_dfs): + df, _ = scalars_dfs + + with pytest.raises(TypeError): + (df["timestamp_col"] - df["datetime_col"]).to_pandas() + + +@pytest.mark.parametrize( + ("column", "value"), + [ + ("timestamp_col", pd.Timestamp("2025-01-01 00:00:01", tz="America/New_York")), + ("datetime_col", datetime.datetime(2025, 1, 1, 0, 0, 1)), + ], +) +def test_timestamp_diff_series_sub_literal(scalars_dfs, column, value): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (bf_series - value).to_pandas() + + expected_result = pd_series - value + assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + ("column", "value"), + [ + ("timestamp_col", pd.Timestamp("2025-01-01 00:00:01", tz="America/New_York")), + ("datetime_col", datetime.datetime(2025, 1, 1, 0, 0, 1)), + ], +) +def test_timestamp_diff_literal_sub_series(scalars_dfs, column, value): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (value - bf_series).to_pandas() + + expected_result = value - pd_series + assert_series_equal(actual_result, expected_result) diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py new file mode 100644 index 0000000000..6c44a62686 --- /dev/null +++ b/tests/system/small/operations/test_timedeltas.py @@ -0,0 +1,166 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime + +import numpy as np +import pandas as pd +import pandas.testing +import pytest + + +@pytest.fixture(scope="module") +def temporal_dfs(session): + pandas_df = pd.DataFrame( + { + "datetime_col": [ + pd.Timestamp("2025-02-01 01:00:01"), + pd.Timestamp("2019-01-02 02:00:00"), + ], + "timestamp_col": [ + pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), + pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), + ], + "timedelta_col": [pd.Timedelta(3, "s"), pd.Timedelta(-4, "d")], + } + ) + + bigframes_df = session.read_pandas(pandas_df) + + return bigframes_df, pandas_df + + +@pytest.mark.parametrize( + ("column", "pd_dtype"), + [ + ("datetime_col", ">> import bigframes.geopandas >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None + >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s 0 POINT (1 1) @@ -43,9 +44,9 @@ def x(self) -> bigframes.series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import geopandas.array >>> import shapely + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], @@ -58,7 +59,7 @@ def x(self) -> bigframes.series.Series: dtype: Float64 Returns: - bigframes.series.Series: + bigframes.pandas.Series: Return the x location (longitude) of point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -70,9 +71,9 @@ def y(self) -> bigframes.series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import geopandas.array >>> import shapely + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], @@ -85,7 +86,49 @@ def y(self) -> bigframes.series.Series: dtype: Float64 Returns: - bigframes.series.Series: + bigframes.pandas.Series: Return the y location (latitude) of point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: + """ + Alternate constructor to create a GeoSeries of Point geometries from + lists or arrays of x, y coordinates. + + In case of geographic coordinates, it is assumed that longitude is + captured by x coordinates and latitude by y. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None + + >>> x = [2.5, 5, -3.0] + >>> y = [0.5, 1, 1.5] + + >>> s = bigframes.geopandas.GeoSeries.from_xy(x, y) + >>> s + 0 POINT (2.5 0.5) + 1 POINT (5 1) + 2 POINT (-3 1.5) + dtype: geometry + + Args: + x, y (array-like): + longitude is x coordinates and latitude y coordinates. + + index (array-like or Index, optional): + The index for the GeoSeries. If not given and all coordinate + inputs are Series with an equal index, that index is used.. + + **kwargs: + Additional arguments passed to the Series constructor, e.g. `name`. + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of Point geometries. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py index 2fd0e9186e..fce0643783 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py @@ -63,7 +63,6 @@ typecode.VARBINARY: dt.Binary, typecode.VARCHAR: dt.String, typecode.VARIANT: dt.JSON, - typecode.UNIQUEIDENTIFIER: dt.UUID, typecode.SET: partial(dt.Array, dt.string), ############################# # Unsupported sqlglot types # diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py index 652f04757b..a252f116dd 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py @@ -359,7 +359,7 @@ def wrap(node, _, **kwargs): return CTE(new) if node in ctes else new result = simplified.replace(wrap) - ctes = reversed([cte.parent for cte in result.find(CTE)]) + ctes = [cte.parent for cte in result.find(CTE, ordered=True)] return result, ctes diff --git a/third_party/bigframes_vendored/ibis/common/graph.py b/third_party/bigframes_vendored/ibis/common/graph.py index 1a3fc6c543..6e7995ec03 100644 --- a/third_party/bigframes_vendored/ibis/common/graph.py +++ b/third_party/bigframes_vendored/ibis/common/graph.py @@ -343,6 +343,7 @@ def find( finder: FinderLike, filter: Optional[FinderLike] = None, context: Optional[dict] = None, + ordered: bool = False, ) -> list[Node]: """Find all nodes matching a given pattern or type in the graph. @@ -360,6 +361,8 @@ def find( the given filter and stop otherwise. context Optional context to use if `finder` or `filter` is a pattern. + ordered + Emit nodes in topological order if `True`. Returns ------- @@ -369,6 +372,8 @@ def find( """ graph = Graph.from_bfs(self, filter=filter, context=context) finder = _coerce_finder(finder, context) + if ordered: + graph, _ = graph.toposort() return [node for node in graph.nodes() if finder(node)] @experimental diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 771146250a..fe15e7b40d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -87,12 +87,12 @@ def field(self, name_or_index: str | int): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -106,7 +106,7 @@ def field(self, name_or_index: str | int): Extract by field index. - >>> s.struct.field(1) + >>> s.struct.field(0) 0 1 1 2 2 1 @@ -133,22 +133,22 @@ def explode(self): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) Extract all child fields. >>> s.struct.explode() - project version - 0 pandas 1 - 1 pandas 2 - 2 numpy 1 + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy [3 rows x 2 columns] @@ -178,8 +178,8 @@ def dtypes(self): ... )) ... ) >>> s.struct.dtypes() - project string[pyarrow] version Int64 + project string[pyarrow] dtype: object Returns: @@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."): >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) >>> downloads = bpd.Series([100, 200, 300]) >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads}) >>> df.struct.explode("file") - country file.project file.version download_count - 0 cn pandas 1 100 - 1 es pandas 2 200 - 2 us numpy 1 300 + country file.version file.project download_count + 0 cn 1 pandas 100 + 1 es 2 pandas 200 + 2 us 1 numpy 300 [3 rows x 4 columns] diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index d9b9875805..e92072bea8 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.35.0" +__version__ = "1.36.0"