From 381a1e9ac8b23ce51300616ff073aa0a1f788a0e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 15 Apr 2026 20:53:33 +0000 Subject: [PATCH] feat(bigframes): Support unstable sort_values, sort_index --- .../bigframes/bigframes/core/array_value.py | 10 ++++- packages/bigframes/bigframes/core/blocks.py | 3 +- .../bigframes/bigframes/core/indexes/base.py | 19 +++++---- packages/bigframes/bigframes/core/nodes.py | 3 +- .../bigframes/bigframes/core/rewrite/order.py | 7 +++- packages/bigframes/bigframes/dataframe.py | 15 ++++--- packages/bigframes/bigframes/series.py | 41 +++++++++++++++---- .../bigframes_vendored/constants.py | 2 + .../bigframes_vendored/pandas/core/frame.py | 9 +++- .../pandas/core/indexes/base.py | 10 ++++- .../bigframes_vendored/pandas/core/series.py | 9 +++- 11 files changed, 96 insertions(+), 32 deletions(-) diff --git a/packages/bigframes/bigframes/core/array_value.py b/packages/bigframes/bigframes/core/array_value.py index 5a238e39b3d6..d7fb186ae912 100644 --- a/packages/bigframes/bigframes/core/array_value.py +++ b/packages/bigframes/bigframes/core/array_value.py @@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression): return arr.drop_columns(filter_ids) def order_by( - self, by: Sequence[OrderingExpression], is_total_order: bool = False + self, + by: Sequence[OrderingExpression], + is_total_order: bool = False, + stable: bool = True, ) -> ArrayValue: return ArrayValue( nodes.OrderByNode( - child=self.node, by=tuple(by), is_total_order=is_total_order + child=self.node, + by=tuple(by), + is_total_order=is_total_order, + stable=stable, ) ) diff --git a/packages/bigframes/bigframes/core/blocks.py b/packages/bigframes/bigframes/core/blocks.py index a23965dd1bef..a4c9e71bfdfa 100644 --- a/packages/bigframes/bigframes/core/blocks.py +++ b/packages/bigframes/bigframes/core/blocks.py @@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: def order_by( self, by: typing.Sequence[ordering.OrderingExpression], + stable: bool = True, ) -> Block: return Block( - self._expr.order_by(by), + self._expr.order_by(by, stable=stable), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index.names, diff --git a/packages/bigframes/bigframes/core/indexes/base.py b/packages/bigframes/bigframes/core/indexes/base.py index bd957e96d3c7..1571962a0f03 100644 --- a/packages/bigframes/bigframes/core/indexes/base.py +++ b/packages/bigframes/bigframes/core/indexes/base.py @@ -255,12 +255,6 @@ def query_job(self) -> bigquery.QueryJob: self._query_job = query_job return self._query_job - @property - def str(self) -> bigframes.operations.strings.StringMethods: - import bigframes.operations.strings - - return bigframes.operations.strings.StringMethods(self) - def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]: """Get integer location, slice or boolean mask for requested label. @@ -436,7 +430,8 @@ def sort_values( *, inplace: bool = False, ascending: bool = True, - na_position: __builtins__.str = "last", + kind: str | None = None, + na_position: str = "last", ) -> Index: if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") @@ -448,7 +443,8 @@ def sort_values( else order.descending_over(column, na_last) for column in index_columns ] - return Index(self._block.order_by(ordering)) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + return Index(self._block.order_by(ordering, stable=is_stable)) def astype( self, @@ -840,6 +836,13 @@ def _apply_binary_op( else: return NotImplemented + # last so as to not shadow __builtins__.str + @property + def str(self) -> bigframes.operations.strings.StringMethods: + import bigframes.operations.strings + + return bigframes.operations.strings.StringMethods(self) + def _should_create_datetime_index(block: blocks.Block) -> bool: if len(block.index.dtypes) != 1: diff --git a/packages/bigframes/bigframes/core/nodes.py b/packages/bigframes/bigframes/core/nodes.py index a7e20a910e4d..f30df7f30840 100644 --- a/packages/bigframes/bigframes/core/nodes.py +++ b/packages/bigframes/bigframes/core/nodes.py @@ -991,7 +991,8 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class OrderByNode(UnaryNode): by: Tuple[OrderingExpression, ...] - # This is an optimization, if true, can discard previous orderings. + stable: bool = True + # This is an optimization, if true, can discard previous orderings, even if doing a stable sort # might be a total ordering even if false is_total_order: bool = False diff --git a/packages/bigframes/bigframes/core/rewrite/order.py b/packages/bigframes/bigframes/core/rewrite/order.py index 7beb510ac6b8..4d6612e6798d 100644 --- a/packages/bigframes/bigframes/core/rewrite/order.py +++ b/packages/bigframes/bigframes/core/rewrite/order.py @@ -71,7 +71,8 @@ def pull_up_order_inner( child_result, child_order = pull_up_order_inner(node.child) return child_result, child_order.with_reverse() elif isinstance(node, bigframes.core.nodes.OrderByNode): - if node.is_total_order: + # unstable sorts don't care about previous order, total orders override previous order + if (not node.stable) or node.is_total_order: new_node = remove_order(node.child) else: new_node, child_order = pull_up_order_inner(node.child) @@ -106,6 +107,10 @@ def pull_up_order_inner( ), ) ) + elif not node.stable: + new_order = bigframes.core.ordering.RowOrdering( + ordering_value_columns=tuple(new_by), + ) else: assert child_order new_order = child_order.with_ordering_columns(new_by) diff --git a/packages/bigframes/bigframes/dataframe.py b/packages/bigframes/bigframes/dataframe.py index 98340e5e377f..5ff46e30e667 100644 --- a/packages/bigframes/bigframes/dataframe.py +++ b/packages/bigframes/bigframes/dataframe.py @@ -2418,6 +2418,7 @@ def sort_index( *, ascending: bool = ..., inplace: Literal[False] = ..., + kind: str = ..., na_position: Literal["first", "last"] = ..., ) -> DataFrame: ... @@ -2427,6 +2428,7 @@ def sort_index( *, ascending: bool = ..., inplace: Literal[True] = ..., + kind: str = ..., na_position: Literal["first", "last"] = ..., ) -> None: ... @@ -2436,6 +2438,7 @@ def sort_index( axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, + kind: str | None = None, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: if utils.get_axis_number(axis) == 0: @@ -2449,7 +2452,8 @@ def sort_index( else order.descending_over(column, na_last) for column in index_columns ] - block = self._block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = self._block.order_by(ordering, stable=is_stable) else: # axis=1 _, indexer = self.columns.sort_values( return_indexer=True, @@ -2472,7 +2476,7 @@ def sort_values( *, inplace: Literal[False] = ..., ascending: bool | typing.Sequence[bool] = ..., - kind: str = ..., + kind: str | None = None, na_position: typing.Literal["first", "last"] = ..., ) -> DataFrame: ... @@ -2483,7 +2487,7 @@ def sort_values( *, inplace: Literal[True] = ..., ascending: bool | typing.Sequence[bool] = ..., - kind: str = ..., + kind: str | None = None, na_position: typing.Literal["first", "last"] = ..., ) -> None: ... @@ -2493,7 +2497,7 @@ def sort_values( *, inplace: bool = False, ascending: bool | typing.Sequence[bool] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: typing.Literal["first", "last"] = "last", ) -> Optional[DataFrame]: if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)): @@ -2525,7 +2529,8 @@ def sort_values( if is_ascending else order.descending_over(column_id, na_last) ) - block = self._block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = self._block.order_by(ordering, stable=is_stable) if inplace: self._set_block(block) return None diff --git a/packages/bigframes/bigframes/series.py b/packages/bigframes/bigframes/series.py index f0648117144a..769733f8f797 100644 --- a/packages/bigframes/bigframes/series.py +++ b/packages/bigframes/bigframes/series.py @@ -1769,7 +1769,7 @@ def sort_values( axis=..., inplace: Literal[True] = ..., ascending: bool | typing.Sequence[bool] = ..., - kind: str = ..., + kind: str | None = ..., na_position: typing.Literal["first", "last"] = ..., ) -> None: ... @@ -1780,7 +1780,7 @@ def sort_values( axis=..., inplace: Literal[False] = ..., ascending: bool | typing.Sequence[bool] = ..., - kind: str = ..., + kind: str | None = ..., na_position: typing.Literal["first", "last"] = ..., ) -> Series: ... @@ -1790,19 +1790,21 @@ def sort_values( axis=0, inplace: bool = False, ascending=True, - kind: str = "quicksort", + kind: str | None = None, na_position: typing.Literal["first", "last"] = "last", ) -> Optional[Series]: if axis != 0 and axis != "index": raise ValueError(f"No axis named {axis} for object type Series") if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] block = self._block.order_by( [ order.ascending_over(self._value_column, (na_position == "last")) if ascending else order.descending_over(self._value_column, (na_position == "last")) ], + stable=is_stable, ) if inplace: self._set_block(block) @@ -1812,17 +1814,37 @@ def sort_values( @typing.overload # type: ignore[override] def sort_index( - self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=... - ) -> Series: ... + self, + *, + axis=..., + inplace: Literal[False] = ..., + ascending=..., + kind: str | None = ..., + na_position=..., + ) -> Series: + ... @typing.overload def sort_index( - self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=... - ) -> None: ... + self, + *, + axis=0, + inplace: Literal[True] = ..., + ascending=..., + kind: str | None = ..., + na_position=..., + ) -> None: + ... @validations.requires_index def sort_index( - self, *, axis=0, inplace: bool = False, ascending=True, na_position="last" + self, + *, + axis=0, + inplace: bool = False, + ascending=True, + kind: str | None = None, + na_position="last", ) -> Optional[Series]: # TODO(tbergeron): Support level parameter once multi-index introduced. if axis != 0 and axis != "index": @@ -1837,7 +1859,8 @@ def sort_index( else order.descending_over(column, na_last) for column in block.index_columns ] - block = block.order_by(ordering) + is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"] + block = block.order_by(ordering, stable=is_stable) if inplace: self._set_block(block) return None diff --git a/packages/bigframes/third_party/bigframes_vendored/constants.py b/packages/bigframes/third_party/bigframes_vendored/constants.py index 9705b19c9045..fc698ea188d4 100644 --- a/packages/bigframes/third_party/bigframes_vendored/constants.py +++ b/packages/bigframes/third_party/bigframes_vendored/constants.py @@ -55,3 +55,5 @@ "_deferred", ] VALID_WRITE_ENGINES = typing.get_args(WriteEngineType) + +DEFAULT_SORT_KIND = "stable" diff --git a/packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py b/packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py index c38aad7dfe77..678fb5f65177 100644 --- a/packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py +++ b/packages/bigframes/third_party/bigframes_vendored/pandas/core/frame.py @@ -2253,7 +2253,7 @@ def sort_values( *, inplace: bool = False, ascending: bool | Sequence[bool] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: Literal["first", "last"] = "last", ): """Sort by the values along row axis. @@ -2339,7 +2339,7 @@ def sort_values( the by. inplace (bool, default False): If True, perform operation in-place. - kind (str, default 'quicksort'): + kind (str, default None): Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. @@ -2363,6 +2363,7 @@ def sort_index( axis: str | int = 0, ascending: bool = True, inplace: bool = False, + kind: str | None = None, na_position: Literal["first", "last"] = "last", ): """Sort object by labels (along an axis). @@ -2375,6 +2376,10 @@ def sort_index( Sort ascending vs. descending. inplace (bool, default False): Whether to modify the DataFrame rather than creating a new one. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default 'last'): Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. Not implemented for MultiIndex. diff --git a/packages/bigframes/third_party/bigframes_vendored/pandas/core/indexes/base.py b/packages/bigframes/third_party/bigframes_vendored/pandas/core/indexes/base.py index 29a3d2109881..632026a33110 100644 --- a/packages/bigframes/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/packages/bigframes/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -828,7 +828,11 @@ def nunique(self) -> int: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_values( - self, *, ascending: bool = True, na_position: str = "last" + self, + *, + ascending: bool = True, + kind: str | None = None, + na_position: str = "last", ) -> Index: """ Return a sorted copy of the index. @@ -851,6 +855,10 @@ def sort_values( Args: ascending (bool, default True): Should the index values be sorted in an ascending order. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first' or 'last'}, default 'last'): Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. diff --git a/packages/bigframes/third_party/bigframes_vendored/pandas/core/series.py b/packages/bigframes/third_party/bigframes_vendored/pandas/core/series.py index 007b8b8e735e..b9cacf3855a2 100644 --- a/packages/bigframes/third_party/bigframes_vendored/pandas/core/series.py +++ b/packages/bigframes/third_party/bigframes_vendored/pandas/core/series.py @@ -1502,7 +1502,7 @@ def sort_values( axis: Axis = 0, inplace: bool = False, ascending: bool | int | Sequence[bool] | Sequence[int] = True, - kind: str = "quicksort", + kind: str | None = None, na_position: str = "last", ): """ @@ -1579,7 +1579,7 @@ def sort_values( Whether to modify the Series rather than creating a new one. ascending (bool or list of bools, default True): If True, sort values in ascending order, otherwise descending. - kind (str, default to 'quicksort'): + kind (str, default to None): Choice of sorting algorithm. Accepts quicksort', 'mergesort', 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder @@ -1599,6 +1599,7 @@ def sort_index( axis: Axis = 0, inplace: bool = False, ascending: bool | Sequence[bool] = True, + kind: str | None = None, na_position: NaPosition = "last", ): """ @@ -1646,6 +1647,10 @@ def sort_index( ascending (bool or list-like of bools, default True): Sort ascending vs. descending. When the index is a MultiIndex the sort direction can be controlled for each level individually. + kind (str, default None): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to + sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default 'last'): If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. Not implemented for MultiIndex.