Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions packages/bigframes/bigframes/core/array_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,17 @@ def filter(self, predicate: ex.Expression):
return arr.drop_columns(filter_ids)

def order_by(
self, by: Sequence[OrderingExpression], is_total_order: bool = False
self,
by: Sequence[OrderingExpression],
is_total_order: bool = False,
stable: bool = True,
) -> ArrayValue:
return ArrayValue(
nodes.OrderByNode(
child=self.node, by=tuple(by), is_total_order=is_total_order
child=self.node,
by=tuple(by),
is_total_order=is_total_order,
stable=stable,
)
)

Expand Down
3 changes: 2 additions & 1 deletion packages/bigframes/bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,9 +395,10 @@ def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
def order_by(
self,
by: typing.Sequence[ordering.OrderingExpression],
stable: bool = True,
) -> Block:
return Block(
self._expr.order_by(by),
self._expr.order_by(by, stable=stable),
index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self.index.names,
Expand Down
19 changes: 11 additions & 8 deletions packages/bigframes/bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,6 @@ def query_job(self) -> bigquery.QueryJob:
self._query_job = query_job
return self._query_job

@property
def str(self) -> bigframes.operations.strings.StringMethods:
import bigframes.operations.strings

return bigframes.operations.strings.StringMethods(self)

def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
"""Get integer location, slice or boolean mask for requested label.

Expand Down Expand Up @@ -436,7 +430,8 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool = True,
na_position: __builtins__.str = "last",
kind: str | None = None,
na_position: str = "last",
) -> Index:
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
Expand All @@ -448,7 +443,8 @@ def sort_values(
else order.descending_over(column, na_last)
for column in index_columns
]
return Index(self._block.order_by(ordering))
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the STABLE_SORT_KINDS constant for consistency and easier maintenance.

Suggested change
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS

return Index(self._block.order_by(ordering, stable=is_stable))

def astype(
self,
Expand Down Expand Up @@ -840,6 +836,13 @@ def _apply_binary_op(
else:
return NotImplemented

# last so as to not shadow __builtins__.str
@property
def str(self) -> bigframes.operations.strings.StringMethods:
import bigframes.operations.strings

return bigframes.operations.strings.StringMethods(self)


def _should_create_datetime_index(block: blocks.Block) -> bool:
if len(block.index.dtypes) != 1:
Expand Down
3 changes: 2 additions & 1 deletion packages/bigframes/bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,8 @@ def remap_refs(
@dataclasses.dataclass(frozen=True, eq=False)
class OrderByNode(UnaryNode):
by: Tuple[OrderingExpression, ...]
# This is an optimization, if true, can discard previous orderings.
stable: bool = True
# This is an optimization, if true, can discard previous orderings, even if doing a stable sort
# might be a total ordering even if false
is_total_order: bool = False

Expand Down
7 changes: 6 additions & 1 deletion packages/bigframes/bigframes/core/rewrite/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def pull_up_order_inner(
child_result, child_order = pull_up_order_inner(node.child)
return child_result, child_order.with_reverse()
elif isinstance(node, bigframes.core.nodes.OrderByNode):
if node.is_total_order:
# unstable sorts don't care about previous order, total orders override previous order
if (not node.stable) or node.is_total_order:
new_node = remove_order(node.child)
else:
new_node, child_order = pull_up_order_inner(node.child)
Expand Down Expand Up @@ -106,6 +107,10 @@ def pull_up_order_inner(
),
)
)
elif not node.stable:
new_order = bigframes.core.ordering.RowOrdering(
ordering_value_columns=tuple(new_by),
)
else:
assert child_order
new_order = child_order.with_ordering_columns(new_by)
Expand Down
15 changes: 10 additions & 5 deletions packages/bigframes/bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2418,6 +2418,7 @@ def sort_index(
*,
ascending: bool = ...,
inplace: Literal[False] = ...,
kind: str = ...,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The kind parameter should allow None to match the implementation and other overloads.

Suggested change
kind: str = ...,
kind: str | None = ...,

na_position: Literal["first", "last"] = ...,
) -> DataFrame: ...

Expand All @@ -2427,6 +2428,7 @@ def sort_index(
*,
ascending: bool = ...,
inplace: Literal[True] = ...,
kind: str = ...,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The kind parameter should allow None to match the implementation and other overloads.

Suggested change
kind: str = ...,
kind: str | None = ...,

na_position: Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -2436,6 +2438,7 @@ def sort_index(
axis: Union[int, str] = 0,
ascending: bool = True,
inplace: bool = False,
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
) -> Optional[DataFrame]:
if utils.get_axis_number(axis) == 0:
Expand All @@ -2449,7 +2452,8 @@ def sort_index(
else order.descending_over(column, na_last)
for column in index_columns
]
block = self._block.order_by(ordering)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
block = self._block.order_by(ordering, stable=is_stable)
Comment on lines +2455 to +2456
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The kind parameter is correctly used for axis=0, but it should also be propagated to self.columns.sort_values when axis=1 (the else block) to ensure consistent sorting behavior for columns. Also, consider using the STABLE_SORT_KINDS constant.

Suggested change
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
block = self._block.order_by(ordering, stable=is_stable)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS
block = self._block.order_by(ordering, stable=is_stable)

else: # axis=1
_, indexer = self.columns.sort_values(
return_indexer=True,
Expand All @@ -2472,7 +2476,7 @@ def sort_values(
*,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In overloads, it is conventional to use ... as the default value instead of None.

Suggested change
kind: str | None = None,
kind: str | None = ...,

na_position: typing.Literal["first", "last"] = ...,
) -> DataFrame: ...

Expand All @@ -2483,7 +2487,7 @@ def sort_values(
*,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In overloads, it is conventional to use ... as the default value instead of None.

Suggested change
kind: str | None = None,
kind: str | None = ...,

na_position: typing.Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -2493,7 +2497,7 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool | typing.Sequence[bool] = True,
kind: str = "quicksort",
kind: str | None = None,
na_position: typing.Literal["first", "last"] = "last",
) -> Optional[DataFrame]:
if isinstance(by, (bigframes.series.Series, indexes.Index, DataFrame)):
Expand Down Expand Up @@ -2525,7 +2529,8 @@ def sort_values(
if is_ascending
else order.descending_over(column_id, na_last)
)
block = self._block.order_by(ordering)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the STABLE_SORT_KINDS constant for consistency.

Suggested change
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS

block = self._block.order_by(ordering, stable=is_stable)
if inplace:
self._set_block(block)
return None
Expand Down
41 changes: 32 additions & 9 deletions packages/bigframes/bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1769,7 +1769,7 @@ def sort_values(
axis=...,
inplace: Literal[True] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> None: ...

Expand All @@ -1780,7 +1780,7 @@ def sort_values(
axis=...,
inplace: Literal[False] = ...,
ascending: bool | typing.Sequence[bool] = ...,
kind: str = ...,
kind: str | None = ...,
na_position: typing.Literal["first", "last"] = ...,
) -> Series: ...

Expand All @@ -1790,19 +1790,21 @@ def sort_values(
axis=0,
inplace: bool = False,
ascending=True,
kind: str = "quicksort",
kind: str | None = None,
na_position: typing.Literal["first", "last"] = "last",
) -> Optional[Series]:
if axis != 0 and axis != "index":
raise ValueError(f"No axis named {axis} for object type Series")
if na_position not in ["first", "last"]:
raise ValueError("Param na_position must be one of 'first' or 'last'")
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the STABLE_SORT_KINDS constant for consistency.

Suggested change
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS

block = self._block.order_by(
[
order.ascending_over(self._value_column, (na_position == "last"))
if ascending
else order.descending_over(self._value_column, (na_position == "last"))
],
stable=is_stable,
)
if inplace:
self._set_block(block)
Expand All @@ -1812,17 +1814,37 @@ def sort_values(

@typing.overload # type: ignore[override]
def sort_index(
self, *, axis=..., inplace: Literal[False] = ..., ascending=..., na_position=...
) -> Series: ...
self,
*,
axis=...,
inplace: Literal[False] = ...,
ascending=...,
kind: str | None = ...,
na_position=...,
) -> Series:
...

@typing.overload
def sort_index(
self, *, axis=0, inplace: Literal[True] = ..., ascending=..., na_position=...
) -> None: ...
self,
*,
axis=0,
inplace: Literal[True] = ...,
ascending=...,
kind: str | None = ...,
na_position=...,
) -> None:
...

@validations.requires_index
def sort_index(
self, *, axis=0, inplace: bool = False, ascending=True, na_position="last"
self,
*,
axis=0,
inplace: bool = False,
ascending=True,
kind: str | None = None,
na_position="last",
) -> Optional[Series]:
# TODO(tbergeron): Support level parameter once multi-index introduced.
if axis != 0 and axis != "index":
Expand All @@ -1837,7 +1859,8 @@ def sort_index(
else order.descending_over(column, na_last)
for column in block.index_columns
]
block = block.order_by(ordering)
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the STABLE_SORT_KINDS constant for consistency.

Suggested change
is_stable = (kind or constants.DEFAULT_SORT_KIND) in ["stable", "mergesort"]
is_stable = (kind or constants.DEFAULT_SORT_KIND) in constants.STABLE_SORT_KINDS

block = block.order_by(ordering, stable=is_stable)
if inplace:
self._set_block(block)
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,5 @@
"_deferred",
]
VALID_WRITE_ENGINES = typing.get_args(WriteEngineType)

DEFAULT_SORT_KIND = "stable"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To avoid repeating the list of stable sort algorithms across multiple modules, consider defining a constant here.

Suggested change
DEFAULT_SORT_KIND = "stable"
DEFAULT_SORT_KIND = "stable"
STABLE_SORT_KINDS = ("stable", "mergesort")

Original file line number Diff line number Diff line change
Expand Up @@ -2253,7 +2253,7 @@ def sort_values(
*,
inplace: bool = False,
ascending: bool | Sequence[bool] = True,
kind: str = "quicksort",
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
):
"""Sort by the values along row axis.
Expand Down Expand Up @@ -2339,7 +2339,7 @@ def sort_values(
the by.
inplace (bool, default False):
If True, perform operation in-place.
kind (str, default 'quicksort'):
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
Expand All @@ -2363,6 +2363,7 @@ def sort_index(
axis: str | int = 0,
ascending: bool = True,
inplace: bool = False,
kind: str | None = None,
na_position: Literal["first", "last"] = "last",
):
"""Sort object by labels (along an axis).
Expand All @@ -2375,6 +2376,10 @@ def sort_index(
Sort ascending vs. descending.
inplace (bool, default False):
Whether to modify the DataFrame rather than creating a new one.
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
na_position ({'first', 'last'}, default 'last'):
Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
Not implemented for MultiIndex.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,11 @@ def nunique(self) -> int:
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def sort_values(
self, *, ascending: bool = True, na_position: str = "last"
self,
*,
ascending: bool = True,
kind: str | None = None,
na_position: str = "last",
) -> Index:
"""
Return a sorted copy of the index.
Expand All @@ -851,6 +855,10 @@ def sort_values(
Args:
ascending (bool, default True):
Should the index values be sorted in an ascending order.
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
na_position ({'first' or 'last'}, default 'last'):
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,7 @@ def sort_values(
axis: Axis = 0,
inplace: bool = False,
ascending: bool | int | Sequence[bool] | Sequence[int] = True,
kind: str = "quicksort",
kind: str | None = None,
na_position: str = "last",
):
"""
Expand Down Expand Up @@ -1579,7 +1579,7 @@ def sort_values(
Whether to modify the Series rather than creating a new one.
ascending (bool or list of bools, default True):
If True, sort values in ascending order, otherwise descending.
kind (str, default to 'quicksort'):
kind (str, default to None):
Choice of sorting algorithm. Accepts quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder
Expand All @@ -1599,6 +1599,7 @@ def sort_index(
axis: Axis = 0,
inplace: bool = False,
ascending: bool | Sequence[bool] = True,
kind: str | None = None,
na_position: NaPosition = "last",
):
"""
Expand Down Expand Up @@ -1646,6 +1647,10 @@ def sort_index(
ascending (bool or list-like of bools, default True):
Sort ascending vs. descending. When the index is a MultiIndex the
sort direction can be controlled for each level individually.
kind (str, default None):
Choice of sorting algorithm. Accepts 'quicksort', 'mergesort',
'heapsort', 'stable'. Ignored except when determining whether to
sort stably. 'mergesort' or 'stable' will result in stable reorder.
na_position ({'first', 'last'}, default 'last'):
If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.
Not implemented for MultiIndex.
Expand Down
Loading