diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e419c61e8..142edaa9d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,32 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.11.0...v2.12.0) (2025-07-23) + + +### Features + +* Add code samples for dbt bigframes integration ([#1898](https://github.com/googleapis/python-bigquery-dataframes/issues/1898)) ([7e03252](https://github.com/googleapis/python-bigquery-dataframes/commit/7e03252d31e505731db113eb38af77842bf29b9b)) +* Add isin local execution to hybrid engine ([#1915](https://github.com/googleapis/python-bigquery-dataframes/issues/1915)) ([c0cefd3](https://github.com/googleapis/python-bigquery-dataframes/commit/c0cefd36cfd55962b86178d2a612d625ed17f79c)) +* Add ml.metrics.mean_absolute_error method ([#1910](https://github.com/googleapis/python-bigquery-dataframes/issues/1910)) ([15b8449](https://github.com/googleapis/python-bigquery-dataframes/commit/15b8449dc5ad0c8190a5cbf47894436de18c8e88)) +* Allow local arithmetic execution in hybrid engine ([#1906](https://github.com/googleapis/python-bigquery-dataframes/issues/1906)) ([ebdcd02](https://github.com/googleapis/python-bigquery-dataframes/commit/ebdcd0240f0d8edaef3094b3a4e664b4a84d4a25)) +* Provide day_of_year and day_of_week for dt accessor ([#1911](https://github.com/googleapis/python-bigquery-dataframes/issues/1911)) ([40e7638](https://github.com/googleapis/python-bigquery-dataframes/commit/40e76383948a79bde48108f6180fd6ae2b3d0875)) +* Support params `max_batching_rows`, `container_cpu`, and `container_memory` for `udf` ([#1897](https://github.com/googleapis/python-bigquery-dataframes/issues/1897)) ([8baa912](https://github.com/googleapis/python-bigquery-dataframes/commit/8baa9126e595ae682469a6bb462244240699f57f)) +* Support typed pyarrow.Scalar in assignment ([#1930](https://github.com/googleapis/python-bigquery-dataframes/issues/1930)) ([cd28e12](https://github.com/googleapis/python-bigquery-dataframes/commit/cd28e12b3f70a6934a68963a7f25dbd5e3c67335)) + + +### Bug Fixes + +* Correct min field from max() to min() in remote function tests ([#1917](https://github.com/googleapis/python-bigquery-dataframes/issues/1917)) ([d5c54fc](https://github.com/googleapis/python-bigquery-dataframes/commit/d5c54fca32ed75c1aef52c99781db7f8ac7426e1)) +* Resolve location reset issue in bigquery options ([#1914](https://github.com/googleapis/python-bigquery-dataframes/issues/1914)) ([c15cb8a](https://github.com/googleapis/python-bigquery-dataframes/commit/c15cb8a1a9c834c2c1c2984930415b246f3f948b)) +* Series.str.isdigit in unicode superscripts and fractions ([#1924](https://github.com/googleapis/python-bigquery-dataframes/issues/1924)) ([8d46c36](https://github.com/googleapis/python-bigquery-dataframes/commit/8d46c36da7881a99861166c03a0831beff8ee0dd)) + + +### Documentation + +* Add code snippets for session and IO public docs ([#1919](https://github.com/googleapis/python-bigquery-dataframes/issues/1919)) ([6e01cbe](https://github.com/googleapis/python-bigquery-dataframes/commit/6e01cbec0dcf40e528b4a96e944681df18773c11)) +* Add snippets for performance optimization doc ([#1923](https://github.com/googleapis/python-bigquery-dataframes/issues/1923)) ([4da309e](https://github.com/googleapis/python-bigquery-dataframes/commit/4da309e27bd58a685e8aca953717da75d4ba5305)) + ## [2.11.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.10.0...v2.11.0) (2025-07-15) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 09ffee95d4..648b69dea7 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -171,7 +171,7 @@ def location(self) -> Optional[str]: @location.setter def location(self, value: Optional[str]): - if self._session_started and self._location != value: + if self._session_started and self._location != _get_validated_location(value): raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="location")) self._location = _get_validated_location(value) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index c31c122078..a70ea49752 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -35,6 +35,7 @@ import bigframes.operations.comparison_ops as comp_ops import bigframes.operations.generic_ops as gen_ops import bigframes.operations.numeric_ops as num_ops +import bigframes.operations.string_ops as string_ops polars_installed = True if TYPE_CHECKING: @@ -146,6 +147,14 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: return input.abs() + @compile_op.register(num_ops.FloorOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.floor() + + @compile_op.register(num_ops.CeilOp) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.ceil() + @compile_op.register(num_ops.PosOp) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: return input.__pos__() @@ -182,10 +191,6 @@ def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: return l_input // r_input - @compile_op.register(num_ops.FloorDivOp) - def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: - return l_input // r_input - @compile_op.register(num_ops.ModOp) def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: return l_input % r_input @@ -270,6 +275,11 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: # eg. We want "True" instead of "true" for bool to strin return input.cast(_DTYPE_MAPPING[op.to_type], strict=not op.safe) + @compile_op.register(string_ops.StrConcatOp) + def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: + assert isinstance(op, string_ops.StrConcatOp) + return pl.concat_str(l_input, r_input) + @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: scalar_compiler = PolarsExpressionCompiler() @@ -503,6 +513,30 @@ def compile_join(self, node: nodes.JoinNode): left, right, node.type, left_on, right_on, node.joins_nulls ) + @compile_node.register + def compile_isin(self, node: nodes.InNode): + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child).unique(node.right_col.id.sql) + right = right.with_columns(pl.lit(True).alias(node.indicator_col.sql)) + + left_ex, right_ex = lowering._coerce_comparables(node.left_col, node.right_col) + + left_pl_ex = self.expr_compiler.compile_expression(left_ex) + right_pl_ex = self.expr_compiler.compile_expression(right_ex) + + joined = left.join( + right, + how="left", + left_on=left_pl_ex, + right_on=right_pl_ex, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=node.joins_nulls, # type: ignore + coalesce=False, + ) + passthrough = [pl.col(id) for id in left.columns] + indicator = pl.col(node.indicator_col.sql).fill_null(False) + return joined.select((*passthrough, indicator)) + def _ordered_join( self, left_frame: pl.LazyFrame, diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index 48d63e9ed9..ee0933b450 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -37,26 +37,259 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return expr.op.as_expr(larg, rarg) +class LowerAddRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.AddOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.AddOp) + larg, rarg = expr.children[0], expr.children[1] + + if ( + larg.output_type == dtypes.BOOL_DTYPE + and rarg.output_type == dtypes.BOOL_DTYPE + ): + int_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg), + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg), + ) + return ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr(int_result) + + if dtypes.is_string_like(larg.output_type) and dtypes.is_string_like( + rarg.output_type + ): + return ops.strconcat_op.as_expr(larg, rarg) + + if larg.output_type == dtypes.BOOL_DTYPE: + larg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg) + if rarg.output_type == dtypes.BOOL_DTYPE: + rarg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg) + + if ( + larg.output_type == dtypes.DATE_DTYPE + and rarg.output_type == dtypes.TIMEDELTA_DTYPE + ): + larg = ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr(larg) + + if ( + larg.output_type == dtypes.TIMEDELTA_DTYPE + and rarg.output_type == dtypes.DATE_DTYPE + ): + rarg = ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr(rarg) + + return expr.op.as_expr(larg, rarg) + + +class LowerSubRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.SubOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.SubOp) + larg, rarg = expr.children[0], expr.children[1] + + if ( + larg.output_type == dtypes.BOOL_DTYPE + and rarg.output_type == dtypes.BOOL_DTYPE + ): + int_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg), + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg), + ) + return ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr(int_result) + + if larg.output_type == dtypes.BOOL_DTYPE: + larg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg) + if rarg.output_type == dtypes.BOOL_DTYPE: + rarg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg) + + if ( + larg.output_type == dtypes.DATE_DTYPE + and rarg.output_type == dtypes.TIMEDELTA_DTYPE + ): + larg = ops.AsTypeOp(to_type=dtypes.DATETIME_DTYPE).as_expr(larg) + + return expr.op.as_expr(larg, rarg) + + +@dataclasses.dataclass +class LowerMulRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.MulOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.MulOp) + larg, rarg = expr.children[0], expr.children[1] + + if ( + larg.output_type == dtypes.BOOL_DTYPE + and rarg.output_type == dtypes.BOOL_DTYPE + ): + int_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg), + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg), + ) + return ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr(int_result) + + if ( + larg.output_type == dtypes.BOOL_DTYPE + and rarg.output_type != dtypes.BOOL_DTYPE + ): + larg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg) + if ( + rarg.output_type == dtypes.BOOL_DTYPE + and larg.output_type != dtypes.BOOL_DTYPE + ): + rarg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg) + + return expr.op.as_expr(larg, rarg) + + +class LowerDivRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.DivOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.DivOp) + + dividend = expr.children[0] + divisor = expr.children[1] + + if dividend.output_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric( + divisor.output_type + ): + # exact same as floordiv impl for timedelta + numeric_result = ops.floordiv_op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), divisor + ) + int_result = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(numeric_result) + return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(int_result) + + if ( + dividend.output_type == dtypes.BOOL_DTYPE + and divisor.output_type == dtypes.BOOL_DTYPE + ): + int_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(divisor), + ) + return ops.AsTypeOp(to_type=dtypes.BOOL_DTYPE).as_expr(int_result) + + # polars divide doesn't like bools, convert to int always + # convert numerics to float always + if dividend.output_type == dtypes.BOOL_DTYPE: + dividend = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend) + elif dividend.output_type in (dtypes.BIGNUMERIC_DTYPE, dtypes.NUMERIC_DTYPE): + dividend = ops.AsTypeOp(to_type=dtypes.FLOAT_DTYPE).as_expr(dividend) + if divisor.output_type == dtypes.BOOL_DTYPE: + divisor = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(divisor) + + return numeric_ops.div_op.as_expr(dividend, divisor) + + class LowerFloorDivRule(op_lowering.OpLoweringRule): @property def op(self) -> type[ops.ScalarOp]: return numeric_ops.FloorDivOp def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, numeric_ops.FloorDivOp) + dividend = expr.children[0] divisor = expr.children[1] - using_floats = (dividend.output_type == dtypes.FLOAT_DTYPE) or ( - divisor.output_type == dtypes.FLOAT_DTYPE - ) - inf_or_zero = ( - expression.const(float("INF")) if using_floats else expression.const(0) - ) - zero_result = ops.mul_op.as_expr(inf_or_zero, dividend) - divisor_is_zero = ops.eq_op.as_expr(divisor, expression.const(0)) - return ops.where_op.as_expr(zero_result, divisor_is_zero, expr) + + if ( + dividend.output_type == dtypes.TIMEDELTA_DTYPE + and divisor.output_type == dtypes.TIMEDELTA_DTYPE + ): + int_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(divisor), + ) + return int_result + if dividend.output_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric( + divisor.output_type + ): + # this is pretty fragile as zero will break it, and must fit back into int + numeric_result = expr.op.as_expr( + ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend), divisor + ) + int_result = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(numeric_result) + return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(int_result) + + if dividend.output_type == dtypes.BOOL_DTYPE: + dividend = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(dividend) + if divisor.output_type == dtypes.BOOL_DTYPE: + divisor = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(divisor) + + if expr.output_type != dtypes.FLOAT_DTYPE: + # need to guard against zero divisor + # multiply dividend in this case to propagate nulls + return ops.where_op.as_expr( + ops.mul_op.as_expr(dividend, expression.const(0)), + ops.eq_op.as_expr(divisor, expression.const(0)), + numeric_ops.floordiv_op.as_expr(dividend, divisor), + ) + else: + return expr.op.as_expr(dividend, divisor) + + +class LowerModRule(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return numeric_ops.ModOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + og_expr = expr + assert isinstance(expr.op, numeric_ops.ModOp) + larg, rarg = expr.children[0], expr.children[1] + + if ( + larg.output_type == dtypes.TIMEDELTA_DTYPE + and rarg.output_type == dtypes.TIMEDELTA_DTYPE + ): + larg_int = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg) + rarg_int = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg) + int_result = expr.op.as_expr(larg_int, rarg_int) + w_zero_handling = ops.where_op.as_expr( + int_result, + ops.ne_op.as_expr(rarg_int, expression.const(0)), + ops.mul_op.as_expr(rarg_int, expression.const(0)), + ) + return ops.AsTypeOp(to_type=dtypes.TIMEDELTA_DTYPE).as_expr(w_zero_handling) + + if larg.output_type == dtypes.BOOL_DTYPE: + larg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(larg) + if rarg.output_type == dtypes.BOOL_DTYPE: + rarg = ops.AsTypeOp(to_type=dtypes.INT_DTYPE).as_expr(rarg) + + wo_bools = expr.op.as_expr(larg, rarg) + + if og_expr.output_type == dtypes.INT_DTYPE: + return ops.where_op.as_expr( + wo_bools, + ops.ne_op.as_expr(rarg, expression.const(0)), + ops.mul_op.as_expr(rarg, expression.const(0)), + ) + return wo_bools -def _coerce_comparables(expr1: expression.Expression, expr2: expression.Expression): +def _coerce_comparables( + expr1: expression.Expression, + expr2: expression.Expression, + *, + bools_only: bool = False +): + if bools_only: + if ( + expr1.output_type != dtypes.BOOL_DTYPE + and expr2.output_type != dtypes.BOOL_DTYPE + ): + return expr1, expr2 target_type = dtypes.coerce_to_common(expr1.output_type, expr2.output_type) if expr1.output_type != target_type: @@ -90,7 +323,12 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): POLARS_LOWERING_RULES = ( *LOWER_COMPARISONS, + LowerAddRule(), + LowerSubRule(), + LowerMulRule(), + LowerDivRule(), LowerFloorDivRule(), + LowerModRule(), ) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 30da6b2cb2..95517ead35 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -487,9 +487,9 @@ def isalpha_op_impl(x: ibis_types.Value): @scalar_op_compiler.register_unary_op(ops.isdigit_op) def isdigit_op_impl(x: ibis_types.Value): - # Based on docs, should include superscript/subscript-ed numbers - # Tests however pass only when set to Nd unicode class - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + return typing.cast(ibis_types.StringValue, x).re_search( + r"^[\p{Nd}\x{00B9}\x{00B2}\x{00B3}\x{2070}\x{2074}-\x{2079}\x{2080}-\x{2089}]+$" + ) @scalar_op_compiler.register_unary_op(ops.isdecimal_op) @@ -1498,7 +1498,7 @@ def eq_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x == y @@ -1508,7 +1508,7 @@ def eq_nulls_match_op( y: ibis_types.Value, ): """Variant of eq_op where nulls match each other. Only use where dtypes are known to be same.""" - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) literal = ibis_types.literal("$NULL_SENTINEL$") if hasattr(x, "fill_null"): left = x.cast(ibis_dtypes.str).fill_null(literal) @@ -1525,7 +1525,7 @@ def ne_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x != y @@ -1537,13 +1537,10 @@ def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue ) -def _coerce_comparables( - x: ibis_types.Value, - y: ibis_types.Value, -): - if x.type().is_boolean() and not y.type().is_boolean(): +def _coerce_bools(x: ibis_types.Value, y: ibis_types.Value, *, always: bool = False): + if x.type().is_boolean() and (always or not y.type().is_boolean()): x = x.cast(ibis_dtypes.int64) - elif y.type().is_boolean() and not x.type().is_boolean(): + if y.type().is_boolean() and (always or not x.type().is_boolean()): y = y.cast(ibis_dtypes.int64) return x, y @@ -1604,8 +1601,18 @@ def add_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_bools(x, y) if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): return ibis_types.null() + + if x.type().is_boolean() and y.type().is_boolean(): + x, y = _coerce_bools(x, y, always=True) + return ( + typing.cast(ibis_types.NumericValue, x) + + typing.cast(ibis_types.NumericValue, x) + ).cast(ibis_dtypes.Boolean) + + x, y = _coerce_bools(x, y) return x + y # type: ignore @@ -1615,6 +1622,7 @@ def sub_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_bools(x, y) return typing.cast(ibis_types.NumericValue, x) - typing.cast( ibis_types.NumericValue, y ) @@ -1626,6 +1634,13 @@ def mul_op( x: ibis_types.Value, y: ibis_types.Value, ): + if x.type().is_boolean() and y.type().is_boolean(): + x, y = _coerce_bools(x, y, always=True) + return ( + typing.cast(ibis_types.NumericValue, x) + * typing.cast(ibis_types.NumericValue, x) + ).cast(ibis_dtypes.Boolean) + x, y = _coerce_bools(x, y) return typing.cast(ibis_types.NumericValue, x) * typing.cast( ibis_types.NumericValue, y ) @@ -1637,6 +1652,7 @@ def div_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_bools(x, y) return typing.cast(ibis_types.NumericValue, x) / typing.cast( ibis_types.NumericValue, y ) @@ -1648,6 +1664,7 @@ def pow_op( x: ibis_types.Value, y: ibis_types.Value, ): + x, y = _coerce_bools(x, y) if x.type().is_integer() and y.type().is_integer(): return _int_pow_op(x, y) else: @@ -1661,6 +1678,7 @@ def unsafe_pow_op( y: ibis_types.Value, ): """For internal use only - where domain and overflow checks are not needed.""" + x, y = _coerce_bools(x, y) return typing.cast(ibis_types.NumericValue, x) ** typing.cast( ibis_types.NumericValue, y ) @@ -1749,7 +1767,7 @@ def lt_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x < y @@ -1759,7 +1777,7 @@ def le_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x <= y @@ -1769,7 +1787,7 @@ def gt_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x > y @@ -1779,7 +1797,7 @@ def ge_op( x: ibis_types.Value, y: ibis_types.Value, ): - x, y = _coerce_comparables(x, y) + x, y = _coerce_bools(x, y) return x >= y @@ -1789,6 +1807,10 @@ def floordiv_op( x: ibis_types.Value, y: ibis_types.Value, ): + if x.type().is_boolean(): + x = x.cast(ibis_dtypes.int64) + elif y.type().is_boolean(): + y = y.cast(ibis_dtypes.int64) x_numeric = typing.cast(ibis_types.NumericValue, x) y_numeric = typing.cast(ibis_types.NumericValue, y) floordiv_expr = x_numeric // y_numeric @@ -1827,6 +1849,7 @@ def mod_op( if isinstance(op, ibis_generic.Literal) and op.value == 0: return ibis_types.null().cast(x.type()) + x, y = _coerce_bools(x, y) if x.type().is_integer() and y.type().is_integer(): # both are ints, no casting necessary return _int_mod( diff --git a/bigframes/core/compile/sqlglot/aggregate_compiler.py b/bigframes/core/compile/sqlglot/aggregate_compiler.py index 888b3756b5..f7abd7dc7a 100644 --- a/bigframes/core/compile/sqlglot/aggregate_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregate_compiler.py @@ -13,16 +13,17 @@ # limitations under the License. from __future__ import annotations -import functools -import typing - import sqlglot.expressions as sge -from bigframes.core import expression, window_spec +from bigframes.core import expression +from bigframes.core.compile.sqlglot.aggregations import ( + binary_compiler, + nullary_compiler, + ordered_unary_compiler, + unary_compiler, +) from bigframes.core.compile.sqlglot.expressions import typed_expr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler -import bigframes.core.compile.sqlglot.sqlglot_ir as ir -import bigframes.operations as ops def compile_aggregate( @@ -31,16 +32,18 @@ def compile_aggregate( ) -> sge.Expression: """Compiles BigFrames aggregation expression into SQLGlot expression.""" if isinstance(aggregate, expression.NullaryAggregation): - return compile_nullary_agg(aggregate.op) + return nullary_compiler.compile(aggregate.op) if isinstance(aggregate, expression.UnaryAggregation): column = typed_expr.TypedExpr( scalar_compiler.compile_scalar_expression(aggregate.arg), aggregate.arg.output_type, ) if not aggregate.op.order_independent: - return compile_ordered_unary_agg(aggregate.op, column, order_by=order_by) + return ordered_unary_compiler.compile( + aggregate.op, column, order_by=order_by + ) else: - return compile_unary_agg(aggregate.op, column) + return unary_compiler.compile(aggregate.op, column) elif isinstance(aggregate, expression.BinaryAggregation): left = typed_expr.TypedExpr( scalar_compiler.compile_scalar_expression(aggregate.left), @@ -50,63 +53,6 @@ def compile_aggregate( scalar_compiler.compile_scalar_expression(aggregate.right), aggregate.right.output_type, ) - return compile_binary_agg(aggregate.op, left, right) + return binary_compiler.compile(aggregate.op, left, right) else: raise ValueError(f"Unexpected aggregation: {aggregate}") - - -@functools.singledispatch -def compile_nullary_agg( - op: ops.aggregations.WindowOp, - window: typing.Optional[window_spec.WindowSpec] = None, -) -> sge.Expression: - raise ValueError(f"Can't compile unrecognized operation: {op}") - - -@functools.singledispatch -def compile_binary_agg( - op: ops.aggregations.WindowOp, - left: typed_expr.TypedExpr, - right: typed_expr.TypedExpr, - window: typing.Optional[window_spec.WindowSpec] = None, -) -> sge.Expression: - raise ValueError(f"Can't compile unrecognized operation: {op}") - - -@functools.singledispatch -def compile_unary_agg( - op: ops.aggregations.WindowOp, - column: typed_expr.TypedExpr, - window: typing.Optional[window_spec.WindowSpec] = None, -) -> sge.Expression: - raise ValueError(f"Can't compile unrecognized operation: {op}") - - -@functools.singledispatch -def compile_ordered_unary_agg( - op: ops.aggregations.WindowOp, - column: typed_expr.TypedExpr, - window: typing.Optional[window_spec.WindowSpec] = None, - order_by: typing.Sequence[sge.Expression] = [], -) -> sge.Expression: - raise ValueError(f"Can't compile unrecognized operation: {op}") - - -@compile_unary_agg.register -def _( - op: ops.aggregations.SumOp, - column: typed_expr.TypedExpr, - window: typing.Optional[window_spec.WindowSpec] = None, -) -> sge.Expression: - # Will be null if all inputs are null. Pandas defaults to zero sum though. - expr = _apply_window_if_present(sge.func("SUM", column.expr), window) - return sge.func("IFNULL", expr, ir._literal(0, column.dtype)) - - -def _apply_window_if_present( - value: sge.Expression, - window: typing.Optional[window_spec.WindowSpec] = None, -) -> sge.Expression: - if window is not None: - raise NotImplementedError("Can't apply window to the expression.") - return value diff --git a/bigframes/core/compile/sqlglot/aggregations/__init__.py b/bigframes/core/compile/sqlglot/aggregations/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/core/compile/sqlglot/aggregations/binary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/binary_compiler.py new file mode 100644 index 0000000000..a162a9c18a --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/binary_compiler.py @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes.core import window_spec +import bigframes.core.compile.sqlglot.aggregations.op_registration as reg +import bigframes.core.compile.sqlglot.expressions.typed_expr as typed_expr +from bigframes.operations import aggregations as agg_ops + +BINARY_OP_REGISTRATION = reg.OpRegistration() + + +def compile( + op: agg_ops.WindowOp, + left: typed_expr.TypedExpr, + right: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return BINARY_OP_REGISTRATION[op](op, left, right, window=window) diff --git a/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py new file mode 100644 index 0000000000..720ce743a6 --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py @@ -0,0 +1,41 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes.core import window_spec +import bigframes.core.compile.sqlglot.aggregations.op_registration as reg +from bigframes.core.compile.sqlglot.aggregations.utils import apply_window_if_present +from bigframes.operations import aggregations as agg_ops + +NULLARY_OP_REGISTRATION = reg.OpRegistration() + + +def compile( + op: agg_ops.WindowOp, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return NULLARY_OP_REGISTRATION[op](op, window=window) + + +@NULLARY_OP_REGISTRATION.register(agg_ops.SizeOp) +def _( + op: agg_ops.SizeOp, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) diff --git a/bigframes/core/compile/sqlglot/aggregations/op_registration.py b/bigframes/core/compile/sqlglot/aggregations/op_registration.py new file mode 100644 index 0000000000..996bf5b362 --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/op_registration.py @@ -0,0 +1,62 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +from sqlglot import expressions as sge + +from bigframes.operations import aggregations as agg_ops + +# We should've been more specific about input types. Unfortunately, +# MyPy doesn't support more rigorous checks. +CompilationFunc = typing.Callable[..., sge.Expression] + + +class OpRegistration: + def __init__(self) -> None: + self._registered_ops: dict[str, CompilationFunc] = {} + + def register( + self, op: agg_ops.WindowOp | type[agg_ops.WindowOp] + ) -> typing.Callable[[CompilationFunc], CompilationFunc]: + def decorator(item: CompilationFunc): + def arg_checker(*args, **kwargs): + if not isinstance(args[0], agg_ops.WindowOp): + raise ValueError( + "The first parameter must be a window operator. " + f"Got {type(args[0])}" + ) + return item(*args, **kwargs) + + if hasattr(op, "name"): + key = typing.cast(str, op.name) + if key in self._registered_ops: + raise ValueError(f"{key} is already registered") + else: + raise ValueError(f"The operator must have a 'name' attribute. Got {op}") + self._registered_ops[key] = item + return arg_checker + + return decorator + + def __getitem__(self, op: str | agg_ops.WindowOp) -> CompilationFunc: + if isinstance(op, agg_ops.WindowOp): + if not hasattr(op, "name"): + raise ValueError(f"The operator must have a 'name' attribute. Got {op}") + else: + key = typing.cast(str, op.name) + return self._registered_ops[key] + return self._registered_ops[op] diff --git a/bigframes/core/compile/sqlglot/aggregations/ordered_unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/ordered_unary_compiler.py new file mode 100644 index 0000000000..dea30ec206 --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/ordered_unary_compiler.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes.core import window_spec +import bigframes.core.compile.sqlglot.aggregations.op_registration as reg +import bigframes.core.compile.sqlglot.expressions.typed_expr as typed_expr +from bigframes.operations import aggregations as agg_ops + +ORDERED_UNARY_OP_REGISTRATION = reg.OpRegistration() + + +def compile( + op: agg_ops.WindowOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, + order_by: typing.Sequence[sge.Expression] = [], +) -> sge.Expression: + return ORDERED_UNARY_OP_REGISTRATION[op]( + op, column, window=window, order_by=order_by + ) diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py new file mode 100644 index 0000000000..75ba090bc4 --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -0,0 +1,56 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes.core import window_spec +import bigframes.core.compile.sqlglot.aggregations.op_registration as reg +from bigframes.core.compile.sqlglot.aggregations.utils import apply_window_if_present +import bigframes.core.compile.sqlglot.expressions.typed_expr as typed_expr +import bigframes.core.compile.sqlglot.sqlglot_ir as ir +from bigframes.operations import aggregations as agg_ops + +UNARY_OP_REGISTRATION = reg.OpRegistration() + + +def compile( + op: agg_ops.WindowOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return UNARY_OP_REGISTRATION[op](op, column, window=window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.SumOp) +def _( + op: agg_ops.SumOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + # Will be null if all inputs are null. Pandas defaults to zero sum though. + expr = apply_window_if_present(sge.func("SUM", column.expr), window) + return sge.func("IFNULL", expr, ir._literal(0, column.dtype)) + + +@UNARY_OP_REGISTRATION.register(agg_ops.SizeUnaryOp) +def _( + op: agg_ops.SizeUnaryOp, + _, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) diff --git a/bigframes/core/compile/sqlglot/aggregations/utils.py b/bigframes/core/compile/sqlglot/aggregations/utils.py new file mode 100644 index 0000000000..57470cde5b --- /dev/null +++ b/bigframes/core/compile/sqlglot/aggregations/utils.py @@ -0,0 +1,29 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import typing + +import sqlglot.expressions as sge + +from bigframes.core import window_spec + + +def apply_window_if_present( + value: sge.Expression, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + if window is not None: + raise NotImplementedError("Can't apply window to the expression.") + return value diff --git a/bigframes/core/compile/sqlglot/expressions/op_registration.py b/bigframes/core/compile/sqlglot/expressions/op_registration.py index e30b58a6d2..d5e4853a45 100644 --- a/bigframes/core/compile/sqlglot/expressions/op_registration.py +++ b/bigframes/core/compile/sqlglot/expressions/op_registration.py @@ -48,7 +48,7 @@ def arg_checker(*args, **kwargs): return decorator - def __getitem__(self, key: str | ops.ScalarOp) -> CompilationFunc: - if isinstance(key, ops.ScalarOp): - return self._registered_ops[key.name] - return self._registered_ops[key] + def __getitem__(self, op: str | ops.ScalarOp) -> CompilationFunc: + if isinstance(op, ops.ScalarOp): + return self._registered_ops[op.name] + return self._registered_ops[op] diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py index 9cca15f352..22079a9a6d 100644 --- a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py @@ -23,6 +23,14 @@ from bigframes.core.compile.sqlglot.expressions.op_registration import OpRegistration from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr +_NAN = sge.Cast(this=sge.convert("NaN"), to="FLOAT64") +_INF = sge.Cast(this=sge.convert("Infinity"), to="FLOAT64") + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = sge.convert(709.78) + UNARY_OP_REGISTRATION = OpRegistration() @@ -30,6 +38,73 @@ def compile(op: ops.UnaryOp, expr: TypedExpr) -> sge.Expression: return UNARY_OP_REGISTRATION[op](op, expr) +@UNARY_OP_REGISTRATION.register(ops.abs_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Abs(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.arccosh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=expr.expr < sge.convert(1), + true=_NAN, + ) + ], + default=sge.func("ACOSH", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.arccos_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=sge.func("ABS", expr.expr) > sge.convert(1), + true=_NAN, + ) + ], + default=sge.func("ACOS", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.arcsin_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=sge.func("ABS", expr.expr) > sge.convert(1), + true=_NAN, + ) + ], + default=sge.func("ASIN", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.arcsinh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("ASINH", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.arctan_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("ATAN", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.arctanh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=sge.func("ABS", expr.expr) > sge.convert(1), + true=_NAN, + ) + ], + default=sge.func("ATANH", expr.expr), + ) + + @UNARY_OP_REGISTRATION.register(ops.ArrayToStringOp) def _(op: ops.ArrayToStringOp, expr: TypedExpr) -> sge.Expression: return sge.ArrayToString(this=expr.expr, expression=f"'{op.delimiter}'") @@ -72,6 +147,209 @@ def _(op: ops.ArraySliceOp, expr: TypedExpr) -> sge.Expression: return sge.array(selected_elements) +@UNARY_OP_REGISTRATION.register(ops.capitalize_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Initcap(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.ceil_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Ceil(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.cos_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("COS", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.cosh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=sge.func("ABS", expr.expr) > sge.convert(709.78), + true=_INF, + ) + ], + default=sge.func("COSH", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.date_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Date(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.day_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Extract(this=sge.Identifier(this="DAY"), expression=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.dayofweek_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + # Adjust the 1-based day-of-week index (from SQL) to a 0-based index. + return sge.Extract( + this=sge.Identifier(this="DAYOFWEEK"), expression=expr.expr + ) - sge.convert(1) + + +@UNARY_OP_REGISTRATION.register(ops.dayofyear_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Extract(this=sge.Identifier(this="DAYOFYEAR"), expression=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.exp_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=expr.expr > _FLOAT64_EXP_BOUND, + true=_INF, + ) + ], + default=sge.func("EXP", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.expm1_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=expr.expr > _FLOAT64_EXP_BOUND, + true=_INF, + ) + ], + default=sge.func("EXP", expr.expr), + ) - sge.convert(1) + + +@UNARY_OP_REGISTRATION.register(ops.floor_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Floor(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.hash_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("FARM_FINGERPRINT", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.hour_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Extract(this=sge.Identifier(this="HOUR"), expression=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.invert_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.BitwiseNot(this=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.isalnum_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^(\p{N}|\p{L})+$")) + + +@UNARY_OP_REGISTRATION.register(ops.isalpha_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^\p{L}+$")) + + +@UNARY_OP_REGISTRATION.register(ops.isdecimal_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^\d+$")) + + +@UNARY_OP_REGISTRATION.register(ops.isdigit_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^\p{Nd}+$")) + + +@UNARY_OP_REGISTRATION.register(ops.islower_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.And( + this=sge.EQ( + this=sge.Lower(this=expr.expr), + expression=expr.expr, + ), + expression=sge.NEQ( + this=sge.Upper(this=expr.expr), + expression=expr.expr, + ), + ) + + +@UNARY_OP_REGISTRATION.register(ops.isnumeric_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^\pN+$")) + + +@UNARY_OP_REGISTRATION.register(ops.isspace_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.RegexpLike(this=expr.expr, expression=sge.convert(r"^\s+$")) + + +@UNARY_OP_REGISTRATION.register(ops.isupper_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.And( + this=sge.EQ( + this=sge.Upper(this=expr.expr), + expression=expr.expr, + ), + expression=sge.NEQ( + this=sge.Lower(this=expr.expr), + expression=expr.expr, + ), + ) + + +@UNARY_OP_REGISTRATION.register(ops.iso_day_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Extract(this=sge.Identifier(this="DAYOFWEEK"), expression=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.iso_week_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Extract(this=sge.Identifier(this="ISOWEEK"), expression=expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.isnull_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Is(this=expr.expr, expression=sge.Null()) + + +@UNARY_OP_REGISTRATION.register(ops.notnull_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Not(this=sge.Is(this=expr.expr, expression=sge.Null())) + + +@UNARY_OP_REGISTRATION.register(ops.sin_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("SIN", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.sinh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.Case( + ifs=[ + sge.If( + this=sge.func("ABS", expr.expr) > _FLOAT64_EXP_BOUND, + true=sge.func("SIGN", expr.expr) * _INF, + ) + ], + default=sge.func("SINH", expr.expr), + ) + + +@UNARY_OP_REGISTRATION.register(ops.tan_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("TAN", expr.expr) + + +@UNARY_OP_REGISTRATION.register(ops.tanh_op) +def _(op: ops.base_ops.UnaryOp, expr: TypedExpr) -> sge.Expression: + return sge.func("TANH", expr.expr) + + # JSON Ops @UNARY_OP_REGISTRATION.register(ops.JSONExtract) def _(op: ops.JSONExtract, expr: TypedExpr) -> sge.Expression: diff --git a/bigframes/core/rewrite/schema_binding.py b/bigframes/core/rewrite/schema_binding.py index 40a00ff8f6..f7f2ca8c59 100644 --- a/bigframes/core/rewrite/schema_binding.py +++ b/bigframes/core/rewrite/schema_binding.py @@ -65,6 +65,16 @@ def bind_schema_to_node( node, conditions=conditions, ) + if isinstance(node, nodes.InNode): + return dataclasses.replace( + node, + left_col=ex.ResolvedDerefOp.from_field( + node.left_child.field_by_id[node.left_col.id] + ), + right_col=ex.ResolvedDerefOp.from_field( + node.right_child.field_by_id[node.right_col.id] + ), + ) if isinstance(node, nodes.AggregateNode): aggregations = [] diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0be31505df..a58619dc21 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: def infer_literal_type(literal) -> typing.Optional[Dtype]: # Maybe also normalize literal to canonical python representation to remove this burden from compilers? + if isinstance(literal, pa.Scalar): + return arrow_dtype_to_bigframes_dtype(literal.type) if pd.api.types.is_list_like(literal): element_types = [infer_literal_type(i) for i in literal] common_type = lcd_type(*element_types) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 1833ac489c..2c9dd0cb31 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -202,6 +202,9 @@ def provision_bq_managed_function( output_type: str, name: Optional[str], packages: Optional[Sequence[str]], + max_batching_rows: Optional[int], + container_cpu: Optional[float], + container_memory: Optional[str], is_row_processor: bool, bq_connection_id, *, @@ -234,6 +237,12 @@ def provision_bq_managed_function( "runtime_version": _MANAGED_FUNC_PYTHON_VERSION, "entry_point": "bigframes_handler", } + if max_batching_rows: + managed_function_options["max_batching_rows"] = max_batching_rows + if container_cpu: + managed_function_options["container_cpu"] = container_cpu + if container_memory: + managed_function_options["container_memory"] = container_memory # Augment user package requirements with any internal package # requirements. diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a7910127e4..22e6981c38 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -702,6 +702,9 @@ def udf( bigquery_connection: Optional[str] = None, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, + max_batching_rows: Optional[int] = None, + container_cpu: Optional[float] = None, + container_memory: Optional[str] = None, ): """Decorator to turn a Python user defined function (udf) into a BigQuery managed function. @@ -769,6 +772,21 @@ def udf( dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + max_batching_rows (int, Optional): + The maximum number of rows in each batch. If you specify + max_batching_rows, BigQuery determines the number of rows in a + batch, up to the max_batching_rows limit. If max_batching_rows + is not specified, the number of rows to batch is determined + automatically. + container_cpu (float, Optional): + The CPU limits for containers that run Python UDFs. By default, + the CPU allocated is 0.33 vCPU. See details at + https://cloud.google.com/bigquery/docs/user-defined-functions-python#configure-container-limits. + container_memory (str, Optional): + The memory limits for containers that run Python UDFs. By + default, the memory allocated to each container instance is + 512 MiB. See details at + https://cloud.google.com/bigquery/docs/user-defined-functions-python#configure-container-limits. """ warnings.warn("udf is in preview.", category=bfe.PreviewWarning, stacklevel=5) @@ -854,6 +872,9 @@ def wrapper(func): output_type=udf_sig.sql_output_type, name=name, packages=packages, + max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, is_row_processor=is_row_processor, bq_connection_id=bq_connection_id, ) diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py index e79b46877b..f6c7d5e52f 100644 --- a/bigframes/ml/metrics/__init__.py +++ b/bigframes/ml/metrics/__init__.py @@ -18,6 +18,7 @@ auc, confusion_matrix, f1_score, + mean_absolute_error, mean_squared_error, precision_score, r2_score, @@ -36,6 +37,7 @@ "confusion_matrix", "precision_score", "f1_score", + "mean_absolute_error", "mean_squared_error", "pairwise", ] diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index d7591ef011..c9639f4b16 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -344,3 +344,17 @@ def mean_squared_error( mean_squared_error.__doc__ = inspect.getdoc( vendored_metrics_regression.mean_squared_error ) + + +def mean_absolute_error( + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], +) -> float: + y_true_series, y_pred_series = utils.batch_convert_to_series(y_true, y_pred) + + return (y_pred_series - y_true_series).abs().sum() / len(y_true_series) + + +mean_absolute_error.__doc__ = inspect.getdoc( + vendored_metrics_regression.mean_absolute_error +) diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 56320e7cc6..14bf10f463 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -49,10 +49,18 @@ def day(self) -> series.Series: def dayofweek(self) -> series.Series: return self._apply_unary_op(ops.dayofweek_op) + @property + def day_of_week(self) -> series.Series: + return self.dayofweek + @property def dayofyear(self) -> series.Series: return self._apply_unary_op(ops.dayofyear_op) + @property + def day_of_year(self) -> series.Series: + return self.dayofyear + @property def date(self) -> series.Series: return self._apply_unary_op(ops.date_op) diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 64eec9d8a1..afdc924c0b 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -140,7 +140,8 @@ class AddOp(base_ops.BinaryOp): def output_type(self, *input_types): left_type = input_types[0] right_type = input_types[1] - if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: + # TODO: Binary/bytes addition requires impl + if all(map(lambda t: t == dtypes.STRING_DTYPE, input_types)): # String addition return input_types[0] @@ -179,7 +180,10 @@ def output_type(self, *input_types): left_type = input_types[0] right_type = input_types[1] - if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): + if left_type == dtypes.DATETIME_DTYPE and right_type == dtypes.DATETIME_DTYPE: + return dtypes.TIMEDELTA_DTYPE + + if left_type == dtypes.TIMESTAMP_DTYPE and right_type == dtypes.TIMESTAMP_DTYPE: return dtypes.TIMEDELTA_DTYPE if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE: @@ -194,6 +198,9 @@ def output_type(self, *input_types): if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE + if left_type == dtypes.BOOL_DTYPE and right_type == dtypes.BOOL_DTYPE: + raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): @@ -214,9 +221,15 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT left_type = input_types[0] right_type = input_types[1] - if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + if left_type == dtypes.TIMEDELTA_DTYPE and right_type in ( + dtypes.INT_DTYPE, + dtypes.FLOAT_DTYPE, + ): return dtypes.TIMEDELTA_DTYPE - if dtypes.is_numeric(left_type) and right_type == dtypes.TIMEDELTA_DTYPE: + if ( + left_type in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE) + and right_type == dtypes.TIMEDELTA_DTYPE + ): return dtypes.TIMEDELTA_DTYPE if (left_type is None or dtypes.is_numeric(left_type)) and ( @@ -239,11 +252,15 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT right_type = input_types[1] if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): + # will fail outright if result undefined or otherwise can't be coerced back into an int return dtypes.TIMEDELTA_DTYPE if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.FLOAT_DTYPE + if left_type == dtypes.BOOL_DTYPE and right_type == dtypes.BOOL_DTYPE: + raise TypeError(f"Cannot divide dtypes {left_type} and {right_type}") + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): @@ -265,11 +282,14 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT left_type = input_types[0] right_type = input_types[1] + if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: + return dtypes.INT_DTYPE + if left_type == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right_type): return dtypes.TIMEDELTA_DTYPE - if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: - return dtypes.INT_DTYPE + if left_type == dtypes.BOOL_DTYPE and right_type == dtypes.BOOL_DTYPE: + raise TypeError(f"Cannot floor divide dtypes {left_type} and {right_type}") if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) @@ -292,6 +312,14 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE: return dtypes.TIMEDELTA_DTYPE + if left_type in ( + dtypes.NUMERIC_DTYPE, + dtypes.BIGNUMERIC_DTYPE, + ) or right_type in (dtypes.NUMERIC_DTYPE, dtypes.BIGNUMERIC_DTYPE): + raise TypeError(f"Cannot mod dtypes {left_type} and {right_type}") + + if left_type == dtypes.BOOL_DTYPE and right_type == dtypes.BOOL_DTYPE: + raise TypeError(f"Cannot mod dtypes {left_type} and {right_type}") if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index f163d25757..76e0f8719b 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -142,6 +142,9 @@ def udf( bigquery_connection: Optional[str] = None, name: str, packages: Optional[Sequence[str]] = None, + max_batching_rows: Optional[int] = None, + container_cpu: Optional[float] = None, + container_memory: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.udf, @@ -151,6 +154,9 @@ def udf( bigquery_connection=bigquery_connection, name=name, packages=packages, + max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 2c9dea2d19..d27cd48cdd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1686,6 +1686,9 @@ def udf( bigquery_connection: Optional[str] = None, name: str, packages: Optional[Sequence[str]] = None, + max_batching_rows: Optional[int] = None, + container_cpu: Optional[float] = None, + container_memory: Optional[str] = None, ): """Decorator to turn a Python user defined function (udf) into a [BigQuery managed user-defined function](https://cloud.google.com/bigquery/docs/user-defined-functions-python). @@ -1807,6 +1810,21 @@ def udf( dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + max_batching_rows (int, Optional): + The maximum number of rows in each batch. If you specify + max_batching_rows, BigQuery determines the number of rows in a + batch, up to the max_batching_rows limit. If max_batching_rows + is not specified, the number of rows to batch is determined + automatically. + container_cpu (float, Optional): + The CPU limits for containers that run Python UDFs. By default, + the CPU allocated is 0.33 vCPU. See details at + https://cloud.google.com/bigquery/docs/user-defined-functions-python#configure-container-limits. + container_memory (str, Optional): + The memory limits for containers that run Python UDFs. By + default, the memory allocated to each container instance is + 512 MiB. See details at + https://cloud.google.com/bigquery/docs/user-defined-functions-python#configure-container-limits. Returns: collections.abc.Callable: A managed function object pointing to the cloud assets created @@ -1828,6 +1846,9 @@ def udf( bigquery_connection=bigquery_connection, name=name, packages=packages, + max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, ) def read_gbq_function( diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index add4efb6ab..c264abd860 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -983,6 +983,8 @@ def read_gbq_query( ) job_config.dry_run = True query_job = self._bqclient.query(query, job_config=job_config) + if self._metrics is not None: + self._metrics.count_job_stats(query_job=query_job) return dry_runs.get_query_stats_with_inferred_dtypes( query_job, list(columns), index_cols ) diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 48cb92a8b4..75f247b028 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -51,6 +51,9 @@ def count_job_stats( write_stats_to_disk(len(query), total_bytes_processed) return + if query_job.configuration.dry_run: + write_stats_to_disk(len(query_job.query), 0, 0, 0) + stats = get_performance_stats(query_job) if stats is not None: query_char_count, bytes_processed, slot_millis, execution_secs = stats diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 3c23e4c200..9b2346a7ed 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -21,6 +21,7 @@ from bigframes.core import array_value, bigframe_node, expression, local_data, nodes import bigframes.operations from bigframes.operations import aggregations as agg_ops +from bigframes.operations import comparison_ops, numeric_ops from bigframes.session import executor, semi_executor if TYPE_CHECKING: @@ -38,16 +39,23 @@ nodes.FilterNode, nodes.ConcatNode, nodes.JoinNode, + nodes.InNode, ) _COMPATIBLE_SCALAR_OPS = ( - bigframes.operations.eq_op, - bigframes.operations.eq_null_match_op, - bigframes.operations.ne_op, - bigframes.operations.gt_op, - bigframes.operations.lt_op, - bigframes.operations.ge_op, - bigframes.operations.le_op, + comparison_ops.EqOp, + comparison_ops.EqNullsMatchOp, + comparison_ops.NeOp, + comparison_ops.LtOp, + comparison_ops.GtOp, + comparison_ops.LeOp, + comparison_ops.GeOp, + numeric_ops.AddOp, + numeric_ops.SubOp, + numeric_ops.MulOp, + numeric_ops.DivOp, + numeric_ops.FloorDivOp, + numeric_ops.ModOp, ) _COMPATIBLE_AGG_OPS = ( agg_ops.SizeOp, @@ -74,7 +82,7 @@ def _is_node_polars_executable(node: nodes.BigFrameNode): if not type(expr.op) in _COMPATIBLE_AGG_OPS: return False if isinstance(expr, expression.Expression): - if not _get_expr_ops(expr).issubset(_COMPATIBLE_SCALAR_OPS): + if not set(map(type, _get_expr_ops(expr))).issubset(_COMPATIBLE_SCALAR_OPS): return False return True @@ -117,7 +125,8 @@ def _can_execute(self, plan: bigframe_node.BigFrameNode): def _adapt_array(self, array: pa.Array) -> pa.Array: target_type = local_data.logical_type_replacements(array.type) if target_type != array.type: - return array.cast(target_type) + # Safe is false to handle weird polars decimal scaling + return array.cast(target_type, safe=False) return array def _adapt_batch(self, batch: pa.RecordBatch) -> pa.RecordBatch: diff --git a/bigframes/testing/engine_utils.py b/bigframes/testing/engine_utils.py index 8aa52cf51a..625d1727ee 100644 --- a/bigframes/testing/engine_utils.py +++ b/bigframes/testing/engine_utils.py @@ -31,4 +31,4 @@ def assert_equivalence_execution( assert e1_result.schema == e2_result.schema e1_table = e1_result.to_pandas() e2_table = e2_result.to_pandas() - pandas.testing.assert_frame_equal(e1_table, e2_table, rtol=1e-10) + pandas.testing.assert_frame_equal(e1_table, e2_table, rtol=1e-5) diff --git a/bigframes/version.py b/bigframes/version.py index 9e7a386601..4eec2e8af7 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.11.0" +__version__ = "2.12.0" # {x-release-please-start-date} -__release_date__ = "2025-07-15" +__release_date__ = "2025-07-23" # {x-release-please-end} diff --git a/samples/dbt/.dbt.yml b/samples/dbt/.dbt.yml new file mode 100644 index 0000000000..98053bfc37 --- /dev/null +++ b/samples/dbt/.dbt.yml @@ -0,0 +1,13 @@ +dbt_sample_project: + outputs: + dev: # The target environment name (e.g., dev, prod) + compute_region: us-central1 # Region used for compute operations + dataset: dbt_sample_dateset # BigQuery dataset where dbt will create models + gcs_bucket: dbt_sample_bucket # GCS bucket to store output files + location: US # BigQuery dataset location + method: oauth # Authentication method + priority: interactive # Job priority: "interactive" or "batch" + project: bigframes-dev # GCP project ID + threads: 1 # Number of threads dbt can use for running models in parallel + type: bigquery # Specifies the dbt adapter + target: dev # The default target environment diff --git a/samples/dbt/README.md b/samples/dbt/README.md new file mode 100644 index 0000000000..c52b633116 --- /dev/null +++ b/samples/dbt/README.md @@ -0,0 +1,62 @@ +# dbt BigFrames Integration + +This repository provides simple examples of using **dbt Python models** with **BigQuery** in **BigFrames** mode. + +It includes basic configurations and sample models to help you get started quickly in a typical dbt project. + +## Highlights + +- `profiles.yml`: configures your connection to BigQuery. +- `dbt_project.yml`: configures your dbt project - **dbt_sample_project**. +- `dbt_bigframes_code_sample_1.py`: An example to read BigQuery data and perform basic transformation. +- `dbt_bigframes_code_sample_2.py`: An example to build an incremental model that leverages BigFrames UDF capabilities. + +## Requirements + +Before using this project, ensure you have: + +- A [Google Cloud account](https://cloud.google.com/free?hl=en) +- A [dbt Cloud account](https://www.getdbt.com/signup) (if using dbt Cloud) +- Python and SQL basics +- Familiarity with dbt concepts and structure + +For more, see: +- https://docs.getdbt.com/guides/dbt-python-bigframes +- https://cloud.google.com/bigquery/docs/dataframes-dbt + +## Run Locally + +Follow these steps to run the Python models using dbt Core. + +1. **Install the dbt BigQuery adapter:** + + ```bash + pip install dbt-bigquery + ``` + +2. **Initialize a dbt project (if not already done):** + + ```bash + dbt init + ``` + + Follow the prompts to complete setup. + +3. **Finish the configuration and add sample code:** + + - Edit `~/.dbt/profiles.yml` to finish the configuration. + - Replace or add code samples in `.../models/example`. + +4. **Run your dbt models:** + + To run all models: + + ```bash + dbt run + ``` + + Or run a specific model: + + ```bash + dbt run --select your_model_name + ``` \ No newline at end of file diff --git a/samples/dbt/dbt_sample_project/dbt_project.yml b/samples/dbt/dbt_sample_project/dbt_project.yml new file mode 100644 index 0000000000..d12098a18a --- /dev/null +++ b/samples/dbt/dbt_sample_project/dbt_project.yml @@ -0,0 +1,39 @@ + +# Name your project! Project names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: 'dbt_sample_project' +version: '1.0.0' + +# This setting configures which "profile" dbt uses for this project. +profile: 'dbt_sample_project' + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that models in this project can be +# found in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + + +# Configuring models +# Full documentation: https://docs.getdbt.com/docs/configuring-models + +# In this example config, we tell dbt to build all models in the example/ +# directory as views. These settings can be overridden in the individual model +# files using the `{{ config(...) }}` macro. +models: + dbt_sample_project: + # Optional: These settings (e.g., submission_method, notebook_template_id, + # etc.) can also be defined directly in the Python model using dbt.config. + submission_method: bigframes + # Config indicated by + and applies to all files under models/example/ + example: + +materialized: view diff --git a/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_1.py b/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_1.py new file mode 100644 index 0000000000..4c8ddf8f6c --- /dev/null +++ b/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_1.py @@ -0,0 +1,58 @@ +# This example demonstrates one of the most general usages of transforming raw +# BigQuery data into a processed table using a dbt Python model with BigFrames. +# See more from: https://cloud.google.com/bigquery/docs/dataframes-dbt. +# +# Key defaults when using BigFrames in a dbt Python model for BigQuery: +# - The default materialization is 'table' unless specified otherwise. This +# means dbt will create a new BigQuery table from the result of this model. +# - The default timeout for the job is 3600 seconds (60 minutes). This can be +# adjusted if your processing requires more time. +# - If no runtime template is provided, dbt will automatically create and reuse +# a default one for executing the Python code in BigQuery. +# +# BigFrames provides a pandas-like API for BigQuery data, enabling familiar +# data manipulation directly within your dbt project. This code sample +# illustrates a basic pattern for: +# 1. Reading data from an existing BigQuery dataset. +# 2. Processing it using pandas-like DataFrame operations powered by BigFrames. +# 3. Outputting a cleaned and transformed table, managed by dbt. + + +def model(dbt, session): + # Optional: Override settings from your dbt_project.yml file. + # When both are set, dbt.config takes precedence over dbt_project.yml. + # + # Use `dbt.config(submission_method="bigframes")` to tell dbt to execute + # this Python model using BigQuery DataFrames (BigFrames). This allows you + # to write pandas-like code that operates directly on BigQuery data + # without needing to pull all data into memory. + dbt.config(submission_method="bigframes") + + # Define the BigQuery table path from which to read data. + table = "bigquery-public-data.epa_historical_air_quality.temperature_hourly_summary" + + # Define the specific columns to select from the BigQuery table. + columns = ["state_name", "county_name", "date_local", "time_local", "sample_measurement"] + + # Read data from the specified BigQuery table into a BigFrames DataFrame. + df = session.read_gbq(table, columns=columns) + + # Sort the DataFrame by the specified columns. This prepares the data for + # `drop_duplicates` to ensure consistent duplicate removal. + df = df.sort_values(columns).drop_duplicates(columns) + + # Group the DataFrame by 'state_name', 'county_name', and 'date_local'. For + # each group, calculate the minimum and maximum of the 'sample_measurement' + # column. The result will be a BigFrames DataFrame with a MultiIndex. + result = df.groupby(["state_name", "county_name", "date_local"])["sample_measurement"]\ + .agg(["min", "max"]) + + # Rename some columns and convert the MultiIndex of the 'result' DataFrame + # into regular columns. This flattens the DataFrame so 'state_name', + # 'county_name', and 'date_local' become regular columns again. + result = result.rename(columns={'min': 'min_temperature', 'max': 'max_temperature'})\ + .reset_index() + + # Return the processed BigFrames DataFrame. + # In a dbt Python model, this DataFrame will be materialized as a table + return result diff --git a/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_2.py b/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_2.py new file mode 100644 index 0000000000..019e503393 --- /dev/null +++ b/samples/dbt/dbt_sample_project/models/example/dbt_bigframes_code_sample_2.py @@ -0,0 +1,67 @@ +# This example demonstrates how to build an **incremental dbt Python model** +# using BigFrames. +# +# Incremental models are essential for efficiently processing large datasets by +# only transforming new or changed data, rather than reprocessing the entire +# dataset every time. If the target table already exists, dbt will perform a +# merge based on the specified unique keys; otherwise, it will create a new +# table automatically. +# +# This model also showcases the definition and application of a **BigFrames +# User-Defined Function (UDF)** to add a descriptive summary column based on +# temperature data. BigFrames UDFs allow you to execute custom Python logic +# directly within BigQuery, leveraging BigQuery's scalability. + + +import bigframes.pandas as bpd + +def model(dbt, session): + # Optional: override settings from dbt_project.yml. + # When both are set, dbt.config takes precedence over dbt_project.yml. + dbt.config( + # Use BigFrames mode to execute this Python model. This enables + # pandas-like operations directly on BigQuery data. + submission_method="bigframes", + # Materialize this model as an 'incremental' table. This tells dbt to + # only process new or updated data on subsequent runs. + materialized='incremental', + # Use MERGE strategy to update rows during incremental runs. + incremental_strategy='merge', + # Define the composite key that uniquely identifies a row in the + # target table. This key is used by the 'merge' strategy to match + # existing rows for updates during incremental runs. + unique_key=["state_name", "county_name", "date_local"], + ) + + # Reference an upstream dbt model or an existing BigQuery table as a + # BigFrames DataFrame. It allows you to seamlessly use the output of another + # dbt model as input to this one. + df = dbt.ref("dbt_bigframes_code_sample_1") + + # Define a BigFrames UDF to generate a temperature description. + # BigFrames UDFs allow you to define custom Python logic that executes + # directly within BigQuery. This is powerful for complex transformations. + @bpd.udf(dataset='dbt_sample_dataset', name='describe_udf') + def describe( + max_temperature: float, + min_temperature: float, + ) -> str: + is_hot = max_temperature > 85.0 + is_cold = min_temperature < 50.0 + + if is_hot and is_cold: + return "Expect both hot and cold conditions today." + if is_hot: + return "Overall, it's a hot day." + if is_cold: + return "Overall, it's a cold day." + return "Comfortable throughout the day." + + # Apply the UDF using combine and store the result in a column "describe". + df["describe"] = df["max_temperature"].combine(df["min_temperature"], describe) + + # Return the transformed BigFrames DataFrame. + # This DataFrame will be the final output of your incremental dbt model. + # On subsequent runs, only new or changed rows will be processed and merged + # into the target BigQuery table based on the `unique_key`. + return df diff --git a/samples/snippets/performance_optimizations_test.py b/samples/snippets/performance_optimizations_test.py new file mode 100644 index 0000000000..43e14e31cc --- /dev/null +++ b/samples/snippets/performance_optimizations_test.py @@ -0,0 +1,52 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_performance_optimizations() -> None: + # [START bigquery_bigframes_use_peek_to_preview_data] + import bigframes.pandas as bpd + + # Read the "Penguins" table into a dataframe + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + # Preview 3 random rows + df.peek(3) + # [END bigquery_bigframes_use_peek_to_preview_data] + assert df.peek(3) is not None + + import bigframes.pandas as bpd + + users = bpd.DataFrame({"user_name": ["John"]}) + groups = bpd.DataFrame({"group_id": ["group_1"]}) + transactions = bpd.DataFrame({"amount": [3], "completed": [True]}) + + # [START bigquery_bigframes_use_cache_after_expensive_operations] + # Assume you have 3 large dataframes "users", "group" and "transactions" + + # Expensive join operations + final_df = users.join(groups).join(transactions) + final_df.cache() + # Subsequent derived results will reuse the cached join + print(final_df.peek()) + print(len(final_df[final_df["completed"]])) + print(final_df.groupby("group_id")["amount"].mean().peek(30)) + # [END bigquery_bigframes_use_cache_after_expensive_operations] + assert final_df is not None + + # [START bigquery_bigframes_enable_deferred_repr_for_debugging] + import bigframes.pandas as bpd + + bpd.options.display.repr_mode = "deferred" + # [END bigquery_bigframes_enable_deferred_repr_for_debugging] + assert bpd.options.display.repr_mode == "deferred" diff --git a/samples/snippets/sessions_and_io_test.py b/samples/snippets/sessions_and_io_test.py new file mode 100644 index 0000000000..98c2c71424 --- /dev/null +++ b/samples/snippets/sessions_and_io_test.py @@ -0,0 +1,169 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_sessions_and_io(project_id: str, dataset_id: str) -> None: + YOUR_PROJECT_ID = project_id + YOUR_LOCATION = "us" + + # [START bigquery_dataframes_create_and_use_session_instance] + import bigframes + import bigframes.pandas as bpd + + # Create session object + context = bigframes.BigQueryOptions( + project=YOUR_PROJECT_ID, + location=YOUR_LOCATION, + ) + session = bigframes.Session(context) + + # Load a BigQuery table into a dataframe + df1 = session.read_gbq("bigquery-public-data.ml_datasets.penguins") + + # Create a dataframe with local data: + df2 = bpd.DataFrame({"my_col": [1, 2, 3]}, session=session) + # [END bigquery_dataframes_create_and_use_session_instance] + assert df1 is not None + assert df2 is not None + + # [START bigquery_dataframes_combine_data_from_multiple_sessions_raise_error] + import bigframes + import bigframes.pandas as bpd + + context = bigframes.BigQueryOptions(location=YOUR_LOCATION, project=YOUR_PROJECT_ID) + + session1 = bigframes.Session(context) + session2 = bigframes.Session(context) + + series1 = bpd.Series([1, 2, 3, 4, 5], session=session1) + series2 = bpd.Series([1, 2, 3, 4, 5], session=session2) + + try: + series1 + series2 + except ValueError as e: + print(e) # Error message: Cannot use combine sources from multiple sessions + # [END bigquery_dataframes_combine_data_from_multiple_sessions_raise_error] + + # [START bigquery_dataframes_set_options_for_global_session] + import bigframes.pandas as bpd + + # Set project ID for the global session + bpd.options.bigquery.project = YOUR_PROJECT_ID + # Update the global default session location + bpd.options.bigquery.location = YOUR_LOCATION + # [END bigquery_dataframes_set_options_for_global_session] + + # [START bigquery_dataframes_global_session_is_the_default_session] + # The following two statements are essentiall the same + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + df = bpd.get_global_session().read_gbq("bigquery-public-data.ml_datasets.penguins") + # [END bigquery_dataframes_global_session_is_the_default_session] + assert df is not None + + # [START bigquery_dataframes_create_dataframe_from_py_and_np] + import numpy as np + + import bigframes.pandas as bpd + + s = bpd.Series([1, 2, 3]) + + # Create a dataframe with Python dict + df = bpd.DataFrame( + { + "col_1": [1, 2, 3], + "col_2": [4, 5, 6], + } + ) + + # Create a series with Numpy + s = bpd.Series(np.arange(10)) + # [END bigquery_dataframes_create_dataframe_from_py_and_np] + assert s is not None + + # [START bigquery_dataframes_create_dataframe_from_pandas] + import numpy as np + import pandas as pd + + import bigframes.pandas as bpd + + pd_df = pd.DataFrame(np.random.randn(4, 2)) + + # Convert Pandas dataframe to BigQuery DataFrame with read_pandas() + df_1 = bpd.read_pandas(pd_df) + # Convert Pandas dataframe to BigQuery DataFrame with the dataframe constructor + df_2 = bpd.DataFrame(pd_df) + # [END bigquery_dataframes_create_dataframe_from_pandas] + assert df_1 is not None + assert df_2 is not None + + # [START bigquery_dataframes_convert_bq_dataframe_to_pandas] + import bigframes.pandas as bpd + + bf_df = bpd.DataFrame({"my_col": [1, 2, 3]}) + # Returns a Pandas Dataframe + bf_df.to_pandas() + + bf_s = bpd.Series([1, 2, 3]) + # Returns a Pandas Series + bf_s.to_pandas() + # [END bigquery_dataframes_convert_bq_dataframe_to_pandas] + assert bf_s.to_pandas() is not None + + # [START bigquery_dataframes_to_pandas_dry_run] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + + # Returns a Pandas series with dry run stats + df.to_pandas(dry_run=True) + # [END bigquery_dataframes_to_pandas_dry_run] + assert df.to_pandas(dry_run=True) is not None + + # [START bigquery_dataframes_read_data_from_csv] + import bigframes.pandas as bpd + + # Read a CSV file from GCS + df = bpd.read_csv("gs://cloud-samples-data/bigquery/us-states/us-states.csv") + # [END bigquery_dataframes_read_data_from_csv] + assert df is not None + + # [START bigquery_dataframes_read_data_from_bigquery_table] + import bigframes.pandas as bpd + + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + # [END bigquery_dataframes_read_data_from_bigquery_table] + assert df is not None + + # [START bigquery_dataframes_read_from_sql_query] + import bigframes.pandas as bpd + + sql = """ + SELECT species, island, body_mass_g + FROM bigquery-public-data.ml_datasets.penguins + WHERE sex = 'MALE' + """ + + df = bpd.read_gbq(sql) + # [END bigquery_dataframes_read_from_sql_query] + assert df is not None + + table_name = "snippets-session-and-io-test" + + # [START bigquery_dataframes_dataframe_to_bigquery_table] + import bigframes.pandas as bpd + + df = bpd.DataFrame({"my_col": [1, 2, 3]}) + + df.to_gbq(f"{project_id}.{dataset_id}.{table_name}") + # [END bigquery_dataframes_dataframe_to_bigquery_table] diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py index 0ea3a5e162..248322f619 100644 --- a/scripts/run_and_publish_benchmark.py +++ b/scripts/run_and_publish_benchmark.py @@ -100,7 +100,12 @@ def collect_benchmark_result( == len(local_seconds_files) ): raise ValueError( - "Mismatch in the number of report files for bytes, millis, seconds and query char count." + "Mismatch in the number of report files for bytes, millis, seconds and query char count: \n" + f"millis_files: {len(millis_files)}\n" + f"bq_seconds_files: {len(bq_seconds_files)}\n" + f"bytes_files: {len(bytes_files)}\n" + f"query_char_count_files: {len(query_char_count_files)}\n" + f"local_seconds_files: {len(local_seconds_files)}\n" ) has_full_metrics = len(bq_seconds_files) == len(local_seconds_files) diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index ad5849eb2f..c58610d1ff 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -549,3 +549,101 @@ def foo(x: int) -> int: finally: # Clean up the gcp assets created for the managed function. cleanup_function_assets(foo, session.bqclient, ignore_failures=False) + + +def test_managed_function_options(session, dataset_id, scalars_dfs): + try: + + def multiply_five(x: int) -> int: + return x * 5 + + mf_multiply_five = session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + max_batching_rows=100, + container_cpu=2, + container_memory="2Gi", + )(multiply_five) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_df = scalars_df["int64_col"] + bf_int64_df_filtered = bf_int64_df.dropna() + bf_result = bf_int64_df_filtered.apply(mf_multiply_five).to_pandas() + + pd_int64_df = scalars_pandas_df["int64_col"] + pd_int64_df_filtered = pd_int64_df.dropna() + pd_result = pd_int64_df_filtered.apply(multiply_five) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + # Make sure the read_gbq_function path works for this function. + multiply_five_ref = session.read_gbq_function( + function_name=mf_multiply_five.bigframes_bigquery_function, # type: ignore + ) + assert mf_multiply_five.bigframes_bigquery_function == multiply_five_ref.bigframes_bigquery_function # type: ignore + + bf_result = bf_int64_df_filtered.apply(multiply_five_ref).to_pandas() + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + # Retrieve the routine and validate its runtime configuration. + routine = session.bqclient.get_routine( + mf_multiply_five.bigframes_bigquery_function + ) + + # TODO(jialuo): Use the newly exposed class properties instead of + # accessing the hidden _properties after resolve of this issue: + # https://github.com/googleapis/python-bigquery/issues/2240. + assert routine._properties["externalRuntimeOptions"]["maxBatchingRows"] == "100" + assert routine._properties["externalRuntimeOptions"]["containerCpu"] == 2 + assert routine._properties["externalRuntimeOptions"]["containerMemory"] == "2Gi" + + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets( + mf_multiply_five, session.bqclient, ignore_failures=False + ) + + +def test_managed_function_options_errors(session, dataset_id): + def foo(x: int) -> int: + return 0 + + with pytest.raises( + google.api_core.exceptions.BadRequest, + # For CPU Value >= 1.0, the value must be one of [1, 2, ...]. + match="Invalid container_cpu function OPTIONS value", + ): + session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + max_batching_rows=100, + container_cpu=2.5, + container_memory="2Gi", + )(foo) + + with pytest.raises( + google.api_core.exceptions.BadRequest, + # For less than 1.0 CPU, the value must be no less than 0.33. + match="Invalid container_cpu function OPTIONS value", + ): + session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + max_batching_rows=100, + container_cpu=0.10, + container_memory="512Mi", + )(foo) + + with pytest.raises( + google.api_core.exceptions.BadRequest, + # For 2.00 CPU, the memory must be in the range of [256Mi, 8Gi]. + match="Invalid container_memory function OPTIONS value", + ): + session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + max_batching_rows=100, + container_cpu=2, + container_memory="64Mi", + )(foo) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 5e60f3ed9f..f3e97aeb85 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1707,7 +1707,7 @@ def analyze(row): { "dtype": row.dtype, "count": row.count(), - "min": row.max(), + "min": row.min(), "max": row.max(), "mean": row.mean(), "std": row.std(), diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index 8530a6fefa..c2fc9ad706 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -47,7 +47,7 @@ def apply_agg_to_all_valid( return new_arr -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_aggregate_size( scalars_array_value: array_value.ArrayValue, engine, @@ -84,7 +84,7 @@ def test_engines_unary_aggregates( assert_equivalence_execution(node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( "grouping_cols", [ diff --git a/tests/system/small/engines/test_join.py b/tests/system/small/engines/test_join.py index 402a41134b..91c199a437 100644 --- a/tests/system/small/engines/test_join.py +++ b/tests/system/small/engines/test_join.py @@ -88,3 +88,22 @@ def test_engines_cross_join( result, _ = scalars_array_value.relational_join(scalars_array_value, type="cross") assert_equivalence_execution(result.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize( + ("left_key", "right_key"), + [ + ("int64_col", "float64_col"), + ("float64_col", "int64_col"), + ("int64_too", "int64_col"), + ], +) +def test_engines_isin( + scalars_array_value: array_value.ArrayValue, engine, left_key, right_key +): + result, _ = scalars_array_value.isin( + scalars_array_value, lcol=left_key, rcol=right_key + ) + + assert_equivalence_execution(result.node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/engines/test_numeric_ops.py b/tests/system/small/engines/test_numeric_ops.py new file mode 100644 index 0000000000..b53da977f5 --- /dev/null +++ b/tests/system/small/engines/test_numeric_ops.py @@ -0,0 +1,170 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import itertools + +import pytest + +from bigframes.core import array_value, expression +import bigframes.operations as ops +from bigframes.session import polars_executor +from bigframes.testing.engine_utils import assert_equivalence_execution + +pytest.importorskip("polars") + +# Polars used as reference as its fast and local. Generally though, prefer gbq engine where they disagree. +REFERENCE_ENGINE = polars_executor.PolarsExecutor() + + +def apply_op_pairwise( + array: array_value.ArrayValue, op: ops.BinaryOp, excluded_cols=[] +) -> array_value.ArrayValue: + exprs = [] + labels = [] + for l_arg, r_arg in itertools.product(array.column_ids, array.column_ids): + if (l_arg in excluded_cols) or (r_arg in excluded_cols): + continue + try: + _ = op.output_type( + array.get_column_type(l_arg), array.get_column_type(r_arg) + ) + expr = op.as_expr(l_arg, r_arg) + exprs.append(expr) + labels.append(f"{l_arg}_{r_arg}") + except TypeError: + continue + assert len(exprs) > 0 + new_arr, ids = array.compute_values(exprs) + new_arr = new_arr.rename_columns( + {new_col: label for new_col, label in zip(ids, labels)} + ) + return new_arr + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_add( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op_pairwise(scalars_array_value, ops.add_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_sub( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op_pairwise(scalars_array_value, ops.sub_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_mul( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op_pairwise(scalars_array_value, ops.mul_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_div(scalars_array_value: array_value.ArrayValue, engine): + # TODO: Duration div is sensitive to zeroes + # TODO: Numeric col is sensitive to scale shifts + arr = apply_op_pairwise( + scalars_array_value, ops.div_op, excluded_cols=["duration_col", "numeric_col"] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_div_durations( + scalars_array_value: array_value.ArrayValue, engine +): + arr, _ = scalars_array_value.compute_values( + [ + ops.div_op.as_expr( + expression.deref("duration_col"), + expression.const(datetime.timedelta(seconds=3)), + ), + ops.div_op.as_expr( + expression.deref("duration_col"), + expression.const(datetime.timedelta(seconds=-3)), + ), + ops.div_op.as_expr(expression.deref("duration_col"), expression.const(4)), + ops.div_op.as_expr(expression.deref("duration_col"), expression.const(-4)), + ops.div_op.as_expr( + expression.deref("duration_col"), expression.const(55.55) + ), + ops.div_op.as_expr( + expression.deref("duration_col"), expression.const(-55.55) + ), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_floordiv( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op_pairwise( + scalars_array_value, + ops.floordiv_op, + excluded_cols=["duration_col", "numeric_col"], + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_floordiv_durations( + scalars_array_value: array_value.ArrayValue, engine +): + arr, _ = scalars_array_value.compute_values( + [ + ops.floordiv_op.as_expr( + expression.deref("duration_col"), + expression.const(datetime.timedelta(seconds=3)), + ), + ops.floordiv_op.as_expr( + expression.deref("duration_col"), + expression.const(datetime.timedelta(seconds=-3)), + ), + ops.floordiv_op.as_expr( + expression.deref("duration_col"), expression.const(4) + ), + ops.floordiv_op.as_expr( + expression.deref("duration_col"), expression.const(-4) + ), + ops.floordiv_op.as_expr( + expression.deref("duration_col"), expression.const(55.55) + ), + ops.floordiv_op.as_expr( + expression.deref("duration_col"), expression.const(-55.55) + ), + ] + ) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_project_mod( + scalars_array_value: array_value.ArrayValue, + engine, +): + arr = apply_op_pairwise(scalars_array_value, ops.mod_op) + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index b80202bdbe..fd5dbef2e3 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -818,3 +818,10 @@ def test_mean_squared_error(session: bigframes.Session): df = session.read_pandas(pd_df) mse = metrics.mean_squared_error(df["y_true"], df["y_pred"]) assert mse == 0.375 + + +def test_mean_absolute_error(session: bigframes.Session): + pd_df = pd.DataFrame({"y_true": [3, -0.5, 2, 7], "y_pred": [2.5, 0.0, 2, 8]}) + df = session.read_pandas(pd_df) + mse = metrics.mean_absolute_error(df["y_true"], df["y_pred"]) + assert mse == 0.5 diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index 8ce0cb9beb..1462a68b49 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -86,12 +86,28 @@ def test_dt_dayofweek(scalars_dfs, col_name): pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.dt.dayofweek.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofweek assert_series_equal(pd_result, bf_result, check_dtype=False) +@pytest.mark.parametrize( + ("col_name",), + DATE_COLUMNS, +) +def test_dt_day_of_week(scalars_dfs, col_name): + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df[col_name] + + bf_result = bf_series.dt.day_of_week.to_pandas() + pd_result = scalars_pandas_df[col_name].dt.day_of_week + + assert_series_equal(pd_result, bf_result, check_dtype=False) + + @pytest.mark.parametrize( ("col_name",), DATE_COLUMNS, @@ -100,12 +116,28 @@ def test_dt_dayofyear(scalars_dfs, col_name): pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.dt.dayofyear.to_pandas() pd_result = scalars_pandas_df[col_name].dt.dayofyear assert_series_equal(pd_result, bf_result, check_dtype=False) +@pytest.mark.parametrize( + ("col_name",), + DATE_COLUMNS, +) +def test_dt_day_of_year(scalars_dfs, col_name): + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df[col_name] + + bf_result = bf_series.dt.day_of_year.to_pandas() + pd_result = scalars_pandas_df[col_name].dt.day_of_year + + assert_series_equal(pd_result, bf_result, check_dtype=False) + + @pytest.mark.parametrize( ("col_name",), DATETIME_COL_NAMES, diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 209bc87f9b..a720614892 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -324,13 +324,10 @@ def test_isalpha(weird_strings, weird_strings_pd): ) -@pytest.mark.skipif( - "dev" in pa.__version__, - # b/333484335 pyarrow is inconsistent on the behavior - reason="pyarrow dev version is inconsistent on isdigit behavior.", -) def test_isdigit(weird_strings, weird_strings_pd): - pd_result = weird_strings_pd.str.isdigit() + # check the behavior against normal pandas str, since pyarrow has a bug with superscripts/fractions b/333484335 + # astype object instead of str to support pd.NA + pd_result = weird_strings_pd.astype(object).str.isdigit() bf_result = weird_strings.str.isdigit().to_pandas() pd.testing.assert_series_equal( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index caf39bd9e9..bc773d05b2 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs): assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) -def test_assign_new_column(scalars_dfs): +@pytest.mark.parametrize( + ("literal", "expected_dtype"), + ( + pytest.param( + 2, + dtypes.INT_DTYPE, + id="INT64", + ), + # ==================================================================== + # NULL values + # + # These are regression tests for b/428999884. It needs to be possible to + # set a column to NULL with a desired type (not just the pandas default + # of float64). + # ==================================================================== + pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), + pytest.param( + pa.scalar(None, type=pa.int64()), + dtypes.INT_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us", tz="UTC")), + dtypes.TIMESTAMP_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us")), + dtypes.DATETIME_DTYPE, + id="NULL-pyarrow-DATETIME", + ), + ), +) +def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"new_col": 2} - df = scalars_df.assign(**kwargs) + df = scalars_df.assign(new_col=literal) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") + new_col_pd = literal + if isinstance(literal, pa.Scalar): + # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. + new_col_pd = literal.as_py() + + # Pandas might not pick the same dtype as BigFrames, but it should at least + # be castable to it. + pd_result = scalars_pandas_df.assign(new_col=new_col_pd) + pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 4bb1c6589a..a04da64af0 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs": 10000, + "timeoutMs": 120_000, } }, pytest.param( diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 686499aa75..3c80f00a37 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -58,6 +58,18 @@ def test_setter_raises_if_session_started(attribute, original_value, new_value): assert getattr(options, attribute) is not new_value +def test_location_set_us_twice(): + """This test ensures the fix for b/423220936 is working as expected.""" + options = bigquery_options.BigQueryOptions() + setattr(options, "location", "us") + assert getattr(options, "location") == "US" + + options._session_started = True + + setattr(options, "location", "us") + assert getattr(options, "location") == "US" + + @pytest.mark.parametrize( [ "attribute", diff --git a/tests/unit/core/compile/sqlglot/aggregations/__init__.py b/tests/unit/core/compile/sqlglot/aggregations/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql new file mode 100644 index 0000000000..78104eb578 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_size/out.sql @@ -0,0 +1,12 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + COUNT(1) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col_agg` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql new file mode 100644 index 0000000000..e748f71278 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_sum/out.sql @@ -0,0 +1,12 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + COALESCE(SUM(`bfcol_0`), 0) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col_agg` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py new file mode 100644 index 0000000000..e3688f19df --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/test_op_registration.py @@ -0,0 +1,45 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sqlglot import expressions as sge + +from bigframes.core.compile.sqlglot.aggregations import op_registration +from bigframes.operations import aggregations as agg_ops + + +def test_register_then_get(): + reg = op_registration.OpRegistration() + input = sge.to_identifier("A") + op = agg_ops.SizeOp() + + @reg.register(agg_ops.SizeOp) + def test_func(op: agg_ops.SizeOp, input: sge.Expression) -> sge.Expression: + return input + + assert reg[agg_ops.SizeOp()](op, input) == test_func(op, input) + assert reg[agg_ops.SizeOp.name](op, input) == test_func(op, input) + + +def test_register_function_first_argument_is_not_agg_op_raise_error(): + reg = op_registration.OpRegistration() + + @reg.register(agg_ops.SizeOp) + def test_func(input: sge.Expression) -> sge.Expression: + return input + + with pytest.raises( + ValueError, match=r".*first parameter must be a window operator.*" + ): + test_func(sge.to_identifier("A")) diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py new file mode 100644 index 0000000000..96cdceb3c6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -0,0 +1,51 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.core import array_value, expression, identifiers, nodes +from bigframes.operations import aggregations as agg_ops +import bigframes.pandas as bpd + +pytest.importorskip("pytest_snapshot") + + +def _apply_unary_op(obj: bpd.DataFrame, op: agg_ops.UnaryWindowOp, arg: str) -> str: + agg_node = nodes.AggregateNode( + obj._block.expr.node, + aggregations=( + ( + expression.UnaryAggregation(op, expression.deref(arg)), + identifiers.ColumnId(arg + "_agg"), + ), + ), + ) + result = array_value.ArrayValue(agg_node) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql + + +def test_size(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, agg_ops.SizeUnaryOp(), "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_sum(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, agg_ops.SumOp(), "int64_col") + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql index 1496f89f28..e8dc2edb80 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric/out.sql @@ -1,16 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0`, - `rowindex` AS `bfcol_1` + `int64_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_1` AS `bfcol_4`, - `bfcol_0` + `bfcol_0` AS `bfcol_5` + `bfcol_0` + `bfcol_0` AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_4` AS `rowindex`, - `bfcol_5` AS `int64_col` + `bfcol_1` AS `int64_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql index 9c4b01a6df..7c4cc2c770 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_numeric_w_scalar/out.sql @@ -1,16 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `int64_col` AS `bfcol_0`, - `rowindex` AS `bfcol_1` + `int64_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_1` AS `bfcol_4`, - `bfcol_0` + 1 AS `bfcol_5` + `bfcol_0` + 1 AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_4` AS `rowindex`, - `bfcol_5` AS `int64_col` + `bfcol_1` AS `int64_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql index 7a8ab83df1..de5129a6a3 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_add_string/out.sql @@ -1,16 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_col` AS `bfcol_1` + `string_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_0` AS `bfcol_4`, - CONCAT(`bfcol_1`, 'a') AS `bfcol_5` + CONCAT(`bfcol_0`, 'a') AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_4` AS `rowindex`, - `bfcol_5` AS `string_col` + `bfcol_1` AS `string_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql index f501dd3b86..b226066b16 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql @@ -1,20 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `json_col` AS `bfcol_1` + `json_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`json_types` ), `bfcte_1` AS ( SELECT *, - JSON_SET(`bfcol_1`, '$.a', 100) AS `bfcol_4` + JSON_SET(`bfcol_0`, '$.a', 100) AS `bfcol_1` FROM `bfcte_0` -), `bfcte_2` AS ( - SELECT - *, - JSON_SET(`bfcol_4`, '$.b', 'hi') AS `bfcol_7` - FROM `bfcte_1` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_7` AS `json_col` -FROM `bfcte_2` \ No newline at end of file + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_abs/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_abs/out.sql new file mode 100644 index 0000000000..6f315f8113 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_abs/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ABS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccos/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccos/out.sql new file mode 100644 index 0000000000..df695b7fbc --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccos/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN ABS(`bfcol_0`) > 1 THEN CAST('NaN' AS FLOAT64) ELSE ACOS(`bfcol_0`) END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccosh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccosh/out.sql new file mode 100644 index 0000000000..5272e4a6a8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arccosh/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN `bfcol_0` < 1 THEN CAST('NaN' AS FLOAT64) ELSE ACOSH(`bfcol_0`) END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsin/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsin/out.sql new file mode 100644 index 0000000000..3afc7c64b8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsin/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN ABS(`bfcol_0`) > 1 THEN CAST('NaN' AS FLOAT64) ELSE ASIN(`bfcol_0`) END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsinh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsinh/out.sql new file mode 100644 index 0000000000..6313e80e5f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arcsinh/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ASINH(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctan/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctan/out.sql new file mode 100644 index 0000000000..ec6a22e653 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctan/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ATAN(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctanh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctanh/out.sql new file mode 100644 index 0000000000..39b5f565fe --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_arctanh/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE WHEN ABS(`bfcol_0`) > 1 THEN CAST('NaN' AS FLOAT64) ELSE ATANH(`bfcol_0`) END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql index 33a8bded13..4398084227 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_index/out.sql @@ -1,15 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_list_col` AS `bfcol_1` + `string_list_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( SELECT *, - `bfcol_1`[SAFE_OFFSET(1)] AS `bfcol_4` + `bfcol_0`[SAFE_OFFSET(1)] AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `string_list_col` + `bfcol_1` AS `string_list_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql index 34d2225931..1ffc3ee8f9 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_only_start/out.sql @@ -1,7 +1,6 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_list_col` AS `bfcol_1` + `string_list_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( SELECT @@ -9,13 +8,12 @@ WITH `bfcte_0` AS ( ARRAY( SELECT el - FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + FROM UNNEST(`bfcol_0`) AS el WITH OFFSET AS slice_idx WHERE slice_idx >= 1 - ) AS `bfcol_4` + ) AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `string_list_col` + `bfcol_1` AS `string_list_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql index d46803ce7c..878b60e5e2 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_slice_with_start_and_stop/out.sql @@ -1,7 +1,6 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_list_col` AS `bfcol_1` + `string_list_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( SELECT @@ -9,13 +8,12 @@ WITH `bfcte_0` AS ( ARRAY( SELECT el - FROM UNNEST(`bfcol_1`) AS el WITH OFFSET AS slice_idx + FROM UNNEST(`bfcol_0`) AS el WITH OFFSET AS slice_idx WHERE slice_idx >= 1 AND slice_idx < 5 - ) AS `bfcol_4` + ) AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `string_list_col` + `bfcol_1` AS `string_list_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql index e0db21f972..4dbd602bea 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_array_to_string/out.sql @@ -1,15 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_list_col` AS `bfcol_1` + `string_list_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`repeated_types` ), `bfcte_1` AS ( SELECT *, - ARRAY_TO_STRING(`bfcol_1`, '.') AS `bfcol_4` + ARRAY_TO_STRING(`bfcol_0`, '.') AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `string_list_col` + `bfcol_1` AS `string_list_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_capitalize/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_capitalize/out.sql new file mode 100644 index 0000000000..7af1708347 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_capitalize/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + INITCAP(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_ceil/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_ceil/out.sql new file mode 100644 index 0000000000..0959f3a0ad --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_ceil/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CEIL(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cos/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cos/out.sql new file mode 100644 index 0000000000..126d2a63f2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cos/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + COS(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cosh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cosh/out.sql new file mode 100644 index 0000000000..f44dfaac41 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_cosh/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN ABS(`bfcol_0`) > 709.78 + THEN CAST('Infinity' AS FLOAT64) + ELSE COSH(`bfcol_0`) + END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_date/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_date/out.sql new file mode 100644 index 0000000000..615a4a92bb --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_date/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + DATE(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_day/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_day/out.sql new file mode 100644 index 0000000000..460823fa20 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_day/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(DAY FROM `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofweek/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofweek/out.sql new file mode 100644 index 0000000000..e6c17587d0 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofweek/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(DAYOFWEEK FROM `bfcol_0`) - 1 AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofyear/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofyear/out.sql new file mode 100644 index 0000000000..4b60bcc4ca --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_dayofyear/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(DAYOFYEAR FROM `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_exp/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_exp/out.sql new file mode 100644 index 0000000000..6afa3f85a5 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_exp/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN `bfcol_0` > 709.78 + THEN CAST('Infinity' AS FLOAT64) + ELSE EXP(`bfcol_0`) + END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_expm1/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_expm1/out.sql new file mode 100644 index 0000000000..f3768deb4a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_expm1/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN `bfcol_0` > 709.78 + THEN CAST('Infinity' AS FLOAT64) + ELSE EXP(`bfcol_0`) + END - 1 AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor/out.sql new file mode 100644 index 0000000000..56be1019e5 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_floor/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + FLOOR(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hash/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hash/out.sql new file mode 100644 index 0000000000..14d6df6d22 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hash/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + FARM_FINGERPRINT(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hour/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hour/out.sql new file mode 100644 index 0000000000..8cc9b9081f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_hour/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(HOUR FROM `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_invert/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_invert/out.sql new file mode 100644 index 0000000000..28f2aa6e06 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_invert/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + ~`bfcol_0` AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalnum/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalnum/out.sql new file mode 100644 index 0000000000..02e0094742 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalnum/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^(\\p{N}|\\p{L})+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalpha/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalpha/out.sql new file mode 100644 index 0000000000..2615d0452f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isalpha/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^\\p{L}+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdecimal/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdecimal/out.sql new file mode 100644 index 0000000000..bc1fce3dbc --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdecimal/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^\\d+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdigit/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdigit/out.sql new file mode 100644 index 0000000000..1cb3a883ab --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isdigit/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^\\p{Nd}+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_islower/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_islower/out.sql new file mode 100644 index 0000000000..a621b71a3b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_islower/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + LOWER(`bfcol_0`) = `bfcol_0` AND UPPER(`bfcol_0`) <> `bfcol_0` AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnull/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnull/out.sql new file mode 100644 index 0000000000..55a2ebb970 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnull/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` IS NULL AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnumeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnumeric/out.sql new file mode 100644 index 0000000000..6566c1dd4c --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isnumeric/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^\\pN+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_day/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_day/out.sql new file mode 100644 index 0000000000..d389172fda --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_day/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(DAYOFWEEK FROM `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_week/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_week/out.sql new file mode 100644 index 0000000000..f22e963bc3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_iso_week/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + EXTRACT(ISOWEEK FROM `bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `timestamp_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isspace/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isspace/out.sql new file mode 100644 index 0000000000..aff12102be --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isspace/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + REGEXP_CONTAINS(`bfcol_0`, '^\\s+$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isupper/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isupper/out.sql new file mode 100644 index 0000000000..03fe005910 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_isupper/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + UPPER(`bfcol_0`) = `bfcol_0` AND LOWER(`bfcol_0`) <> `bfcol_0` AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql index 2ffb0174a8..3d23bd1e3e 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql @@ -1,15 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `json_col` AS `bfcol_1` + `json_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`json_types` ), `bfcte_1` AS ( SELECT *, - JSON_EXTRACT(`bfcol_1`, '$') AS `bfcol_4` + JSON_EXTRACT(`bfcol_0`, '$') AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `json_col` + `bfcol_1` AS `json_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_array/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_array/out.sql new file mode 100644 index 0000000000..1ddb3999b3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_array/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + JSON_EXTRACT_ARRAY(`bfcol_0`, '$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_string_array/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_string_array/out.sql new file mode 100644 index 0000000000..cbc3df74c0 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract_string_array/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + JSON_EXTRACT_STRING_ARRAY(`bfcol_0`, '$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query/out.sql new file mode 100644 index 0000000000..b5d98b80d2 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + JSON_QUERY(`bfcol_0`, '$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query_array/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query_array/out.sql new file mode 100644 index 0000000000..1b7a5908eb --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_query_array/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + JSON_QUERY_ARRAY(`bfcol_0`, '$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_value/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_value/out.sql new file mode 100644 index 0000000000..3a84a1a92a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_value/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + JSON_VALUE(`bfcol_0`, '$') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_notnull/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_notnull/out.sql new file mode 100644 index 0000000000..c1961f9d62 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_notnull/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + NOT `bfcol_0` IS NULL AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql index d965ea8f1b..cdb091ae39 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql @@ -1,15 +1,13 @@ WITH `bfcte_0` AS ( SELECT - `rowindex` AS `bfcol_0`, - `string_col` AS `bfcol_1` + `string_col` AS `bfcol_0` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - JSON_VALUE(`bfcol_1`, '$') AS `bfcol_4` + PARSE_JSON(`bfcol_0`) AS `bfcol_1` FROM `bfcte_0` ) SELECT - `bfcol_0` AS `rowindex`, - `bfcol_4` AS `string_col` + `bfcol_1` AS `string_col` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sin/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sin/out.sql new file mode 100644 index 0000000000..62a5cff0b5 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sin/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + SIN(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sinh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sinh/out.sql new file mode 100644 index 0000000000..711dba94a9 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_sinh/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN ABS(`bfcol_0`) > 709.78 + THEN SIGN(`bfcol_0`) * CAST('Infinity' AS FLOAT64) + ELSE SINH(`bfcol_0`) + END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tan/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tan/out.sql new file mode 100644 index 0000000000..5fac274b6b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tan/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + TAN(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tanh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tanh/out.sql new file mode 100644 index 0000000000..5d1a5a5320 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_tanh/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `float64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + TANH(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `float64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_json_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_json_string/out.sql new file mode 100644 index 0000000000..2786973933 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_to_json_string/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `json_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`json_types` +), `bfcte_1` AS ( + SELECT + *, + TO_JSON_STRING(`bfcol_0`) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `json_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py index 9daff51c9f..a78a41fdbf 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py @@ -12,38 +12,60 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + import pytest -import bigframes.bigquery as bbq +from bigframes import operations as ops +import bigframes.core.expression as ex import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") +def _apply_binary_op( + obj: bpd.DataFrame, + op: ops.BinaryOp, + l_arg: str, + r_arg: typing.Union[str, ex.Expression], +) -> str: + array_value = obj._block.expr + op_expr = op.as_expr(l_arg, r_arg) + result, col_ids = array_value.compute_values([op_expr]) + + # Rename columns for deterministic golden SQL results. + assert len(col_ids) == 1 + result = result.rename_columns({col_ids[0]: l_arg}).select_columns([l_arg]) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql + + def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] + sql = _apply_binary_op(bf_df, ops.add_op, "int64_col", "int64_col") - bf_df["int64_col"] = bf_df["int64_col"] + bf_df["int64_col"] - - snapshot.assert_match(bf_df.sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_add_numeric_w_scalar(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col"]] + sql = _apply_binary_op(bf_df, ops.add_op, "int64_col", ex.const(1)) - bf_df["int64_col"] = bf_df["int64_col"] + 1 - - snapshot.assert_match(bf_df.sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["string_col"]] + sql = _apply_binary_op(bf_df, ops.add_op, "string_col", ex.const("a")) - bf_df["string_col"] = bf_df["string_col"] + "a" - - snapshot.assert_match(bf_df.sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_json_set(json_types_df: bpd.DataFrame, snapshot): - result = bbq.json_set(json_types_df["json_col"], [("$.a", 100), ("$.b", "hi")]) - snapshot.assert_match(result.to_frame().sql, "out.sql") + bf_df = json_types_df[["json_col"]] + sql = _apply_binary_op( + bf_df, ops.JSONSet(json_path="$.a"), "json_col", ex.const(100) + ) + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py index 6d9101aff0..9f04450d38 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py @@ -14,74 +14,364 @@ import pytest -import bigframes.bigquery as bbq +from bigframes import operations as ops +from bigframes.operations._op_converters import convert_index, convert_slice import bigframes.pandas as bpd pytest.importorskip("pytest_snapshot") +def _apply_unary_op(obj: bpd.DataFrame, op: ops.UnaryOp, arg: str) -> str: + array_value = obj._block.expr + op_expr = op.as_expr(arg) + result, col_ids = array_value.compute_values([op_expr]) + + # Rename columns for deterministic golden SQL results. + assert len(col_ids) == 1 + result = result.rename_columns({col_ids[0]: arg}).select_columns([arg]) + + sql = result.session._executor.to_sql(result, enable_cache=False) + return sql + + +def test_arccosh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arccosh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_arccos(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arccos_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_arcsin(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arcsin_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_arcsinh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arcsinh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_arctan(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arctan_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_arctanh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.arctanh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_abs(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.abs_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_capitalize(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.capitalize_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_ceil(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.ceil_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_date(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.date_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_day(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.day_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_dayofweek(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.dayofweek_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_dayofyear(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.dayofyear_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_exp(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.exp_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_expm1(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.expm1_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_floor(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.floor_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + def test_array_to_string(repeated_types_df: bpd.DataFrame, snapshot): - result = bbq.array_to_string(repeated_types_df["string_list_col"], ".") + bf_df = repeated_types_df[["string_list_col"]] + sql = _apply_unary_op(bf_df, ops.ArrayToStringOp(delimiter="."), "string_list_col") - snapshot.assert_match(result.to_frame().sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_array_index(repeated_types_df: bpd.DataFrame, snapshot): - result = repeated_types_df["string_list_col"].list[1] + bf_df = repeated_types_df[["string_list_col"]] + sql = _apply_unary_op(bf_df, convert_index(1), "string_list_col") - snapshot.assert_match(result.to_frame().sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_array_slice_with_only_start(repeated_types_df: bpd.DataFrame, snapshot): - result = repeated_types_df["string_list_col"].list[1:] + bf_df = repeated_types_df[["string_list_col"]] + sql = _apply_unary_op(bf_df, convert_slice(slice(1, None)), "string_list_col") - snapshot.assert_match(result.to_frame().sql, "out.sql") + snapshot.assert_match(sql, "out.sql") def test_array_slice_with_start_and_stop(repeated_types_df: bpd.DataFrame, snapshot): - result = repeated_types_df["string_list_col"].list[1:5] + bf_df = repeated_types_df[["string_list_col"]] + sql = _apply_unary_op(bf_df, convert_slice(slice(1, 5)), "string_list_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_cos(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.cos_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_cosh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.cosh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_hash(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.hash_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + - snapshot.assert_match(result.to_frame().sql, "out.sql") +def test_hour(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.hour_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_invert(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col"]] + sql = _apply_unary_op(bf_df, ops.invert_op, "int64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isalnum(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isalnum_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isalpha(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isalpha_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isdecimal(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isdecimal_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isdigit(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isdigit_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_islower(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.islower_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isnumeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isnumeric_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isspace(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isspace_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isupper(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.isupper_op, "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_iso_day(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.iso_day_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_iso_week(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col"]] + sql = _apply_unary_op(bf_df, ops.iso_week_op, "timestamp_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_isnull(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.isnull_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_notnull(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.notnull_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_sin(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.sin_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_sinh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.sinh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_tan(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.tan_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_tanh(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["float64_col"]] + sql = _apply_unary_op(bf_df, ops.tanh_op, "float64_col") + + snapshot.assert_match(sql, "out.sql") -# JSON Ops def test_json_extract(json_types_df: bpd.DataFrame, snapshot): - result = bbq.json_extract(json_types_df["json_col"], "$") - expected_sql = "JSON_EXTRACT(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql - snapshot.assert_match(result.to_frame().sql, "out.sql") + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONExtract(json_path="$"), "json_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_json_extract_array(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONExtractArray(json_path="$"), "json_col") + snapshot.assert_match(sql, "out.sql") -def test_json_extract_array(json_types_df: bpd.DataFrame): - result = bbq.json_extract_array(json_types_df["json_col"], "$") - expected_sql = "JSON_EXTRACT_ARRAY(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql +def test_json_extract_string_array(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONExtractStringArray(json_path="$"), "json_col") -def test_json_extract_string_array(json_types_df: bpd.DataFrame): - result = bbq.json_extract_string_array(json_types_df["json_col"], "$") - expected_sql = "JSON_EXTRACT_STRING_ARRAY(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql + snapshot.assert_match(sql, "out.sql") -def test_json_query(json_types_df: bpd.DataFrame): - result = bbq.json_query(json_types_df["json_col"], "$") - expected_sql = "JSON_QUERY(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql +def test_json_query(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONQuery(json_path="$"), "json_col") + snapshot.assert_match(sql, "out.sql") -def test_json_query_array(json_types_df: bpd.DataFrame): - result = bbq.json_query_array(json_types_df["json_col"], "$") - expected_sql = "JSON_QUERY_ARRAY(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql +def test_json_query_array(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONQueryArray(json_path="$"), "json_col") -def test_json_value(json_types_df: bpd.DataFrame): - result = bbq.json_value(json_types_df["json_col"], "$") - expected_sql = "JSON_VALUE(`bfcol_1`, '$') AS `bfcol_4`" - assert expected_sql in result.to_frame().sql + snapshot.assert_match(sql, "out.sql") + + +def test_json_value(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.JSONValue(json_path="$"), "json_col") + + snapshot.assert_match(sql, "out.sql") def test_parse_json(scalar_types_df: bpd.DataFrame, snapshot): - result = bbq.json_value(scalar_types_df["string_col"], "$") - snapshot.assert_match(result.to_frame().sql, "out.sql") + bf_df = scalar_types_df[["string_col"]] + sql = _apply_unary_op(bf_df, ops.ParseJSON(), "string_col") + + snapshot.assert_match(sql, "out.sql") + + +def test_to_json_string(json_types_df: bpd.DataFrame, snapshot): + bf_df = json_types_df[["json_col"]] + sql = _apply_unary_op(bf_df, ops.ToJSONString(), "json_col") + + snapshot.assert_match(sql, "out.sql") diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 77392bea2f..cd23614bbf 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -272,3 +272,19 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): ValueError, ): bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) + + +@pytest.mark.parametrize( + ["scalar", "expected_dtype"], + [ + (pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE), + # Support NULL scalars. + (pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE), + ], +) +def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): + assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index cfee5ea98d..c451d74d0f 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -109,14 +109,15 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): labels = io_bq.create_job_configs_labels( job_configs_labels=cur_labels, api_methods=api_methods ) - expected_dict = { + expected_labels = { "source": "bigquery-dataframes-temp", "bigframes-api": "dataframe-columns", "recent-bigframes-api-0": "dataframe-max", "recent-bigframes-api-1": "dataframe-head", "recent-bigframes-api-2": "dataframe-__init__", } - assert labels == expected_dict + # Asserts that all items in expected_labels are present in labels + assert labels.items() >= expected_labels.items() def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): diff --git a/third_party/bigframes_vendored/ibis/common/temporal.py b/third_party/bigframes_vendored/ibis/common/temporal.py index 1b0e4fa985..8d84caf5a1 100644 --- a/third_party/bigframes_vendored/ibis/common/temporal.py +++ b/third_party/bigframes_vendored/ibis/common/temporal.py @@ -260,3 +260,8 @@ def _from_numpy_datetime64(value): raise TypeError("Unable to convert np.datetime64 without pandas") else: return pd.Timestamp(value).to_pydatetime() + + +@normalize_datetime.register("pyarrow.Scalar") +def _from_pyarrow_scalar(value): + return value.as_py() diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py index e390cea02c..85be0ac749 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py @@ -27,6 +27,7 @@ import bigframes_vendored.ibis.expr.datatypes as dt from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence from public import public +import pyarrow as pa import toolz @@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array: return dt.Array(highest_precedence(map(infer, values))) +@infer.register("pyarrow.Scalar") +def infer_pyarrow_scalar(value: "pa.Scalar"): + """Infert the type of a PyArrow Scalar value.""" + import bigframes_vendored.ibis.formats.pyarrow + + return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type) + + @infer.register(datetime.time) def infer_time(value: datetime.time) -> dt.Time: return dt.time @@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon: def normalize(typ, value): """Ensure that the Python type underlying a literal resolves to a single type.""" + if pa is not None and isinstance(value, pa.Scalar): + value = value.as_py() + dtype = dt.dtype(typ) if value is None: if not dtype.nullable: diff --git a/third_party/bigframes_vendored/ibis/formats/pyarrow.py b/third_party/bigframes_vendored/ibis/formats/pyarrow.py index a6861b52e1..491e551ec1 100644 --- a/third_party/bigframes_vendored/ibis/formats/pyarrow.py +++ b/third_party/bigframes_vendored/ibis/formats/pyarrow.py @@ -24,7 +24,6 @@ @functools.cache def _from_pyarrow_types(): import pyarrow as pa - import pyarrow_hotfix # noqa: F401 return { pa.int8(): dt.Int8, @@ -87,7 +86,6 @@ class PyArrowType(TypeMapper): def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType: """Convert a pyarrow type to an ibis type.""" import pyarrow as pa - import pyarrow_hotfix # noqa: F401 if pa.types.is_null(typ): return dt.null diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index dfb1cf9efc..0dd487d056 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -66,6 +66,40 @@ def dayofweek(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def day_of_week(self): + """The day of the week with Monday=0, Sunday=6. + + Return the day of the week. It is assumed the week starts on + Monday, which is denoted by 0 and ends on Sunday, which is denoted + by 6. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series() + ... ) + >>> s.dt.day_of_week + 2016-12-31 00:00:00 5 + 2017-01-01 00:00:00 6 + 2017-01-02 00:00:00 0 + 2017-01-03 00:00:00 1 + 2017-01-04 00:00:00 2 + 2017-01-05 00:00:00 3 + 2017-01-06 00:00:00 4 + 2017-01-07 00:00:00 5 + 2017-01-08 00:00:00 6 + dtype: Int64 + + Returns: + Series: Containing integers indicating the day number. + """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def dayofyear(self): """The ordinal day of the year. @@ -94,6 +128,34 @@ def dayofyear(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def day_of_year(self): + """The ordinal day of the year. + + **Examples:** + + >>> import pandas as pd + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... pd.date_range('2016-12-28', '2017-01-03', freq='D').to_series() + ... ) + >>> s.dt.day_of_year + 2016-12-28 00:00:00 363 + 2016-12-29 00:00:00 364 + 2016-12-30 00:00:00 365 + 2016-12-31 00:00:00 366 + 2017-01-01 00:00:00 1 + 2017-01-02 00:00:00 2 + 2017-01-03 00:00:00 3 + dtype: Int64 + + Returns: + Series: Containing integers indicating the day number. + """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def date(self): """Returns a Series with the date part of Timestamps without time and diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index 56f78c6d0b..1c14e8068b 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -91,3 +91,30 @@ def mean_squared_error(y_true, y_pred) -> float: float: Mean squared error. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def mean_absolute_error(y_true, y_pred) -> float: + """Mean absolute error regression loss. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) + >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) + >>> mae = bigframes.ml.metrics.mean_absolute_error(y_true, y_pred) + >>> mae + np.float64(0.5) + + Args: + y_true (Series or DataFrame of shape (n_samples,)): + Ground truth (correct) target values. + y_pred (Series or DataFrame of shape (n_samples,)): + Estimated target values. + + Returns: + float: Mean absolute error. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 9e7a386601..4eec2e8af7 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.11.0" +__version__ = "2.12.0" # {x-release-please-start-date} -__release_date__ = "2025-07-15" +__release_date__ = "2025-07-23" # {x-release-please-end}