# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import annotations

from datetime import datetime, timezone

import pyarrow as pa
import pyarrow.compute as pc
import pytest
from datafusion import Accumulator, column, udaf


class Summarize(Accumulator):
    """Interface of a user-defined accumulation."""

    def __init__(self, initial_value: float = 0.0, as_scalar: bool = False):
        self._sum = initial_value
        self.as_scalar = as_scalar

    def state(self) -> list[pa.Scalar]:
        if self.as_scalar:
            return [pa.scalar(self._sum)]
        return [self._sum]

    def update(self, values: pa.Array) -> None:
        # Not nice since pyarrow scalars can't be summed yet.
        # This breaks on `None`
        self._sum = self._sum + pc.sum(values).as_py()

    def merge(self, states: list[pa.Array]) -> None:
        # Not nice since pyarrow scalars can't be summed yet.
        # This breaks on `None`
        self._sum = self._sum + pc.sum(states[0]).as_py()

    def evaluate(self) -> pa.Scalar:
        if self.as_scalar:
            return pa.scalar(self._sum)
        return self._sum


class NotSubclassOfAccumulator:
    pass


class MissingMethods(Accumulator):
    def __init__(self):
        self._sum = pa.scalar(0)

    def state(self) -> list[pa.Scalar]:
        return [self._sum]


class CollectTimestamps(Accumulator):
    def __init__(self, wrap_in_scalar: bool):
        self._values: list[datetime] = []
        self.wrap_in_scalar = wrap_in_scalar

    def state(self) -> list[pa.Scalar]:
        if self.wrap_in_scalar:
            return [pa.scalar(self._values, type=pa.list_(pa.timestamp("ns")))]
        return [pa.array(self._values, type=pa.timestamp("ns"))]

    def update(self, values: pa.Array) -> None:
        self._values.extend(values.to_pylist())

    def merge(self, states: list[pa.Array]) -> None:
        for state in states[0].to_pylist():
            if state is not None:
                self._values.extend(state)

    def evaluate(self) -> pa.Scalar:
        if self.wrap_in_scalar:
            return pa.scalar(self._values, type=pa.list_(pa.timestamp("ns")))
        return pa.array(self._values, type=pa.timestamp("ns"))


@pytest.fixture
def df(ctx):
    # create a RecordBatch and a new DataFrame from it
    batch = pa.RecordBatch.from_arrays(
        [pa.array([1, 2, 3]), pa.array([4, 4, 6])],
        names=["a", "b"],
    )
    return ctx.create_dataframe([[batch]], name="test_table")


def test_errors(df):
    with pytest.raises(TypeError):
        udaf(
            NotSubclassOfAccumulator,
            pa.float64(),
            pa.float64(),
            [pa.float64()],
            volatility="immutable",
        )

    msg = (
        "Can't instantiate abstract class MissingMethods (without an implementation "
        "for abstract methods 'evaluate', 'merge', 'update'|with abstract methods "
        "evaluate, merge, update)"
    )
    with pytest.raises(Exception, match=msg):
        accum = udaf(  # noqa: F841
            MissingMethods,
            pa.int64(),
            pa.int64(),
            [pa.int64()],
            volatility="immutable",
        )


def test_udaf_aggregate(df):
    summarize = udaf(
        Summarize,
        pa.float64(),
        pa.float64(),
        [pa.float64()],
        volatility="immutable",
    )

    df1 = df.aggregate([], [summarize(column("a"))])

    # execute and collect the first (and only) batch
    result = df1.collect()[0]

    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])

    df2 = df.aggregate([], [summarize(column("a"))])

    # Run a second time to ensure the state is properly reset
    result = df2.collect()[0]

    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])


def test_udaf_decorator_aggregate(df):
    @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable")
    def summarize():
        return Summarize()

    df1 = df.aggregate([], [summarize(column("a"))])

    # execute and collect the first (and only) batch
    result = df1.collect()[0]

    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])

    df2 = df.aggregate([], [summarize(column("a"))])

    # Run a second time to ensure the state is properly reset
    result = df2.collect()[0]

    assert result.column(0) == pa.array([1.0 + 2.0 + 3.0])


@pytest.mark.parametrize("as_scalar", [True, False])
def test_udaf_aggregate_with_arguments(df, as_scalar):
    bias = 10.0

    summarize = udaf(
        lambda: Summarize(initial_value=bias, as_scalar=as_scalar),
        pa.float64(),
        pa.float64(),
        [pa.float64()],
        volatility="immutable",
    )

    df1 = df.aggregate([], [summarize(column("a"))])

    # execute and collect the first (and only) batch
    result = df1.collect()[0]

    assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0])

    df2 = df.aggregate([], [summarize(column("a"))])

    # Run a second time to ensure the state is properly reset
    result = df2.collect()[0]

    assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0])


def test_udaf_decorator_aggregate_with_arguments(df):
    bias = 10.0

    @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable")
    def summarize():
        return Summarize(bias)

    df1 = df.aggregate([], [summarize(column("a"))])

    # execute and collect the first (and only) batch
    result = df1.collect()[0]

    assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0])

    df2 = df.aggregate([], [summarize(column("a"))])

    # Run a second time to ensure the state is properly reset
    result = df2.collect()[0]

    assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0])


def test_group_by(df):
    summarize = udaf(
        Summarize,
        pa.float64(),
        pa.float64(),
        [pa.float64()],
        volatility="immutable",
    )

    df = df.aggregate([column("b")], [summarize(column("a"))])

    batches = df.collect()

    arrays = [batch.column(1) for batch in batches]
    joined = pa.concat_arrays(arrays)
    assert joined == pa.array([1.0 + 2.0, 3.0])


def test_register_udaf(ctx, df) -> None:
    summarize = udaf(
        Summarize,
        pa.float64(),
        pa.float64(),
        [pa.float64()],
        volatility="immutable",
    )

    ctx.register_udaf(summarize)

    df_result = ctx.sql("select summarize(b) from test_table")

    assert df_result.collect()[0][0][0].as_py() == 14.0


@pytest.mark.parametrize("wrap_in_scalar", [True, False])
def test_udaf_list_timestamp_return(ctx, wrap_in_scalar) -> None:
    timestamps1 = [
        datetime(2024, 1, 1, tzinfo=timezone.utc),
        datetime(2024, 1, 2, tzinfo=timezone.utc),
    ]
    timestamps2 = [
        datetime(2024, 1, 3, tzinfo=timezone.utc),
        datetime(2024, 1, 4, tzinfo=timezone.utc),
    ]
    batch1 = pa.RecordBatch.from_arrays(
        [pa.array(timestamps1, type=pa.timestamp("ns"))],
        names=["ts"],
    )
    batch2 = pa.RecordBatch.from_arrays(
        [pa.array(timestamps2, type=pa.timestamp("ns"))],
        names=["ts"],
    )
    df = ctx.create_dataframe([[batch1], [batch2]], name="timestamp_table")

    list_type = pa.list_(
        pa.field("item", type=pa.timestamp("ns"), nullable=wrap_in_scalar)
    )

    collect = udaf(
        lambda: CollectTimestamps(wrap_in_scalar),
        pa.timestamp("ns"),
        list_type,
        [list_type],
        volatility="immutable",
    )

    result = df.aggregate([], [collect(column("ts"))]).collect()[0]

    # There is no guarantee about the ordering of the batches, so perform a sort
    # to get consistent results. Alternatively we could sort on evaluate().
    assert (
        result.column(0).values.sort()
        == pa.array(
            [[*timestamps1, *timestamps2]],
            type=list_type,
        ).values
    )