add integration test

HaoXuAI · HaoXuAI · commit ed0cdf479961 · 2025-04-09T23:06:44.000-07:00
Signed-off-by: HaoXuAI &lt;sduxuhao@gmail.com&gt;
diff --git a/sdk/python/feast/batch_feature_view.py b/sdk/python/feast/batch_feature_view.py
@@ -64,6 +64,7 @@ class BatchFeatureView(FeatureView):
     udf: Optional[Callable[[Any], Any]]
     udf_string: Optional[str]
     feature_transformation: Transformation
+    batch_engine: Optional[Field]
 
     def __init__(
         self,
@@ -82,6 +83,7 @@ def __init__(
         udf: Optional[Callable[[Any], Any]],
         udf_string: Optional[str] = "",
         feature_transformation: Optional[Transformation] = None,
+        batch_engine: Optional[Field] = None,
     ):
         if not flags_helper.is_test():
             warnings.warn(
@@ -105,6 +107,7 @@ def __init__(
         self.feature_transformation = (
             feature_transformation or self.get_feature_transformation()
         )
+        self.batch_engine = batch_engine
 
         super().__init__(
             name=name,
@@ -147,18 +150,21 @@ def batch_feature_view(
     source: Optional[DataSource] = None,
     tags: Optional[Dict[str, str]] = None,
     online: bool = True,
+    offline: bool = True,
     description: str = "",
     owner: str = "",
     schema: Optional[List[Field]] = None,
 ):
     """
     Args:
         name:
+        mode:
         entities:
         ttl:
         source:
         tags:
         online:
+        offline:
         description:
         owner:
         schema:
@@ -184,6 +190,7 @@ def decorator(user_function):
             source=source,
             tags=tags,
             online=online,
+            offline=offline,
             description=description,
             owner=owner,
             schema=schema,
diff --git a/sdk/python/feast/infra/compute_engines/dag/builder.py b/sdk/python/feast/infra/compute_engines/dag/builder.py
@@ -69,7 +69,14 @@ def build(self) -> ExecutionPlan:
         return ExecutionPlan(self.nodes)
 
     def _should_join(self):
-        return (
-            self.feature_view.compute_config.join_strategy == "engine"
-            or self.task.config.compute_engine.get("point_in_time_join") == "engine"
-        )
+        if hasattr(self.feature_view, "batch_engine"):
+            return hasattr(self.feature_view.batch_engine, "join_strategy") and (
+                self.feature_view.batch_engine.join_strategy == "engine"
+                or self.task.config.batch_engine.get("point_in_time_join") == "engine"
+            )
+        if hasattr(self.feature_view, "batch_engine_config"):
+            return hasattr(self.feature_view.stream_engine, "join_strategy") and (
+                self.feature_view.stream_engine.join_strategy == "engine"
+                or self.task.config.stream_engine.get("point_in_time_join") == "engine"
+            )
+        return False
diff --git a/sdk/python/feast/infra/compute_engines/spark/spark_dag_builder.py b/sdk/python/feast/infra/compute_engines/spark/spark_dag_builder.py
@@ -27,12 +27,11 @@ def __init__(
         self.spark_session = spark_session
 
     def build_source_node(self):
-        source_path = self.feature_view.source.path
         if isinstance(self.task, MaterializationTask):
-            node = SparkMaterializationReadNode("source", source_path)
+            node = SparkMaterializationReadNode("source", self.task)
         else:
             node = SparkHistoricalRetrievalReadNode(
-                "source", source_path, self.spark_session
+                "source", self.task, self.spark_session
             )
         self.nodes.append(node)
         return node
diff --git a/sdk/python/feast/stream_feature_view.py b/sdk/python/feast/stream_feature_view.py
@@ -83,6 +83,7 @@ class StreamFeatureView(FeatureView):
     udf: Optional[FunctionType]
     udf_string: Optional[str]
     feature_transformation: Optional[Transformation]
+    stream_engine: Optional[Field]
 
     def __init__(
         self,
@@ -103,6 +104,7 @@ def __init__(
         udf: Optional[FunctionType] = None,
         udf_string: Optional[str] = "",
         feature_transformation: Optional[Transformation] = None,
+        stream_engine: Optional[Field] = None,
     ):
         if not flags_helper.is_test():
             warnings.warn(
@@ -133,6 +135,7 @@ def __init__(
         self.feature_transformation = (
             feature_transformation or self.get_feature_transformation()
         )
+        self.stream_engine = stream_engine
 
         super().__init__(
             name=name,
diff --git a/sdk/python/tests/example_repos/example_feature_repo_with_bfvs_compute.py b/sdk/python/tests/example_repos/example_feature_repo_with_bfvs_compute.py
@@ -0,0 +1,67 @@
+from datetime import timedelta
+
+from pyspark.sql import DataFrame
+
+from feast import BatchFeatureView, Entity, Field, FileSource
+from feast.types import Float32, Int32, Int64
+
+driver_hourly_stats = FileSource(
+    path="%PARQUET_PATH%",  # placeholder to be replaced by the test
+    timestamp_field="event_timestamp",
+    created_timestamp_column="created",
+)
+
+driver = Entity(
+    name="driver_id",
+    description="driver id",
+)
+
+
+def transform_feature(df: DataFrame) -> DataFrame:
+    df = df.withColumn("conv_rate", df["conv_rate"] * 2)
+    df = df.withColumn("acc_rate", df["acc_rate"] * 2)
+    return df
+
+
+driver_hourly_stats_view = BatchFeatureView(
+    name="driver_hourly_stats",
+    entities=[driver],
+    mode="python",
+    udf=transform_feature,
+    udf_string="transform_feature",
+    ttl=timedelta(days=1),
+    schema=[
+        Field(name="conv_rate", dtype=Float32),
+        Field(name="acc_rate", dtype=Float32),
+        Field(name="avg_daily_trips", dtype=Int64),
+        Field(name="driver_id", dtype=Int32),
+    ],
+    online=True,
+    offline=True,
+    source=driver_hourly_stats,
+    tags={},
+)
+
+
+global_daily_stats = FileSource(
+    path="%PARQUET_PATH_GLOBAL%",  # placeholder to be replaced by the test
+    timestamp_field="event_timestamp",
+    created_timestamp_column="created",
+)
+
+
+global_stats_feature_view = BatchFeatureView(
+    name="global_daily_stats",
+    entities=None,
+    mode="python",
+    udf=lambda x: x,
+    ttl=timedelta(days=1),
+    schema=[
+        Field(name="num_rides", dtype=Int32),
+        Field(name="avg_ride_length", dtype=Float32),
+    ],
+    online=True,
+    offline=True,
+    source=global_daily_stats,
+    tags={},
+)
diff --git a/sdk/python/tests/integration/compute_engines/spark/test_compute.py b/sdk/python/tests/integration/compute_engines/spark/test_compute.py
@@ -0,0 +1,122 @@
+from datetime import datetime, timedelta
+from typing import cast
+from unittest.mock import MagicMock
+
+import pandas as pd
+import pytest
+
+from feast.infra.compute_engines.base import HistoricalRetrievalTask
+from feast.infra.compute_engines.spark.compute import SparkComputeEngine
+from feast.infra.compute_engines.spark.job import SparkDAGRetrievalJob
+from feast.infra.offline_stores.contrib.spark_offline_store.spark import (
+    SparkOfflineStore,
+)
+from feast.infra.offline_stores.contrib.spark_offline_store.tests.data_source import (
+    SparkDataSourceCreator,
+)
+from tests.example_repos.example_feature_repo_with_bfvs_compute import (
+    global_stats_feature_view,
+)
+from tests.integration.feature_repos.integration_test_repo_config import (
+    IntegrationTestRepoConfig,
+)
+from tests.integration.feature_repos.repo_configuration import (
+    construct_test_environment,
+)
+from tests.integration.feature_repos.universal.online_store.redis import (
+    RedisOnlineStoreCreator,
+)
+
+
+@pytest.mark.integration
+def test_spark_compute_engine_get_historical_features():
+    now = datetime.utcnow()
+
+    spark_config = IntegrationTestRepoConfig(
+        provider="local",
+        online_store_creator=RedisOnlineStoreCreator,
+        offline_store_creator=SparkDataSourceCreator,
+        batch_engine={"type": "spark.engine", "partitions": 10},
+    )
+    spark_environment = construct_test_environment(
+        spark_config, None, entity_key_serialization_version=2
+    )
+
+    spark_environment.setup()
+
+    # 👷 Prepare test parquet feature file
+    df = pd.DataFrame(
+        [
+            {
+                "driver_id": 1001,
+                "event_timestamp": now - timedelta(days=1),
+                "created": now - timedelta(hours=2),
+                "conv_rate": 0.8,
+                "acc_rate": 0.95,
+                "avg_daily_trips": 15,
+            },
+            {
+                "driver_id": 1001,
+                "event_timestamp": now - timedelta(days=2),
+                "created": now - timedelta(hours=3),
+                "conv_rate": 0.75,
+                "acc_rate": 0.9,
+                "avg_daily_trips": 14,
+            },
+            {
+                "driver_id": 1002,
+                "event_timestamp": now - timedelta(days=1),
+                "created": now - timedelta(hours=2),
+                "conv_rate": 0.7,
+                "acc_rate": 0.88,
+                "avg_daily_trips": 12,
+            },
+        ]
+    )
+
+    ds = spark_environment.data_source_creator.create_data_source(
+        df,
+        spark_environment.feature_store.project,
+        field_mapping={"ts_1": "ts"},
+    )
+    global_stats_feature_view.source = ds
+
+    # 📥 Entity DataFrame to join with
+    entity_df = pd.DataFrame(
+        [
+            {"driver_id": 1001, "event_timestamp": now},
+            {"driver_id": 1002, "event_timestamp": now},
+        ]
+    )
+
+    # 🛠 Build retrieval task
+    task = HistoricalRetrievalTask(
+        entity_df=entity_df,
+        feature_view=global_stats_feature_view,
+        full_feature_name=False,
+        registry=MagicMock(),
+        config=spark_environment.config,
+        start_time=now - timedelta(days=1),
+        end_time=now,
+    )
+
+    # 🧪 Run SparkComputeEngine
+    engine = SparkComputeEngine(
+        repo_config=task.config,
+        offline_store=SparkOfflineStore(),
+        online_store=MagicMock(),
+        registry=MagicMock(),
+    )
+
+    spark_dag_retrieval_job = engine.get_historical_features(task)
+    spark_df = cast(SparkDAGRetrievalJob, spark_dag_retrieval_job).to_spark_df()
+    df_out = spark_df.to_pandas().sort_values("driver_id").reset_index(drop=True)
+
+    # ✅ Assert output
+    assert list(df_out.driver_id) == [1001, 1002]
+    assert abs(df_out.loc[0]["conv_rate"] - 0.8) < 1e-6
+    assert abs(df_out.loc[1]["conv_rate"] - 0.7) < 1e-6
+
+
+if __name__ == "__main__":
+    test_spark_compute_engine_get_historical_features()