revert kafkaStreamProcessor changes, change base type instead

Signed-off-by: Chester Ong <chester.ong.ch@gmail.com>
feast-dev · shuchu · Feb 20, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
commit 1aac1e95622e87003793a716bfc89d15419a3621
diff --git a/sdk/python/feast/infra/contrib/spark_kafka_processor.py b/sdk/python/feast/infra/contrib/spark_kafka_processor.py
@@ -5,6 +5,7 @@
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.avro.functions import from_avro
 from pyspark.sql.functions import col, from_json
+from pyspark.sql.streaming import StreamingQuery
 
 from feast.data_format import AvroFormat, JsonFormat
 from feast.data_source import KafkaSource, PushMode
@@ -67,10 +68,13 @@ def __init__(
         # data_source type has been checked to be an instance of KafkaSource.
         self.data_source: KafkaSource = self.data_source  # type: ignore
 
-    def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None:
+    def ingest_stream_feature_view(
+        self, to: PushMode = PushMode.ONLINE
+    ) -> StreamingQuery:
         ingested_stream_df = self._ingest_stream_data()
         transformed_df = self._construct_transformation_plan(ingested_stream_df)
-        self._write_stream_data(transformed_df, to)
+        online_store_query = self._write_stream_data(transformed_df, to)
+        return online_store_query
 
     # In the line 64 of __init__(), the "data_source" is assigned a stream_source (and has to be KafkaSource as in line 40).
     @no_type_check
@@ -127,7 +131,7 @@ def _ingest_stream_data(self) -> StreamTable:
     def _construct_transformation_plan(self, df: StreamTable) -> StreamTable:
         return self.sfv.udf.__call__(df) if self.sfv.udf else df
 
-    def _write_stream_data(self, df: StreamTable, to: PushMode) -> None:
+    def _write_stream_data(self, df: StreamTable, to: PushMode) -> StreamingQuery:
         # Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
         def batch_write(row: DataFrame, batch_id: int):
             rows: pd.DataFrame = row.toPandas()
@@ -166,3 +170,4 @@ def batch_write(row: DataFrame, batch_id: int):
         )
 
         query.awaitTermination(timeout=self.query_timeout)
+        return query
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from types import MethodType
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 from pyspark.sql import DataFrame
 from typing_extensions import TypeAlias
@@ -51,7 +51,9 @@ def __init__(
         self.data_source = data_source
 
     @abstractmethod
-    def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None:
+    def ingest_stream_feature_view(
+        self, to: PushMode = PushMode.ONLINE
+    ) -> Optional[Any]:
         """
         Ingests data from the stream source attached to the stream feature view; transforms the data
         and then persists it to the online store and/or offline store, depending on the 'to' parameter.
@@ -75,7 +77,7 @@ def _construct_transformation_plan(self, table: StreamTable) -> StreamTable:
         raise NotImplementedError
 
     @abstractmethod
-    def _write_stream_data(self, table: StreamTable, to: PushMode) -> None:
+    def _write_stream_data(self, table: StreamTable, to: PushMode) -> Optional[Any]:
         """
         Launches a job to persist stream data to the online store and/or offline store, depending
         on the 'to' parameter, and returns a handle for the job.