Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
revert kafkaStreamProcessor changes, change base type instead
Signed-off-by: Chester Ong <chester.ong.ch@gmail.com>
  • Loading branch information
bushwhackr committed Feb 16, 2024
commit 1aac1e95622e87003793a716bfc89d15419a3621
11 changes: 8 additions & 3 deletions sdk/python/feast/infra/contrib/spark_kafka_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import col, from_json
from pyspark.sql.streaming import StreamingQuery

from feast.data_format import AvroFormat, JsonFormat
from feast.data_source import KafkaSource, PushMode
Expand Down Expand Up @@ -67,10 +68,13 @@ def __init__(
# data_source type has been checked to be an instance of KafkaSource.
self.data_source: KafkaSource = self.data_source # type: ignore

def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None:
def ingest_stream_feature_view(
self, to: PushMode = PushMode.ONLINE
) -> StreamingQuery:
ingested_stream_df = self._ingest_stream_data()
transformed_df = self._construct_transformation_plan(ingested_stream_df)
self._write_stream_data(transformed_df, to)
online_store_query = self._write_stream_data(transformed_df, to)
return online_store_query

# In the line 64 of __init__(), the "data_source" is assigned a stream_source (and has to be KafkaSource as in line 40).
@no_type_check
Expand Down Expand Up @@ -127,7 +131,7 @@ def _ingest_stream_data(self) -> StreamTable:
def _construct_transformation_plan(self, df: StreamTable) -> StreamTable:
return self.sfv.udf.__call__(df) if self.sfv.udf else df

def _write_stream_data(self, df: StreamTable, to: PushMode) -> None:
def _write_stream_data(self, df: StreamTable, to: PushMode) -> StreamingQuery:
# Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
def batch_write(row: DataFrame, batch_id: int):
rows: pd.DataFrame = row.toPandas()
Expand Down Expand Up @@ -166,3 +170,4 @@ def batch_write(row: DataFrame, batch_id: int):
)

query.awaitTermination(timeout=self.query_timeout)
return query
8 changes: 5 additions & 3 deletions sdk/python/feast/infra/contrib/stream_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from types import MethodType
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, Any, Optional

from pyspark.sql import DataFrame
from typing_extensions import TypeAlias
Expand Down Expand Up @@ -51,7 +51,9 @@ def __init__(
self.data_source = data_source

@abstractmethod
def ingest_stream_feature_view(self, to: PushMode = PushMode.ONLINE) -> None:
def ingest_stream_feature_view(
self, to: PushMode = PushMode.ONLINE
) -> Optional[Any]:
"""
Ingests data from the stream source attached to the stream feature view; transforms the data
and then persists it to the online store and/or offline store, depending on the 'to' parameter.
Expand All @@ -75,7 +77,7 @@ def _construct_transformation_plan(self, table: StreamTable) -> StreamTable:
raise NotImplementedError

@abstractmethod
def _write_stream_data(self, table: StreamTable, to: PushMode) -> None:
def _write_stream_data(self, table: StreamTable, to: PushMode) -> Optional[Any]:
"""
Launches a job to persist stream data to the online store and/or offline store, depending
on the 'to' parameter, and returns a handle for the job.
Expand Down