py: properly serialize DataFrames with Timestamp columns (#1846)

abhizer · Leonid Ryzhyk · web-flow · commit 1d225ac907e8 · 2024-06-10T22:21:58.000+05:45
* py: properly serialize DataFrames with Timestamp columns Fixes: #1840 Also does the following things: * chunk dataframes into smaller groups of 1000 rows per request while ingesting data * avoids adding empty dataframes to output buffer * ignores the index while concatenating output dataframes Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> * py: Testing instructions. Signed-off-by: Leonid Ryzhyk <leonid@feldera.com> * py: Encode Pandas timestamps as epoch. Introduces a new JSON dialect that matches how Pandas encodes timestamp types as millis since epoch. Signed-off-by: Leonid Ryzhyk <leonid@feldera.com> * py: rename dont_serialize to serialize in push_to_pipeline Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> --------- Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> Signed-off-by: Leonid Ryzhyk <leonid@feldera.com> Co-authored-by: Leonid Ryzhyk <leonid@feldera.com>
diff --git a/crates/pipeline-types/src/format/json.rs b/crates/pipeline-types/src/format/json.rs
@@ -140,6 +140,8 @@ pub enum JsonFlavor {
     /// JSON format accepted by the Kafka Connect `JsonConverter` class.
     #[serde(rename = "kafka_connect_json_converter")]
     KafkaConnectJsonConverter,
+    #[serde(rename = "pandas")]
+    Pandas,
     /// Parquet to-json format.
     /// (For internal use only)
     #[serde(skip)]
diff --git a/crates/pipeline-types/src/serde_with_context/serde_config.rs b/crates/pipeline-types/src/serde_with_context/serde_config.rs
@@ -140,6 +140,12 @@ impl From<JsonFlavor> for SqlSerdeConfig {
                 timestamp_format: TimestampFormat::String("%Y-%m-%dT%H:%M:%S%.f%:z"),
                 decimal_format: DecimalFormat::String,
             },
+            JsonFlavor::Pandas => Self {
+                time_format: TimeFormat::String("%H:%M:%S%.f"),
+                date_format: DateFormat::String("%Y-%m-%d"),
+                timestamp_format: TimestampFormat::MillisSinceEpoch,
+                decimal_format: DecimalFormat::String,
+            },
             JsonFlavor::ParquetConverter => Self {
                 time_format: TimeFormat::Nanos,
                 date_format: DateFormat::String("%Y-%m-%d"),
diff --git a/openapi.json b/openapi.json
@@ -3790,7 +3790,8 @@
           "default",
           "debezium_mysql",
           "snowflake",
-          "kafka_connect_json_converter"
+          "kafka_connect_json_converter",
+          "pandas"
         ]
       },
       "JsonParserConfig": {
diff --git a/python/README.md b/python/README.md
@@ -32,4 +32,12 @@ sphinx-apidoc -o . ../feldera
 make html
 ```
 
-To clean the build, run `make clean`.
+To clean the build, run `make clean`.
+
+## Testing
+
+
+```bash
+cd python
+python3 -m unittest
+```
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -32,3 +32,12 @@ def validate_connector_input_format(fmt: Format):
 
     if isinstance(fmt, JSONFormat) and fmt.config.get("update_format") is None:
         raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
+
+
+def chunk_dataframe(df, chunk_size=1000):
+    """
+    Yield successive n-sized chunks from the given dataframe.
+    """
+
+    for i in range(0, len(df), chunk_size):
+        yield df.iloc[i:i + chunk_size]
diff --git a/python/feldera/output_handler.py b/python/feldera/output_handler.py
@@ -20,7 +20,8 @@ def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, qu
 
         # the callback that is passed to the `CallbackRunner`
         def callback(df: pd.DataFrame, _: int):
-            self.buffer.append(df)
+            if not df.empty:
+                self.buffer.append(df)
 
         # sets up the callback runner
         self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)
@@ -38,4 +39,7 @@ def to_pandas(self):
         """
 
         self.handler.join()
-        return pd.concat(self.buffer)
+
+        if len(self.buffer) == 0:
+            return pd.DataFrame()
+        return pd.concat(self.buffer, ignore_index=True)
diff --git a/python/feldera/rest/_httprequests.py b/python/feldera/rest/_httprequests.py
@@ -9,6 +9,10 @@
 from typing import Callable, Optional, Any, Union, Mapping, Sequence, List
 
 
+def json_serialize(body: Any) -> str:
+    return json.dumps(body) if body else "" if body == "" else "null"
+
+
 class HttpRequests:
     def __init__(self, config: Config) -> None:
         self.config = config
@@ -28,6 +32,7 @@ def send_request(
             content_type: str = "application/json",
             params: Optional[Mapping[str, Any]] = None,
             stream: bool = False,
+            serialize: bool = True,
     ) -> Any:
         """
         :param http_method: The HTTP method to use. Takes the equivalent `requests.*` module. (Example: `requests.get`)
@@ -36,6 +41,7 @@ def send_request(
         :param content_type: The value for `Content-Type` HTTP header. "application/json" by default.
         :param params: The query parameters part of this request.
         :param stream: True if the response is expected to be a HTTP stream.
+        :param serialize: True if the body needs to be serialized to JSON.
         """
         self.headers["Content-Type"] = content_type
 
@@ -71,7 +77,7 @@ def send_request(
                     request_path,
                     timeout=timeout,
                     headers=headers,
-                    data=json.dumps(body) if body else "" if body == "" else "null",
+                    data=json_serialize(body) if serialize else body,
                     params=params,
                     stream=stream,
                 )
@@ -102,8 +108,16 @@ def post(
             content_type: Optional[str] = "application/json",
             params: Optional[Mapping[str, Any]] = None,
             stream: bool = False,
+            serialize: bool = True,
     ) -> Any:
-        return self.send_request(requests.post, path, body, content_type, params, stream=stream)
+        return self.send_request(
+            requests.post,
+            path,
+            body,
+            content_type,
+            params, stream=stream,
+            serialize=serialize
+        )
 
     def patch(
             self,
diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py
@@ -32,13 +32,19 @@ def __init__(
         """
         :param url: The url to Feldera API (ex: https://try.feldera.com)
         :param api_key: The optional API key for Feldera
-        :param timeout: (optional) The amount of time in seconds that the cient will wait for a response beforing timing
+        :param timeout: (optional) The amount of time in seconds that the client will wait for a response before timing
             out.
         """
 
         self.config = Config(url, api_key, timeout)
         self.http = HttpRequests(self.config)
 
+        try:
+            self.programs()
+        except Exception as e:
+            logging.error(f"Failed to connect to Feldera API: {e}")
+            raise e
+
     def programs(self) -> list[Program]:
         """
         Get all programs
@@ -381,6 +387,8 @@ def push_to_pipeline(
             array: bool = False,
             force: bool = False,
             update_format: str = "raw",
+            json_flavor: str = None,
+            serialize: bool = True,
     ):
         """
         Insert data into a pipeline
@@ -394,8 +402,10 @@ def push_to_pipeline(
         :param force: If True, the data will be inserted even if the pipeline is paused
         :param update_format: JSON data change event format, used in conjunction with the "json" format,
             the default value is "insert_delete", other supported formats: "weighted", "debezium", "snowflake", "raw"
-
+        :param json_flavor: JSON encoding used for individual table records, the default value is "default", other supported encodings:
+            "debezium_mysql", "snowflake", "kafka_connect_json_converter", "pandas"
         :param data: The data to insert
+        :param serialize: If True, the data will be serialized to JSON. True by default
         """
 
         if format not in ["json", "csv"]:
@@ -404,6 +414,9 @@ def push_to_pipeline(
         if update_format not in ["insert_delete", "weighted", "debezium", "snowflake", "raw"]:
             raise ValueError("update_format must be one of 'insert_delete', 'weighted', 'debezium', 'snowflake', 'raw'")
 
+        if json_flavor is not None and json_flavor not in ["default", "debezium_mysql", "snowflake", "kafka_connect_json_converter", "pandas"]:
+            raise ValueError("json_flavor must be one of 'default', 'debezium_mysql', 'snowflake', 'kafka_connect_json_converter', 'pandas'")
+
         # python sends `True` which isn't accepted by the backend
         array = _prepare_boolean_input(array)
         force = _prepare_boolean_input(force)
@@ -417,6 +430,9 @@ def push_to_pipeline(
             params["array"] = array
             params["update_format"] = update_format
 
+        if json_flavor is not None:
+            params["json_flavor"] = json_flavor
+
         content_type = "application/json"
 
         if format == "csv":
@@ -428,6 +444,7 @@ def push_to_pipeline(
             params=params,
             content_type=content_type,
             body=data,
+            serialize=serialize,
         )
 
     def listen_to_pipeline(
@@ -493,4 +510,4 @@ def listen_to_pipeline(
             if end and time.time() > end:
                 break
             if chunk:
-                yield json.loads(chunk)
+                yield json.loads(chunk)
diff --git a/python/feldera/sql_context.py b/python/feldera/sql_context.py
@@ -4,6 +4,7 @@
 
 from typing import Optional, Dict, Callable
 
+import pandas as pd
 from typing_extensions import Self
 from queue import Queue
 
@@ -18,9 +19,9 @@
 from feldera._callback_runner import CallbackRunner, _CallbackRunnerInstruction
 from feldera._helpers import ensure_dataframe_has_columns
 from feldera.formats import JSONFormat, CSVFormat, AvroFormat
-from feldera._helpers import validate_connector_input_format
 from feldera.resources import Resources
 from feldera.enums import BuildMode, CompilationProfile
+from feldera._helpers import validate_connector_input_format, chunk_dataframe
 
 
 def _table_name_from_sql(ddl: str) -> str:
@@ -72,7 +73,7 @@ def __init__(
         # TODO: to be used for schema inference
         self.todo_tables: Dict[str, Optional[SQLTable]] = {}
 
-        self.http_input_buffer: list[Dict[str, dict | list[dict] | str]] = []
+        self.http_input_buffer: list[Dict[str, pd.DataFrame]] = []
 
         # buffer that stores all input connectors to be created
         # this is a Mapping[table_name -> list[Connector]]
@@ -173,7 +174,16 @@ def __push_http_inputs(self):
 
         for input_buffer in self.http_input_buffer:
             for tbl_name, data in input_buffer.items():
-                self.client.push_to_pipeline(self.pipeline_name, tbl_name, "json", data, array=True)
+                for datum in chunk_dataframe(data):
+                    self.client.push_to_pipeline(
+                        self.pipeline_name,
+                        tbl_name,
+                        "json",
+                        datum.to_json(orient='records', date_format='epoch'),
+                        json_flavor='pandas',
+                        array=True,
+                        serialize=False
+                    )
 
         self.http_input_buffer.clear()
 
@@ -273,7 +283,7 @@ def connect_source_pandas(self, table_name: str, df: pandas.DataFrame):
 
         if tbl:
             # tbl.validate_schema(df)   TODO: something like this would be nice
-            self.http_input_buffer.append({tbl.name: df.to_dict('records')})
+            self.http_input_buffer.append({tbl.name: df})
             return
 
         tbl = self.todo_tables.get(table_name)
diff --git a/python/tests/test_pipeline.py b/python/tests/test_pipeline.py
@@ -135,18 +135,18 @@ def test_listen_to_pipeline(self):
         name = str(uuid.uuid4())
         self.test_create_pipeline(name, False)
 
-        TEST_CLIENT.start_pipeline(name)
+        TEST_CLIENT.pause_pipeline(name)
 
         t1 = threading.Thread(target=self.__listener, args=(name,))
         t1.start()
 
+        TEST_CLIENT.start_pipeline(name)
         TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)
 
         t1.join()
 
         assert self.result
 
-        TEST_CLIENT.pause_pipeline(name)
         TEST_CLIENT.shutdown_pipeline(name)
         TEST_CLIENT.delete_pipeline(name)
 
diff --git a/python/tests/test_wireframes.py b/python/tests/test_wireframes.py
@@ -1,7 +1,7 @@
 import time
 import unittest
 import pandas as pd
-from kafka import KafkaProducer, KafkaConsumer, TopicPartition
+from kafka import KafkaProducer, KafkaConsumer
 from kafka.admin import KafkaAdminClient, NewTopic
 
 from feldera import SQLContext, SQLSchema
@@ -324,6 +324,31 @@ def test_pipeline_resource_config(self):
 
         assert TEST_CLIENT.get_pipeline(name).config["resources"] == config
 
+    def test_timestamp_pandas(self):
+        sql = SQLContext("test_timestamp_pandas", TEST_CLIENT).get_or_create()
+
+        TBL_NAME = "items"
+        VIEW_NAME = "s"
+
+        # backend doesn't support TIMESTAMP of format: "2024-06-06T18:06:28.443"
+        sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING", "birthdate": "TIMESTAMP"}))
+
+        sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")
+
+        df = pd.DataFrame({"id": [1, 2, 3], "name": ["a", "b", "c"], "birthdate": [
+            pd.Timestamp.now(), pd.Timestamp.now(), pd.Timestamp.now()
+        ]})
+
+        sql.connect_source_pandas(TBL_NAME, df)
+
+        out = sql.listen(VIEW_NAME)
+
+        sql.run_to_completion()
+
+        df = out.to_pandas()
+
+        assert df.shape[0] == 3
+
 
 if __name__ == '__main__':
     unittest.main()