py: convert SQL types to proper pandas type (#2305)

abhizer · ryzhyk · web-flow · commit 334c2adfebe8 · 2024-08-26T22:03:34.000+05:45
* py: convert SQL types to proper pandas type

When receiving data from feldera and creating a Pandas dataframe, this
commit considers the SQL schema of the data, and then uses appropriate
pandas types for the data columns.

Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;

* docs: document pandas types compatibility

Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;

* Update python/feldera/_callback_runner.py

Co-authored-by: Leonid Ryzhyk &lt;ryzhyk@gmail.com&gt;
Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;

---------

Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;
Co-authored-by: Leonid Ryzhyk &lt;ryzhyk@gmail.com&gt;
diff --git a/python/docs/examples.rst b/python/docs/examples.rst
@@ -1,66 +1,8 @@
 Examples
 ========
 
-Using Pandas DataFrames as Input / Output
-*******************************************
+Specifying Data Sources / Sinks
+*******************************
 
-
-You can use :meth:`.Pipeline.input_pandas` to insert records from a
-DataFrame to a Feldera table.
-
-Use :meth:`.Pipeline.listen` to subscribe to updates to a view in the form of a stream of DataFrames.
-To ensure all data is received start listening before calling
-:meth:`.Pipeline.start`.
-
-.. highlight:: python
-.. code-block:: python
-
-    from feldera import FelderaClient, PipelineBuilder
-    import pandas as pd
-
-    sql = f"""
-    CREATE TABLE students (
-        name STRING,
-        id INT
-    );
-
-    CREATE TABLE grades (
-        student_id INT,
-        science INT,
-        maths INT,
-        art INT
-    );
-
-    CREATE VIEW average_scores AS SELECT name, ((science + maths + art) / 3) as average FROM {TBL_NAMES[0]} JOIN {TBL_NAMES[1]} on id = student_id ORDER BY average DESC;
-    """
-
-    # Create a client
-    client = FelderaClient("https://try.feldera.com", api_key="YOUR_API_KEY")
-    pipeline = PipelineBuilder(client, name="notebook", sql=sql).create_or_replace()
-
-    df_students = pd.read_csv('students.csv')
-    df_grades = pd.read_csv('grades.csv')
-
-    # listen for the output of the view here in the notebook
-    # you do not need to call this if you are forwarding the data to a sink
-    out = pipeline.listen("average_scores")
-
-    pipeline.start()
-    pipeline.input_pandas("students", df_students)
-    pipeline.input_pandas("grades", df_grades)
-
-    # wait for the pipeline to complete
-    # note that if the source is a stream, this will run indefinitely
-    pipeline.wait_for_completion(True)
-    df = out.to_pandas()
-
-    # see the result
-    print(df)
-
-    pipeline.delete()
-
-Using Other Data Sources / Sinks
-**********************************
-
-To connect Feldera to other data sources or sinks, you can specify them in the SQL code.
+To connect Feldera to data sources or sinks, you can specify them in the SQL code.
 Refer to the connector documentation at: https://github.com/feldera/feldera/tree/main/docs/connectors
diff --git a/python/docs/index.rst b/python/docs/index.rst
@@ -11,6 +11,7 @@ Welcome to feldera's documentation!
    :caption: Contents:
 
    introduction
+   pandas
    examples
 
 .. toctree::
diff --git a/python/docs/pandas.rst b/python/docs/pandas.rst
@@ -0,0 +1,93 @@
+Pandas Compatibility
+====================
+
+Feldera tries to be compatible with the Pandas as much as possible.
+However, some types in SQL have limited support in Pandas.
+
+Columns with the following SQL types will be converted to the corresponding Pandas types:
+
+.. csv-table::
+   :header: "SQL Type", "Pandas Type"
+
+    "BOOLEAN", "bool"
+    "TINYINT", "Int8"
+    "SMALLINT", "Int16"
+    "INTEGER", "Int32"
+    "BIGINT", "Int64"
+    "REAL", "Float32"
+    "DOUBLE", "Float64"
+    "DECIMAL", "decimal.Decimal"
+    "CHAR", "str"
+    "VARCHAR", "str"
+    "DATE", "datetime64[ns]"
+    "TIMESTAMP", "datetime64[ns]"
+    "TIME", "timedelta64[ns]"
+    "INTERVAL", "timedelta64[ns]"
+    "ARRAY", "object"
+    "BINARY", "object"
+    "VARBINARY", "object"
+    "STRUCT", "object"
+    "MAP", "object"
+
+
+.. note::
+    Please note that the "object" type in Pandas is dynamic and can hold any type of data.
+    Therefore, the representation of primitive types in arrays, binary, struct, and map types may be different to their
+    representation as a standalone column.
+
+Using Pandas DataFrames as Input / Output
+*******************************************
+
+You can use :meth:`.Pipeline.input_pandas` to insert records from a
+DataFrame to a Feldera table.
+
+Use :meth:`.Pipeline.listen` to subscribe to updates to a view in the form of a stream of DataFrames.
+To ensure all data is received start listening before calling
+:meth:`.Pipeline.start`.
+
+.. highlight:: python
+.. code-block:: python
+
+    from feldera import FelderaClient, PipelineBuilder
+    import pandas as pd
+
+    sql = f"""
+    CREATE TABLE students (
+        name STRING,
+        id INT
+    );
+
+    CREATE TABLE grades (
+        student_id INT,
+        science INT,
+        maths INT,
+        art INT
+    );
+
+    CREATE VIEW average_scores AS SELECT name, ((science + maths + art) / 3) as average FROM {TBL_NAMES[0]} JOIN {TBL_NAMES[1]} on id = student_id ORDER BY average DESC;
+    """
+
+    # Create a client
+    client = FelderaClient("https://try.feldera.com", api_key="YOUR_API_KEY")
+    pipeline = PipelineBuilder(client, name="notebook", sql=sql).create_or_replace()
+
+    df_students = pd.read_csv('students.csv')
+    df_grades = pd.read_csv('grades.csv')
+
+    # listen for the output of the view here in the notebook
+    # you do not need to call this if you are forwarding the data to a sink
+    out = pipeline.listen("average_scores")
+
+    pipeline.start()
+    pipeline.input_pandas("students", df_students)
+    pipeline.input_pandas("grades", df_grades)
+
+    # wait for the pipeline to complete
+    # note that if the source is a stream, this will run indefinitely
+    pipeline.wait_for_completion(True)
+    df = out.to_pandas()
+
+    # see the result
+    print(df)
+
+    pipeline.delete()
diff --git a/python/feldera/_callback_runner.py b/python/feldera/_callback_runner.py
@@ -29,6 +29,7 @@ def __init__(
         self.view_name: str = view_name
         self.callback: Callable[[pd.DataFrame, int], None] = callback
         self.queue: Optional[Queue] = queue
+        self.schema: Optional[dict] = None
 
     def run(self):
         """
@@ -37,6 +38,19 @@ def run(self):
         :meta private:
         """
 
+        pipeline = self.client.get_pipeline(self.pipeline_name)
+        schema = pipeline.program_info["schema"]
+
+        if schema:
+            schemas = [relation for relation in schema["inputs"] + schema["outputs"]]
+            for schema in schemas:
+                if schema["name"] == self.view_name:
+                    self.schema = schema
+                    break
+
+        if self.schema is None:
+            raise ValueError(f"Table or View {self.view_name} not found in the pipeline schema.")
+
         # by default, we assume that the pipeline has been started
         ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
 
@@ -65,7 +79,7 @@ def run(self):
                     seq_no: int = chunk.get("sequence_number")
 
                     if data is not None:
-                        self.callback(dataframe_from_response([data]), seq_no)
+                        self.callback(dataframe_from_response([data], schema), seq_no)
 
                     if self.queue:
                         try:
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -1,4 +1,45 @@
 import pandas as pd
+from decimal import Decimal
+
+
+def sql_type_to_pandas_type(sql_type: str):
+    """
+    Converts a SQL type to a pandas type.
+    """
+
+    match sql_type.upper():
+        case 'BOOLEAN':
+            return 'boolean'
+        case 'TINYINT':
+            return 'Int8'
+        case 'SMALLINT':
+            return 'Int16'
+        case 'INTEGER':
+            return 'Int32'
+        case 'BIGINT':
+            return 'Int64'
+        case 'REAL':
+            return 'Float32'
+        case 'DOUBLE':
+            return 'Float64'
+        case 'DECIMAL':
+            return None
+        case 'CHAR':
+            return 'str'
+        case 'VARCHAR':
+            return 'str'
+        case 'DATE' | 'TIMESTAMP':
+            return 'datetime64[ns]'
+        case 'TIME' | 'INTERVAL':
+            return 'timedelta64[ns]'
+        case 'ARRAY':
+            return None
+        case 'NULL':
+            return None
+        case 'BINARY' | 'VARBINARY':
+            return None
+        case 'STRUCT' | 'MAP':
+            return None
 
 
 def ensure_dataframe_has_columns(df: pd.DataFrame):
@@ -15,14 +56,39 @@ def ensure_dataframe_has_columns(df: pd.DataFrame):
         )
 
 
-def dataframe_from_response(buffer: list[list[dict]]):
+def dataframe_from_response(buffer: list[list[dict]], schema: dict):
     """
     Converts the response from Feldera to a pandas DataFrame.
     """
-    return pd.DataFrame([
+
+    pd_schema = {}
+
+    decimal_col = []
+
+    for column in schema['fields']:
+        column_name = column['name']
+        column_type = column['columntype']['type']
+        if column_type == 'DECIMAL':
+            decimal_col.append(column_name)
+
+        pd_schema[column_name] = sql_type_to_pandas_type(column_type)
+
+    data = [
         {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
         for sublist in buffer for item in sublist
-    ])
+    ]
+
+    if len(decimal_col) != 0:
+        for datum in data:
+            for col in decimal_col:
+                if datum[col] is not None:
+                    datum[col] = Decimal(datum[col])
+
+
+    df = pd.DataFrame(data)
+    df = df.astype(pd_schema)
+
+    return df
 
 
 def chunk_dataframe(df, chunk_size=1000):
diff --git a/python/tests/test_pipeline_builder.py b/python/tests/test_pipeline_builder.py