py: kafka connector (#1807)

abhizer · web-flow · commit 4fee55f5677d · 2024-06-04T08:20:10.000+05:45
* py: kafka connector * also rename `Client` to `FelderaClient` Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> py: update the kafka test Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> py: update documentation * include an example for Kafka connector in the python docs * refactor `Client` to `FelderaClient` * add an entry in the changelog Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> py: update documentation Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> * py: HTTP GET input connector (#1816) * py: HTTP GET input connector Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> * py: refactor: common function to check format to create connector Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> * py: update docs, rename `UpdateFormat` to `JSONUpdateFormat` Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> --------- Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com> --------- Signed-off-by: Abhinav Gyawali <22275402+abhizer@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [0.17.0] - 2024-05-28
 
+- [Python] Added support for Kafka connector via Python SDK
+  ([#1807](https://github.com/feldera/feldera/pull/1807))
+- [Python] Added support for HTTP GET connector via Python SDK
+
 ### Added
 
 - Added backpressure mode to the `/egress` endpoint, which applies
diff --git a/python/.gitignore b/python/.gitignore
@@ -10,3 +10,4 @@ docs/feldera.rst
 docs/feldera.rest.rst
 build
 feldera.egg-info
+UNKNOWN.egg-info
diff --git a/python/docs/examples.rst b/python/docs/examples.rst
@@ -64,3 +64,135 @@ to to call :meth:`.SQLContext.listen` before you call
 
     # see the result
     print(df)
+
+
+Kafka
+******
+
+To setup Kafka as the source use :meth:`.SQLContext.connect_source_kafka` and as the sink use
+:meth:`.SQLContext.connect_sink_kafka`.
+
+Both of these methods require a ``config`` which is a dictionary, and ``fmt`` which is a
+`data format configuration <https://www.feldera.com/docs/api/json>`_ that is either a
+:class:`.JSONFormat` or :class:`.CSVFormat`.
+
+The input config looks like the following:
+
+.. highlight:: python
+.. code-block:: python
+
+    source_config = {
+        "topics": [INPUT_TOPIC],
+        "bootstrap.servers": KAFKA_SERVER_URL,
+        "auto.offset.reset": "earliest",
+    }
+
+Here,
+
+- ``topics`` is a list of Kafka topics to subscribe to for input data.
+- ``bootstrap.servers`` is the ``host:port`` of the Kafka server.
+- Similarly, other
+  `relevant options supported by librdkafka <https://github.com/confluentinc/librdkafka/blob/master/CONFIGURATION.md>`_
+  can also be set here, like: ``auto.offset.reset``
+
+More on Kafka as an input connector at: https://www.feldera.com/docs/connectors/sources/kafka
+
+Similarly, the output config looks like the following:
+
+.. highlight:: python
+.. code-block:: python
+
+    sink_config = {
+        "topic": OUTPUT_TOPIC,
+        "bootstrap.servers": PIPELINE_TO_KAFKA_SERVER,
+        "auto.offset.reset": "earliest",
+    }
+
+Here the only notable difference is:
+
+- ``topic`` is the name of the topic to write the output data to.
+
+More on Kafka as the output connector at: https://www.feldera.com/docs/connectors/sinks/kafka
+
+.. warning::
+    Kafka is a streaming data source, therefore running: :meth:`.SQLContext.run_to_completion` will run forever.
+
+.. highlight:: python
+.. code-block:: python
+
+    from feldera import SQLContext, SQLSchema
+    from feldera.formats import JSONFormat, JSONUpdateFormat
+
+    TABLE_NAME = "example"
+    VIEW_NAME = "example_count"
+    KAFKA_SERVER = "localhost:9092"
+
+    sql = SQLContext('kafka', 'http://localhost:8080').get_or_create()
+    sql.register_table(TABLE_NAME, SQLSchema({"id": "INT NOT NULL PRIMARY KEY"}))
+    sql.register_view(VIEW_NAME, f"SELECT COUNT(*) as num_rows FROM {TABLE_NAME}")
+
+    source_config = {
+        "topics": ["example_topic"],
+        "bootstrap.servers": KAFKA_SERVER,
+        "auto.offset.reset": "earliest",
+    }
+
+    sink_config = {
+        "topic": "example_topic_out",
+        "bootstrap.servers": KAFKA_SERVER,
+        "auto.offset.reset": "earliest",
+    }
+
+    # Data format configuration
+    format = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
+
+    sql.connect_source_kafka(TABLE_NAME, "kafka_conn_in", source_config, format)
+    sql.connect_sink_kafka(VIEW_NAME, "kafka_conn_out", sink_config, format)
+
+    out = sql.listen(VIEW_NAME)
+    sql.start()
+    time.sleep(10)
+    sql.shutdown()
+    df = out.to_pandas()
+
+
+HTTP GET
+*********
+
+
+Feldera can ingest data from a user-provided URL into a SQL table.
+The file is fetched using HTTP with the GET method.
+
+More on the HTTP GET connector at: https://www.feldera.com/docs/connectors/sources/http-get
+
+.. note::
+    The JSON used as input for Feldera should be in
+    `newline-delimited JSON (NDJSON) format <https://www.feldera.com/docs/api/json/#encoding-multiple-changes>`_.
+
+
+.. highlight:: python
+.. code-block:: python
+
+    from feldera import SQLContext, SQLSchema
+    from feldera.formats import JSONFormat, JSONUpdateFormat
+
+    sql = SQLContext("test_http_get", TEST_CLIENT).get_or_create()
+
+    TBL_NAME = "items"
+    VIEW_NAME = "s"
+
+    sql.register_table(TBL_NAME, SQLSchema({"id": "INT", "name": "STRING"}))
+
+    sql.register_view(VIEW_NAME, f"SELECT * FROM {TBL_NAME}")
+
+    path = "https://feldera-basics-tutorial.s3.amazonaws.com/part.json"
+
+    fmt = JSONFormat().with_update_format(JSONUpdateFormat.InsertDelete).with_array(False)
+    sql.connect_source_url(TBL_NAME, "part", path, fmt)
+
+    out = sql.listen(VIEW_NAME)
+
+    sql.run_to_completion()
+
+    df = out.to_pandas()
+
diff --git a/python/docs/introduction.rst b/python/docs/introduction.rst
@@ -29,10 +29,10 @@ Replace ``{BRANCH_NAME}`` with the name of the branch you want to install from.
 Key Concepts
 ************
 
-* :class:`feldera.FelderaClient` or :class:`.Client`
+* :class:`.FelderaClient`
    - This is the actual HTTP client used to make requests to your Feldera 
      instance.
-   - creating an instance of :class:`.Client` is usually the first thing you 
+   - creating an instance of :class:`.FelderaClient` is usually the first thing you
      will do while working with Feldera.
 
    - Example:
@@ -62,7 +62,7 @@ Key Concepts
 
       - The first parameter is the name of this SQL context. By default, this is
         the name used in both Feldera Program and Pipeline.
-      - The second parameter here is :class:`.Client` that we created above.
+      - The second parameter here is :class:`.FelderaClient` that we created above.
 
 * :meth:`.SQLContext.run_to_completion`
    - Runs this Feldera pipeline to completion. Normally this means until the EoF
diff --git a/python/feldera/__init__.py b/python/feldera/__init__.py
@@ -1,3 +1,3 @@
-from feldera.rest.client import Client as FelderaClient
+from feldera.rest.feldera_client import FelderaClient
 from feldera.sql_context import SQLContext
 from feldera.sql_schema import SQLSchema
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -1,4 +1,5 @@
 import pandas as pd
+from feldera.formats import JSONFormat, CSVFormat
 
 
 def ensure_dataframe_has_columns(df: pd.DataFrame):
@@ -23,3 +24,11 @@ def dataframe_from_response(buffer: list[list[dict]]):
         {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
         for sublist in buffer for item in sublist
     ])
+
+
+def validate_connector_input_format(fmt: JSONFormat | CSVFormat):
+    if not isinstance(fmt, JSONFormat) and not isinstance(fmt, CSVFormat):
+        raise ValueError("format must be JSONFormat or CSVFormat")
+
+    if isinstance(fmt, JSONFormat) and fmt.config.get("update_format") is None:
+        raise ValueError("update_format not set in the format config; consider using: .with_update_format()")
diff --git a/python/feldera/formats.py b/python/feldera/formats.py
@@ -0,0 +1,109 @@
+from typing import Optional
+from typing_extensions import Self
+from enum import Enum
+
+
+class JSONUpdateFormat(Enum):
+    """
+    Supported JSON data change event formats.
+
+    Each element in a JSON-formatted input stream specifies
+    an update to one or more records in an input table.  We support
+    several different ways to represent such updates.
+
+    https://www.feldera.com/docs/api/json/#the-insertdelete-format
+    """
+
+    InsertDelete = 1
+    """
+    Insert/delete format.
+    
+    Each element in the input stream consists of an "insert" or "delete"
+    command and a record to be inserted to or deleted from the input table.
+    
+    Example: `{"insert": {"id": 1, "name": "Alice"}, "delete": {"id": 2, "name": "Bob"}}`
+    Here, `id` and `name` are the columns in the table.
+    """
+
+    Raw = 2
+    """
+    Raw input format.
+    
+    This format is suitable for insert-only streams (no deletions).
+    Each element in the input stream contains a record without any
+    additional envelope that gets inserted in the input table.
+    
+    Example: `{"id": 1, "name": "Alice"}`
+    Here, `id` and `name` are the columns in the table.
+    """
+
+    def __str__(self):
+        match self:
+            case JSONUpdateFormat.InsertDelete:
+                return "insert_delete"
+            case JSONUpdateFormat.Raw:
+                return "raw"
+
+
+class JSONFormat:
+    """
+    Used to represent data ingested and output from Feldera in the JSON format.
+    """
+
+    def __init__(self, config: Optional[dict] = None):
+        """
+        Creates a new JSONFormat instance.
+
+        :param config: Optional. Configuration for the JSON format.
+        """
+
+        self.config: dict = config or {
+            "array": False,
+        }
+
+    def with_update_format(self, update_format: JSONUpdateFormat) -> Self:
+        """
+        Specifies the format of the data change events in the JSON data stream.
+        """
+
+        self.config["update_format"] = update_format.__str__()
+        return self
+
+    def with_array(self, array: bool) -> Self:
+        """
+        Set to `True` if updates in this stream are packaged into JSON arrays.
+
+        Example: `[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]`
+        """
+
+        self.config["array"] = array
+        return self
+
+    def to_dict(self):
+        """
+        Serialize to a dict to be used in the API request.
+
+        :meta private:
+        """
+        return {
+            "name": "json",
+            "config": self.config
+        }
+
+
+class CSVFormat:
+    """
+    Used to represent data ingested and output from Feldera in the CSV format.
+    
+    https://www.feldera.com/docs/api/csv
+    """
+
+    def to_dict(self) -> dict:
+        """
+        Serialize to a dict to be used in the API request.
+
+        :meta private:
+        """
+        return {
+            "name": "csv"
+        }
diff --git a/python/feldera/rest/__init__.py b/python/feldera/rest/__init__.py
@@ -8,4 +8,4 @@
 
 """
 
-from feldera.rest.client import Client
+from feldera.rest.feldera_client import FelderaClient
diff --git a/python/feldera/rest/config.py b/python/feldera/rest/config.py
@@ -3,7 +3,7 @@
 
 class Config:
     """
-    Client's credentials and configuration parameters
+    :class:`.FelderaClient`'s credentials and configuration parameters
     """
 
     def __init__(
diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py
@@ -15,7 +15,7 @@ def _prepare_boolean_input(value: bool) -> str:
     return "true" if value else "false"
 
 
-class Client:
+class FelderaClient:
     """
     A client for the Feldera HTTP API
 
diff --git a/python/feldera/rest/pipeline.py b/python/feldera/rest/pipeline.py
@@ -50,3 +50,10 @@ def default_config() -> Mapping[str, Any]:
         return {
             "workers": 8
         }
+
+    def current_state(self) -> Optional[str]:
+        """
+        Returns the current state of this pipeline
+        """
+
+        return self.state.get("current_status")
diff --git a/python/feldera/sql_context.py b/python/feldera/sql_context.py
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
diff --git a/python/tests/test_wireframes.py b/python/tests/test_wireframes.py

Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,4 @@`
`8`	`8`
`9`	`9`	`"""`
`10`	`10`
`11`		`-from feldera.rest.client import Client`
	`11`	`+from feldera.rest.feldera_client import FelderaClient`