[py] add Pipeline.query() and Pipeline.query_tabular() for adhoc queries

abhizer · abhizer · commit dd7de6662aa4 · 2024-10-02T13:17:24.000Z
Adds methods to run adhoc queries against running or paused pipelines.

In FelderaClient:
- adds query(), query_as_text(), query_as_json() &amp; query_as_parquet() methods

In Pipeline:
- adds query() and query_tabular() methods

A new enum is also added to represent the different formats of responses
for ad-hoc queries: `QueryResponseFormat`.

Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;
diff --git a/python/feldera/enums.py b/python/feldera/enums.py
@@ -184,3 +184,18 @@ def from_str(value):
 
     def __eq__(self, other):
         return self.value == other.value
+
+
+class QueryResponseFormat(Enum):
+    """
+    The format of the response returned for the output of a query.
+    """
+
+    JSON = 1
+    """
+    The output is serialized as JSON.
+    The response is deserialized into a python dictionary.
+    
+    """
+
+    PARQUET = 2
diff --git a/python/feldera/pipeline.py b/python/feldera/pipeline.py
@@ -1,11 +1,11 @@
 import time
 import pandas
 
-from typing import List, Dict, Callable, Optional
+from typing import List, Dict, Callable, Optional, Generator, Mapping, Any
 from queue import Queue
 
 from feldera.rest.errors import FelderaAPIError
-from feldera.enums import PipelineStatus
+from feldera.enums import PipelineStatus, QueryResponseFormat
 from feldera.rest.pipeline import Pipeline as InnerPipeline
 from feldera.rest.feldera_client import FelderaClient
 from feldera._callback_runner import _CallbackRunnerInstruction, CallbackRunner
@@ -344,3 +344,39 @@ def get(name: str, client: FelderaClient) -> 'Pipeline':
         except FelderaAPIError as err:
             if err.status_code == 404:
                 raise RuntimeError(f"Pipeline with name {name} not found")
+
+    def query(self, query: str, fmt: QueryResponseFormat) -> Generator[Mapping[str, Any], None, None]:
+        """
+        Executes an ad-hoc SQL query on this pipeline and returns the result in the specified format.
+
+        :param query: The SQL query to be executed.
+        :param fmt: An instance of the :class:`.QueryResponseFormat` enum, specifying the output format:
+
+            - `QueryResponseFormat.JSON`: Returns a generator that yields rows as Python dictionaries.
+            - `QueryResponseFormat.PARQUET`: Returns the result as a binary blob in Parquet format.
+
+        :return: A generator that yields the rows of the result as Python dictionaries when the format is JSON,
+                 or binary data when the format is Parquet.
+        """
+
+        if self.status() not in [
+            PipelineStatus.RUNNING,
+            PipelineStatus.PAUSED,
+        ]:
+            raise RuntimeError("Pipeline must be running or paused to run a query")
+        return self.client.query(self.name, query, fmt.name)
+
+    def query_tabular(self, query: str) -> str:
+        """
+        Executes a SQL query on this pipeline and returns the result as a formatted string.
+
+        :param query: The SQL query to be executed.
+        :return: A string representing the query result in a human-readable, tabular format.
+        """
+
+        if self.status() not in [
+            PipelineStatus.RUNNING,
+            PipelineStatus.PAUSED,
+        ]:
+            raise RuntimeError("Pipeline must be running or paused to run a query")
+        return self.client.query_as_text(self.name, query)
diff --git a/python/feldera/rest/_httprequests.py b/python/feldera/rest/_httprequests.py
@@ -62,6 +62,7 @@ def send_request(
                     timeout=timeout,
                     headers=headers,
                     params=params,
+                    stream=stream,
                 )
             elif isinstance(body, bytes):
                 request = http_method(
@@ -81,8 +82,14 @@ def send_request(
                     params=params,
                     stream=stream,
                 )
-                if stream:
-                    return request
+
+            if stream:
+                return request
+            if request.headers.get("content-type") == "text/plain":
+                return request.text
+            elif request.headers.get("content-type") == "application/octet-stream":
+                return request.content
+
             resp = self.__validate(request)
             logging.debug("got response: %s", str(resp))
             return resp
@@ -95,9 +102,10 @@ def send_request(
     def get(
             self,
             path: str,
-            params: Optional[Mapping[str, Any]] = None
+            params: Optional[Mapping[str, Any]] = None,
+            stream: bool = False,
     ) -> Any:
-        return self.send_request(requests.get, path, params)
+        return self.send_request(requests.get, path, params=params, stream=stream)
 
     def post(
             self,
diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py
@@ -3,6 +3,7 @@
 import time
 import json
 from decimal import Decimal
+from typing import Generator
 
 from feldera.rest.config import Config
 from feldera.rest.pipeline import Pipeline
@@ -375,3 +376,94 @@ def listen_to_pipeline(
                 break
             if chunk:
                 yield json.loads(chunk, parse_float=Decimal)
+
+    def query(self, pipeline_name: str, query: str, fmt: str) -> str | bytes | Generator[dict, None, None]:
+        """
+        Executes an ad-hoc query on the specified data pipeline.
+
+        :param pipeline_name: The name of the pipeline to query.
+        :param query: The SQL query to be executed.
+        :param fmt: The format in which to return the query result:
+
+            - "text": Returns a string in tabular format representing the query result.
+            - "parquet": Returns a binary blob of data in Parquet format, which can be saved as a file.
+            - "json": Returns a generator that yields each row of the result as a Python dictionary.
+
+        :return: Depending on the format (`fmt`) provided:
+
+            - For "text": A string representing the query result in tabular form.
+            - For "parquet": A binary blob representing the query result in Parquet format.
+            - For "json": A generator that produces dictionaries for each row in the query result.
+        """
+
+        match fmt.lower():
+            case "text":
+                return self.query_as_text(pipeline_name, query)
+            case "parquet":
+                return self.query_as_parquet(pipeline_name, query)
+            case _:
+                return self.query_as_json(pipeline_name, query)
+
+    def query_as_text(self, pipeline_name: str, query: str) -> str:
+        """
+        Executes an ad-hoc query on the specified pipeline and returns the result as a formatted text table.
+
+        :param pipeline_name: The name of the pipeline to query.
+        :param query: The SQL query to be executed.
+        :return: A string containing the query result in tabular format.
+        """
+        params = {
+            "pipeline_name": pipeline_name,
+            "sql": query,
+            "format": "text",
+        }
+
+        return self.http.get(
+            path=f"/pipelines/{pipeline_name}/query",
+            params=params,
+        )
+
+    def query_as_parquet(self, pipeline_name: str, query: str) -> bytes:
+        """
+        Executes an ad-hoc query on the specified pipeline and returns the result as a Parquet binary blob.
+
+        :param pipeline_name: The name of the pipeline to query.
+        :param query: The SQL query to be executed.
+        :return: A binary blob representing the query result in Parquet format, which can be saved as a file.
+        """
+        params = {
+            "pipeline_name": pipeline_name,
+            "sql": query,
+            "format": "parquet",
+        }
+
+        return self.http.get(
+            path=f"/pipelines/{pipeline_name}/query",
+            params=params,
+        )
+
+    def query_as_json(self, pipeline_name: str, query: str) -> Generator[dict, None, None]:
+        """
+        Executes an ad-hoc query on the specified pipeline and returns the result as a generator that yields
+        rows of the query as Python dictionaries.
+
+        :param pipeline_name: The name of the pipeline to query.
+        :param query: The SQL query to be executed.
+        :return: A generator that yields each row of the result as a Python dictionary, deserialized from JSON.
+        """
+        params = {
+            "pipeline_name": pipeline_name,
+            "sql": query,
+            "format": "json",
+        }
+
+        resp = self.http.get(
+            path=f"/pipelines/{pipeline_name}/query",
+            params=params,
+            stream=True,
+        )
+
+        for chunk in resp.iter_lines(chunk_size=50000000):
+            if chunk:
+                yield json.loads(chunk, parse_float=Decimal)
+
diff --git a/python/tests/test_pipeline.py b/python/tests/test_pipeline.py
@@ -106,7 +106,7 @@ def test_get_pipeline_stats(self):
         TEST_CLIENT.shutdown_pipeline(name)
         TEST_CLIENT.delete_pipeline(name)
 
-    def __listener(self, name: str) -> bool:
+    def __listener(self, name: str):
 
         gen_obj = TEST_CLIENT.listen_to_pipeline(
             pipeline_name=name,
@@ -145,6 +145,79 @@ def test_listen_to_pipeline(self):
         TEST_CLIENT.shutdown_pipeline(name)
         TEST_CLIENT.delete_pipeline(name)
 
+    def test_adhoc_query_text(self):
+        data = "1\n2\n"
+        name = str(uuid.uuid4())
+
+        sql = f"""
+        CREATE TABLE tbl(id INT) with ('materialized' = 'true');
+        """
+
+        pipeline = Pipeline(name, sql, {}, {})
+        pipeline = TEST_CLIENT.create_pipeline(pipeline)
+
+        TEST_CLIENT.start_pipeline(name)
+
+        TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)
+        tbl = TEST_CLIENT.query(pipeline.name, "SELECT * FROM tbl", "text")
+        TEST_CLIENT.shutdown_pipeline(name)
+        TEST_CLIENT.delete_pipeline(name)
+
+        expected = """+----+
+| id |
++----+
+| 2  |
+| 1  |
++----+"""
+
+        assert tbl == expected
+
+    def test_adhoc_query_parquet(self):
+        data = "1\n2\n"
+        name = str(uuid.uuid4())
+
+        sql = f"""
+        CREATE TABLE tbl(id INT) with ('materialized' = 'true');
+        """
+
+        pipeline = Pipeline(name, sql, {}, {})
+        pipeline = TEST_CLIENT.create_pipeline(pipeline)
+
+        TEST_CLIENT.start_pipeline(name)
+
+        TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)
+        got: bytes = TEST_CLIENT.query(pipeline.name, "SELECT * FROM tbl", "parquet")
+        TEST_CLIENT.shutdown_pipeline(name)
+        TEST_CLIENT.delete_pipeline(name)
+
+        expected = b'PAR1\x15\x04\x15\x10\x15\x14L\x15\x04\x15\x00\x12'
+
+        assert got.find(expected) == 0
+
+    def test_adhoc_query_json(self):
+        data = "1\n2\n"
+        name = str(uuid.uuid4())
+
+        sql = f"""
+        CREATE TABLE tbl(id INT) with ('materialized' = 'true');
+        """
+
+        pipeline = Pipeline(name, sql, {}, {})
+        pipeline = TEST_CLIENT.create_pipeline(pipeline)
+
+        TEST_CLIENT.start_pipeline(name)
+
+        TEST_CLIENT.push_to_pipeline(name, "tbl", "csv", data)
+        got = TEST_CLIENT.query(pipeline.name, "SELECT * FROM tbl", "json")
+
+        expected = [{"id": 2}, {"id": 1}]
+
+        for d, e in zip(got, expected):
+            assert d == e
+
+        TEST_CLIENT.shutdown_pipeline(name)
+        TEST_CLIENT.delete_pipeline(name)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/tests/test_variant.py b/python/tests/test_variant.py
@@ -7,6 +7,7 @@
 from tests import TEST_CLIENT
 from decimal import Decimal
 
+
 class TestVariant(unittest.TestCase):
     def test_local(self):
         sql = f"""
@@ -33,9 +34,9 @@ def test_local(self):
         pipeline = PipelineBuilder(TEST_CLIENT, name="test_variant", sql=sql).create_or_replace()
 
         input_strings = [
-            {"json":"{\"name\":\"Bob\",\"scores\":[8,10]}"},
-            {"json":"{\"name\":\"Dunce\",\"scores\":[3,4]}"},
-            {"json":"{\"name\":\"John\",\"scores\":[9,10]}"}
+            {"json": "{\"name\":\"Bob\",\"scores\":[8,10]}"},
+            {"json": "{\"name\":\"Dunce\",\"scores\":[3,4]}"},
+            {"json": "{\"name\":\"John\",\"scores\":[9,10]}"}
         ]
 
         input_json = [
@@ -47,25 +48,25 @@ def test_local(self):
         expected_strings = [j | {"insert_delete": 1} for j in input_strings]
 
         expected_average = [
-            { "name": "Bob", "average": Decimal(9) },
-            { "name": "Dunce", "average": Decimal(3.5) },
-            { "name": "John", "average": Decimal(9.5) }
+            {"name": "Bob", "average": Decimal(9)},
+            {"name": "Dunce", "average": Decimal(3.5)},
+            {"name": "John", "average": Decimal(9.5)}
         ]
         for datum in expected_average:
             datum.update({"insert_delete": 1})
 
         expected_typed = [
-            { "name": "Bob", "scores": [8, 10] },
-            { "name": "Dunce", "scores": [3, 4] },
-            { "name": "John", "scores": [9, 10] }
+            {"name": "Bob", "scores": [8, 10]},
+            {"name": "Dunce", "scores": [3, 4]},
+            {"name": "John", "scores": [9, 10]}
         ]
         for datum in expected_typed:
             datum.update({"insert_delete": 1})
 
         expected_variant = [
-            {"json": { "name": "Bob", "scores": [8, 10] }},
-            {"json": { "name": "Dunce", "scores": [3, 4] }},
-            {"json": { "name": "John", "scores": [9, 10] }}
+            {"json": {"name": "Bob", "scores": [8, 10]}},
+            {"json": {"name": "Dunce", "scores": [3, 4]}},
+            {"json": {"name": "John", "scores": [9, 10]}}
         ]
         for datum in expected_variant:
             datum.update({"insert_delete": 1})
@@ -98,5 +99,6 @@ def test_local(self):
 
         pipeline.delete()
 
+
 if __name__ == '__main__':
     unittest.main()