py: refactor(connect_input_pandas) -> input_pandas

abhizer · ryzhyk · commit 9823f23ac866 · 2024-06-24T12:04:08.000-07:00
* `input_pandas` must now be called after starting a pipeline

Signed-off-by: Abhinav Gyawali &lt;22275402+abhizer@users.noreply.github.com&gt;
diff --git a/demo/project_demo10-FraudDetectionDeltaLake/notebook.ipynb b/demo/project_demo10-FraudDetectionDeltaLake/notebook.ipynb
@@ -333,6 +333,7 @@
     "hfeature = sql.listen(\"feature\")\n",
     "\n",
     "# Process full snapshot of the input tables and compute a dataset with feature vectors.\n",
+    "sql.start()\n",
     "sql.wait_for_completion(shutdown=True)\n",
     "\n",
     "# Read computed feature vectors into a Pandas dataframe.\n",
diff --git a/demo/project_demo10-FraudDetectionDeltaLake/run.py b/demo/project_demo10-FraudDetectionDeltaLake/run.py
@@ -121,6 +121,7 @@ def main():
 
     # Process full snapshot of the input tables and compute a dataset
     # with feature vectors for use in model training and testing.
+    sql.start()
     sql.wait_for_completion(shutdown=True)
 
     features_pd = hfeature.to_pandas()
diff --git a/docs/use_cases/fraud_detection/fraud_detection.md b/docs/use_cases/fraud_detection/fraud_detection.md
@@ -298,6 +298,7 @@ hfeature = sql.listen("feature")
 
 # Process full snapshot of the input tables and compute a dataset
 # with feature vectors for use in model training and testing.
+sql.start()
 sql.wait_for_completion(shutdown=True)
 
 features_pd = hfeature.to_pandas()
diff --git a/python/docs/examples.rst b/python/docs/examples.rst
@@ -1,12 +1,12 @@
 Examples
 ========
 
-Pandas
-*******
+Using Pandas DataFrames as Input / Output
+*******************************************
 
 
 Working wth pandas DataFrames in Feldera is fairly straight forward. 
-You can use :meth:`.SQLContext.connect_source_pandas` to connect a 
+You can use :meth:`.SQLContext.input_pandas` to connect a
 DataFrame to a feldera table as the data source. 
 
 To listen for response from feldera, in the form of DataFrames
@@ -48,15 +48,18 @@ To ensure all data is received start listening before calling
     query = f"SELECT name, ((science + maths + art) / 3) as average FROM {TBL_NAMES[0]} JOIN {TBL_NAMES[1]} on id = student_id ORDER BY average DESC"
     sql.register_output_view(view_name, query)
 
-    # connect the source (a pandas Dataframe in this case) to the tables
-    sql.connect_source_pandas(TBL_NAMES[0], df_students)
-    sql.connect_source_pandas(TBL_NAMES[1], df_grades)
-
     # listen for the output of the view here in the notebook
     # you do not need to call this if you are forwarding the data to a sink
     out = sql.listen(view_name)
 
-    # run this to completion
+    # start the pipeline
+    sql.start()
+
+    # connect the source (a pandas Dataframe in this case) to the tables
+    sql.input_pandas(TBL_NAMES[0], df_students)
+    sql.input_pandas(TBL_NAMES[1], df_grades)
+
+    # wait for the pipeline to complete
     # note that if the source is a stream, this will run indefinitely
     sql.wait_for_completion(shutdown=True)
 
@@ -67,8 +70,8 @@ To ensure all data is received start listening before calling
     print(df)
 
 
-Kafka
-******
+Using Kafka as Data Source / Sink
+***********************************
 
 To setup Kafka as the source use :meth:`.SQLContext.connect_source_kafka` and as the sink use
 :meth:`.SQLContext.connect_sink_kafka`.
@@ -157,8 +160,8 @@ More on Kafka as the output connector at: https://www.feldera.com/docs/connector
     df = out.to_pandas()
 
 
-HTTP GET
-*********
+Ingesting data from a URL
+**************************
 
 
 Feldera can ingest data from a user-provided URL into a SQL table.
@@ -193,6 +196,7 @@ More on the HTTP GET connector at: https://www.feldera.com/docs/connectors/sourc
 
     out = sql.listen(VIEW_NAME)
 
+    sql.start()
     sql.wait_for_completion(shutdown=True)
 
     df = out.to_pandas()
diff --git a/python/feldera/_helpers.py b/python/feldera/_helpers.py
@@ -40,4 +40,4 @@ def chunk_dataframe(df, chunk_size=1000):
     """
 
     for i in range(0, len(df), chunk_size):
-        yield df.iloc[i:i + chunk_size]
+        yield df.iloc[i:i + chunk_size]
diff --git a/python/feldera/formats.py b/python/feldera/formats.py
@@ -169,7 +169,7 @@ def __init__(
             self,
             config: Optional[dict] = None,
             schema: Optional[str] = None,
-            skip_schema_id: Optional[bool] = None,
+            skip_schema_id: Optional[bool] = False,
             registry_urls: Optional[list[str]] = None,
             registry_headers: Optional[Mapping[str, str]] = None,
             registry_proxy: Optional[str] = None,
diff --git a/python/feldera/sql_context.py b/python/feldera/sql_context.py
@@ -71,11 +71,6 @@ def __init__(
         self.tables: Dict[str, SQLTable] = {}
         self.types: Dict[str, str] = {}
 
-        # TODO: to be used for schema inference
-        self.todo_tables: Dict[str, Optional[SQLTable]] = {}
-
-        self.http_input_buffer: list[Dict[str, pd.DataFrame]] = []
-
         # buffer that stores all input connectors to be created
         # this is a Mapping[table_name -> list[Connector]]
         self.input_connectors_buffer: Dict[str, list[Connector]] = {}
@@ -167,28 +162,6 @@ def __setup_output_listeners(self):
                 # block until the callback runner is ready
                 queue.join()
 
-    def __push_http_inputs(self):
-        """
-        Internal function used to push the input data to the pipeline.
-
-        :meta private:
-        """
-
-        for input_buffer in self.http_input_buffer:
-            for tbl_name, data in input_buffer.items():
-                for datum in chunk_dataframe(data):
-                    self.client.push_to_pipeline(
-                        self.pipeline_name,
-                        tbl_name,
-                        "json",
-                        datum.to_json(orient='records', date_format='epoch'),
-                        json_flavor='pandas',
-                        array=True,
-                        serialize=False
-                    )
-
-        self.http_input_buffer.clear()
-
     def create(self) -> Self:
         """
         Sets the build mode to CREATE, meaning that the pipeline will be created from scratch.
@@ -253,8 +226,6 @@ def register_table(self, table_name: str, schema: Optional[SQLSchema] = None, dd
 
         if schema:
             self.tables[table_name] = SQLTable(table_name, schema=schema)
-        else:
-            self.todo_tables[table_name] = None
 
     def register_table_from_sql(self, ddl: str):
         """
@@ -272,34 +243,47 @@ def register_table_from_sql(self, ddl: str):
 
         self.tables[name] = SQLTable(name, ddl)
 
-    def connect_source_pandas(self, table_name: str, df: pandas.DataFrame, flush: bool = False):
+    def input_pandas(self, table_name: str, df: pandas.DataFrame, force: bool = False):
         """
         Adds a pandas DataFrame to the input buffer of the SQLContext, to be pushed to the pipeline.
         Note that if the pipeline is running, the data will not be pushed if `flush` is False.
 
         :param table_name: The name of the table.
         :param df: The pandas DataFrame to be pushed to the pipeline.
-        :param flush: If True, the data will be pushed to the pipeline immediately. Defaults to False.
+        :param force: `True` to push data even if the pipeline is paused. `False` by default.
         """
 
-        if flush and self.pipeline_status() != PipelineStatus.RUNNING:
-            raise RuntimeError("Pipeline must be running to flush the data")
+        status = self.pipeline_status()
+        if status not in [
+            PipelineStatus.RUNNING,
+            PipelineStatus.PAUSED,
+        ]:
+            raise RuntimeError("Pipeline must be running or paused to push data")
+
+        if not force and status == PipelineStatus.PAUSED:
+            raise RuntimeError("Pipeline is paused, set force=True to push data")
 
         ensure_dataframe_has_columns(df)
 
         tbl = self.tables.get(table_name)
 
-        if tbl:
+        if tbl is None:
+            raise ValueError(f"Cannot push to table '{table_name}' as it is not registered yet")
+        else:
             # tbl.validate_schema(df)   TODO: something like this would be nice
-            self.http_input_buffer.append({tbl.name: df})
-            if flush:
-                self.__push_http_inputs()
+            for datum in chunk_dataframe(df):
+                self.client.push_to_pipeline(
+                    self.pipeline_name,
+                    table_name,
+                    "json",
+                    datum.to_json(orient='records', date_format='epoch'),
+                    json_flavor='pandas',
+                    array=True,
+                    serialize=False,
+                    force=force,
+                )
             return
 
-        # TODO: handle schema inference
-        if tbl is None:
-            raise ValueError(f"Cannot push to table {table_name} as it is not registered yet")
-
     def register_local_view(self, name: str, query: str):
         """
         Registers a local view with the SQLContext.
@@ -639,8 +623,6 @@ def start(self):
 
         self.resume()
 
-        self.__push_http_inputs()
-
     def wait_for_idle(
             self,
             idle_interval_s: float = 5.0,
@@ -740,7 +722,12 @@ def delete(self, delete_program: bool = True, delete_connectors: bool = False):
         :param delete_connectors: If True, also deletes the connectors associated with the pipeline. False by default.
         """
 
-        if self.pipeline_status() != PipelineStatus.SHUTDOWN:
+        current_status = self.pipeline_status()
+
+        if current_status == PipelineStatus.NOT_FOUND:
+            raise RuntimeError("Attempting to delete a pipeline that hasn't been created yet")
+
+        if current_status not in [PipelineStatus.SHUTDOWN, PipelineStatus.FAILED]:
             raise RuntimeError("Pipeline must be shutdown before deletion")
 
         self.client.delete_pipeline(self.pipeline_name)
@@ -754,4 +741,4 @@ def delete(self, delete_program: bool = True, delete_connectors: bool = False):
                     self.client.delete_connector(conn.name)
             for connector in self.output_connectors_buffer.values():
                 for conn in connector:
-                    self.client.delete_connector(conn.name)
+                    self.client.delete_connector(conn.name)
diff --git a/python/requirements-dev.txt b/python/requirements-dev.txt
@@ -1,3 +1,2 @@
 sphinx==7.3.7
-sphinx_rtd_theme==2.0.0
-enum-tools[sphinx]
+sphinx_rtd_theme==2.0.0
diff --git a/python/tests/test_wireframes.py b/python/tests/test_wireframes.py