Various fixes for integration tests (#111)

phil-scale · web-flow · commit 42dce79b7de6 · 2023-06-16T17:22:16.000-07:00
* update model bundles demo code for integration test

* add back sync

* various fixes
diff --git a/docs/concepts/batch_jobs.md b/docs/concepts/batch_jobs.md
@@ -25,6 +25,7 @@ batch_job = client.batch_async_request(
         {"x": 2, "y": "hello"},
         {"x": 3, "y": "world"},
     ],
+    gpus=0,
     labels={
         "team": "MY_TEAM",
         "product": "MY_PRODUCT",
diff --git a/docs/concepts/endpoint_predictions.md b/docs/concepts/endpoint_predictions.md
@@ -3,7 +3,6 @@
 Once endpoints have been created, users can send tasks to them to make
 predictions. The following code snippet shows how to send tasks to endpoints.
 
-
 === "Sending a Task to an Async Endpoint"
     ```py
     import os
@@ -35,7 +34,8 @@ predictions. The following code snippet shows how to send tasks to endpoints.
     client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
     endpoint = client.get_model_endpoint("demo-endpoint-streaming")
     response = endpoint.predict(request=EndpointRequest(args={"x": 2, "y": "hello"}))
-    print(response)
+    for chunk in response:
+        print(chunk)
     ```
 
 ::: launch.model_endpoint.EndpointRequest
diff --git a/docs/concepts/model_bundles.md b/docs/concepts/model_bundles.md
@@ -1,15 +1,16 @@
 # Model Bundles
 
 Model Bundles are deployable models that can be used to make predictions. They
-are created by packaging a model up into a deployable format. 
+are created by packaging a model up into a deployable format.
 
 ## Creating Model Bundles
 
 There are five methods for creating model bundles:
 [`create_model_bundle_from_callable_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_callable_v2),
 [`create_model_bundle_from_dirs_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_dirs_v2),
 [`create_model_bundle_from_runnable_image_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_runnable_image_v2),
-[`create_model_bundle_from_triton_enhanced_runnable_image_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_triton_enhanced_runnable_image_v2), and [`create_model_bundle_from_streaming_enhanced_runnable_image_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_streaming_enhanced_runnable_image_v2).
+[`create_model_bundle_from_triton_enhanced_runnable_image_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_triton_enhanced_runnable_image_v2),
+and [`create_model_bundle_from_streaming_enhanced_runnable_image_v2`](/api/client/#launch.client.LaunchClient.create_model_bundle_from_streaming_enhanced_runnable_image_v2).
 
 The first directly pickles a user-specified `load_predict_fn`, a function which
 loads the model and returns a `predict_fn`, a function which takes in a request.
@@ -20,7 +21,8 @@ requests at port 5005 using HTTP and exposes `POST /predict` and
 `GET /readyz` endpoints.
 The fourth is a variant of the third that also starts an instance of the NVidia
 Triton framework for efficient model serving.
-The fifth is a variant of the third that responds with a stream of SSEs at `POST /stream` (the user can decide whether `POST /predict` is also exposed).
+The fifth is a variant of the third that responds with a stream of SSEs at `POST /stream` (the user
+can decide whether `POST /predict` is also exposed).
 
 Each of these modes of creating a model bundle is called a "Flavor".
 
@@ -57,7 +59,6 @@ Each of these modes of creating a model bundle is called a "Flavor".
     * You want to use a `RunnableImageFlavor`
     * You also want to support token streaming while the model is generating
 
-
 === "Creating From Callables"
     ```py
     import os
@@ -132,7 +133,7 @@ Each of these modes of creating a model bundle is called a "Flavor".
     """)
 
     requirements_filename = os.path.join(directory, "requirements.txt")
-    with open(predict_filename, "w") as f:
+    with open(requirements_filename, "w") as f:
         f.write("""
     pytest==7.2.1
     numpy
@@ -157,13 +158,13 @@ Each of these modes of creating a model bundle is called a "Flavor".
         __root__: int
      
     BUNDLE_PARAMS = {
-        "model_bundle_name": "test-bundle",
+        "model_bundle_name": "test-bundle-from-dirs",
         "base_paths": [directory],
         "load_predict_fn_module_path": "predict.my_load_predict_fn",
         "load_model_fn_module_path": "model.my_load_model_fn",
         "request_schema": MyRequestSchema,
         "response_schema": MyResponseSchema,
-        "requirements_path": "requirements.txt",
+        "requirements_path": requirements_filename,
         "pytorch_image_tag": "1.7.1-cuda11.0-cudnn8-runtime",
     }
 
@@ -173,6 +174,7 @@ Each of these modes of creating a model bundle is called a "Flavor".
     # Clean up files from demo
     os.remove(model_filename)
     os.remove(predict_filename)
+    os.remove(requirements_filename)
     os.rmdir(directory)
     ```
 
@@ -197,9 +199,7 @@ Each of these modes of creating a model bundle is called a "Flavor".
         "response_schema": MyResponseSchema,
         "repository": "...",
         "tag": "...",
-        "command": [
-            ...
-        ],
+        "command": ...,
         "env": {
             "TEST_KEY": "test_value",
         },
@@ -227,14 +227,12 @@ Each of these modes of creating a model bundle is called a "Flavor".
 
 
     BUNDLE_PARAMS = {
-        "model_bundle_name": "test-bundle",
+        "model_bundle_name": "test-triton-bundle",
         "request_schema": MyRequestSchema,
         "response_schema": MyResponseSchema,
         "repository": "...",
         "tag": "...",
-        "command": [
-            ...
-        ],
+        "command": ...,
         "env": {
             "TEST_KEY": "test_value",
         },
@@ -274,12 +272,8 @@ Each of these modes of creating a model bundle is called a "Flavor".
         "response_schema": MyResponseSchema,
         "repository": "...",
         "tag": "...",
-        "command": [ # optional; if provided, will also expose the /predict endpoint
-            ...
-        ],
-        "streaming_command": [ # required
-            ...
-        ],
+        "command": ...,  # optional; if provided, will also expose the /predict endpoint
+        "streaming_command": ...,  # required
         "env": {
             "TEST_KEY": "test_value",
         },
diff --git a/docs/concepts/model_endpoints.md b/docs/concepts/model_endpoints.md
@@ -10,7 +10,8 @@ of CPUs, amount of memory, GPU count, and type of GPU.
 Endpoints can be asynchronous, synchronous, or streaming. Asynchronous endpoints return
 a future immediately after receiving a request, and the future can be used to
 retrieve the prediction once it is ready. Synchronous endpoints return the
-prediction directly after receiving a request. Streaming endpoints are variants of synchronous endpoints that return a stream of SSEs instead of a single HTTP response.
+prediction directly after receiving a request. Streaming endpoints are variants of synchronous
+endpoints that return a stream of SSEs instead of a single HTTP response.
 
 !!! info
     # Choosing the right inference mode
@@ -90,7 +91,8 @@ endpoint = client.create_model_endpoint(
 
 ## Creating Streaming Model Endpoints
 
-Streaming model endpoints are variants of sync model endpoints that are useful for tasks with strict requirements on perceived latency. Streaming endpoints are more expensive than async endpoints.
+Streaming model endpoints are variants of sync model endpoints that are useful for tasks with strict
+requirements on perceived latency. Streaming endpoints are more expensive than async endpoints.
 !!! Note
     Streaming model endpoints require at least 1 `min_worker`.
 
@@ -104,6 +106,7 @@ endpoint = client.create_model_endpoint(
     model_bundle="test-streaming-bundle",
     cpus=1,
     min_workers=1,
+    per_worker=1,
     endpoint_type="streaming",
     update_if_exists=True,
     labels={
@@ -131,7 +134,7 @@ from launch import LaunchClient
 
 client = LaunchClient(api_key=os.getenv("LAUNCH_API_KEY"))
 client.edit_model_endpoint(
-    model_endpoint="demo-endpoint",
+    model_endpoint="demo-endpoint-sync",
     max_workers=2,
 )
 ```
diff --git a/launch/client.py b/launch/client.py
@@ -1904,6 +1904,63 @@ def _streaming_request(
         )
         return response
 
+    def _sync_request(
+        self,
+        endpoint_name: str,
+        url: Optional[str] = None,
+        args: Optional[Dict] = None,
+        return_pickled: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Not recommended for use, instead use functions provided by SyncEndpoint Makes a request
+        to the Sync Model Endpoint at endpoint_id, and blocks until request completion or
+        timeout. Endpoint at endpoint_id must be a SyncEndpoint, otherwise this request will fail.
+
+        Parameters:
+            endpoint_name: The name of the endpoint to make the request to
+
+            url: A url that points to a file containing model input. Must be accessible by Scale
+            Launch, hence it needs to either be public or a signedURL. **Note**: the contents of
+            the file located at ``url`` are opened as a sequence of ``bytes`` and passed to the
+            predict function. If you instead want to pass the url itself as an input to the
+            predict function, see ``args``.
+
+            args: A dictionary of arguments to the ``predict`` function defined in your model
+            bundle. Must be json-serializable, i.e. composed of ``str``, ``int``, ``float``,
+            etc. If your ``predict`` function has signature ``predict(foo, bar)``, then args
+            should be a dictionary with keys ``foo`` and ``bar``. Exactly one of ``url`` and
+            ``args`` must be specified.
+
+            return_pickled: Whether the python object returned is pickled, or directly written to
+            the file returned.
+
+        Returns:
+            A dictionary with key either ``"result_url"`` or ``"result"``, depending on the value
+            of ``return_pickled``. If ``return_pickled`` is true, the key will be ``"result_url"``,
+            and the value is a signedUrl that contains a cloudpickled Python object,
+            the result of running inference on the model input.
+            Example output:
+                ``https://foo.s3.us-west-2.amazonaws.com/bar/baz/qux?xyzzy``
+
+            Otherwise, if ``return_pickled`` is false, the key will be ``"result"``,
+            and the value is the output of the endpoint's ``predict`` function, serialized as json.
+        """
+        validate_task_request(url=url, args=args)
+        endpoint = self.get_model_endpoint(endpoint_name)
+        endpoint_id = endpoint.model_endpoint.id  # type: ignore
+        with ApiClient(self.configuration) as api_client:
+            api_instance = DefaultApi(api_client)
+            payload = dict_not_none(return_pickled=return_pickled, url=url, args=args)
+            request = EndpointPredictV1Request(**payload)
+            query_params = frozendict({"model_endpoint_id": endpoint_id})
+            response = api_instance.create_sync_inference_task_v1_sync_tasks_post(  # type: ignore
+                body=request,
+                query_params=query_params,
+                skip_deserialization=True,
+            )
+            resp = json.loads(response.response.data)
+        return resp
+
     def _async_request(
         self,
         endpoint_name: str,