From 4ec02616ebb8ffe6247d7af90e26af42966ac3c3 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 7 Aug 2023 14:41:53 -0700 Subject: [PATCH 1/3] Vertex SDK - LLM - Added tuning samples for the code generation models - `code-bison` --- .../list_tuned_code_generation_models.py | 35 ++++++ .../list_tuned_code_generation_models_test.py | 42 +++++++ generative_ai/tune_code_generation_model.py | 70 ++++++++++++ .../tune_code_generation_model_test.py | 107 ++++++++++++++++++ 4 files changed, 254 insertions(+) create mode 100644 generative_ai/list_tuned_code_generation_models.py create mode 100644 generative_ai/list_tuned_code_generation_models_test.py create mode 100644 generative_ai/tune_code_generation_model.py create mode 100644 generative_ai/tune_code_generation_model_test.py diff --git a/generative_ai/list_tuned_code_generation_models.py b/generative_ai/list_tuned_code_generation_models.py new file mode 100644 index 00000000000..da8450863d4 --- /dev/null +++ b/generative_ai/list_tuned_code_generation_models.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START aiplatform_sdk_list_tuned_code_generation_models] + +import vertexai +from vertexai.preview.language_models import CodeGenerationModel + + +def list_tuned_code_generation_models( + project_id: str, + location: str, +) -> None: + """List tuned models.""" + vertexai.init(project=project_id, location=location) + model = CodeGenerationModel.from_pretrained("code-bison@001") + tuned_model_names = model.list_tuned_model_names() + print(tuned_model_names) + # [END aiplatform_sdk_list_tuned_code_generation_models] + return tuned_model_names + + +if __name__ == "__main__": + list_tuned_models() diff --git a/generative_ai/list_tuned_code_generation_models_test.py b/generative_ai/list_tuned_code_generation_models_test.py new file mode 100644 index 00000000000..624cb8ade3a --- /dev/null +++ b/generative_ai/list_tuned_code_generation_models_test.py @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import backoff +from google.api_core.exceptions import ResourceExhausted +from google.cloud import aiplatform + +import list_tuned_code_generation_models + + +_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") +_LOCATION = "us-central1" + + +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_list_tuned_code_generation_models() -> None: + tuned_model_names = list_tuned_code_generation_models.list_tuned_code_generation_models( + _PROJECT_ID, + _LOCATION, + ) + filtered_models_counter = 0 + for tuned_model_name in tuned_model_names: + model_registry = aiplatform.models.ModelRegistry(model=tuned_model_name) + if ( + "Vertex LLM Test Fixture " + "(list_tuned_models_test.py::test_list_tuned_models)" + ) in model_registry.get_version_info("1").model_display_name: + filtered_models_counter += 1 + assert filtered_models_counter == 0 diff --git a/generative_ai/tune_code_generation_model.py b/generative_ai/tune_code_generation_model.py new file mode 100644 index 00000000000..f4db042546b --- /dev/null +++ b/generative_ai/tune_code_generation_model.py @@ -0,0 +1,70 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START aiplatform_sdk_tune_code_generation_model] +from __future__ import annotations + + +from google.auth import default +import pandas as pd +import vertexai +from vertexai.preview.language_models import CodeGenerationModel + +credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + + +def tune_code_generation_model( + project_id: str, + location: str, + training_data: pd.DataFrame | str, + train_steps: int = 300, +) -> None: + """Tune a new model, based on a prompt-response data. + + "training_data" can be either the GCS URI of a file formatted in JSONL format + (for example: training_data=f'gs://{bucket}/{filename}.jsonl'), or a pandas + DataFrame. Each training example should be JSONL record with two keys, for + example: + { + "input_text": , + "output_text": + }, + or the pandas DataFame should contain two columns: + ['input_text', 'output_text'] + with rows for each training example. + + Args: + project_id: GCP Project ID, used to initialize vertexai + location: GCP Region, used to initialize vertexai + training_data: GCS URI of jsonl file or pandas dataframe of training data + train_steps: Number of training steps to use when tuning the model. + """ + vertexai.init(project=project_id, location=location, credentials=credentials) + model = CodeGenerationModel.from_pretrained("code-bison@001") + + model.tune_model( + training_data=training_data, + # Optional: + train_steps=train_steps, + tuning_job_location="europe-west4", + tuned_model_location=location, + ) + + print(model._job.status) + # [END aiplatform_sdk_tune_code_generation_model] + return model + + +if __name__ == "__main__": + tuning() diff --git a/generative_ai/tune_code_generation_model_test.py b/generative_ai/tune_code_generation_model_test.py new file mode 100644 index 00000000000..ccd4888f974 --- /dev/null +++ b/generative_ai/tune_code_generation_model_test.py @@ -0,0 +1,107 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +from google.cloud import aiplatform +from google.cloud import storage +from google.cloud.aiplatform.compat.types import pipeline_state +import pytest +from vertexai.preview.language_models import TextGenerationModel + +import tune_code_generation_model + +_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") +_LOCATION = "us-central1" +_BUCKET = os.environ["CLOUD_STORAGE_BUCKET"] + + +def get_model_display_name(tuned_model: TextGenerationModel) -> str: + language_model_tuning_job = tuned_model._job + pipeline_job = language_model_tuning_job._job + return dict(pipeline_job._gca_resource.runtime_config.parameter_values)[ + "model_display_name" + ] + + +def upload_to_gcs(bucket: str, name: str, data: str) -> None: + client = storage.Client() + bucket = client.get_bucket(bucket) + blob = bucket.blob(name) + blob.upload_from_string(data) + + +def download_from_gcs(bucket: str, name: str) -> str: + client = storage.Client() + bucket = client.get_bucket(bucket) + blob = bucket.blob(name) + data = blob.download_as_bytes() + return "\n".join(data.decode().splitlines()[:10]) + + +def delete_from_gcs(bucket: str, name: str) -> None: + client = storage.Client() + bucket = client.get_bucket(bucket) + blob = bucket.blob(name) + blob.delete() + + +@pytest.fixture(scope="function") +def training_data_filename() -> str: + temp_filename = f"{uuid.uuid4()}.jsonl" + data = download_from_gcs( + "cloud-samples-data", "ai-platform/generative_ai/headline_classification.jsonl" + ) + upload_to_gcs(_BUCKET, temp_filename, data) + try: + yield f"gs://{_BUCKET}/{temp_filename}" + finally: + delete_from_gcs(_BUCKET, temp_filename) + + +def teardown_model( + tuned_model: TextGenerationModel, training_data_filename: str +) -> None: + for tuned_model_name in tuned_model.list_tuned_model_names(): + model_registry = aiplatform.models.ModelRegistry(model=tuned_model_name) + if ( + training_data_filename + in model_registry.get_version_info("1").model_display_name + ): + display_name = model_registry.get_version_info("1").model_display_name + for endpoint in aiplatform.Endpoint.list(): + for _ in endpoint.list_models(): + if endpoint.display_name == display_name: + endpoint.undeploy_all() + endpoint.delete() + aiplatform.Model(model_registry.model_resource_name).delete() + + +@pytest.mark.skip("Blocked on b/277959219") +def test_tuning_code_generation_model(training_data_filename: str) -> None: + """Takes approx. 20 minutes.""" + tuned_model = tune_code_generation_model.tune_code_generation_model( + training_data=training_data_filename, + project_id=_PROJECT_ID, + location=_LOCATION, + train_steps=1, + ) + try: + assert ( + tuned_model._job.status + == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED + ) + finally: + teardown_model(tuned_model, training_data_filename) From b1bb728ce8cc8a7809a772de21e738458aab2104 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 7 Aug 2023 15:15:00 -0700 Subject: [PATCH 2/3] Bumping the SKD version --- generative_ai/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generative_ai/requirements.txt b/generative_ai/requirements.txt index 3f2377d0986..f1d40be161b 100644 --- a/generative_ai/requirements.txt +++ b/generative_ai/requirements.txt @@ -1,4 +1,4 @@ pandas==1.3.5; python_version == '3.7' pandas==2.0.1; python_version > '3.7' -google-cloud-aiplatform[pipelines]==1.28.1 +google-cloud-aiplatform[pipelines]==1.29.0 google-auth==2.17.3 From 56b409c97f1de827a830e7983a76b1d443b1e2f0 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 7 Aug 2023 19:05:18 -0700 Subject: [PATCH 3/3] Fixed the called function names --- generative_ai/list_tuned_code_generation_models.py | 2 +- generative_ai/tune_code_generation_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/generative_ai/list_tuned_code_generation_models.py b/generative_ai/list_tuned_code_generation_models.py index da8450863d4..c9e2ea85f8e 100644 --- a/generative_ai/list_tuned_code_generation_models.py +++ b/generative_ai/list_tuned_code_generation_models.py @@ -32,4 +32,4 @@ def list_tuned_code_generation_models( if __name__ == "__main__": - list_tuned_models() + list_tuned_code_generation_models() diff --git a/generative_ai/tune_code_generation_model.py b/generative_ai/tune_code_generation_model.py index f4db042546b..4474f7ea8fa 100644 --- a/generative_ai/tune_code_generation_model.py +++ b/generative_ai/tune_code_generation_model.py @@ -67,4 +67,4 @@ def tune_code_generation_model( if __name__ == "__main__": - tuning() + tune_code_generation_model()