feat(generativeai): Add sample for Prompt Optimiser (GoogleCloudPlatform#12624)

Sita04 · gcf-owl-bot[bot] · web-flow · commit 765b109f3c02 · 2024-09-24T15:07:36.000+02:00
* feat(genai): add prompt optimizer sample * add config files and tests * update comments * lint fix * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint fix and update region tag * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * lint and quota fix * refactor and headercheck exclusion * test dir path * test moving config files and update bucket * update headercheck * update acc to review * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * update location * update --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/.github/header-checker-lint.yml b/.github/header-checker-lint.yml
@@ -25,6 +25,8 @@ ignoreFiles:
   - "dlp/snippets/resources/harmless.txt"
   - "dlp/snippets/resources/test.txt"
   - "dlp/snippets/resources/term_list.txt"
+  - "generative_ai/prompts/test_resources/sample_prompt_template.txt"
+  - "generative_ai/prompts/test_resources/sample_system_instruction.txt"
   - "service_extensions/callouts/add_header/service_pb2.py"
   - "service_extensions/callouts/add_header/service_pb2_grpc.py"
 
diff --git a/generative_ai/prompts/prompt_optimizer.py b/generative_ai/prompts/prompt_optimizer.py
@@ -0,0 +1,73 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def optimize_prompts(
+    project: str,
+    location: str,
+    staging_bucket: str,
+    configuration_path: str,
+) -> str:
+    """Improve prompts by evaluating the model's response to sample prompts against specified evaluation metric(s).
+    Args:
+        project: Google Cloud Project ID.
+        location: Location where you want to run the Vertex AI prompt optimizer.
+        staging_bucket: Specify the Google Cloud Storage bucket to store outputs and metadata. For example, gs://bucket-name
+        configuration_path: URI of the configuration file in your Google Cloud Storage bucket. For example, gs://bucket-name/configuration.json.
+    Returns:
+        custom_job.resource_name: Returns the resource name of the job created of type: projects/project-id/locations/location/customJobs/job-id
+    """
+    #  [START generativeaionvertexai_prompt_optimizer]
+    from google.cloud import aiplatform
+
+    # TODO(developer): Update & uncomment below line
+    # project = "your-gcp-project-id"
+    # location = "location"
+    # staging_bucket = "output-bucket-gcs-uri"
+    # configuration_path = "configuration-file-gcs-uri"
+    aiplatform.init(project=project, location=location, staging_bucket=staging_bucket)
+
+    worker_pool_specs = [
+        {
+            "replica_count": 1,
+            "container_spec": {
+                "image_uri": "us-docker.pkg.dev/vertex-ai-restricted/builtin-algorithm/apd:preview_v1_0",
+                "args": [f"--config={configuration_path}"],
+            },
+            "machine_spec": {
+                "machine_type": "n1-standard-4",
+            },
+        }
+    ]
+
+    custom_job = aiplatform.CustomJob(
+        display_name="Prompt Optimizer example",
+        worker_pool_specs=worker_pool_specs,
+    )
+    custom_job.submit()
+    print(f"Job resource name: {custom_job.resource_name}")
+
+    #  [END generativeaionvertexai_prompt_optimizer]
+    return custom_job.resource_name
+
+
+if __name__ == "__main__":
+    optimize_prompts(
+        os.environ["PROJECT_ID"],
+        "us-central1",
+        os.environ["PROMPT_OPTIMIZER_BUCKET_NAME"],
+        os.environ["JSON_CONFIG_PATH"],
+    )
diff --git a/generative_ai/prompts/test_prompt_optimizer.py b/generative_ai/prompts/test_prompt_optimizer.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import time
+from typing import Callable
+
+from google.cloud import aiplatform, storage
+from google.cloud.aiplatform import CustomJob
+from google.cloud.aiplatform_v1 import JobState
+from google.cloud.exceptions import NotFound
+from google.cloud.storage import transfer_manager
+
+from prompt_optimizer import optimize_prompts
+
+import pytest
+
+PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+STAGING_BUCKET_NAME = "prompt_optimizer_bucket"
+CONFIGURATION_DIRECTORY = "test_resources"
+CONFIGURATION_FILENAME = "sample_configuration.json"
+LOCATION = "us-central1"
+OUTPUT_PATH = "instruction"
+
+STORAGE_CLIENT = storage.Client()
+
+
+def _clean_resources(bucket_resource_name: str) -> None:
+    # delete blobs and bucket if exists
+    try:
+        bucket = STORAGE_CLIENT.get_bucket(bucket_resource_name)
+    except NotFound:
+        print(f"Bucket {bucket_resource_name} cannot be accessed")
+        return
+
+    blobs = bucket.list_blobs()
+    for blob in blobs:
+        blob.delete()
+    bucket.delete()
+
+
+def substitute_env_variable(data: dict, target_key: str, env_var_name: str) -> dict:
+    # substitute env variables in the given config file with runtime values
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if key == target_key:
+                data[key] = os.environ.get(env_var_name)
+            else:
+                data[key] = substitute_env_variable(value, target_key, env_var_name)
+    elif isinstance(data, list):
+        for i, value in enumerate(data):
+            data[i] = substitute_env_variable(value, target_key, env_var_name)
+    return data
+
+
+def update_json() -> dict:
+    # Load the JSON file
+    file_path = os.path.join(
+        os.path.dirname(__file__), CONFIGURATION_DIRECTORY, CONFIGURATION_FILENAME
+    )
+    with open(file_path, "r") as f:
+        data = json.load(f)
+    # Substitute only the "project" variable with the value of "PROJECT_ID"
+    substituted_data = substitute_env_variable(data, "project", "PROJECT_ID")
+    return substituted_data
+
+
+@pytest.fixture(scope="session")
+def bucket_name() -> str:
+    filenames = [
+        "sample_prompt_template.txt",
+        "sample_prompts.jsonl",
+        "sample_system_instruction.txt",
+    ]
+    # cleanup existing stale resources
+    _clean_resources(STAGING_BUCKET_NAME)
+    # create bucket
+    bucket = STORAGE_CLIENT.bucket(STAGING_BUCKET_NAME)
+    bucket.storage_class = "STANDARD"
+    new_bucket = STORAGE_CLIENT.create_bucket(bucket, location="us")
+    # update JSON to substitute env variables
+    substituted_data = update_json()
+    # convert the JSON data to a byte string
+    json_str = json.dumps(substituted_data, indent=2)
+    json_bytes = json_str.encode("utf-8")
+    # upload substituted JSON file to the bucket
+    blob = bucket.blob(CONFIGURATION_FILENAME)
+    blob.upload_from_string(json_bytes)
+    # upload config files to the bucket
+    transfer_manager.upload_many_from_filenames(
+        new_bucket,
+        filenames,
+        source_directory=os.path.join(
+            os.path.dirname(__file__), CONFIGURATION_DIRECTORY
+        ),
+    )
+    yield new_bucket.name
+    _clean_resources(new_bucket.name)
+
+
+def _main_test(test_func: Callable) -> None:
+    job_resource_name: str = ""
+    timeout = 900  # seconds
+    # wait for the job to complete
+    try:
+        job_resource_name = test_func()
+        start_time = time.time()
+        while (
+            get_job(job_resource_name).state
+            not in [JobState.JOB_STATE_SUCCEEDED, JobState.JOB_STATE_FAILED]
+            and time.time() - start_time < timeout
+        ):
+            time.sleep(10)
+    finally:
+        # delete job
+        get_job(job_resource_name).delete()
+
+
+def test_prompt_optimizer(bucket_name: pytest.fixture()) -> None:
+    _main_test(
+        test_func=lambda: optimize_prompts(
+            PROJECT_ID,
+            LOCATION,
+            f"gs://{bucket_name}",
+            f"gs://{bucket_name}/{CONFIGURATION_FILENAME}",
+        )
+    )
+    assert (
+        STORAGE_CLIENT.get_bucket(bucket_name).list_blobs(prefix=OUTPUT_PATH)
+        is not None
+    )
+
+
+def get_job(job_resource_name: str) -> CustomJob:
+    return aiplatform.CustomJob.get(
+        resource_name=job_resource_name, project=PROJECT_ID, location=LOCATION
+    )
diff --git a/generative_ai/prompts/test_resources/sample_configuration.json b/generative_ai/prompts/test_resources/sample_configuration.json
@@ -0,0 +1,11 @@
+{
+"project": "$PROJECT_ID",
+"system_instruction_path": "gs://prompt_optimizer_bucket/sample_system_instruction.txt",
+"prompt_template_path": "gs://prompt_optimizer_bucket/sample_prompt_template.txt",
+"target_model": "gemini-1.5-flash-001",
+"eval_metrics_types": ["safety"],
+"optimization_mode": "instruction",
+"input_data_path": "gs://prompt_optimizer_bucket/sample_prompts.jsonl",
+"output_path": "gs://prompt_optimizer_bucket",
+"eval_metrics_weights": [1]
+}
diff --git a/generative_ai/prompts/test_resources/sample_prompt_template.txt b/generative_ai/prompts/test_resources/sample_prompt_template.txt
@@ -0,0 +1 @@
+Question: Do {{animal_name}} {{animal_activity}}?
diff --git a/generative_ai/prompts/test_resources/sample_prompts.jsonl b/generative_ai/prompts/test_resources/sample_prompts.jsonl
@@ -0,0 +1,5 @@
+{"animal_name": "Bears", "animal_activity": "Eat grapes"}
+{"animal_name": "Cows", "animal_activity": "swim in the ocean"}
+{"animal_name": "Bees", "animal_activity": "Ride donkeys"}
+{"animal_name": "Cats", "animal_activity": "go to school"}
+{"animal_name": "Lions", "animal_activity": "hunt"}
diff --git a/generative_ai/prompts/test_resources/sample_system_instruction.txt b/generative_ai/prompts/test_resources/sample_system_instruction.txt
@@ -0,0 +1 @@
+Based on the following text respond to the questions.'\n' Be concise, and answer \"I don't know\" if the response cannot be found in the provided text.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Question: Do {{animal_name}} {{animal_activity}}?`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Based on the following text respond to the questions.'\n' Be concise, and answer \"I don't know\" if the response cannot be found in the provided text.`