googleapis
diff --git a/‎packages/google-cloud-storage/tests/perf/README.md‎
Lines changed: 36 additions & 12 deletions b/‎packages/google-cloud-storage/tests/perf/README.md‎
Lines changed: 36 additions & 12 deletions
diff --git a/‎packages/google-cloud-storage/tests/perf/benchmarking.py‎
Lines changed: 274 additions & 0 deletions b/‎packages/google-cloud-storage/tests/perf/benchmarking.py‎
Lines changed: 274 additions & 0 deletions
diff --git a/‎packages/google-cloud-storage/tests/perf/benchwrapper/README.md‎
Lines changed: 21 additions & 0 deletions b/‎packages/google-cloud-storage/tests/perf/benchwrapper/README.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎…cloud-storage/tests/perf/benchwrapper.py‎ ‎…/tests/perf/benchwrapper/benchwrapper.py‎packages/google-cloud-storage/tests/perf/benchwrapper.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/benchwrapper.py b/‎…cloud-storage/tests/perf/benchwrapper.py‎ ‎…/tests/perf/benchwrapper/benchwrapper.py‎packages/google-cloud-storage/tests/perf/benchwrapper.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/benchwrapper.py
diff --git a/‎…e-cloud-storage/tests/perf/storage.proto‎ ‎…ge/tests/perf/benchwrapper/storage.proto‎packages/google-cloud-storage/tests/perf/storage.proto renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage.proto b/‎…e-cloud-storage/tests/perf/storage.proto‎ ‎…ge/tests/perf/benchwrapper/storage.proto‎packages/google-cloud-storage/tests/perf/storage.proto renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage.proto
diff --git a/‎…-cloud-storage/tests/perf/storage_pb2.py‎ ‎…e/tests/perf/benchwrapper/storage_pb2.py‎packages/google-cloud-storage/tests/perf/storage_pb2.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage_pb2.py b/‎…-cloud-storage/tests/perf/storage_pb2.py‎ ‎…e/tests/perf/benchwrapper/storage_pb2.py‎packages/google-cloud-storage/tests/perf/storage_pb2.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage_pb2.py
diff --git a/‎…d-storage/tests/perf/storage_pb2_grpc.py‎ ‎…ts/perf/benchwrapper/storage_pb2_grpc.py‎packages/google-cloud-storage/tests/perf/storage_pb2_grpc.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage_pb2_grpc.py b/‎…d-storage/tests/perf/storage_pb2_grpc.py‎ ‎…ts/perf/benchwrapper/storage_pb2_grpc.py‎packages/google-cloud-storage/tests/perf/storage_pb2_grpc.py renamed to packages/google-cloud-storage/tests/perf/benchwrapper/storage_pb2_grpc.py
@@ -1,21 +1,45 @@
-# storage benchwrapp
+# python-storage benchmarking
 
-main.py is a gRPC wrapper around the storage library for benchmarking purposes.
+**This is not an officially supported Google product**
 
-## Running
+This benchmarking script is used by Storage client library maintainers to benchmark various workloads and collect metrics in order to improve performance of the library.
+Currently the benchmarking runs a Write-1-Read-3 workload and measures the usual two QoS performance attributes, latency and throughput.
 
+## Run example:
+This runs 10K iterations of Write-1-Read-3 on 5KiB to 16KiB files, and generates output to a default csv file `benchmarking<TIMESTAMP>.csv`:
 ```bash
-$ export STORAGE_EMULATOR_HOST=http://localhost:8080
-$ pip install grpcio
-$ cd storage
+$ cd python-storage
 $ pip install -e . # install google.cloud.storage locally
 $ cd tests/perf
-$ python3 benchwrapper.py --port 8081
+$ python3 benchmarking.py --num_samples 10000 --max_size 16384
 ```
 
-## Re-generating protos
+## CLI parameters
 
-```bash
-$ pip install grpcio-tools
-$ python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. *.proto
-```
+| Parameter | Description | Possible values | Default |
+| --------- | ----------- | --------------- |:-------:|
+| --min_size | minimum object size in bytes | any positive integer | `5120` (5 KiB) |
+| --max_size | maximum object size in bytes | any positive integer | `2147483648` (2 GiB) |
+| --num_samples | number of W1R3 iterations | any positive integer | `1000` |
+| --r | bucket region for benchmarks | any GCS region | `US` |
+| --p | number of processes (multiprocessing enabled) | any positive integer | 16 (recommend not to exceed 16) |
+| --o | file to output results to | any file path | `benchmarking<TIMESTAMP>.csv` |
+
+
+## Workload definition and CSV headers
+
+For each invocation of the benchmark, write a new object of random size between `min_size` and `max_size` . After the successful write, download the object in full three times. For each of the 4 operations record the following fields:
+
+| Field | Description |
+| ----- | ----------- |
+| Op | the name of the operations (WRITE, READ[{0,1,2}]) |
+| ObjectSize | the number of bytes of the object |
+| LibBufferSize | configured to use the [library default of 100 MiB](https://github.com/googleapis/python-storage/blob/main/google/cloud/storage/blob.py#L135) |
+| Crc32cEnabled | bool: whether crc32c was computed for the operation |
+| MD5Enabled | bool: whether MD5 was computed for the operation |
+| ApiName | default to JSON|
+| ElapsedTimeUs | the elapsed time in microseconds the operation took |
+| Status | completion state of the operation [OK, FAIL] |
+| RunID | timestamp from the benchmarking run |
+| AppBufferSize | N/A |
+| CpuTimeUs | N/A |
@@ -0,0 +1,274 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Performance benchmarking script. This is not an officially supported Google product."""
+
+import argparse
+import csv
+import logging
+import multiprocessing
+import os
+import random
+import time
+import uuid
+
+from functools import partial, update_wrapper
+
+from google.cloud import storage
+
+
+##### DEFAULTS & CONSTANTS #####
+HEADER = [
+    "Op",
+    "ObjectSize",
+    "AppBufferSize",
+    "LibBufferSize",
+    "Crc32cEnabled",
+    "MD5Enabled",
+    "ApiName",
+    "ElapsedTimeUs",
+    "CpuTimeUs",
+    "Status",
+    "RunID",
+]
+CHECKSUM = ["md5", "crc32c", None]
+TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
+DEFAULT_API = "JSON"
+DEFAULT_BUCKET_LOCATION = "US"
+DEFAULT_MIN_SIZE = 5120  # 5 KiB
+DEFAULT_MAX_SIZE = 2147483648  # 2 GiB
+DEFAULT_NUM_SAMPLES = 1000
+DEFAULT_NUM_PROCESSES = 16
+DEFAULT_LIB_BUFFER_SIZE = 104857600  # https://github.com/googleapis/python-storage/blob/main/google/cloud/storage/blob.py#L135
+NOT_SUPPORTED = -1
+
+
+def log_performance(func):
+    """Log latency and throughput output per operation call."""
+    # Holds benchmarking results for each operation
+    res = {
+        "ApiName": DEFAULT_API,
+        "RunID": TIMESTAMP,
+        "CpuTimeUs": NOT_SUPPORTED,
+        "AppBufferSize": NOT_SUPPORTED,
+        "LibBufferSize": DEFAULT_LIB_BUFFER_SIZE,
+    }
+
+    try:
+        elapsed_time = func()
+    except Exception as e:
+        logging.exception(
+            f"Caught an exception while running operation {func.__name__}\n {e}"
+        )
+        res["Status"] = ["FAIL"]
+        elapsed_time = NOT_SUPPORTED
+    else:
+        res["Status"] = ["OK"]
+
+    checksum = func.keywords.get("checksum")
+    num = func.keywords.get("num", None)
+    res["ElapsedTimeUs"] = elapsed_time
+    res["ObjectSize"] = func.keywords.get("size")
+    res["Crc32cEnabled"] = checksum == "crc32c"
+    res["MD5Enabled"] = checksum == "md5"
+    res["Op"] = func.__name__
+    if res["Op"] == "READ":
+        res["Op"] += f"[{num}]"
+
+    return [
+        res["Op"],
+        res["ObjectSize"],
+        res["AppBufferSize"],
+        res["LibBufferSize"],
+        res["Crc32cEnabled"],
+        res["MD5Enabled"],
+        res["ApiName"],
+        res["ElapsedTimeUs"],
+        res["CpuTimeUs"],
+        res["Status"],
+        res["RunID"],
+    ]
+
+
+def WRITE(bucket, blob_name, checksum, size, **kwargs):
+    """Perform an upload and return latency."""
+    blob = bucket.blob(blob_name)
+    file_path = f"{os.getcwd()}/{uuid.uuid4().hex}"
+    # Create random file locally on disk
+    with open(file_path, "wb") as file_obj:
+        file_obj.write(os.urandom(size))
+
+    start_time = time.monotonic_ns()
+    blob.upload_from_filename(file_path, checksum=checksum, if_generation_match=0)
+    end_time = time.monotonic_ns()
+
+    elapsed_time = round(
+        (end_time - start_time) / 1000
+    )  # convert nanoseconds to microseconds
+
+    # Clean up local file
+    cleanup_file(file_path)
+
+    return elapsed_time
+
+
+def READ(bucket, blob_name, checksum, **kwargs):
+    """Perform a download and return latency."""
+    blob = bucket.blob(blob_name)
+    if not blob.exists():
+        raise Exception("Blob does not exist. Previous WRITE failed.")
+
+    file_path = f"{os.getcwd()}/{blob_name}"
+    with open(file_path, "wb") as file_obj:
+        start_time = time.monotonic_ns()
+        blob.download_to_file(file_obj, checksum=checksum)
+        end_time = time.monotonic_ns()
+
+    elapsed_time = round(
+        (end_time - start_time) / 1000
+    )  # convert nanoseconds to microseconds
+
+    # Clean up local file
+    cleanup_file(file_path)
+
+    return elapsed_time
+
+
+def cleanup_file(file_path):
+    """Clean up local file on disk."""
+    try:
+        os.remove(file_path)
+    except Exception as e:
+        logging.exception(f"Caught an exception while deleting local file\n {e}")
+
+
+def _wrapped_partial(func, *args, **kwargs):
+    """Helper method to create partial and propagate function name and doc from original function."""
+    partial_func = partial(func, *args, **kwargs)
+    update_wrapper(partial_func, func)
+    return partial_func
+
+
+def _generate_func_list(bucket_name, min_size, max_size):
+    """Generate Write-1-Read-3 workload."""
+    # generate randmon size in bytes using a uniform distribution
+    size = random.randrange(min_size, max_size)
+    blob_name = f"{TIMESTAMP}-{uuid.uuid4().hex}"
+
+    # generate random checksumming type: md5, crc32c or None
+    idx_checksum = random.choice([0, 1, 2])
+    checksum = CHECKSUM[idx_checksum]
+
+    func_list = [
+        _wrapped_partial(
+            WRITE,
+            storage.Client().bucket(bucket_name),
+            blob_name,
+            size=size,
+            checksum=checksum,
+        ),
+        *[
+            _wrapped_partial(
+                READ,
+                storage.Client().bucket(bucket_name),
+                blob_name,
+                size=size,
+                checksum=checksum,
+                num=i,
+            )
+            for i in range(3)
+        ],
+    ]
+    return func_list
+
+
+def benchmark_runner(args):
+    """Run benchmarking iterations."""
+    results = []
+    for func in _generate_func_list(args.b, args.min_size, args.max_size):
+        results.append(log_performance(func))
+
+    return results
+
+
+def main(args):
+    # Create a storage bucket to run benchmarking
+    client = storage.Client()
+    if not client.bucket(args.b).exists():
+        bucket = client.create_bucket(args.b, location=args.r)
+
+    # Launch benchmark_runner using multiprocessing
+    p = multiprocessing.Pool(args.p)
+    pool_output = p.map(benchmark_runner, [args for _ in range(args.num_samples)])
+
+    # Output to CSV file
+    with open(args.o, "w") as file:
+        writer = csv.writer(file)
+        writer.writerow(HEADER)
+        for result in pool_output:
+            for row in result:
+                writer.writerow(row)
+    print(f"Succesfully ran benchmarking. Please find your output log at {args.o}")
+
+    # Cleanup and delete bucket
+    try:
+        bucket.delete(force=True)
+    except Exception as e:
+        logging.exception(f"Caught an exception while deleting bucket\n {e}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--min_size",
+        type=int,
+        default=DEFAULT_MIN_SIZE,
+        help="Minimum object size in bytes",
+    )
+    parser.add_argument(
+        "--max_size",
+        type=int,
+        default=DEFAULT_MAX_SIZE,
+        help="Maximum object size in bytes",
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=DEFAULT_NUM_SAMPLES,
+        help="Number of iterations",
+    )
+    parser.add_argument(
+        "--p",
+        type=int,
+        default=DEFAULT_NUM_PROCESSES,
+        help="Number of processes- multiprocessing enabled",
+    )
+    parser.add_argument(
+        "--r", type=str, default=DEFAULT_BUCKET_LOCATION, help="Bucket location"
+    )
+    parser.add_argument(
+        "--o",
+        type=str,
+        default=f"benchmarking{TIMESTAMP}.csv",
+        help="File to output results to",
+    )
+    parser.add_argument(
+        "--b",
+        type=str,
+        default=f"benchmarking{TIMESTAMP}",
+        help="Storage bucket name",
+    )
+    args = parser.parse_args()
+
+    main(args)
@@ -0,0 +1,21 @@
+# storage benchwrapp
+
+main.py is a gRPC wrapper around the storage library for benchmarking purposes.
+
+## Running
+
+```bash
+$ export STORAGE_EMULATOR_HOST=http://localhost:8080
+$ pip install grpcio
+$ cd storage
+$ pip install -e . # install google.cloud.storage locally
+$ cd tests/perf
+$ python3 benchwrapper.py --port 8081
+```
+
+## Re-generating protos
+
+```bash
+$ pip install grpcio-tools
+$ python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. *.proto
+```