Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions .github/scripts/gha_matrix_balancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import json
import argparse

# Complex, handwritten libraries with long-running test suites.
# The load balancer isolates these onto dedicated VMs to prevent bottlenecks.
HEAVY_LIFTERS = {
"google-cloud-spanner",
"google-cloud-compute",
"google-cloud-compute-v1beta",
"google-cloud-discoveryengine"
}

def get_valid_packages(directories):
"""Filters a list of directories, returning only those containing a noxfile.py."""
return [p for p in directories if os.path.isfile(os.path.join(p, "noxfile.py"))]

def distribute_packages(packages, max_buckets):
"""Distributes packages into load-balanced buckets, isolating heavy lifters."""
if not packages:
return []

# Heavy lifters jump to the front of the line
packages.sort(key=lambda p: os.path.basename(p) not in HEAVY_LIFTERS)

# Create the requested number of buckets (or fewer, if we have fewer packages than buckets)
actual_buckets = min(len(packages), max_buckets)
buckets = [{"weight": 0, "pkgs": []} for _ in range(actual_buckets)]

for pkg in packages:
# Find the bucket with the lowest weight, add the package, and update its weight
lightest = min(buckets, key=lambda b: b["weight"])
lightest["pkgs"].append(pkg)
lightest["weight"] += 9999 if os.path.basename(pkg) in HEAVY_LIFTERS else 1

return [b["pkgs"] for b in buckets]

def build_github_actions_jobs(buckets):
"""Formats the buckets into the specific JSON schema required by GitHub Actions."""
jobs = []
for bucket in buckets:
base_name = os.path.basename(bucket[0]).replace("google-cloud-", "")
job_label = f"{base_name} + {len(bucket) - 1}" if len(bucket) > 1 else base_name
jobs.append({"id": job_label, "packages": " ".join(bucket)})
return jobs

def main():
parser = argparse.ArgumentParser()
parser.add_argument("--matrix-multiplier", type=int, required=True)
parser.add_argument("--max-vms", type=int, default=20)
args = parser.parse_args()

changed_dirs = os.environ.get("CHANGED_DIRS", "").split()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If CHANGED_DIRS contains paths with trailing slashes (e.g., packages/my-pkg/), os.path.basename() will return an empty string. This breaks the heavy lifter detection and results in invalid job labels like + 5. Normalizing the paths ensures consistent behavior across different environments and input formats.

Suggested change
changed_dirs = os.environ.get("CHANGED_DIRS", "").split()
changed_dirs = [os.path.normpath(d) for d in os.environ.get("CHANGED_DIRS", "").split()]

packages = get_valid_packages(changed_dirs)

if not packages:
return

# Protect against GitHub's 256-job hard limit
max_buckets = min(250 // args.matrix_multiplier, args.max_vms)
buckets = distribute_packages(packages, max_buckets)

jobs_json = json.dumps(build_github_actions_jobs(buckets))
Comment on lines +56 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This section has two robustness issues:

  1. If no valid packages are found, the script returns early without setting the buckets output. This will cause fromJson() to fail in GitHub Actions workflows. It is better to always output a valid JSON array (e.g., []).
  2. The max_buckets calculation can crash with a ZeroDivisionError if matrix-multiplier is 0, or cause a ValueError in distribute_packages if max_buckets evaluates to 0 (which happens if matrix-multiplier > 250 or max-vms is 0).
    jobs = []
    if packages:
        # Protect against GitHub's 256-job hard limit and ensure at least 1 bucket
        multiplier = max(1, args.matrix_multiplier)
        max_buckets = max(1, min(250 // multiplier, args.max_vms))

        buckets = distribute_packages(packages, max_buckets)
        jobs = build_github_actions_jobs(buckets)

    jobs_json = json.dumps(jobs)


if "GITHUB_OUTPUT" in os.environ:
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
f.write(f"buckets={jobs_json}\n")
else:
print(jobs_json)

if __name__ == "__main__":
main()
78 changes: 78 additions & 0 deletions .github/scripts/test_gha_matrix_balancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest
from unittest.mock import patch
from gha_matrix_balancer import (
get_valid_packages,
distribute_packages,
build_github_actions_jobs,
HEAVY_LIFTERS
)

@patch("os.path.isfile")
def test_get_valid_packages(mock_isfile):
# Mock isfile to only return True for the exact valid path
mock_isfile.side_effect = lambda path: path == "packages/valid-pkg/noxfile.py"

dirs = ["packages/valid-pkg", "packages/invalid-pkg"]
result = get_valid_packages(dirs)

assert result == ["packages/valid-pkg"]

def test_distribute_packages_isolates_heavy_lifters():
# Mix 1 heavy lifter with 5 normal packages
heavy_lifter = list(HEAVY_LIFTERS)[0] # Grab one of the defined heavy lifters dynamically

packages = [
"packages/google-cloud-vision",
"packages/google-cloud-storage",
f"packages/{heavy_lifter}",
"packages/google-cloud-logging",
"packages/google-cloud-pubsub",
"packages/google-cloud-kms",
]

# Request 3 buckets
buckets = distribute_packages(packages, max_buckets=3)

assert len(buckets) == 3

# Find the bucket containing the heavy lifter
heavy_bucket = next(b for b in buckets if f"packages/{heavy_lifter}" in b)

# Because it adds 9999 weight, it should be the ONLY package in its bucket
assert len(heavy_bucket) == 1
assert heavy_bucket[0] == f"packages/{heavy_lifter}"

def test_distribute_packages_max_bucket_limit():
# 5 packages, but we only allow 2 buckets
packages = [f"pkg-{i}" for i in range(5)]
buckets = distribute_packages(packages, max_buckets=2)

assert len(buckets) == 2
# Packages should be distributed (3 in one, 2 in the other)
assert len(buckets[0]) + len(buckets[1]) == 5

def test_distribute_packages_more_buckets_than_packages():
# 2 packages, but we allow up to 10 buckets
packages = ["pkg-1", "pkg-2"]
buckets = distribute_packages(packages, max_buckets=10)

# It should only create 2 buckets, not 10
assert len(buckets) == 2

def test_build_github_actions_jobs():
buckets = [
["packages/google-cloud-spanner"], # Single item
["packages/google-cloud-vision", "packages/google-cloud-storage", "packages/google-cloud-pubsub"] # Multiple items
]

jobs = build_github_actions_jobs(buckets)

assert len(jobs) == 2

# Test single-item label stripping
assert jobs[0]["id"] == "spanner"
assert jobs[0]["packages"] == "packages/google-cloud-spanner"

# Test multi-item label generation (+ N logic)
assert jobs[1]["id"] == "vision + 2"
assert jobs[1]["packages"] == "packages/google-cloud-vision packages/google-cloud-storage packages/google-cloud-pubsub"
Loading