-
Notifications
You must be signed in to change notification settings - Fork 1.7k
[DRAFT] chore: add test sharding #17438
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
01d4753
a7502d4
9ad0f0c
02c7653
3c8d057
73ab091
74f1ef7
a450254
af4d10c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,9 @@ run_package_test() { | |
| local package_name=$1 | ||
| local package_path="packages/${package_name}" | ||
|
|
||
| # Isolate gcloud config for parallel execution | ||
| export CLOUDSDK_CONFIG=$(mktemp -d) | ||
|
|
||
| # Declare local overrides to prevent bleeding into the next loop iteration | ||
| local PROJECT_ID | ||
| local GOOGLE_APPLICATION_CREDENTIALS | ||
|
|
@@ -87,12 +90,59 @@ run_package_test() { | |
| set -e | ||
| popd > /dev/null | ||
|
|
||
| # Clean up isolated gcloud config | ||
| rm -rf "${CLOUDSDK_CONFIG}" | ||
|
Comment on lines
+93
to
+94
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| return $res | ||
| } | ||
|
|
||
| # A file for running system tests | ||
| system_test_script="${PROJECT_ROOT}/.kokoro/system-single.sh" | ||
|
|
||
| # Parallel execution settings | ||
| MAX_PARALLEL=4 | ||
| running_pids=() | ||
| declare -A pid_to_pkg | ||
| declare -A pid_to_log | ||
| declare -A pid_to_resfile | ||
|
|
||
| # Array to keep track of results for the final summary | ||
| results=() | ||
|
|
||
| handle_finished_job() { | ||
| local pid=$1 | ||
| local pkg=${pid_to_pkg[$pid]} | ||
| local log=${pid_to_log[$pid]} | ||
| local resfile=${pid_to_resfile[$pid]} | ||
|
|
||
| # wait $pid might fail if it was already reaped by wait -n, | ||
| # so we ignore its exit code and use the resfile. | ||
| wait "$pid" 2>/dev/null || true | ||
|
|
||
| local res=1 | ||
| if [ -f "$resfile" ]; then | ||
| res=$(cat "$resfile") | ||
| rm "$resfile" | ||
| fi | ||
|
|
||
| echo "------------------------------------------------------------" | ||
| echo "System tests for ${pkg} finished (Exit code: ${res})" | ||
| echo "------------------------------------------------------------" | ||
| if [ -f "$log" ]; then | ||
| cat "$log" | ||
| rm "$log" | ||
| else | ||
| echo "Log file missing for ${pkg}" | ||
| fi | ||
|
|
||
| if [ "${res}" -ne 0 ]; then | ||
| RETVAL=${res} | ||
| results+=("${pkg}: FAILED") | ||
| else | ||
| results+=("${pkg}: PASSED") | ||
| fi | ||
| } | ||
|
|
||
| # Run system tests for each package with directory packages/*/tests/system | ||
| for path in `find 'packages' \ | ||
| \( -type d -wholename 'packages/*/tests/system' \) -o \ | ||
|
|
@@ -140,10 +190,53 @@ for path in `find 'packages' \ | |
| set -e | ||
|
|
||
| if [[ "${package_modified}" -gt 0 || "$KOKORO_BUILD_ARTIFACTS_SUBDIR" == *"continuous"* ]]; then | ||
| # Call the function - its internal exports won't affect the next loop | ||
| run_package_test "$package_name" || RETVAL=$? | ||
| # Wait if we have reached MAX_PARALLEL | ||
| while [[ ${#running_pids[@]} -ge $MAX_PARALLEL ]]; do | ||
| set +e | ||
| wait -n | ||
| set -e | ||
| # Find which job finished | ||
| new_pids=() | ||
| for pid in "${running_pids[@]}"; do | ||
| if kill -0 "$pid" 2>/dev/null; then | ||
| new_pids+=("$pid") | ||
| else | ||
| handle_finished_job "$pid" | ||
| fi | ||
| done | ||
| running_pids=("${new_pids[@]}") | ||
| done | ||
|
|
||
| # Start the next test in the background | ||
| log_file=$(mktemp) | ||
| res_file=$(mktemp) | ||
| ( | ||
| run_package_test "$package_name" > "$log_file" 2>&1 | ||
| echo $? > "$res_file" | ||
| ) & | ||
| pid=$! | ||
| running_pids+=($pid) | ||
| pid_to_pkg[$pid]=$package_name | ||
| pid_to_log[$pid]=$log_file | ||
| pid_to_resfile[$pid]=$res_file | ||
| echo "Started system tests for ${package_name} (PID: ${pid})" | ||
| else | ||
| echo "No changes in ${package_name} and not a continuous build, skipping." | ||
| fi | ||
| done | ||
|
|
||
| # Wait for all remaining jobs | ||
| for pid in "${running_pids[@]}"; do | ||
| handle_finished_job "$pid" | ||
| done | ||
|
|
||
| echo "------------------------------------------------------------" | ||
| echo "System Test Summary" | ||
| echo "------------------------------------------------------------" | ||
| for res in "${results[@]}"; do | ||
| echo "$res" | ||
| done | ||
| echo "------------------------------------------------------------" | ||
|
|
||
| exit ${RETVAL} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| import os | ||
| import subprocess | ||
| import json | ||
| import math | ||
| import sys | ||
|
|
||
| def get_packages(): | ||
| subdirs = ['packages'] | ||
| packages = [] | ||
| for subdir in subdirs: | ||
| if not os.path.exists(subdir): | ||
| continue | ||
| # Use the same sorting as the shell script | ||
| pkg_dirs = [os.path.join(subdir, d) + '/' for d in os.listdir(subdir) if os.path.isdir(os.path.join(subdir, d))] | ||
| packages.extend(sorted(pkg_dirs)) | ||
| return packages | ||
|
|
||
| def get_packages_to_test(): | ||
| build_type = os.environ.get('BUILD_TYPE', 'presubmit') | ||
| target_branch = os.environ.get('TARGET_BRANCH', 'main') | ||
|
|
||
| all_packages = get_packages() | ||
|
|
||
| if build_type == 'presubmit': | ||
| git_diff_arg = f"origin/{target_branch}..." | ||
| elif build_type == 'continuous': | ||
| git_diff_arg = "HEAD~.." | ||
| else: | ||
| return all_packages | ||
|
|
||
| # Check if ci/ changed | ||
| try: | ||
| subprocess.check_call(['git', 'diff', '--quiet', git_diff_arg, 'ci']) | ||
| ci_changed = False | ||
| except subprocess.CalledProcessError: | ||
| ci_changed = True | ||
|
|
||
| if ci_changed: | ||
| return all_packages | ||
|
|
||
| try: | ||
| res = subprocess.check_output(['git', 'diff', '--name-only', git_diff_arg]).decode('utf-8') | ||
| changed_files = res.splitlines() | ||
| except subprocess.CalledProcessError: | ||
| return all_packages | ||
|
|
||
| to_test = [] | ||
| for pkg in all_packages: | ||
| # Check if any changed file starts with the package path | ||
| if any(f.startswith(pkg) for f in changed_files): | ||
| to_test.append(pkg) | ||
|
|
||
| return to_test | ||
|
|
||
| def group_packages(packages, max_packages_per_shard=50, max_total_shards=10): | ||
| if not packages: | ||
| return [] | ||
|
|
||
| num_packages = len(packages) | ||
|
|
||
| # Calculate number of shards based on packages per shard | ||
| num_shards = math.ceil(num_packages / max_packages_per_shard) | ||
|
|
||
| # Cap the total number of shards | ||
| num_shards = min(num_shards, max_total_shards) | ||
|
|
||
| # Recalculate shard size to be as even as possible given the capped shards | ||
| shard_size = math.ceil(num_packages / num_shards) | ||
|
|
||
| shards = [] | ||
| for i in range(num_shards): | ||
| start = i * shard_size | ||
| end = min((i + 1) * shard_size, num_packages) | ||
| if start >= num_packages: | ||
| break | ||
|
|
||
| shard_packages = packages[start:end] | ||
| index = i + 1 | ||
| name = f"Shard {index}" | ||
|
|
||
| shards.append({ | ||
| "name": name, | ||
| "index": index, | ||
| "packages": " ".join(shard_packages) | ||
| }) | ||
| return shards | ||
|
|
||
| if __name__ == "__main__": | ||
| packages = get_packages_to_test() | ||
| # Shard into groups of ~50 libraries, up to 10 parallel jobs | ||
| shards = group_packages(packages, max_packages_per_shard=50, max_total_shards=10) | ||
| print(json.dumps(shards)) |
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -82,10 +82,42 @@ subdirs=( | |||||||||||||
| packages | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| if [ -n "${PACKAGE_LIST}" ]; then | ||||||||||||||
| echo "Using provided PACKAGE_LIST" | ||||||||||||||
| to_test=(${PACKAGE_LIST}) | ||||||||||||||
| RETVAL=0 | ||||||||||||||
| for d in ${to_test[@]}; do | ||||||||||||||
| echo "running test in ${d}" | ||||||||||||||
| pushd ${d} | ||||||||||||||
| set +e | ||||||||||||||
| ${test_script} | ||||||||||||||
| ret=$? | ||||||||||||||
| set -e | ||||||||||||||
|
Comment on lines
+92
to
+95
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Toggling
Suggested change
|
||||||||||||||
| if [ ${ret} -ne 0 ]; then | ||||||||||||||
| RETVAL=${ret} | ||||||||||||||
| fi | ||||||||||||||
| popd | ||||||||||||||
| done | ||||||||||||||
| exit ${RETVAL} | ||||||||||||||
| fi | ||||||||||||||
|
|
||||||||||||||
| # Sharding logic (fallback for manual runs) | ||||||||||||||
| TOTAL_SHARDS="${TOTAL_SHARDS:-1}" | ||||||||||||||
| SHARD_INDEX="${SHARD_INDEX:-1}" | ||||||||||||||
| count=0 | ||||||||||||||
|
|
||||||||||||||
| RETVAL=0 | ||||||||||||||
|
|
||||||||||||||
| for subdir in ${subdirs[@]}; do | ||||||||||||||
| for d in `ls -d ${subdir}/*/`; do | ||||||||||||||
| # Sort the directories to ensure consistent sharding across jobs | ||||||||||||||
| for d in `ls -d ${subdir}/*/ | sort`; do | ||||||||||||||
| # Sharding logic: only process directories that belong to this shard | ||||||||||||||
| if (( count % TOTAL_SHARDS != SHARD_INDEX - 1 )); then | ||||||||||||||
| ((count++)) | ||||||||||||||
| continue | ||||||||||||||
| fi | ||||||||||||||
| ((count++)) | ||||||||||||||
|
|
||||||||||||||
| should_test=false | ||||||||||||||
| if [ -n "${GIT_DIFF_ARG}" ]; then | ||||||||||||||
| echo "checking changes with 'git diff --quiet ${GIT_DIFF_ARG} ${d}'" | ||||||||||||||
|
|
@@ -108,6 +140,13 @@ for subdir in ${subdirs[@]}; do | |||||||||||||
| pushd ${d} | ||||||||||||||
| # Temporarily allow failure. | ||||||||||||||
| set +e | ||||||||||||||
|
|
||||||||||||||
| # Ensure unique coverage file per package to avoid DataError | ||||||||||||||
| # when combining statement and branch coverage. | ||||||||||||||
| # Strip trailing slash from directory name for the filename. | ||||||||||||||
| pkg_name_clean=$(echo ${d} | sed 's|/$||' | sed 's|/|_|g') | ||||||||||||||
| export COVERAGE_FILE="${PROJECT_ROOT}/.coverage.${PY_VERSION}.${pkg_name_clean}" | ||||||||||||||
|
|
||||||||||||||
| ${test_script} | ||||||||||||||
| ret=$? | ||||||||||||||
| set -e | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Test change for sharding |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since
run_package_testruns withset -eenabled, any test failure or error will cause the subshell to exit immediately. This prevents the manual cleanup at the end of the function from running, leading to leaked temporary directories in/tmp.Using an
EXITtrap ensures that the temporary directory is reliably cleaned up when the subshell exits, regardless of whether the tests succeeded or failed.