chore[ci]: tpch-10 on ci action (#4498)

joseph-isaacs · web-flow · commit 4caa9171fd48 · 2025-09-03T18:56:08.000Z
Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;

---------

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml
@@ -109,13 +109,10 @@ jobs:
             | grep $base_commit_sha \
             > base.json
 
-          echo '# Benchmarks: ${{ matrix.benchmark.id }}' > comment.md
-          echo '<details>' >> comment.md
-          echo '<summary>Table of Results</summary>' >> comment.md
+          echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
           echo '' >> comment.md
-          uv run --no-project scripts/compare-benchmark-jsons.py base.json ${{ matrix.benchmark.id }}.json \
+          uv run --no-project scripts/compare-benchmark-jsons.py base.json ${{ matrix.benchmark.id }}.json "${{ matrix.benchmark.name }}" \
             >> comment.md
-          echo '</details>' >> comment.md
 
       - name: Comment PR
         uses: thollander/actions-comment-pull-request@v3
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -38,6 +38,22 @@ on:
               "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
               "scale_factor": "--scale-factor 1.0"
             },
+            {
+              "id": "tpch-nvme-10",
+              "subcommand": "tpch",
+              "name": "TPC-H SF=10 on NVME",
+              "targets": "datafusion:arrow,datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
+              "scale_factor": "--scale-factor 10.0"
+            },
+            {
+              "id": "tpch-s3-10",
+              "subcommand": "tpch",
+              "name": "TPC-H SF=10 on S3",
+              "local_dir": "bench-vortex/data/tpch/10.0",
+              "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/tpch/10.0/",
+              "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
+              "scale_factor": "--scale-factor 10.0"
+            },
             {
               "id": "tpcds-nvme",
               "subcommand": "tpcds",
@@ -195,12 +211,9 @@ jobs:
             > base.json
 
           echo '# Benchmarks: ${{ matrix.name }}' > comment.md
-          echo '<details>' >> comment.md
-          echo '<summary>Table of Results</summary>' >> comment.md
           echo '' >> comment.md
-          uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json \
+          uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.name }}" \
             >> comment.md
-          echo '</details>' >> comment.md
 
       - name: Comment PR
         if: inputs.mode == 'pr'
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -9,10 +9,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+import math
 import sys
 
 import pandas as pd
 
+# Check if benchmark name argument is provided (will be added from workflow)
+benchmark_name = sys.argv[3] if len(sys.argv) > 3 else ""
+
 base = pd.read_json(sys.argv[1], lines=True)
 pr = pd.read_json(sys.argv[2], lines=True)
 
@@ -24,30 +28,165 @@
 assert len(pr_commit_id) == 1, pr_commit_id
 pr_commit_id = next(iter(pr_commit_id))
 
+# Handle missing storage field
 if "storage" not in base:
-    # For whatever reason, the base lacks storage. Might be an old database of results. Might be a
-    # database of results without any storage fields.
     base["storage"] = pd.NA
-
 if "storage" not in pr:
-    # Not all benchmarks have a "storage" key. If none of the JSON objects in the PR results file
-    # had a "storage" key, then the PR DataFrame will lack that key and the join will fail.
     pr["storage"] = pd.NA
 
-# NB: `pd.merge` considers two null key values to be equal, so benchmarks without storage keys will
-# match.
-df3 = pd.merge(base, pr, on=["name", "storage"], how="right", suffixes=("_base", "_pr"))
+
+# Handle missing dataset field and create a dataset key for joining
+def extract_dataset_key(df):
+    if "dataset" not in df.columns:
+        df["dataset_key"] = pd.NA
+    else:
+        # Convert dataset dict to a string representation for joining
+        df["dataset_key"] = df["dataset"].apply(
+            lambda x: str(sorted(x.items())) if pd.notna(x) and isinstance(x, dict) else pd.NA
+        )
+    return df
+
+
+base = extract_dataset_key(base)
+pr = extract_dataset_key(pr)
+
+# Join on name, storage, and dataset_key
+# NB: `pd.merge` considers two null key values to be equal, so benchmarks without these keys will match.
+df3 = pd.merge(base, pr, on=["name", "storage", "dataset_key"], how="right", suffixes=("_base", "_pr"))
 
 # assert df3["unit_base"].equals(df3["unit_pr"]), (df3["unit_base"], df3["unit_pr"])
 
-print(
-    pd.DataFrame(
-        {
-            "name": df3["name"],
-            f"PR {pr_commit_id[:8]}": df3["value_pr"],
-            f"base {base_commit_id[:8]}": df3["value_base"],
-            "ratio (PR/base)": df3["value_pr"] / df3["value_base"],
-            "unit": df3["unit_base"],
-        }
-    ).to_markdown(index=False)
+# Generate summary statistics
+df3["ratio"] = df3["value_pr"] / df3["value_base"]
+
+# Filter for different target combinations for summary statistics
+vortex_df = df3[df3["name"].str.contains("vortex", case=False, na=False)]
+duckdb_vortex_df = df3[df3["name"].str.contains("duckdb.*vortex", case=False, na=False, regex=True)]
+datafusion_vortex_df = df3[df3["name"].str.contains("datafusion.*vortex", case=False, na=False, regex=True)]
+
+
+# Overall performance (all results)
+valid_positive_ratios = [r for r in df3["ratio"] if r > 0 and not pd.isna(r)]
+if len(valid_positive_ratios) > 0:
+    geo_mean_ratio = math.exp(sum(math.log(r) for r in valid_positive_ratios) / len(valid_positive_ratios))
+else:
+    geo_mean_ratio = float("nan")
+
+
+# Performance for different target combinations
+def calculate_geo_mean(df):
+    valid_ratios = [r for r in df["ratio"] if r > 0 and not pd.isna(r)]
+    if len(valid_ratios) > 0:
+        return math.exp(sum(math.log(r) for r in valid_ratios) / len(valid_ratios))
+    else:
+        return float("nan")
+
+
+vortex_geo_mean_ratio = calculate_geo_mean(vortex_df)
+duckdb_vortex_geo_mean_ratio = calculate_geo_mean(duckdb_vortex_df)
+datafusion_vortex_geo_mean_ratio = calculate_geo_mean(datafusion_vortex_df)
+
+# Find best and worst changes for vortex-only results
+vortex_valid_ratios = vortex_df["ratio"].dropna()
+if len(vortex_valid_ratios) > 0:
+    # Best improvement: smallest ratio (< 1.0, fastest performance)
+    improvements = vortex_valid_ratios[vortex_valid_ratios < 1.0]
+    if len(improvements) > 0:
+        best_idx = improvements.idxmin()
+        best_improvement = f"{vortex_df.loc[best_idx, 'name']} ({vortex_df.loc[best_idx, 'ratio']:.3f}x)"
+    else:
+        best_improvement = "no improvements"
+
+    # Worst regression: largest ratio (> 1.0, slowest performance)
+    regressions = vortex_valid_ratios[vortex_valid_ratios > 1.0]
+    if len(regressions) > 0:
+        worst_idx = regressions.idxmax()
+        worst_regression = f"{vortex_df.loc[worst_idx, 'name']} ({vortex_df.loc[worst_idx, 'ratio']:.3f}x)"
+    else:
+        worst_regression = "no regressions"
+else:
+    best_improvement = "no valid vortex comparisons"
+    worst_regression = "no valid vortex comparisons"
+
+# Determine threshold based on benchmark name
+# Use 30% threshold for S3 benchmarks, 10% for others
+is_s3_benchmark = "s3" in benchmark_name.lower()
+threshold_pct = 30 if is_s3_benchmark else 10
+improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
+regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
+
+# Count significant changes for vortex-only results
+significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
+significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()
+
+
+# Build summary
+def format_performance(ratio, target_name):
+    if pd.isna(ratio):
+        return f"no valid {target_name.lower()} comparisons available"
+    else:
+        return f"{ratio:.3f}x ({'better' if ratio < 1 else 'worse'} than base)"
+
+
+overall_performance = (
+    "no valid comparisons available"
+    if pd.isna(geo_mean_ratio)
+    else f"{geo_mean_ratio:.3f}x ({'better' if geo_mean_ratio < 1 else 'worse'} than base)"
+)
+vortex_performance = format_performance(vortex_geo_mean_ratio, "vortex")
+duckdb_vortex_performance = format_performance(duckdb_vortex_geo_mean_ratio, "duckdb:vortex")
+datafusion_vortex_performance = format_performance(datafusion_vortex_geo_mean_ratio, "datafusion:vortex")
+
+summary_lines = [
+    "## Summary",
+    "",
+    f"- **overall performance (all targets)**: {overall_performance}",
+]
+
+# Only add vortex-specific sections if we have vortex data
+if len(vortex_df) > 0:
+    summary_lines.extend(
+        [
+            f"- **vortex performance**: {vortex_performance}",
+        ]
+    )
+
+# Only add duckdb:vortex section if we have that data
+if len(duckdb_vortex_df) > 0:
+    summary_lines.append(f"- **duckdb:vortex performance**: {duckdb_vortex_performance}")
+
+# Only add datafusion:vortex section if we have that data
+if len(datafusion_vortex_df) > 0:
+    summary_lines.append(f"- **datafusion:vortex performance**: {datafusion_vortex_performance}")
+
+# Only add best/worst if we have vortex data
+if len(vortex_df) > 0:
+    summary_lines.extend(
+        [
+            f"- **best vortex improvement**: {best_improvement}",
+            f"- **worst vortex regression**: {worst_regression}",
+            f"- **significant vortex changes (>{threshold_pct}%)**:",
+            f"  - improvements: {significant_improvements} queries",
+            f"  - regressions: {significant_regressions} queries",
+        ]
+    )
+
+# Build table
+table_df = pd.DataFrame(
+    {
+        "name": df3["name"],
+        f"PR {pr_commit_id[:8]}": df3["value_pr"],
+        f"base {base_commit_id[:8]}": df3["value_base"],
+        "ratio (PR/base)": df3["ratio"],
+        "unit": df3["unit_base"],
+    }
 )
+
+# Output complete formatted markdown
+print("\n".join(summary_lines))
+print("")
+print("<details>")
+print("<summary>Detailed Results Table</summary>")
+print("")
+print(table_df.to_markdown(index=False))
+print("</details>")