From 024c34c7d244d7b752207ffdace02bf86ecac46b Mon Sep 17 00:00:00 2001
From: dondonz <13839920+dondonz@users.noreply.github.com>
Date: Sun, 22 Mar 2026 14:52:31 +1100
Subject: [PATCH 1/3] Add autoresearch setup for execution engine (large
 in-memory query) optimization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 autoresearch-execution-large/autoresearch.sh  | 155 ++++++++++++++++++
 autoresearch-execution-large/program.md       |  96 +++++++++++
 autoresearch-execution-large/run_benchmark.sh |  34 ++++
 3 files changed, 285 insertions(+)
 create mode 100755 autoresearch-execution-large/autoresearch.sh
 create mode 100644 autoresearch-execution-large/program.md
 create mode 100755 autoresearch-execution-large/run_benchmark.sh

diff --git a/autoresearch-execution-large/autoresearch.sh b/autoresearch-execution-large/autoresearch.sh
new file mode 100755
index 000000000..d9175e90b
--- /dev/null
+++ b/autoresearch-execution-large/autoresearch.sh
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+# Autoresearch loop driver for graphql-java execution engine optimization.
+#
+# Usage:
+#   ./autoresearch-execution-large/autoresearch.sh [max_iterations]
+#
+# Default: 200 iterations (designed for overnight runs)
+
+set -euo pipefail
+
+MAX_ITERATIONS="${1:-200}"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+LOG_FILE="$SCRIPT_DIR/results.tsv"
+BEST_SCORE_FILE="$SCRIPT_DIR/.best_score"
+TEST_FILTER='--tests "graphql.execution.*" --tests "graphql.GraphQLTest"'
+
+cd "$PROJECT_DIR"
+
+# Verify claude CLI is available
+if ! command -v claude &>/dev/null; then
+    echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first."
+    exit 1
+fi
+
+# Initialize log
+if [ ! -f "$LOG_FILE" ]; then
+    printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE"
+fi
+
+# Get baseline score
+echo "=== Getting baseline score ==="
+BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+if [ "$BASELINE" = "FAILED" ]; then
+    echo "ERROR: Baseline benchmark failed. Fix issues before starting autoresearch."
+    exit 1
+fi
+echo "Baseline: $BASELINE ops/s"
+echo "$BASELINE" > "$BEST_SCORE_FILE"
+
+BEST_SCORE="$BASELINE"
+
+for i in $(seq 1 "$MAX_ITERATIONS"); do
+    echo ""
+    echo "========================================"
+    echo "=== Iteration $i / $MAX_ITERATIONS ==="
+    echo "=== Best score: $BEST_SCORE ops/s ==="
+    echo "========================================"
+
+    # Build the prompt for this iteration
+    RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations")
+
+    PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java.
+
+Read autoresearch-execution-large/program.md for full context and strategy.
+
+Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s)
+
+Previous optimization log (last 10 entries):
+$RECENT_LOG
+
+YOUR TASK: Make exactly ONE focused optimization to the execution engine code.
+- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the
+  benchmark with async-profiler first to identify hotspots.
+- Pick the most promising strategy from program.md that has NOT already been tried (check the log above)
+- Make a minimal, targeted change to ONE or TWO files
+- Do NOT run tests or benchmarks — the outer harness handles that
+- Do NOT commit — the outer harness handles that
+- After editing, output a single-line summary of what you changed and why
+
+SCOPE: Only modify files under src/main/java/graphql/execution/, src/main/java/graphql/GraphQL.java,
+or the utility files listed in program.md (ImmutableKit.java, FpKit.java).
+
+Make the change now."
+
+    echo "--- Asking Claude to make an optimization ---"
+    CLAUDE_OUTPUT=$(claude \
+        --model sonnet \
+        --dangerously-skip-permissions \
+        --max-turns 25 \
+        --verbose \
+        -p "$PROMPT" \
+        2>&1) || true
+
+    echo "$CLAUDE_OUTPUT" | tail -5
+
+    # Check if anything changed
+    if git diff --quiet src/main/java/; then
+        echo "No source changes in iteration $i, skipping"
+        printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Show what changed
+    echo "--- Changes made ---"
+    git diff --stat src/main/java/
+
+    # Run targeted tests locally
+    echo "--- Running tests ---"
+    if ! ./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q 2>&1 | tail -10; then
+        echo "Tests FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Run benchmark
+    echo "--- Running benchmark ---"
+    SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+    if [ "$SCORE" = "FAILED" ]; then
+        echo "Benchmark FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Compare (using awk for floating point)
+    IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}')
+    DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}')
+
+    if [ "$IMPROVED" = "yes" ]; then
+        echo ""
+        echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***"
+        echo ""
+        BEST_SCORE="$SCORE"
+        echo "$BEST_SCORE" > "$BEST_SCORE_FILE"
+
+        DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs)
+
+        git add src/main/java/
+        git commit -m "autoresearch: iteration $i [+$DELTA ops/s]
+
+$(git diff --cached --stat | head -5)"
+
+        COMMIT=$(git rev-parse --short HEAD)
+        printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE"
+    else
+        echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting"
+        git checkout -- src/
+        printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE"
+    fi
+done
+
+echo ""
+echo "========================================"
+echo "=== Autoresearch complete ==="
+echo "=== Baseline:    $BASELINE ops/s ==="
+echo "=== Final best:  $BEST_SCORE ops/s ==="
+TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}')
+TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}')
+echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ==="
+echo "========================================"
+echo ""
+echo "Results log: $LOG_FILE"
+echo "Review kept commits: git log --oneline --grep='autoresearch'"
diff --git a/autoresearch-execution-large/program.md b/autoresearch-execution-large/program.md
new file mode 100644
index 000000000..b44aed66d
--- /dev/null
+++ b/autoresearch-execution-large/program.md
@@ -0,0 +1,96 @@
+# Autoresearch: Optimize Execution Engine Performance (Large In-Memory Query)
+
+## Goal
+
+Improve the throughput (ops/sec) of `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` by making
+targeted optimizations to the core execution engine. This benchmark executes a sync query returning 10M scalar
+values — it cleanly isolates the execution engine (field resolution, result assembly, ResultNodesInfo).
+
+Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance.
+
+## Metric
+
+- **Primary**: `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` — higher is better (ops/sec)
+- Run with: `./gradlew jmh -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2`
+- A run takes ~3-5 minutes. Parse the score from JMH's output line containing `benchMarkSimpleQueriesThroughput`.
+- **Use async-profiler** to identify hotspots before optimizing: add `-PjmhProfilers=async` to the JMH command. Output goes to `performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput-Throughput/summary-cpu.txt`.
+
+## Scope — Files You May Modify
+
+Primary targets under `src/main/java/graphql/execution/`:
+
+- `ExecutionStrategy.java` (1141 lines) — the main execution strategy, field resolution
+- `AsyncExecutionStrategy.java` (97 lines) — async field execution
+- `Execution.java` (328 lines) — top-level execution orchestration
+- `FieldCollector.java` (182 lines) — collects fields from selection sets
+- `ResultNodesInfo.java` (55 lines) — tracks result node info during execution
+- `ExecutionStepInfoFactory.java` (92 lines) — creates step info per field
+- `FetchedValue.java` (82 lines) — wraps fetched values
+- `FieldValueInfo.java` (101 lines) — field value tracking
+- `MergedSelectionSet.java` (73 lines) — merged selections
+- `MergedField.java` — merged field representation
+
+Also consider:
+- `graphql/GraphQL.java` (624 lines) — top-level entry point
+- `graphql/collect/ImmutableKit.java` — collection utilities
+- `graphql/util/FpKit.java` — functional programming utilities
+- `graphql/execution/instrumentation/` — instrumentation overhead
+
+**Do NOT modify**: test files, benchmark files, schema files, build files.
+
+## Constraints
+
+1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` for fast iteration (~30 sec). Full suite runs on EC2.
+2. **No new dependencies**: This is a firm project policy.
+3. **No wildcard imports, no inner classes, no Optional**: Project coding standards.
+4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged.
+5. **Thread safety**: The execution engine is called concurrently. Don't introduce shared mutable state.
+6. **Use `graphql.Assert`** not `Objects.requireNonNull`.
+
+## Optimization Strategies to Explore (ordered by expected impact)
+
+### High Impact
+1. **Profile first**: Run async-profiler to identify actual CPU hotspots before making changes. The previous ENF autoresearch found that Guava ImmutableMap/ImmutableListMultimap builders were the dominant hotspot due to Object.hashCode() overhead — similar patterns may exist here.
+2. **Reduce object allocation in the execution hot loop**: The execution strategy creates many intermediate objects per field (ExecutionStepInfo, FetchedValue, FieldValueInfo). Consider whether allocations can be reduced.
+3. **Optimize ResultNodesInfo**: This is called for every field resolution. Any overhead here multiplies by the number of fields (10M in this benchmark).
+4. **Replace Guava immutable builders with mutable collections**: If ImmutableMap.Builder or ImmutableList.Builder are used in hot paths, replacing with LinkedHashMap/ArrayList (as was done in the ENF optimization) can yield 20%+ improvements.
+5. **Reduce instrumentation overhead**: Even "no-op" instrumentation has method call overhead per field.
+
+### Medium Impact
+6. **Optimize FieldCollector**: Field collection happens at each level. Caching or pre-computing merged selection sets could help.
+7. **Reduce ExecutionStepInfo creation overhead**: ExecutionStepInfo is created per-field. Consider lazy computation of expensive fields.
+8. **Avoid unnecessary wrapping/unwrapping**: FetchedValue wrapping, DataFetcherResult handling.
+9. **Replace stream operations with loops**: In hot paths, `.stream().collect()` has overhead.
+
+### Lower Impact (but easy wins)
+10. **Pre-size collections**: When field count is known, pre-size ArrayList/HashMap.
+11. **Cache repeated lookups**: Schema type lookups, field definition lookups.
+12. **Reduce string operations**: String concatenation in hot paths.
+
+## How to Iterate
+
+1. **Profile first** with async-profiler to identify actual hotspots
+2. Pick ONE strategy targeting the top hotspot
+3. Make a focused, minimal change
+4. Run tests locally: `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q`
+5. Run the benchmark — compare to previous best
+6. If improved: commit with message "autoresearch: <description> [+X.XX ops/s]"
+7. If not improved: revert with `git checkout -- src/`
+8. Re-profile to see updated hotspots, then pick next strategy
+
+## Lessons from Previous Autoresearch (ENF Optimization)
+
+These patterns delivered the biggest wins in the ENF autoresearch:
+
+- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s. The `.build()` call hashes all keys, and Object.hashCode() on Apple Silicon triggers expensive `pthread_jit_write_protect_np`.
+- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue. Replaced keyed multimap with index-aligned parallel lists.
+- **Avoid groupingBy when only checking group count**: Saved 13k ops/s. Replaced full map creation with a boolean flag.
+- **Short-circuit for empty/single-element cases**: Multiple small wins from fast-pathing the common case.
+- **Cache lambda captures**: Reusing a Supplier field instead of creating `() -> value` per call.
+
+## Important Notes
+
+- The benchmark queries 10M scalar fields — execution engine overhead per field is the bottleneck.
+- `GraphQL.execute()` is the entry point; it calls `Execution.execute()` → `ExecutionStrategy.execute()`.
+- The execution engine is inherently recursive (fields within fields).
+- Guava is an existing dependency — you can use Guava utilities but nothing else new.
diff --git a/autoresearch-execution-large/run_benchmark.sh b/autoresearch-execution-large/run_benchmark.sh
new file mode 100755
index 000000000..2baeb914f
--- /dev/null
+++ b/autoresearch-execution-large/run_benchmark.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Runs the LargeInMemoryQuery throughput benchmark and extracts the score.
+# Usage: ./autoresearch-execution-large/run_benchmark.sh
+# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+cd "$PROJECT_DIR"
+
+echo "=== Running LargeInMemoryQuery throughput benchmark ===" >&2
+BENCHMARK_OUTPUT=$(./gradlew jmh \
+    -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" \
+    -PjmhFork=1 \
+    -PjmhIterations=3 \
+    -PjmhWarmupIterations=2 \
+    2>&1)
+
+# Extract score from JMH output line like:
+# LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput  thrpt    3  XX.XXX ± Y.YYY  ops/s
+SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "benchMarkSimpleQueriesThroughput\s+thrpt" | awk '{print $(NF-3)}')
+
+if [ -z "$SCORE" ]; then
+    echo "FAILED: could not extract benchmark score" >&2
+    echo "Last 20 lines of output:" >&2
+    echo "$BENCHMARK_OUTPUT" | tail -20 >&2
+    echo "FAILED"
+    exit 1
+fi
+
+echo "Score: $SCORE ops/s" >&2
+echo "$SCORE"

From 5eca0dea6162b17ad73f066b4dd24a36bf34cf88 Mon Sep 17 00:00:00 2001
From: dondonz <13839920+dondonz@users.noreply.github.com>
Date: Sun, 22 Mar 2026 17:35:38 +1100
Subject: [PATCH 2/3] Replace --dangerously-skip-permissions with scoped
 --allowedTools

Use --permission-mode plan with explicit --allowedTools whitelist instead
of bypassing all permissions. The agent can read files, edit source code,
and run gradle (for profiling), but cannot run tests, git commit, or
perform other destructive operations. The outer harness handles those.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 autoresearch-execution-large/autoresearch.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/autoresearch-execution-large/autoresearch.sh b/autoresearch-execution-large/autoresearch.sh
index d9175e90b..8358900a5 100755
--- a/autoresearch-execution-large/autoresearch.sh
+++ b/autoresearch-execution-large/autoresearch.sh
@@ -5,6 +5,11 @@
 #   ./autoresearch-execution-large/autoresearch.sh [max_iterations]
 #
 # Default: 200 iterations (designed for overnight runs)
+#
+# Safety:
+#   The agent runs with --permission-mode plan and explicit --allowedTools.
+#   It can read files, edit source code, and run gradle for profiling.
+#   Tests, benchmarks, git commits, and reverts are handled by the outer harness.
 
 set -euo pipefail
 
@@ -73,10 +78,16 @@ or the utility files listed in program.md (ImmutableKit.java, FpKit.java).
 
 Make the change now."
 
+    # Allowed tools: read-only exploration + code edits + safe bash commands
+    # The agent can profile (gradlew jmh), inspect files, and edit source code.
+    # Tests, benchmarks, git commits, and reverts are handled by this outer harness.
+    ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*)'
+
     echo "--- Asking Claude to make an optimization ---"
     CLAUDE_OUTPUT=$(claude \
         --model sonnet \
-        --dangerously-skip-permissions \
+        --permission-mode plan \
+        --allowedTools "$ALLOWED_TOOLS" \
         --max-turns 25 \
         --verbose \
         -p "$PROMPT" \

From 2db2c4c174ef343201a6d82654dff845ebfcdc27 Mon Sep 17 00:00:00 2001
From: dondonz <13839920+dondonz@users.noreply.github.com>
Date: Sun, 22 Mar 2026 17:37:14 +1100
Subject: [PATCH 3/3] Allow git checkout in agent allowedTools

---
 autoresearch-execution-large/autoresearch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoresearch-execution-large/autoresearch.sh b/autoresearch-execution-large/autoresearch.sh
index 8358900a5..d4e4e5a37 100755
--- a/autoresearch-execution-large/autoresearch.sh
+++ b/autoresearch-execution-large/autoresearch.sh
@@ -81,7 +81,7 @@ Make the change now."
     # Allowed tools: read-only exploration + code edits + safe bash commands
     # The agent can profile (gradlew jmh), inspect files, and edit source code.
     # Tests, benchmarks, git commits, and reverts are handled by this outer harness.
-    ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*)'
+    ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)'
 
     echo "--- Asking Claude to make an optimization ---"
     CLAUDE_OUTPUT=$(claude \