diff --git a/autoresearch-execution-large/autoresearch.sh b/autoresearch-execution-large/autoresearch.sh new file mode 100755 index 000000000..d4e4e5a37 --- /dev/null +++ b/autoresearch-execution-large/autoresearch.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# Autoresearch loop driver for graphql-java execution engine optimization. +# +# Usage: +# ./autoresearch-execution-large/autoresearch.sh [max_iterations] +# +# Default: 200 iterations (designed for overnight runs) +# +# Safety: +# The agent runs with --permission-mode plan and explicit --allowedTools. +# It can read files, edit source code, and run gradle for profiling. +# Tests, benchmarks, git commits, and reverts are handled by the outer harness. + +set -euo pipefail + +MAX_ITERATIONS="${1:-200}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +LOG_FILE="$SCRIPT_DIR/results.tsv" +BEST_SCORE_FILE="$SCRIPT_DIR/.best_score" +TEST_FILTER='--tests "graphql.execution.*" --tests "graphql.GraphQLTest"' + +cd "$PROJECT_DIR" + +# Verify claude CLI is available +if ! command -v claude &>/dev/null; then + echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first." + exit 1 +fi + +# Initialize log +if [ ! -f "$LOG_FILE" ]; then + printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE" +fi + +# Get baseline score +echo "=== Getting baseline score ===" +BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh") +if [ "$BASELINE" = "FAILED" ]; then + echo "ERROR: Baseline benchmark failed. Fix issues before starting autoresearch." + exit 1 +fi +echo "Baseline: $BASELINE ops/s" +echo "$BASELINE" > "$BEST_SCORE_FILE" + +BEST_SCORE="$BASELINE" + +for i in $(seq 1 "$MAX_ITERATIONS"); do + echo "" + echo "========================================" + echo "=== Iteration $i / $MAX_ITERATIONS ===" + echo "=== Best score: $BEST_SCORE ops/s ===" + echo "========================================" + + # Build the prompt for this iteration + RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations") + + PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java. + +Read autoresearch-execution-large/program.md for full context and strategy. + +Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s) + +Previous optimization log (last 10 entries): +$RECENT_LOG + +YOUR TASK: Make exactly ONE focused optimization to the execution engine code. +- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the + benchmark with async-profiler first to identify hotspots. +- Pick the most promising strategy from program.md that has NOT already been tried (check the log above) +- Make a minimal, targeted change to ONE or TWO files +- Do NOT run tests or benchmarks — the outer harness handles that +- Do NOT commit — the outer harness handles that +- After editing, output a single-line summary of what you changed and why + +SCOPE: Only modify files under src/main/java/graphql/execution/, src/main/java/graphql/GraphQL.java, +or the utility files listed in program.md (ImmutableKit.java, FpKit.java). + +Make the change now." + + # Allowed tools: read-only exploration + code edits + safe bash commands + # The agent can profile (gradlew jmh), inspect files, and edit source code. + # Tests, benchmarks, git commits, and reverts are handled by this outer harness. + ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)' + + echo "--- Asking Claude to make an optimization ---" + CLAUDE_OUTPUT=$(claude \ + --model sonnet \ + --permission-mode plan \ + --allowedTools "$ALLOWED_TOOLS" \ + --max-turns 25 \ + --verbose \ + -p "$PROMPT" \ + 2>&1) || true + + echo "$CLAUDE_OUTPUT" | tail -5 + + # Check if anything changed + if git diff --quiet src/main/java/; then + echo "No source changes in iteration $i, skipping" + printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE" + continue + fi + + # Show what changed + echo "--- Changes made ---" + git diff --stat src/main/java/ + + # Run targeted tests locally + echo "--- Running tests ---" + if ! ./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q 2>&1 | tail -10; then + echo "Tests FAILED — reverting changes" + git checkout -- src/ + printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE" + continue + fi + + # Run benchmark + echo "--- Running benchmark ---" + SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh") + if [ "$SCORE" = "FAILED" ]; then + echo "Benchmark FAILED — reverting changes" + git checkout -- src/ + printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE" + continue + fi + + # Compare (using awk for floating point) + IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}') + DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}') + + if [ "$IMPROVED" = "yes" ]; then + echo "" + echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***" + echo "" + BEST_SCORE="$SCORE" + echo "$BEST_SCORE" > "$BEST_SCORE_FILE" + + DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs) + + git add src/main/java/ + git commit -m "autoresearch: iteration $i [+$DELTA ops/s] + +$(git diff --cached --stat | head -5)" + + COMMIT=$(git rev-parse --short HEAD) + printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE" + else + echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting" + git checkout -- src/ + printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE" + fi +done + +echo "" +echo "========================================" +echo "=== Autoresearch complete ===" +echo "=== Baseline: $BASELINE ops/s ===" +echo "=== Final best: $BEST_SCORE ops/s ===" +TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}') +TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}') +echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ===" +echo "========================================" +echo "" +echo "Results log: $LOG_FILE" +echo "Review kept commits: git log --oneline --grep='autoresearch'" diff --git a/autoresearch-execution-large/program.md b/autoresearch-execution-large/program.md new file mode 100644 index 000000000..b44aed66d --- /dev/null +++ b/autoresearch-execution-large/program.md @@ -0,0 +1,96 @@ +# Autoresearch: Optimize Execution Engine Performance (Large In-Memory Query) + +## Goal + +Improve the throughput (ops/sec) of `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` by making +targeted optimizations to the core execution engine. This benchmark executes a sync query returning 10M scalar +values — it cleanly isolates the execution engine (field resolution, result assembly, ResultNodesInfo). + +Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance. + +## Metric + +- **Primary**: `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` — higher is better (ops/sec) +- Run with: `./gradlew jmh -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2` +- A run takes ~3-5 minutes. Parse the score from JMH's output line containing `benchMarkSimpleQueriesThroughput`. +- **Use async-profiler** to identify hotspots before optimizing: add `-PjmhProfilers=async` to the JMH command. Output goes to `performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput-Throughput/summary-cpu.txt`. + +## Scope — Files You May Modify + +Primary targets under `src/main/java/graphql/execution/`: + +- `ExecutionStrategy.java` (1141 lines) — the main execution strategy, field resolution +- `AsyncExecutionStrategy.java` (97 lines) — async field execution +- `Execution.java` (328 lines) — top-level execution orchestration +- `FieldCollector.java` (182 lines) — collects fields from selection sets +- `ResultNodesInfo.java` (55 lines) — tracks result node info during execution +- `ExecutionStepInfoFactory.java` (92 lines) — creates step info per field +- `FetchedValue.java` (82 lines) — wraps fetched values +- `FieldValueInfo.java` (101 lines) — field value tracking +- `MergedSelectionSet.java` (73 lines) — merged selections +- `MergedField.java` — merged field representation + +Also consider: +- `graphql/GraphQL.java` (624 lines) — top-level entry point +- `graphql/collect/ImmutableKit.java` — collection utilities +- `graphql/util/FpKit.java` — functional programming utilities +- `graphql/execution/instrumentation/` — instrumentation overhead + +**Do NOT modify**: test files, benchmark files, schema files, build files. + +## Constraints + +1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` for fast iteration (~30 sec). Full suite runs on EC2. +2. **No new dependencies**: This is a firm project policy. +3. **No wildcard imports, no inner classes, no Optional**: Project coding standards. +4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged. +5. **Thread safety**: The execution engine is called concurrently. Don't introduce shared mutable state. +6. **Use `graphql.Assert`** not `Objects.requireNonNull`. + +## Optimization Strategies to Explore (ordered by expected impact) + +### High Impact +1. **Profile first**: Run async-profiler to identify actual CPU hotspots before making changes. The previous ENF autoresearch found that Guava ImmutableMap/ImmutableListMultimap builders were the dominant hotspot due to Object.hashCode() overhead — similar patterns may exist here. +2. **Reduce object allocation in the execution hot loop**: The execution strategy creates many intermediate objects per field (ExecutionStepInfo, FetchedValue, FieldValueInfo). Consider whether allocations can be reduced. +3. **Optimize ResultNodesInfo**: This is called for every field resolution. Any overhead here multiplies by the number of fields (10M in this benchmark). +4. **Replace Guava immutable builders with mutable collections**: If ImmutableMap.Builder or ImmutableList.Builder are used in hot paths, replacing with LinkedHashMap/ArrayList (as was done in the ENF optimization) can yield 20%+ improvements. +5. **Reduce instrumentation overhead**: Even "no-op" instrumentation has method call overhead per field. + +### Medium Impact +6. **Optimize FieldCollector**: Field collection happens at each level. Caching or pre-computing merged selection sets could help. +7. **Reduce ExecutionStepInfo creation overhead**: ExecutionStepInfo is created per-field. Consider lazy computation of expensive fields. +8. **Avoid unnecessary wrapping/unwrapping**: FetchedValue wrapping, DataFetcherResult handling. +9. **Replace stream operations with loops**: In hot paths, `.stream().collect()` has overhead. + +### Lower Impact (but easy wins) +10. **Pre-size collections**: When field count is known, pre-size ArrayList/HashMap. +11. **Cache repeated lookups**: Schema type lookups, field definition lookups. +12. **Reduce string operations**: String concatenation in hot paths. + +## How to Iterate + +1. **Profile first** with async-profiler to identify actual hotspots +2. Pick ONE strategy targeting the top hotspot +3. Make a focused, minimal change +4. Run tests locally: `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` +5. Run the benchmark — compare to previous best +6. If improved: commit with message "autoresearch: [+X.XX ops/s]" +7. If not improved: revert with `git checkout -- src/` +8. Re-profile to see updated hotspots, then pick next strategy + +## Lessons from Previous Autoresearch (ENF Optimization) + +These patterns delivered the biggest wins in the ENF autoresearch: + +- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s. The `.build()` call hashes all keys, and Object.hashCode() on Apple Silicon triggers expensive `pthread_jit_write_protect_np`. +- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue. Replaced keyed multimap with index-aligned parallel lists. +- **Avoid groupingBy when only checking group count**: Saved 13k ops/s. Replaced full map creation with a boolean flag. +- **Short-circuit for empty/single-element cases**: Multiple small wins from fast-pathing the common case. +- **Cache lambda captures**: Reusing a Supplier field instead of creating `() -> value` per call. + +## Important Notes + +- The benchmark queries 10M scalar fields — execution engine overhead per field is the bottleneck. +- `GraphQL.execute()` is the entry point; it calls `Execution.execute()` → `ExecutionStrategy.execute()`. +- The execution engine is inherently recursive (fields within fields). +- Guava is an existing dependency — you can use Guava utilities but nothing else new. diff --git a/autoresearch-execution-large/run_benchmark.sh b/autoresearch-execution-large/run_benchmark.sh new file mode 100755 index 000000000..2baeb914f --- /dev/null +++ b/autoresearch-execution-large/run_benchmark.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Runs the LargeInMemoryQuery throughput benchmark and extracts the score. +# Usage: ./autoresearch-execution-large/run_benchmark.sh +# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$PROJECT_DIR" + +echo "=== Running LargeInMemoryQuery throughput benchmark ===" >&2 +BENCHMARK_OUTPUT=$(./gradlew jmh \ + -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" \ + -PjmhFork=1 \ + -PjmhIterations=3 \ + -PjmhWarmupIterations=2 \ + 2>&1) + +# Extract score from JMH output line like: +# LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput thrpt 3 XX.XXX ± Y.YYY ops/s +SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "benchMarkSimpleQueriesThroughput\s+thrpt" | awk '{print $(NF-3)}') + +if [ -z "$SCORE" ]; then + echo "FAILED: could not extract benchmark score" >&2 + echo "Last 20 lines of output:" >&2 + echo "$BENCHMARK_OUTPUT" | tail -20 >&2 + echo "FAILED" + exit 1 +fi + +echo "Score: $SCORE ops/s" >&2 +echo "$SCORE"