From adb81f265a7b890764cc07d89d5380c71ac7c045 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Wed, 27 May 2026 20:25:51 +0000 Subject: [PATCH 01/24] chore: rationalize SessionImpl field visibility Document lock ownership before later steps remove the lock. Add @GuardedBy to fields under the lock, move heartbeatInterval read into the synchronized block in startRpc(), and comment fields intentionally read outside the lock. No runtime change. --- .../data/v2/internal/session/SessionImpl.java | 28 +++++++++++++++++-- .../v2/internal/session/SessionPoolImpl.java | 10 ++++++- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 9ee86bffbdbd..8dcebbfc22d9 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -94,12 +94,24 @@ public class SessionImpl implements Session, VRpcSessionApi { @GuardedBy("lock") private Instant lastStateChangedAt; + // Set once under lock in start(), then read freely from gRPC callbacks without the lock. + // Safe because start() is always called before any callback fires, so the write is + // visible to all subsequent readers through the happens-before chain from stream.start(). private Listener sessionListener; + // volatile: written under lock in handleSessionRefreshConfigResponse(); read without lock in + // getOpenParams() and isOpenParamsUpdated() so callers get a consistent (if possibly stale) + // snapshot without contending on the lock. Stale reads are acceptable for these accessors. private volatile OpenParams openParams; private volatile boolean openParamsUpdated; + // closeReason is written under lock in close(), forceClose(), handleGoAwayResponse(), and + // dispatchStreamClosed(). The one read that occurs outside the lock — in dispatchStreamClosed + // after the synchronized block — runs on the same gRPC callback thread that just released the + // lock, so the lock's release-acquire edge provides the necessary visibility. A stale read is + // structurally impossible given the control flow (closeReason is always set before the lock + // is released on every path that reaches that read site). @Nullable private CloseSessionRequest closeReason = null; @GuardedBy("lock") @@ -112,10 +124,18 @@ public class SessionImpl implements Session, VRpcSessionApi { @GuardedBy("lock") private VRpcResult currentCancel = null; + @GuardedBy("lock") private SessionParametersResponse sessionParameters = DEFAULT_SESSION_PARAMS; + + // volatile: written under lock in handleSessionParamsResponse(); read without lock in + // handleHeartBeatResponse() where a stale read is acceptable — the heartbeat deadline is a + // soft scheduling hint, not a correctness invariant. startRpc() reads this inside the lock. private volatile Duration heartbeatInterval = Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); + // volatile: written from multiple sites without holding the lock (startRpc, handleVRpc*, + // handleHeartBeatResponse). Stale reads are acceptable — nextHeartbeat is used only as a + // scheduling hint by the pool's heartbeat monitor. private volatile Instant nextHeartbeat; public SessionImpl( @@ -330,9 +350,6 @@ VRpc newCall(VRpcDescriptor descriptor) { @Override public Status startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { - // start monitoring for heartbeat when the vrpc is started - this.nextHeartbeat = clock.instant().plus(heartbeatInterval); - synchronized (lock) { if (currentRpc != null) { return Status.INTERNAL.withDescription( @@ -345,6 +362,11 @@ public Status startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { this.currentRpc = rpc; stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); + // Start monitoring for heartbeat when the vRPC is started. heartbeatInterval is read + // inside the lock to avoid a race with handleSessionParamsResponse(). nextHeartbeat is + // volatile and written here without an atomicity guarantee — that is intentional; it is + // only a scheduling hint (see field comment). + this.nextHeartbeat = clock.instant().plus(heartbeatInterval); return Status.OK; } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index 35884cb74319..c175509d5f33 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -140,6 +140,10 @@ private enum PoolState { private final DebugTagTracer debugTagTracer; + // @SuppressWarnings("GuardedBy"): error-prone flags writes to @GuardedBy("this") fields + // (sessions, picker, poolSizer, pendingRpcs, budget, retryCreateSessionFuture) inside the + // constructor without holding the monitor. This is safe because the object is not yet published + // to other threads — no external reference exists until the constructor returns. @SuppressWarnings("GuardedBy") public SessionPoolImpl( Metrics metrics, @@ -164,6 +168,7 @@ public SessionPoolImpl( createInitialBudget(configManager.getClientConfiguration())); } + // @SuppressWarnings("GuardedBy"): same rationale as the public constructor above. @SuppressWarnings("GuardedBy") @VisibleForTesting SessionPoolImpl( @@ -751,7 +756,10 @@ static class Watchdog implements Runnable { private final Clock clock; private final DebugTagTracer debugTagTracer; - // TODO: fix lock sharing + // The `lock` parameter is the pool-wide monitor (SessionPoolImpl.this). It is typed as Object + // because Watchdog is a static nested class and cannot reference the outer instance type in its + // constructor signature without creating a circular dependency. Phase 5 will replace this with + // a properly typed lock once the per-AFE sharding model is established. public Watchdog( Object lock, ScheduledExecutorService executor, From 0854204b15667f2d04433d2e9fc4c4b4ac8955a2 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Fri, 12 Jun 2026 22:28:34 +0000 Subject: [PATCH 02/24] chore: migrate pool scheduling to a hashed-wheel timer Replace ScheduledExecutorService with a BigtableTimer (Netty hashed wheel, will move in-tree later) for heartbeat, deadline, watchdog, AFE-prune, retry-create-session, and retry-delay. Owned by Client and shared across pools. --- .../v2/internal/api/AuthorizedViewAsync.java | 6 +- .../bigtable/data/v2/internal/api/Client.java | 17 +- .../internal/api/MaterializedViewAsync.java | 6 +- .../data/v2/internal/api/TableAsync.java | 6 +- .../data/v2/internal/api/TableBase.java | 19 +- .../v2/internal/middleware/RetryingVRpc.java | 38 ++-- .../v2/internal/session/BigtableTimer.java | 60 ++++++ .../v2/internal/session/NettyWheelTimer.java | 78 ++++++++ .../data/v2/internal/session/SessionImpl.java | 76 +++++++- .../data/v2/internal/session/SessionList.java | 24 --- .../v2/internal/session/SessionPoolImpl.java | 182 ++++++++++++------ .../data/v2/internal/api/TableBaseTest.java | 4 +- .../internal/csm/tracers/VRpcTracerTest.java | 15 +- .../internal/middleware/RetryingVRpcTest.java | 25 ++- .../v2/internal/session/SessionImplTest.java | 20 +- .../internal/session/SessionPoolImplTest.java | 50 +++-- .../v2/internal/session/WatchdogTest.java | 35 +++- 17 files changed, 487 insertions(+), 174 deletions(-) create mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java create mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java index 3edacf7766e0..fdb79871324f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java @@ -26,13 +26,13 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; public class AuthorizedViewAsync implements AutoCloseable, Closeable { @@ -48,7 +48,7 @@ static AuthorizedViewAsync createAndStart( String viewId, Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer) { AuthorizedViewName viewName = AuthorizedViewName.builder() @@ -78,7 +78,7 @@ static AuthorizedViewAsync createAndStart( callOptions, viewName.toString(), metrics, - executorService); + timer); return new AuthorizedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 82ddff08c3cb..5214c2480131 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -33,7 +33,9 @@ import com.google.cloud.bigtable.data.v2.internal.csm.MetricsImpl; import com.google.cloud.bigtable.data.v2.internal.csm.NoopMetrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.opencensus.stats.Stats; @@ -71,6 +73,10 @@ public class Client implements AutoCloseable { private final FeatureFlags featureFlags; private final ClientInfo clientInfo; private final Resource backgroundExecutor; + // Hashed-wheel timer for heartbeat / deadline / watchdog / retry scheduling. Built over + // backgroundExecutor (the timer's tick thread dispatches bodies onto it). Single tick thread per + // Client, shared across every SessionPoolImpl. + private final BigtableTimer sessionTimer; private final CallOptions defaultCallOptions; private final ChannelPool channelPool; @@ -166,6 +172,9 @@ public Client( this.metrics = metrics; this.configManager = configManager; this.backgroundExecutor = bgExecutor; + // Timer's tick thread dispatches bodies onto backgroundExecutor — tick-thread-blocking work + // (anything that takes a pool lock) gets handed off there instead of stalling the wheel. + this.sessionTimer = new NettyWheelTimer("bigtable-session-timer", bgExecutor.get()); defaultCallOptions = CallOptions.DEFAULT; @@ -202,6 +211,8 @@ public void close() { metrics.close(); channelPool.close(); configManager.close(); + // Stop the timer before tearing down backgroundExecutor (the timer's dispatcher). + sessionTimer.stop(); backgroundExecutor.close(); } @@ -216,7 +227,7 @@ public TableAsync openTableAsync(String tableId, Permission permission) { tableId, permission, metrics.get(), - backgroundExecutor.get()); + sessionTimer); sessionPools.add(tableAsync.getSessionPool()); return tableAsync; } @@ -234,7 +245,7 @@ public AuthorizedViewAsync openAuthorizedViewAsync( viewId, permission, metrics.get(), - backgroundExecutor.get()); + sessionTimer); sessionPools.add(viewAsync.getSessionPool()); return viewAsync; } @@ -251,7 +262,7 @@ public MaterializedViewAsync openMaterializedViewAsync( viewId, permission, metrics.get(), - backgroundExecutor.get()); + sessionTimer); sessionPools.add(viewAsync.getSessionPool()); return viewAsync; } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java index 70023645efc2..81a76f876fe6 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java @@ -23,13 +23,13 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; public class MaterializedViewAsync implements AutoCloseable, Closeable { @@ -44,7 +44,7 @@ public static MaterializedViewAsync createAndStart( String viewId, OpenMaterializedViewRequest.Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer) { MaterializedViewName viewName = MaterializedViewName.builder() @@ -73,7 +73,7 @@ public static MaterializedViewAsync createAndStart( callOptions, viewId, metrics, - executorService); + timer); return new MaterializedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java index 0bc699d4f146..8fb0ac27a2d3 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java @@ -27,13 +27,13 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; public class TableAsync implements AutoCloseable, Closeable { private final TableBase base; @@ -47,7 +47,7 @@ public static TableAsync createAndStart( String tableId, Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer) { TableName tableName = TableName.builder() @@ -76,7 +76,7 @@ public static TableAsync createAndStart( callOptions, tableId, metrics, - executorService); + timer); return new TableAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java index 8feef399d625..af3bf306856f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java @@ -31,6 +31,7 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolImpl; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import com.google.common.annotations.VisibleForTesting; @@ -39,11 +40,10 @@ import io.grpc.Context; import io.grpc.Deadline; import io.grpc.Metadata; -import java.util.concurrent.ScheduledExecutorService; class TableBase implements AutoCloseable { private final SessionPool sessionPool; - private final ScheduledExecutorService backgroundExecutor; + private final BigtableTimer timer; private final Metrics metrics; private final VRpcDescriptor readRowDescriptor; private final VRpcDescriptor @@ -61,7 +61,7 @@ static TableBase createAndStart( CallOptions callOptions, String sessionPoolName, Metrics metrics, - ScheduledExecutorService executor) { + BigtableTimer timer) { SessionPool sessionPool = new SessionPoolImpl<>( @@ -73,11 +73,12 @@ static TableBase createAndStart( callOptions, sessionDescriptor, sessionPoolName, - executor); + timer); sessionPool.start(openReq, new Metadata()); - return new TableBase(sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, executor); + return new TableBase( + sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, timer); } @VisibleForTesting @@ -86,12 +87,12 @@ static TableBase createAndStart( VRpcDescriptor readRowDescriptor, VRpcDescriptor mutateRowDescriptor, Metrics metrics, - ScheduledExecutorService executor) { + BigtableTimer timer) { this.sessionPool = sessionPool; this.readRowDescriptor = readRowDescriptor; this.mutateRowDescriptor = mutateRowDescriptor; this.metrics = metrics; - this.backgroundExecutor = executor; + this.timer = timer; } @Override @@ -110,7 +111,7 @@ public SessionPool getSessionPool() { public void readRow( SessionReadRowRequest req, VRpcListener listener, Deadline deadline) { RetryingVRpc retry = - new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), backgroundExecutor); + new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), timer); VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), readRowDescriptor, deadline); VRpcCallContext ctx = VRpcCallContext.create(deadline, true, tracer); @@ -126,7 +127,7 @@ public void mutateRow( VRpcListener listener, Deadline deadline) { RetryingVRpc retry = - new RetryingVRpc<>(() -> sessionPool.newCall(mutateRowDescriptor), backgroundExecutor); + new RetryingVRpc<>(() -> sessionPool.newCall(mutateRowDescriptor), timer); boolean idempotent = Util.isIdempotent(req.getMutationsList()); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index d6048bfb9140..0e19fe076d63 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -17,6 +17,7 @@ package com.google.cloud.bigtable.data.v2.internal.middleware; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.common.base.Stopwatch; import com.google.protobuf.Duration; import com.google.protobuf.util.Durations; @@ -26,7 +27,6 @@ import io.grpc.SynchronizationContext; import java.util.Optional; import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import java.util.logging.Level; @@ -46,7 +46,7 @@ public class RetryingVRpc implements VRpc { private VRpcCallContext context; private VRpcTracer tracer; - private final ScheduledExecutorService executor; + private final BigtableTimer timer; private final SynchronizationContext syncContext; // current state and all the flags don't need to be volatile because they're only updated within @@ -56,13 +56,13 @@ public class RetryingVRpc implements VRpc { // Breaks the loop if uncaught exception happens during sync context execution. private boolean isCancelling; - public RetryingVRpc(Supplier> supplier, ScheduledExecutorService executor) { + public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { this.attemptFactory = supplier; grpcContext = Context.current(); otelContext = io.opentelemetry.context.Context.current(); - this.executor = otelContext.wrap(executor); + this.timer = timer; this.syncContext = new SynchronizationContext( (t, e) -> { @@ -271,7 +271,7 @@ boolean shouldRetry(VRpcResult result) { class Scheduled extends State { private final Duration retryDelay; - private SynchronizationContext.ScheduledHandle future; + private BigtableTimer.Timeout future; Scheduled(Duration retryDelay) { this.retryDelay = retryDelay; @@ -280,12 +280,23 @@ class Scheduled extends State { @Override public void onStart() { try { + // Wraps go innermost so the captured gRPC + OpenTelemetry contexts are re-established at + // the moment the body runs, not just while the dispatcher is invoking the outer task. + // syncContext.execute may queue the inner runnable for a later drain on a different + // thread; an outer wrap's scope would already be closed by then. future = - syncContext.schedule( - () -> grpcContext.wrap(() -> onStateChange(new Idle())).run(), + timer.newTimeout( + () -> + syncContext.execute( + () -> + grpcContext.wrap( + () -> + otelContext + .wrap(() -> onStateChange(new Idle())) + .run()) + .run()), Durations.toMillis(retryDelay), - TimeUnit.MILLISECONDS, - executor); + TimeUnit.MILLISECONDS); } catch (RejectedExecutionException e) { onStateChange( new Done( @@ -299,11 +310,10 @@ public void onStart() { @Override public void onCancel(String reason, Throwable throwable) { - // future can be null if schedule throws an exception that's not RejectedExecutionException. - // In which case sync context uncaught exception handler will be called, which calls cancel on - // the current - // state before transition into done state. - if (future != null && future.isPending()) { + // future can be null if newTimeout throws an exception. In which case sync context uncaught + // exception handler will be called, which calls cancel on the current state before + // transition into done state. + if (future != null && !future.isCancelled()) { future.cancel(); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java new file mode 100644 index 000000000000..48930ece77e1 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java @@ -0,0 +1,60 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.session; + +import java.util.concurrent.TimeUnit; + +/** + * Schedules short-lived callbacks (heartbeat ticks, deadline monitors, watchdog ticks) at + * approximate, low-resolution times. Backed by a hashed wheel: O(1) insert and O(1) cancel, + * regardless of how many pending timeouts the wheel holds. + * + *

{@link #newTimeout} runs the callback on the timer's bundled dispatch executor — callers do + * not have to wrap their bodies in {@code executor.execute(...)} to stay off the tick thread. This + * is the default and is correct for any callback that takes a lock or does real work. + * + *

TODO: once later refactor steps introduce per-op / per-session dispatchers (e.g. {@code + * SerializingExecutor} or {@code SynchronizationContext}), add a {@code newTimeoutOnTickThread} + * variant so callers with their own dispatcher can skip the wasted hop through the bundled + * executor. + * + *

This is a thin abstraction over a single concrete implementation today (see {@code + * NettyWheelTimer}). It exists so the implementation can be swapped after benchmarking establishes + * a baseline. + */ +public interface BigtableTimer { + /** + * Schedules {@code task} to run after {@code delay}. The task body runs on the timer's bundled + * dispatch executor, not on the tick thread, so it is safe to take locks or do bounded work. + * + *

The returned handle can be used to cancel the task; cancel is O(1) and does not leave the + * entry in any heap. + */ + Timeout newTimeout(Runnable task, long delay, TimeUnit unit); + + /** + * Releases the tick thread and discards any pending timeouts. Idempotent. After {@code stop()}, + * subsequent calls to {@link #newTimeout} throw {@link IllegalStateException}. + */ + void stop(); + + interface Timeout { + /** Cancels the scheduled task. Returns true if the task had not yet fired. */ + boolean cancel(); + + boolean isCancelled(); + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java new file mode 100644 index 000000000000..825065055110 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java @@ -0,0 +1,78 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.session; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import io.grpc.netty.shaded.io.netty.util.HashedWheelTimer; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; + +/** + * {@link BigtableTimer} backed by Netty's {@code HashedWheelTimer}, accessed via the shaded copy + * inside {@code grpc-netty-shaded}. We depend on the shaded class so the library does not pull in + * an additional {@code io.netty:netty-common} artifact. + * + *

Temporary: once threading-refactor benchmarks establish a baseline, this should be replaced + * with an in-tree implementation that does not reach into gRPC's shaded internals. + */ +public final class NettyWheelTimer implements BigtableTimer { + // 10 ms tick × 512 buckets ≈ 5 s per rotation. Heartbeat (100 ms), deadlines (sub-second to + // seconds), and watchdog (5 min) all sit comfortably inside this resolution. + private static final long TICK_DURATION_MS = 10; + private static final int TICKS_PER_WHEEL = 512; + + private final HashedWheelTimer delegate; + private final Executor dispatcher; + + public NettyWheelTimer(String name, Executor dispatcher) { + this.dispatcher = dispatcher; + this.delegate = + new HashedWheelTimer( + new ThreadFactoryBuilder().setNameFormat(name + "-%d").setDaemon(true).build(), + TICK_DURATION_MS, + TimeUnit.MILLISECONDS, + TICKS_PER_WHEEL); + } + + @Override + public Timeout newTimeout(Runnable task, long delay, TimeUnit unit) { + return new TimeoutHandle( + delegate.newTimeout(ignored -> dispatcher.execute(task), delay, unit)); + } + + @Override + public void stop() { + delegate.stop(); + } + + private static final class TimeoutHandle implements Timeout { + private final io.grpc.netty.shaded.io.netty.util.Timeout nettyTimeout; + + TimeoutHandle(io.grpc.netty.shaded.io.netty.util.Timeout nettyTimeout) { + this.nettyTimeout = nettyTimeout; + } + + @Override + public boolean cancel() { + return nettyTimeout.cancel(); + } + + @Override + public boolean isCancelled() { + return nettyTimeout.isCancelled(); + } + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 8dcebbfc22d9..832c08943913 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -49,6 +49,7 @@ import java.time.Instant; import java.util.Locale; import java.util.Optional; +import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; @@ -70,6 +71,12 @@ public class SessionImpl implements Session, VRpcSessionApi { // A time in the future to skip heartbeat checks when there's no active vRPCs on the session static final Duration FUTURE_TIME = Duration.ofMinutes(30); + private static final CloseSessionRequest MISSED_HEARTBEAT_CLOSE_REQUEST = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_MISSED_HEARTBEAT) + .setDescription("missed heartbeat") + .build(); + /* * This lock should be mostly uncontended - all access should be naturally interleaved. Contention * can only really happen when an unsolicited gRPC control message (ie GOAWAY) arrives at the same @@ -79,6 +86,7 @@ public class SessionImpl implements Session, VRpcSessionApi { private final Object lock = new Object(); private final Clock clock; + private final BigtableTimer timer; private final SessionTracer tracer; private final DebugTagTracer debugTagTracer; @@ -135,12 +143,23 @@ public class SessionImpl implements Session, VRpcSessionApi { // volatile: written from multiple sites without holding the lock (startRpc, handleVRpc*, // handleHeartBeatResponse). Stale reads are acceptable — nextHeartbeat is used only as a - // scheduling hint by the pool's heartbeat monitor. + // scheduling hint by the per-session heartbeat tick. private volatile Instant nextHeartbeat; + // Handle for the in-flight heartbeat tick (one outstanding at a time). Set under lock when the + // session enters READY (handleOpenSessionResponse) and again from checkHeartbeat to chain the + // next tick. Cancelled under lock from updateState when the session transitions past READY. + @GuardedBy("lock") + @Nullable + private BigtableTimer.Timeout heartbeatTimeout; + public SessionImpl( - Metrics metrics, SessionPoolInfo poolInfo, long sessionNum, SessionStream stream) { - this(metrics, Clock.systemUTC(), poolInfo, sessionNum, stream); + Metrics metrics, + SessionPoolInfo poolInfo, + long sessionNum, + SessionStream stream, + BigtableTimer timer) { + this(metrics, Clock.systemUTC(), poolInfo, sessionNum, stream, timer); } SessionImpl( @@ -148,8 +167,10 @@ public SessionImpl( Clock clock, SessionPoolInfo poolInfo, long sessionNum, - SessionStream stream) { + SessionStream stream, + BigtableTimer timer) { this.clock = clock; + this.timer = timer; this.info = SessionInfo.create(poolInfo, sessionNum); this.stream = stream; this.tracer = metrics.newSessionTracer(poolInfo); @@ -383,6 +404,46 @@ public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable } } + @GuardedBy("lock") + private void scheduleHeartbeatCheck() { + heartbeatTimeout = + timer.newTimeout( + this::checkHeartbeat, + HEARTBEAT_CHECK_INTERVAL.toMillis(), + TimeUnit.MILLISECONDS); + } + + @GuardedBy("lock") + private void cancelHeartbeatTimeout() { + if (heartbeatTimeout != null) { + heartbeatTimeout.cancel(); + heartbeatTimeout = null; + } + } + + // Runs on the wheel-timer tick thread. Takes the per-session lock to read state/nextHeartbeat + // and force-close on miss, then chains the next tick by re-scheduling. If the session is past + // WAIT_SERVER_CLOSE we drop the chain — no further checks are useful. + private void checkHeartbeat() { + CloseSessionRequest missed = null; + synchronized (lock) { + if (state.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { + return; + } + if (clock.instant().isAfter(nextHeartbeat)) { + missed = MISSED_HEARTBEAT_CLOSE_REQUEST; + } else { + scheduleHeartbeatCheck(); + } + } + if (missed != null) { + logger.warning( + String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); + // forceClose acquires the lock again and performs its own state checks. + forceClose(missed); + } + } + // region SessionStream event handlers private void dispatchResponseMessage(SessionResponse message) { switch (message.getPayloadCase()) { @@ -434,6 +495,7 @@ private void handleOpenSessionResponse(OpenSessionResponse openSession) { } localPeerInfo = stream.getPeerInfo(); updateState(SessionState.READY); + scheduleHeartbeatCheck(); } tracer.onOpen(localPeerInfo); sessionListener.onReady(openSession); @@ -703,6 +765,12 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { private void updateState(SessionState newState) { this.state = newState; this.lastStateChangedAt = clock.instant(); + // Once we're past READY, no further heartbeat checks are useful: checkHeartbeat short-circuits + // on state.phase >= WAIT_SERVER_CLOSE. Cancel any pending tick to keep the wheel clean during + // session churn. + if (newState.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { + cancelHeartbeatTimeout(); + } } private static String formatPeerInfo(PeerInfo peerInfo) { diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java index cba048fe0ce0..9beccf40fd85 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java @@ -16,15 +16,12 @@ package com.google.cloud.bigtable.data.v2.internal.session; -import static com.google.bigtable.v2.CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_MISSED_HEARTBEAT; - import com.google.auto.value.AutoValue; import com.google.bigtable.v2.CloseSessionRequest; import com.google.bigtable.v2.PeerInfo; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; import com.google.cloud.bigtable.data.v2.internal.session.Session.SessionState; import com.google.common.annotations.VisibleForTesting; -import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; @@ -41,7 +38,6 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.concurrent.NotThreadSafe; @@ -69,12 +65,6 @@ class SessionList { private final Set allSessions = new HashSet<>(); private final Set inUseSessions = new HashSet<>(); - private final CloseSessionRequest missedHeartbeatCloseRequest = - CloseSessionRequest.newBuilder() - .setReason(CLOSE_SESSION_REASON_MISSED_HEARTBEAT) - .setDescription("missed heartbeat") - .build(); - // pool level statistics across all the afes private final PoolStats poolStats = new PoolStats(); @@ -145,20 +135,6 @@ void prune() { } } - void checkHeartbeat(Clock clock) { - Instant now = clock.instant(); - inUseSessions.forEach( - handle -> { - if (now.isAfter(handle.getSession().getNextHeartbeat())) { - LOG.log( - Level.WARNING, - "Missed heartbeat for {0}, forcing session close", - handle.getSession().getLogName()); - handle.getSession().forceClose(missedHeartbeatCloseRequest); - } - }); - } - @NotThreadSafe class SessionHandle { private final Session session; diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index c175509d5f33..eaf2289097d7 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -59,12 +59,11 @@ import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -112,6 +111,10 @@ private enum PoolState { private final Watchdog watchdog; + // Shared by every SessionImpl, PendingVRpc, Watchdog, AFE pruner, and retry-create-session in + // this pool. One tick thread per Client (owned by Client); O(1) insert / O(1) cancel. + private final BigtableTimer timer; + @GuardedBy("this") private int consecutiveFailures = 0; @@ -122,14 +125,16 @@ private enum PoolState { */ private volatile int consecutiveUnimplementedFailures = 0; - private final ScheduledFuture afeListPruneTask; - - private final ScheduledFuture heartbeatMonitor; + // Self-rescheduling AFE-prune chain. Set by scheduleNextAfePrune; cancelled by close. + @GuardedBy("this") + @Nullable + private BigtableTimer.Timeout afeListPruneTimeout; - private final ScheduledExecutorService executorService; + @GuardedBy("this") + private boolean closed = false; @GuardedBy("this") - private ScheduledFuture retryCreateSessionFuture = null; + private BigtableTimer.Timeout retryCreateSessionFuture = null; // TODO: get the max from ClientConfiguration @GuardedBy("this") @@ -154,7 +159,7 @@ public SessionPoolImpl( CallOptions callOptions, SessionDescriptor sessionDescriptor, String name, - ScheduledExecutorService executorService) { + BigtableTimer timer) { this( metrics, featureFlags, @@ -164,7 +169,7 @@ public SessionPoolImpl( callOptions, sessionDescriptor, name, - executorService, + timer, createInitialBudget(configManager.getClientConfiguration())); } @@ -180,7 +185,7 @@ public SessionPoolImpl( CallOptions callOptions, SessionDescriptor sessionDescriptor, String name, - ScheduledExecutorService executorService, + BigtableTimer timer, SessionCreationBudget budget) { this.metrics = metrics; this.featureFlags = featureFlags; @@ -188,7 +193,9 @@ public SessionPoolImpl( this.factory = new SessionFactory(channelPool, sessionDescriptor.getMethodDescriptor(), callOptions); this.descriptor = sessionDescriptor; - this.executorService = executorService; + // Timer is owned by the caller (typically Client) and shared across pools — do NOT stop it + // in close(). + this.timer = timer; sessions = new SessionList(); LoadBalancingOptions lbOptions = @@ -210,29 +217,9 @@ public SessionPoolImpl( debugTagTracer = metrics.getDebugTagTracer(); // Watchdog checks for sessions in WAIT_SERVER_CLOSE state and runs every 5 minutes - watchdog = new Watchdog(this, executorService, Duration.ofMinutes(5), sessions, debugTagTracer); - // Heartbeat monitor checks for sessions in READY state with active vRPCs and runs more - // frequently - heartbeatMonitor = - executorService.scheduleAtFixedRate( - () -> { - synchronized (SessionPoolImpl.this) { - sessions.checkHeartbeat(Clock.systemUTC()); - } - }, - SessionImpl.HEARTBEAT_CHECK_INTERVAL.toMillis(), - SessionImpl.HEARTBEAT_CHECK_INTERVAL.toMillis(), - TimeUnit.MILLISECONDS); - afeListPruneTask = - executorService.scheduleAtFixedRate( - () -> { - synchronized (SessionPoolImpl.this) { - sessions.prune(); - } - }, - SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), - SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), - TimeUnit.MILLISECONDS); + watchdog = new Watchdog(this, timer, Duration.ofMinutes(5), sessions, debugTagTracer); + // Heartbeat monitoring is now done per-session via SessionImpl.scheduleHeartbeatCheck. + scheduleNextAfePrune(); this.budget = budget; @@ -266,19 +253,54 @@ public void close(CloseSessionRequest req) { logger.fine(String.format("Closing session pool %s for reason %s", info.getLogName(), req)); poolState = PoolState.CLOSED; + closed = true; for (PendingVRpc pendingRpc : pendingRpcs) { pendingRpc.cancel("SessionPool closed: " + req, null); } - afeListPruneTask.cancel(false); - heartbeatMonitor.cancel(false); + if (afeListPruneTimeout != null) { + afeListPruneTimeout.cancel(); + afeListPruneTimeout = null; + } if (retryCreateSessionFuture != null) { - retryCreateSessionFuture.cancel(false); + retryCreateSessionFuture.cancel(); retryCreateSessionFuture = null; } watchdog.close(); sessions.close(req); } + + // Timer is owned by the Client and shared across pools; do not stop it here. + } + + // Self-rescheduling AFE prune. Pattern matches Watchdog: tick dispatches to the pool executor, + // executor takes the lock, prunes, schedules the next tick. Tolerates body exceptions so a + // transient fault does not permanently disable pruning. + @GuardedBy("this") + private void scheduleNextAfePrune() { + if (closed) { + return; + } + afeListPruneTimeout = + timer.newTimeout( + this::runAfePruneAndReschedule, + SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), + TimeUnit.MILLISECONDS); + } + + private void runAfePruneAndReschedule() { + synchronized (SessionPoolImpl.this) { + try { + if (closed) { + return; + } + sessions.prune(); + } catch (Throwable t) { + logger.log(Level.WARNING, "AFE prune tick threw; continuing", t); + } finally { + scheduleNextAfePrune(); + } + } } @Override @@ -350,7 +372,7 @@ private synchronized void createSession(OpenParams openParams) { try (Scope ignored = io.opentelemetry.context.Context.root().makeCurrent()) { SessionStream stream = factory.createNew(); - Session session = new SessionImpl(metrics, info, sessionNum++, stream); + Session session = new SessionImpl(metrics, info, sessionNum++, stream, timer); SessionHandle handle = sessions.newHandle(session); Metadata localMd = new Metadata(); @@ -393,9 +415,9 @@ private void maybeScheduleCreateSessionRetry() { retryIntervalMs = 1; } retryCreateSessionFuture = - executorService.schedule( + timer.newTimeout( () -> { - synchronized (this) { + synchronized (SessionPoolImpl.this) { retryCreateSessionFuture = null; if (poolState != PoolState.CLOSED && poolSizer.getScaleDelta() > 0) { createSession(openParams); @@ -620,7 +642,7 @@ class PendingVRpc implements VRpc realCall; - private ScheduledFuture deadlineMonitor; + private BigtableTimer.Timeout deadlineMonitor; public PendingVRpc(VRpcDescriptor desc) { this.desc = desc; @@ -636,7 +658,7 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { this.req = req; this.ctx = ctx; this.listener = listener; - this.deadlineMonitor = monitorDeadline(executorService, ctx.getOperationInfo().getDeadline()); + this.deadlineMonitor = monitorDeadline(ctx.getOperationInfo().getDeadline()); synchronized (SessionPoolImpl.this) { if (SessionPoolImpl.this.poolState != PoolState.STARTED) { @@ -723,7 +745,7 @@ private void drainTo(SessionHandle handle) { } this.realCall.start(req, ctx, listener); if (deadlineMonitor != null) { - deadlineMonitor.cancel(false); + deadlineMonitor.cancel(); } } @@ -731,14 +753,15 @@ private VRpcListener getListener() { return listener; } - private ScheduledFuture monitorDeadline( - ScheduledExecutorService executorService, Deadline deadline) { - return executorService.schedule( + private BigtableTimer.Timeout monitorDeadline(Deadline deadline) { + // Body runs on the timer's bundled dispatcher (off the tick thread). + // onlyCancelPendingCall=true avoids racing with a user cancel that already attached a real + // call. + return timer.newTimeout( () -> - // This could race with user cancel. Setting onlyCancelPendingCall to true - // so that if the real call is set, this cancellation is going to be a noop. cancel( - Status.DEADLINE_EXCEEDED.withDescription("Deadline exceeded waiting for session"), + Status.DEADLINE_EXCEEDED.withDescription( + "Deadline exceeded waiting for session"), true), deadline.timeRemaining(TimeUnit.NANOSECONDS), TimeUnit.NANOSECONDS); @@ -749,36 +772,45 @@ static class Watchdog implements Runnable { private static final Logger LOG = Logger.getLogger(Watchdog.class.getName()); private final Object lock; - private final ScheduledExecutorService executor; + private final BigtableTimer timer; private final Duration interval; private final SessionList sessions; - private ScheduledFuture future; private final Clock clock; private final DebugTagTracer debugTagTracer; + // Guards currentTimeout and watchdogClosed. Self-contained — never composed with any external + // lock. + private final Object scheduleLock = new Object(); + + @GuardedBy("scheduleLock") + private BigtableTimer.Timeout currentTimeout; + + @GuardedBy("scheduleLock") + private boolean watchdogClosed = false; + // The `lock` parameter is the pool-wide monitor (SessionPoolImpl.this). It is typed as Object // because Watchdog is a static nested class and cannot reference the outer instance type in its // constructor signature without creating a circular dependency. Phase 5 will replace this with // a properly typed lock once the per-AFE sharding model is established. public Watchdog( Object lock, - ScheduledExecutorService executor, + BigtableTimer timer, Duration interval, SessionList sessionList, DebugTagTracer debugTagTracer) { - this(lock, executor, interval, sessionList, debugTagTracer, Clock.systemUTC()); + this(lock, timer, interval, sessionList, debugTagTracer, Clock.systemUTC()); } @VisibleForTesting Watchdog( Object lock, - ScheduledExecutorService executor, + BigtableTimer timer, Duration interval, SessionList sessionList, DebugTagTracer debugTagTracer, Clock clock) { this.lock = lock; - this.executor = executor; + this.timer = timer; this.interval = interval; this.sessions = sessionList; this.debugTagTracer = debugTagTracer; @@ -786,16 +818,40 @@ public Watchdog( } public void start() { - future = - executor.scheduleAtFixedRate( - this, interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); + scheduleNext(); + } + + // Self-reschedule. Called once from start() and again at the end of each tick. + private void scheduleNext() { + synchronized (scheduleLock) { + if (watchdogClosed) return; + currentTimeout = + timer.newTimeout(this::runAndReschedule, interval.toMillis(), TimeUnit.MILLISECONDS); + } + } + + private void runAndReschedule() { + try { + run(); + } catch (Throwable t) { + // Preserve the watchdog across body exceptions — unlike scheduleAtFixedRate, which silently + // stops the schedule on the first exception, we keep going so a transient fault doesn't + // permanently disable session leak detection. + LOG.log(Level.WARNING, "Watchdog tick threw; continuing", t); + } finally { + scheduleNext(); + } } @Override public void run() { + // Snapshot under the pool lock: getAllSessions() returns the live HashSet, and pool state + // mutation (session create/close on another thread) during iteration would throw + // ConcurrentModificationException. Most common trigger is a heartbeat-miss cascade that + // churns sessions while the watchdog is walking the list. Set allSessions; synchronized (lock) { - allSessions = sessions.getAllSessions(); + allSessions = new HashSet<>(sessions.getAllSessions()); } for (SessionHandle handle : allSessions) { @@ -833,8 +889,12 @@ public void run() { } public void close() { - if (future != null) { - future.cancel(false); + synchronized (scheduleLock) { + watchdogClosed = true; + if (currentTimeout != null) { + currentTimeout.cancel(); + currentTimeout = null; + } } } } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index 54406abe51a8..b2b6e819e959 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -29,6 +29,7 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.protobuf.Message; import io.grpc.Deadline; @@ -50,6 +51,7 @@ public class TableBaseTest { private final Metrics noopMetrics = new NoopMetrics(); private final ScheduledExecutorService mockExecutor = Mockito.mock(ScheduledExecutorService.class); + private final BigtableTimer mockTimer = Mockito.mock(BigtableTimer.class); private static final ClientInfo clientInfo = ClientInfo.builder() .setInstanceName( @@ -70,7 +72,7 @@ public void setup() { VRpcDescriptor.READ_ROW, VRpcDescriptor.MUTATE_ROW, noopMetrics, - mockExecutor); + mockTimer); deadline = Deadline.after(1, TimeUnit.MINUTES); f = new UnaryResponseFuture<>(); } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java index 62a802dfeb0d..ec7dc6a01245 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java @@ -45,8 +45,10 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.RetryingVRpc; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.cloud.bigtable.data.v2.internal.session.FakeDescriptor; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.Session; import com.google.cloud.bigtable.data.v2.internal.session.SessionFactory; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionImpl; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeServiceBuilder; @@ -97,6 +99,7 @@ public class VRpcTracerTest { Correspondence.transforming(MetricData::getName, "MetricData name"); private ScheduledExecutorService executor; + private BigtableTimer timer; private Server server; private ChannelPool channelPool; @@ -115,6 +118,7 @@ public class VRpcTracerTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("vrpc-tracer-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -157,7 +161,7 @@ void setUp() throws IOException { SessionFactory sessionFactory = new SessionFactory( channelPool, FakeDescriptor.FAKE_SESSION.getMethodDescriptor(), CallOptions.DEFAULT); - session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); } @AfterEach @@ -173,6 +177,7 @@ void tearDown() { metrics.close(); server.shutdownNow(); executor.shutdownNow(); + timer.stop(); } @Test @@ -199,7 +204,7 @@ public void operationLatencyTest() throws Exception { CompletableFuture opFinished = new CompletableFuture<>(); Stopwatch stopwatch = Stopwatch.createStarted(); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture userFuture = new UnaryResponseFuture<>(); MethodInfo methodInfo = MethodInfo.builder().setName("Bigtable.ReadRow").setStreaming(false).build(); @@ -258,7 +263,7 @@ public void attemptLatencyTest() throws Exception { AtomicLong maxAttemptLatency = new AtomicLong(); DelayedVRpc delayedVRpc = new DelayedVRpc<>( - () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor)); + () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer)); UnaryResponseFuture userFuture = new UnaryResponseFuture<>(); MethodInfo methodInfo = MethodInfo.builder().setName("Bigtable.ReadRow").setStreaming(false).build(); @@ -316,7 +321,7 @@ public void retryCountTest() throws Exception { // Test RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); CompletableFuture opFinished = new CompletableFuture<>(); MethodInfo methodInfo = @@ -367,7 +372,7 @@ public void clientBlockingLatencySessionDelayTest() throws Exception { // Test DelayedVRpc delayedVRpc = new DelayedVRpc<>( - () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor)); + () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer)); UnaryResponseFuture f = new UnaryResponseFuture<>(); CompletableFuture attemptFinished = new CompletableFuture<>(); MethodInfo methodInfo = diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java index b9af925d4871..e823db5b3aea 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java @@ -38,9 +38,11 @@ import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; import com.google.cloud.bigtable.data.v2.internal.session.FakeDescriptor; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionFactory; import com.google.cloud.bigtable.data.v2.internal.session.SessionImpl; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeServiceBuilder; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeSessionListener; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeSessionService; @@ -74,6 +76,7 @@ @ExtendWith(MockitoExtension.class) public class RetryingVRpcTest { private ScheduledExecutorService executor; + private BigtableTimer timer; private Server server; private ChannelPool channelPool; @@ -88,6 +91,7 @@ public class RetryingVRpcTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("retrying-vrpc-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -117,11 +121,12 @@ void tearDown() { channelPool.close(); server.shutdownNow(); executor.shutdownNow(); + timer.stop(); } @Test void noRetryTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -133,7 +138,7 @@ void noRetryTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(0).build(), @@ -152,7 +157,7 @@ void noRetryTest() throws Exception { @Test public void retryServerError() throws Exception { int requestTag = 1; - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -199,7 +204,7 @@ public void retryServerError() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(requestTag).build(), @@ -218,7 +223,7 @@ public void retryServerError() throws Exception { @Test public void retryDeadlineRespectedTest() throws Exception { int requestTag = 1; - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -274,7 +279,7 @@ public void retryDeadlineRespectedTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(requestTag).build(), @@ -295,7 +300,7 @@ public void retryDeadlineRespectedTest() throws Exception { @Test public void vRpcFailureTest() throws Exception { // vrpc error on the session should not close the stream - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -333,7 +338,7 @@ public void vRpcFailureTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(0).build(), @@ -353,7 +358,7 @@ public void vRpcFailureTest() throws Exception { @Test void cancelInScheduledState() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -392,7 +397,7 @@ void cancelInScheduledState() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(1).build(), diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index 838cea0e2ffe..c50e4f55d5c2 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -85,6 +85,7 @@ @ExtendWith(MockitoExtension.class) public class SessionImplTest { private ScheduledExecutorService executor; + private BigtableTimer timer; @Mock(answer = Answers.RETURNS_DEEP_STUBS) private Metrics metrics; @@ -98,6 +99,7 @@ public class SessionImplTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("session-impl-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -128,12 +130,13 @@ void setUp() throws IOException { void tearDown() { channelPool.close(); server.shutdownNow(); + timer.stop(); executor.shutdownNow(); } @Test void sessionSendAndCloseTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -163,7 +166,7 @@ void sessionSendAndCloseTest() throws Exception { @Test void sessionCloseBeforeInit() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -180,7 +183,7 @@ void sessionCloseBeforeInit() throws Exception { @Test void sessionGoAwayTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); Duration goAwayDelay = Duration.ofMillis(500); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -268,7 +271,7 @@ void sessionGoAwayTest() throws Exception { @Test void streamErrorDuringRpcTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); Status.Code actualCode = Status.Code.INTERNAL; @@ -337,7 +340,7 @@ void streamErrorDuringRpcTest() throws Exception { @Test void rpcErrorDuringRpcTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); com.google.rpc.Status expectedRpcStatus = com.google.rpc.Status.newBuilder() @@ -404,7 +407,7 @@ void rpcErrorDuringRpcTest() throws Exception { @Test void localErrorTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); session.start( @@ -451,7 +454,8 @@ void testHeartbeat() throws Exception { Instant time = clock.instant(); - SessionImpl session = new SessionImpl(metrics, clock, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = + new SessionImpl(metrics, clock, poolInfo, 0, sessionFactory.createNew(), timer); int keepAliveDurationMs = 150; @@ -507,7 +511,7 @@ void testHeartbeat() throws Exception { @Test void testCancel() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); int responseDelayMs = 200; // Configure the fake service to delay the response, giving us time to cancel it diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index ea142d8a7449..ceb84d4b0e59 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -21,6 +21,8 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.longThat; +import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; @@ -110,6 +112,7 @@ public class SessionPoolImplTest { Correspondence.transforming(SessionRequest::getOpenSession, "open session"); private ScheduledExecutorService executor; + private NettyWheelTimer testTimer; @Mock(answer = Answers.RETURNS_DEEP_STUBS) private Metrics metrics; @@ -126,6 +129,7 @@ public class SessionPoolImplTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + testTimer = new NettyWheelTimer("test-timer", com.google.common.util.concurrent.MoreExecutors.directExecutor()); fakeService = new FakeSessionService(executor); headerInterceptor = new HeaderInterceptor(); server = @@ -159,7 +163,7 @@ void setUp() throws IOException { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); } @AfterEach @@ -173,6 +177,7 @@ void tearDown() { // channel gets shutdown in channelPool.close() server.shutdownNow(); executor.shutdownNow(); + testTimer.stop(); } @Test @@ -239,7 +244,7 @@ void pendingVRpcDrainTest() throws ExecutionException, InterruptedException, Tim CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); // session ack should be delayed by at least 10ms testSessionPool.start(OpenFakeSessionRequest.getDefaultInstance(), new Metadata()); @@ -295,7 +300,7 @@ void testCreateSessionDoesntPropagateDeadline() { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); Context.current() .withDeadlineAfter(1, TimeUnit.MINUTES, executor) @@ -314,7 +319,7 @@ void testCreateSessionDoesntPropagateDeadline() { class RetrySessionCreation { private FakeClock fakeClock; - private ScheduledExecutorService mockExecutor; + private BigtableTimer mockTimer; private FakeSessionService fakeService; private ChannelPool channelPool; private SessionPoolImpl sessionPool; @@ -324,7 +329,9 @@ class RetrySessionCreation { @BeforeEach void setUp() throws Exception { fakeClock = new FakeClock(Instant.now()); - mockExecutor = mock(ScheduledExecutorService.class, Mockito.RETURNS_DEEP_STUBS); + // The retry-create-session site uses timer.newTimeout(); we capture the scheduled body on a + // mock timer and run it inline below. + mockTimer = mock(BigtableTimer.class, Mockito.RETURNS_DEEP_STUBS); Duration penalty = Duration.ofMinutes(1); SessionCreationBudget budget = new SessionCreationBudget(10, penalty, fakeClock); @@ -353,7 +360,7 @@ void setUp() throws Exception { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - mockExecutor, + mockTimer, budget); } @@ -371,7 +378,14 @@ void tearDown() { @Test public void test() throws Exception { ArgumentCaptor runnableCaptor = ArgumentCaptor.forClass(Runnable.class); - ArgumentCaptor delayCaptor = ArgumentCaptor.forClass(Long.class); + // Filter out watchdog (5 min, exact) and AFE prune (10 min, exact) ticks. The + // retry-create-session site computes its delay against the real wall clock and the fake + // budget clock, so it can land anywhere from sub-second to a couple of penalty intervals. + // Match anything that isn't one of the two fixed cadences. + long watchdogMs = java.time.Duration.ofMinutes(5).toMillis(); + long afePruneMs = SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(); + java.util.function.LongPredicate isRetrySchedule = + d -> d > 0 && d != watchdogMs && d != afePruneMs; // start the pool sessionPool.start( @@ -384,12 +398,11 @@ public void test() throws Exception { // The delay should be around budget creation failure penalty. It'll take some time for the // job to exhaust all the creation budget so set a delay before verifying. int waitForReadyMs = 1000; - verify(mockExecutor, Mockito.timeout(waitForReadyMs)) - .schedule(runnableCaptor.capture(), delayCaptor.capture(), eq(TimeUnit.MILLISECONDS)); - assertThat(delayCaptor.getValue()) - .isIn( - Range.openClosed( - penalty.minus(Duration.ofMillis(waitForReadyMs)).toMillis(), penalty.toMillis())); + verify(mockTimer, Mockito.timeout(waitForReadyMs)) + .newTimeout( + runnableCaptor.capture(), + longThat(isRetrySchedule::test), + eq(TimeUnit.MILLISECONDS)); // we should have received some open requests int requestsBefore = fakeService.getOpenRequestCount().get(); @@ -407,13 +420,16 @@ public void test() throws Exception { // Advance the clock so there's more budget to create sessions fakeClock.increment(penalty.plusMillis(1)); - // Run the scheduled task, pool sizer will return a positive scale factor because there's a - // pending vrpc + // Run the scheduled timer body inline. The body executes the retry which schedules another + // timer tick. runnableCaptor.getValue().run(); // The retry task will try to open new sessions. This will fail and schedule another retry. - verify(mockExecutor, Mockito.timeout(1000).times(2)) - .schedule(any(Runnable.class), anyLong(), eq(TimeUnit.MILLISECONDS)); + verify(mockTimer, Mockito.timeout(5000).times(2)) + .newTimeout( + any(Runnable.class), + longThat(isRetrySchedule::test), + eq(TimeUnit.MILLISECONDS)); // the retry will exhaust the budget again. we should see double the request compared to // before diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java index 94bd6fcc5049..3582e66e1971 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java @@ -30,9 +30,7 @@ import io.grpc.Metadata; import java.time.Duration; import java.time.Instant; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import org.junit.jupiter.api.AfterEach; +import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -41,7 +39,7 @@ @ExtendWith(MockitoExtension.class) public class WatchdogTest { private final Duration interval = Duration.ofMinutes(5); - private ScheduledExecutorService executor; + private BigtableTimer timer; private SessionPoolImpl.Watchdog watchdog; private SessionList sessions; private FakeSession fakeSession = new FakeSession(); @@ -51,7 +49,9 @@ public class WatchdogTest { @BeforeEach void setUp() { - executor = Executors.newScheduledThreadPool(4); + // run() is invoked synchronously in tests; the timer is wired in only so the constructor + // signature is satisfied. start() / close() are not exercised here. + timer = new NoOpBigtableTimer(); now = Instant.now(); fakeClock = new FakeClock(now); @@ -59,16 +59,33 @@ void setUp() { watchdog = new Watchdog( new Object(), - executor, + timer, interval, sessions, NoopMetrics.NoopDebugTracer.INSTANCE, fakeClock); } - @AfterEach - void tearDown() { - executor.shutdownNow(); + // A BigtableTimer that drops every newTimeout(). Used because awaitCloseTest drives the watchdog + // by calling run() directly; the scheduling layer is not under test. + private static final class NoOpBigtableTimer implements BigtableTimer { + @Override + public Timeout newTimeout(Runnable task, long delay, TimeUnit unit) { + return new Timeout() { + @Override + public boolean cancel() { + return false; + } + + @Override + public boolean isCancelled() { + return true; + } + }; + } + + @Override + public void stop() {} } @Test From 54c287ea80978f2d75f46f394107862d75bfdfc4 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Fri, 12 Jun 2026 22:45:19 +0000 Subject: [PATCH 03/24] chore: introduce VOperation as the head of the middleware chain Replaces CancellableVRpc with a VOperation layer that sits above the VRpc chain rather than inside it. VOperationImpl owns the gRPC Context cancellation listener and constructs the per-op VRpcCallContext; downstream middleware just sees the chain. --- .../data/v2/internal/api/TableBase.java | 19 +--- .../internal/middleware/CancellableVRpc.java | 74 ------------- .../v2/internal/middleware/VOperation.java | 37 +++++++ .../internal/middleware/VOperationImpl.java | 100 ++++++++++++++++++ 4 files changed, 141 insertions(+), 89 deletions(-) delete mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java create mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java create mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java index af3bf306856f..a11a05c9d4b9 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java @@ -25,9 +25,8 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; -import com.google.cloud.bigtable.data.v2.internal.middleware.CancellableVRpc; +import com.google.cloud.bigtable.data.v2.internal.middleware.VOperationImpl; import com.google.cloud.bigtable.data.v2.internal.middleware.RetryingVRpc; -import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolImpl; @@ -112,14 +111,9 @@ public void readRow( SessionReadRowRequest req, VRpcListener listener, Deadline deadline) { RetryingVRpc retry = new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), timer); - VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), readRowDescriptor, deadline); - VRpcCallContext ctx = VRpcCallContext.create(deadline, true, tracer); - - CancellableVRpc cancellableVRpc = - new CancellableVRpc<>(retry, Context.current()); - cancellableVRpc.start(req, ctx, listener); + new VOperationImpl<>(retry, Context.current(), tracer, deadline, true).start(req, listener); } public void mutateRow( @@ -128,16 +122,11 @@ public void mutateRow( Deadline deadline) { RetryingVRpc retry = new RetryingVRpc<>(() -> sessionPool.newCall(mutateRowDescriptor), timer); - boolean idempotent = Util.isIdempotent(req.getMutationsList()); - VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), mutateRowDescriptor, deadline); - VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer); - - CancellableVRpc cancellable = - new CancellableVRpc<>(retry, Context.current()); - cancellable.start(req, ctx, listener); + new VOperationImpl<>(retry, Context.current(), tracer, deadline, idempotent) + .start(req, listener); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java deleted file mode 100644 index ded284e1979c..000000000000 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2026 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.google.cloud.bigtable.data.v2.internal.middleware; - -import com.google.common.util.concurrent.MoreExecutors; -import io.grpc.Context; -import io.grpc.Deadline; -import java.util.Optional; -import java.util.concurrent.TimeoutException; - -/** - * A {@link VRpc} decorator that propagates gRPC {@link Context} cancellation to the underlying - * VRpc. - */ -public class CancellableVRpc extends ForwardingVRpc { - private final Context context; - private final Context.CancellationListener cancellationListener; - - public CancellableVRpc(VRpc delegate, Context context) { - super(delegate); - this.context = context; - this.cancellationListener = - (c) -> { - boolean deadlineExceeded = - Optional.ofNullable(c.getDeadline()).map(Deadline::isExpired).orElse(false); - deadlineExceeded = deadlineExceeded && c.cancellationCause() instanceof TimeoutException; - // Let VRpc machinery handle deadline exceeded - if (!deadlineExceeded) { - delegate.cancel("gRPC context cancelled", c.cancellationCause()); - } - }; - } - - @Override - public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { - context.addListener(cancellationListener, MoreExecutors.directExecutor()); - super.start( - req, ctx, new CancellationCleanupListener<>(listener, context, cancellationListener)); - } - - private static class CancellationCleanupListener extends ForwardListener { - private final Context context; - private final Context.CancellationListener cancellationListener; - - private CancellationCleanupListener( - VRpcListener delegate, - Context context, - Context.CancellationListener cancellationListener) { - super(delegate); - this.context = context; - this.cancellationListener = cancellationListener; - } - - @Override - public void onClose(VRpcResult result) { - context.removeListener(cancellationListener); - super.onClose(result); - } - } -} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java new file mode 100644 index 000000000000..fcc50904fedd --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java @@ -0,0 +1,37 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import javax.annotation.Nullable; + +/** + * Entry point for a single user-facing operation that may translate into one or more {@link VRpc} + * attempts. + * + *

{@code VOperation} sits above the {@code VRpc} chain. It owns the per-op {@link + * VRpc.VRpcCallContext} and the gRPC {@link io.grpc.Context} cancellation listener, so downstream + * middleware only deals with the chain itself. + */ +public interface VOperation { + + /** Start the operation. Results are delivered to {@code listener}. */ + void start(ReqT req, VRpcListener listener); + + /** Cancel a started operation. Best effort. */ + void cancel(@Nullable String message, @Nullable Throwable cause); +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java new file mode 100644 index 000000000000..f00c4376350e --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -0,0 +1,100 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; +import com.google.cloud.bigtable.data.v2.internal.middleware.ForwardingVRpc.ForwardListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; +import com.google.common.util.concurrent.MoreExecutors; +import io.grpc.Context; +import io.grpc.Deadline; +import java.util.Optional; +import java.util.concurrent.TimeoutException; +import javax.annotation.Nullable; + +/** + * The single edge between the user and the VRpc middleware chain. Constructs the per-op {@link + * VRpcCallContext} and owns the gRPC {@link Context} cancellation listener. + * + *

Precondition: {@link #cancel} must not be called before {@link #start}. + */ +public class VOperationImpl implements VOperation { + + private final VRpc chain; + private final Context grpcContext; + private final VRpcTracer tracer; + private final Deadline deadline; + private final boolean idempotent; + private final Context.CancellationListener cancellationListener; + + public VOperationImpl( + VRpc chain, + Context grpcContext, + VRpcTracer tracer, + Deadline deadline, + boolean idempotent) { + this.chain = chain; + this.grpcContext = grpcContext; + this.tracer = tracer; + this.deadline = deadline; + this.idempotent = idempotent; + this.cancellationListener = + (c) -> { + boolean deadlineExceeded = + Optional.ofNullable(c.getDeadline()).map(Deadline::isExpired).orElse(false); + deadlineExceeded = deadlineExceeded && c.cancellationCause() instanceof TimeoutException; + // Let VRpc machinery handle deadline exceeded. + if (!deadlineExceeded) { + cancel("gRPC context cancelled", c.cancellationCause()); + } + }; + } + + @Override + public void start(ReqT req, VRpcListener listener) { + VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer); + grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); + chain.start(req, ctx, new CleanupListener<>(listener, grpcContext, cancellationListener)); + } + + @Override + public void cancel(@Nullable String message, @Nullable Throwable cause) { + chain.cancel(message, cause); + } + + private static class CleanupListener extends ForwardListener { + private final Context grpcContext; + private final Context.CancellationListener cancellationListener; + + CleanupListener( + VRpcListener delegate, + Context grpcContext, + Context.CancellationListener cancellationListener) { + super(delegate); + this.grpcContext = grpcContext; + this.cancellationListener = cancellationListener; + } + + @Override + public void onClose(VRpcResult result) { + grpcContext.removeListener(cancellationListener); + super.onClose(result); + } + } +} From c88862af1c0014db911cd099e1a2fb675f1efb4a Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Fri, 12 Jun 2026 23:02:34 +0000 Subject: [PATCH 04/24] chore: introduce OpExecutor; plumb VRpcCallContext.getExecutor VRpcCallContext.getExecutor() returns OpExecutor (thin wrapper with runningThread affinity tracking). VOperationImpl constructs the per-call SynchronizationContext + OpExecutor; RetryingVRpc drops its own SyncContext and dispatches via ctx.getExecutor(). The uncaught-handler safety net moves from RetryingVRpc up to VOperationImpl. --- .../v2/internal/middleware/OpExecutor.java | 59 +++++++++ .../v2/internal/middleware/RetryingVRpc.java | 112 +++++++++--------- .../internal/middleware/VOperationImpl.java | 9 +- .../data/v2/internal/middleware/VRpc.java | 22 +++- 4 files changed, 140 insertions(+), 62 deletions(-) create mode 100644 java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java new file mode 100644 index 000000000000..91669284549c --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java @@ -0,0 +1,59 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import java.util.concurrent.Executor; + +/** + * Per-op serializing executor. Wraps a delegate {@link Executor} and tracks which thread is + * currently running a task, so callers can assert they are on the executor (analogous to {@code + * SynchronizationContext#throwIfNotInThisSynchronizationContext}). + * + *

Backing executor evolves over the refactor — for now it is the per-call {@link + * io.grpc.SynchronizationContext} that {@link VOperationImpl} constructs. Later commits swap it + * for a {@code SerializingExecutor} over the user-callback pool, and eventually a tailored inline + * queue. + */ +public final class OpExecutor implements Executor { + + private final Executor backing; + private volatile Thread runningThread; + + public OpExecutor(Executor backing) { + this.backing = backing; + } + + @Override + public void execute(Runnable r) { + backing.execute( + () -> { + Thread prev = runningThread; + runningThread = Thread.currentThread(); + try { + r.run(); + } finally { + runningThread = prev; + } + }); + } + + public void throwIfNotInThisExecutor() { + if (Thread.currentThread() != runningThread) { + throw new IllegalStateException("Not running on this op executor"); + } + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index 0e19fe076d63..6625407dee38 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -24,7 +24,6 @@ import com.google.rpc.RetryInfo; import io.grpc.Context; import io.grpc.Status; -import io.grpc.SynchronizationContext; import java.util.Optional; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.TimeUnit; @@ -47,13 +46,12 @@ public class RetryingVRpc implements VRpc { private VRpcTracer tracer; private final BigtableTimer timer; - private final SynchronizationContext syncContext; // current state and all the flags don't need to be volatile because they're only updated within - // the sync context. + // the op executor. private State currentState; private boolean started; - // Breaks the loop if uncaught exception happens during sync context execution. + // Breaks the loop if uncaught exception happens during op-executor execution. private boolean isCancelling; public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { @@ -63,14 +61,6 @@ public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { otelContext = io.opentelemetry.context.Context.current(); this.timer = timer; - this.syncContext = - new SynchronizationContext( - (t, e) -> { - this.cancel( - "Unexpected error while notifying the caller of RetryingVRpc. Trying to cancel" - + " vRpc to ensure consistent state", - e); - }); started = false; isCancelling = false; @@ -79,51 +69,55 @@ public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { @Override public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { - syncContext.execute( - () -> { - if (started) { - listener.onClose( - VRpcResult.createRejectedError( - Status.FAILED_PRECONDITION.withDescription("operation is already started"))); - return; - } - started = true; - - this.request = req; - this.listener = listener; - this.context = ctx; - this.tracer = context.getTracer(); - - tracer.onOperationStart(); - currentState.onStart(); - }); + ctx.getExecutor() + .execute( + () -> { + if (started) { + listener.onClose( + VRpcResult.createRejectedError( + Status.FAILED_PRECONDITION.withDescription( + "operation is already started"))); + return; + } + started = true; + + this.request = req; + this.listener = listener; + this.context = ctx; + this.tracer = context.getTracer(); + + tracer.onOperationStart(); + currentState.onStart(); + }); } @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { - syncContext.execute( - () -> { - if (currentState.isDone() || isCancelling) { - LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); - return; - } - // Prevents infinite loop if there's any error thrown during this phase. - isCancelling = true; - Throwable finalCause = cause; - try { - currentState.onCancel(message, cause); - } catch (Throwable t) { - if (finalCause != null) { - finalCause.addSuppressed(t); - } else { - finalCause = t; - } - } - onStateChange( - new Done( - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(finalCause)))); - }); + context + .getExecutor() + .execute( + () -> { + if (currentState.isDone() || isCancelling) { + LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); + return; + } + // Prevents infinite loop if there's any error thrown during this phase. + isCancelling = true; + Throwable finalCause = cause; + try { + currentState.onCancel(message, cause); + } catch (Throwable t) { + if (finalCause != null) { + finalCause.addSuppressed(t); + } else { + finalCause = t; + } + } + onStateChange( + new Done( + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(finalCause)))); + }); } @Override @@ -132,7 +126,7 @@ public void requestNext() { } void onStateChange(State state) { - syncContext.throwIfNotInThisSynchronizationContext(); + context.getExecutor().throwIfNotInThisExecutor(); if (currentState.isDone()) { return; } @@ -177,7 +171,7 @@ public void onStart() { new VRpcListener() { @Override public void onMessage(RespT msg) { - syncContext.execute( + context.getExecutor().execute( () -> { if (currentState != Active.this) { LOG.log( @@ -198,7 +192,7 @@ public void onMessage(RespT msg) { @Override public void onClose(VRpcResult result) { - syncContext.execute( + context.getExecutor().execute( () -> { tracer.onAttemptFinish(result); if (currentState != Active.this) { @@ -282,12 +276,12 @@ public void onStart() { try { // Wraps go innermost so the captured gRPC + OpenTelemetry contexts are re-established at // the moment the body runs, not just while the dispatcher is invoking the outer task. - // syncContext.execute may queue the inner runnable for a later drain on a different - // thread; an outer wrap's scope would already be closed by then. + // The executor may queue the inner runnable for a later drain on a different thread; an + // outer wrap's scope would already be closed by then. future = timer.newTimeout( () -> - syncContext.execute( + context.getExecutor().execute( () -> grpcContext.wrap( () -> diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java index f00c4376350e..4d8b7427d144 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -24,6 +24,7 @@ import com.google.common.util.concurrent.MoreExecutors; import io.grpc.Context; import io.grpc.Deadline; +import io.grpc.SynchronizationContext; import java.util.Optional; import java.util.concurrent.TimeoutException; import javax.annotation.Nullable; @@ -68,7 +69,13 @@ public VOperationImpl( @Override public void start(ReqT req, VRpcListener listener) { - VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer); + // Per-call SynchronizationContext serializes all middleware below this layer. Uncaught task + // failures drive the chain to a terminal state so the caller's listener still gets onClose; + // RetryingVRpc.cancel is idempotent so the resulting cascade collapses safely. + SynchronizationContext syncContext = + new SynchronizationContext((t, e) -> chain.cancel("Uncaught exception in op executor", e)); + OpExecutor exec = new OpExecutor(syncContext); + VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer, exec); grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); chain.start(req, ctx, new CleanupListener<>(listener, grpcContext, cancellationListener)); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java index a29a51fd72c6..d17f9b6eee7f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java @@ -24,6 +24,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.MoreExecutors; import com.google.protobuf.Any; import com.google.rpc.RetryInfo; import io.grpc.Context; @@ -92,12 +93,29 @@ abstract class VRpcCallContext { public abstract VRpcTracer getTracer(); + /** + * The per-op executor that serializes all middleware below {@link VOperationImpl}. Owned by + * {@code VOperationImpl}; downstream layers use it via {@code ctx.getExecutor()}. + */ + public abstract OpExecutor getExecutor(); + // TODO: csm // Clientside metrics instrument // public abstract BigtableTracer getTracer(); + /** + * Defaults the executor to one over {@link MoreExecutors#directExecutor()}. Suitable for tests + * that do not exercise the per-op serialization; production callers go through {@link + * VOperationImpl} which supplies a real serializing executor. + */ public static VRpcCallContext create( Deadline deadline, boolean isIdempotent, VRpcTracer tracer) { + return create( + deadline, isIdempotent, tracer, new OpExecutor(MoreExecutors.directExecutor())); + } + + public static VRpcCallContext create( + Deadline deadline, boolean isIdempotent, VRpcTracer tracer, OpExecutor executor) { Deadline grpcContextDeadline = Context.current().getDeadline(); @@ -114,12 +132,12 @@ public static VRpcCallContext create( } return new AutoValue_VRpc_VRpcCallContext( - OperationInfo.create(operationTimeout, isIdempotent), "TODO", tracer); + OperationInfo.create(operationTimeout, isIdempotent), "TODO", tracer, executor); } public VRpcCallContext createForNextAttempt() { return new AutoValue_VRpc_VRpcCallContext( - getOperationInfo().createForNextAttempt(), getTraceParent(), getTracer()); + getOperationInfo().createForNextAttempt(), getTraceParent(), getTracer(), getExecutor()); } } From 8f997781354bb3b82d3ff48be1715ece629b1a8b Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sat, 13 Jun 2026 14:32:02 +0000 Subject: [PATCH 05/24] chore: move callback dispatch from RetryingVRpc to VRpcImpl VRpcImpl.handle*() methods now dispatch listener callbacks via ctx.getExecutor(), with CAS STARTED->CLOSED in all three (handleError no longer proceeds from NEW) and decode moved into the executor task. RetryingVRpc.Active drops its own wrap since callbacks already arrive on the op executor. start() publishes ctx/listener only after winning the CAS so a racing duplicate can't corrupt the winner's fields. SessionPoolImpl's three direct listener.onClose paths also dispatch via ctx.getExecutor(). --- .../v2/internal/middleware/RetryingVRpc.java | 85 ++++++----- .../v2/internal/session/SessionPoolImpl.java | 21 ++- .../data/v2/internal/session/VRpcImpl.java | 135 +++++++++--------- 3 files changed, 125 insertions(+), 116 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index 6625407dee38..ffaafc4e180f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -169,57 +169,52 @@ public void onStart() { request, context, new VRpcListener() { + // VRpcImpl dispatches its callbacks via ctx.getExecutor() already, so these methods + // run inside the op-executor task — no need to re-dispatch here. @Override public void onMessage(RespT msg) { - context.getExecutor().execute( - () -> { - if (currentState != Active.this) { - LOG.log( - Level.FINE, - "Discarding response {0} because the attempt is no longer active.", - msg); - return; - } - tracer.onResponseReceived(); - Stopwatch appTimer = Stopwatch.createStarted(); - try { - listener.onMessage(msg); - } finally { - tracer.recordApplicationBlockingLatencies(appTimer.elapsed()); - } - }); + if (currentState != Active.this) { + LOG.log( + Level.FINE, + "Discarding response {0} because the attempt is no longer active.", + msg); + return; + } + tracer.onResponseReceived(); + Stopwatch appTimer = Stopwatch.createStarted(); + try { + listener.onMessage(msg); + } finally { + tracer.recordApplicationBlockingLatencies(appTimer.elapsed()); + } } @Override public void onClose(VRpcResult result) { - context.getExecutor().execute( - () -> { - tracer.onAttemptFinish(result); - if (currentState != Active.this) { - LOG.log( - Level.FINE, - "Discarding server close with result {0} because the the attempt is no" - + " longer active.", - result); - return; - } - if (shouldRetry(result)) { - context = context.createForNextAttempt(); - Duration retryDelay = - Optional.ofNullable(result.getRetryInfo()) - .map(RetryInfo::getRetryDelay) - .orElse(Durations.ZERO); - if (Durations.compare(retryDelay, Durations.ZERO) > 0) { - Scheduled scheduled = new Scheduled(retryDelay); - onStateChange(scheduled); - } else { - onStateChange(new Idle()); - } - return; - } - - onStateChange(new Done(result)); - }); + tracer.onAttemptFinish(result); + if (currentState != Active.this) { + LOG.log( + Level.FINE, + "Discarding server close with result {0} because the the attempt is no" + + " longer active.", + result); + return; + } + if (shouldRetry(result)) { + context = context.createForNextAttempt(); + Duration retryDelay = + Optional.ofNullable(result.getRetryInfo()) + .map(RetryInfo::getRetryDelay) + .orElse(Durations.ZERO); + if (Durations.compare(retryDelay, Durations.ZERO) > 0) { + Scheduled scheduled = new Scheduled(retryDelay); + onStateChange(scheduled); + } else { + onStateChange(new Idle()); + } + return; + } + onStateChange(new Done(result)); } }); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index eaf2289097d7..67c2ac505358 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -542,9 +542,18 @@ private void onSessionClose( status, trailers))); for (PendingVRpc vrpc : toBeClosed) { try { - vrpc.getListener().onClose(result); + vrpc.ctx + .getExecutor() + .execute( + () -> { + try { + vrpc.getListener().onClose(result); + } catch (Throwable t) { + logger.log(Level.WARNING, "Exception when closing request", t); + } + }); } catch (Throwable t) { - logger.log(Level.WARNING, "Exception when closing request", t); + logger.log(Level.WARNING, "Exception dispatching close to op executor", t); } } } @@ -662,10 +671,11 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { synchronized (SessionPoolImpl.this) { if (SessionPoolImpl.this.poolState != PoolState.STARTED) { - listener.onClose( + VRpcResult result = VRpcResult.createUncommitedError( Status.UNAVAILABLE.withCause( - new IllegalStateException("SessionPool is closed")))); + new IllegalStateException("SessionPool is closed"))); + ctx.getExecutor().execute(() -> listener.onClose(result)); return; } pendingRpcs.add(this); @@ -724,7 +734,8 @@ private void cancel(Status status, boolean onlyCancelPendingCall) { if (delegateToRealCall) { realCall.cancel(status.getDescription(), status.getCause()); } else { - listener.onClose(VRpcResult.createRejectedError(status)); + VRpcResult result = VRpcResult.createRejectedError(status); + ctx.getExecutor().execute(() -> listener.onClose(result)); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java index a2d841bd1a68..95b83b8b4cb6 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java @@ -69,6 +69,7 @@ private enum State { private final VRpcDescriptor desc; final long rpcId; private VRpcListener listener; + private VRpcCallContext ctx; private PeerInfo peerInfo; private AtomicReference state; @@ -91,54 +92,55 @@ public VRpcImpl( @Override public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { + if (!state.compareAndSet(State.NEW, State.STARTED)) { + // Lost the CAS — a duplicate start. Dispatch to the local listener/ctx without touching + // the shared fields, otherwise we'd corrupt the in-flight call owned by the CAS winner. + VRpcResult result = + VRpcResult.createRejectedError( + Status.INTERNAL.withDescription("VRpc already started in state: " + state.get())); + ctx.getExecutor().execute(() -> listener.onClose(result)); + return; + } + // Won the CAS — publish the fields. this.listener = listener; + this.ctx = ctx; - Status status; - boolean retryable = true; - - if (!state.compareAndSet(State.NEW, State.STARTED)) { - status = Status.INTERNAL.withDescription("VRpc already started in state: " + state.get()); - retryable = false; - } else if (ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.MICROSECONDS) + if (ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.MICROSECONDS) < TimeUnit.MILLISECONDS.toMicros(1)) { - // Don't send RPCs that don't have any hope of succeeding - status = - Status.DEADLINE_EXCEEDED.withDescription("Remaining deadline is too short to send RPC"); - retryable = false; - } else { - Metadata vRpcMetadata = - Metadata.newBuilder() - .setAttemptNumber(ctx.getOperationInfo().getAttemptNumber()) - .setTraceparent(ctx.getTraceParent()) - .build(); - ctx.getTracer().onRequestSent(peerInfo); - status = - session.startRpc( - this, - VirtualRpcRequest.newBuilder() - .setRpcId(rpcId) - .setMetadata(vRpcMetadata) - .setDeadline( - Durations.fromNanos( - ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) - .setPayload(desc.encode(req)) - .build()); - // if status is not OK, the session might not be ready and the vRPC can be retried on a - // different session + state.set(State.CLOSED); + VRpcResult result = + VRpcResult.createRejectedError( + Status.DEADLINE_EXCEEDED.withDescription( + "Remaining deadline is too short to send RPC")); + ctx.getExecutor().execute(() -> listener.onClose(result)); + return; } + Metadata vRpcMetadata = + Metadata.newBuilder() + .setAttemptNumber(ctx.getOperationInfo().getAttemptNumber()) + .setTraceparent(ctx.getTraceParent()) + .build(); + ctx.getTracer().onRequestSent(peerInfo); + Status status = + session.startRpc( + this, + VirtualRpcRequest.newBuilder() + .setRpcId(rpcId) + .setMetadata(vRpcMetadata) + .setDeadline( + Durations.fromNanos( + ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) + .setPayload(desc.encode(req)) + .build()); if (!status.isOk()) { debugTagTracer.checkPrecondition( state.compareAndSet(State.STARTED, State.CLOSED), "vrpc_incorrect_start_state", "VRpc has incorrect state. Expected to be started but was %s", state); - // TODO: loop through the session executor - if (retryable) { - listener.onClose(VRpcResult.createUncommitedError(status)); - } else { - listener.onClose(VRpcResult.createRejectedError(status)); - } + VRpcResult result = VRpcResult.createUncommitedError(status); + ctx.getExecutor().execute(() -> listener.onClose(result)); } } @@ -147,8 +149,7 @@ void handleSessionClose(VRpcResult result) { logger.warning("tried to close a vRPC after it was already closed state: " + state.get()); return; } - - listener.onClose(result); + ctx.getExecutor().execute(() -> listener.onClose(result)); } void handleResponse(VirtualRpcResponse response) { @@ -159,38 +160,40 @@ void handleResponse(VirtualRpcResponse response) { } // TODO: handle streaming - RespT resp; - try { - resp = desc.decode(response.getPayload()); - } catch (Throwable e) { - // TODO: notify Session to cancel the vRPC - // Right now, vrpc streaming & cancellation is not supported, so notifying SessionImpl is - // unnecessary. In the future handleResponse will need to notify that Session that the user - // was already notified of the error and no further notifications should be delivered - VRpcResult result = - VRpcResult.createLocalTransportError( - Status.INTERNAL.withDescription("Failed to decode VRpc payload").withCause(e)); - listener.onClose(result); - return; - } - - try { - listener.onMessage(resp); - } catch (Throwable e) { - VRpcResult result = VRpcResult.createUserError(e); - listener.onClose(result); - return; - } - - listener.onClose(VRpcResult.createServerOk(response)); + // Decode + callback fan-out all run on the op executor: keeps the (potentially heavy) decode + // off the session sync context, and gives every callback a single dispatcher. + ctx.getExecutor() + .execute( + () -> { + RespT resp; + try { + resp = desc.decode(response.getPayload()); + } catch (Throwable e) { + // TODO: notify Session to cancel the vRPC + listener.onClose( + VRpcResult.createLocalTransportError( + Status.INTERNAL + .withDescription("Failed to decode VRpc payload") + .withCause(e))); + return; + } + try { + listener.onMessage(resp); + } catch (Throwable e) { + listener.onClose(VRpcResult.createUserError(e)); + return; + } + listener.onClose(VRpcResult.createServerOk(response)); + }); } void handleError(VRpcResult result) { - if (state.getAndSet(State.CLOSED) == State.CLOSED) { + // CAS STARTED -> CLOSED, matching handleResponse / handleSessionClose. The previous + // getAndSet(CLOSED) would proceed from NEW and dereference null ctx/listener fields. + if (!state.compareAndSet(State.STARTED, State.CLOSED)) { return; } - - listener.onClose(result); + ctx.getExecutor().execute(() -> listener.onClose(result)); } @Override From 14dde0b7b793b5996b8c4d8abcadf491546b3c6c Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 03:00:11 +0000 Subject: [PATCH 06/24] chore: add session SynchronizationContext alongside the existing lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SessionImpl gains sessionSyncContext that serializes stream callbacks. onMessage/onClose dispatch onto it, and the per-session heartbeat tick trampolines through it. synchronized(lock) blocks remain inside the handlers — the two coexist for now. Affinity asserts added at boundary methods and every handle*. --- .../data/v2/internal/session/SessionImpl.java | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 832c08943913..ca855bf06a7b 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -44,6 +44,7 @@ import com.google.protobuf.util.Durations; import io.grpc.Metadata; import io.grpc.Status; +import io.grpc.SynchronizationContext; import java.time.Clock; import java.time.Duration; import java.time.Instant; @@ -87,6 +88,9 @@ public class SessionImpl implements Session, VRpcSessionApi { private final Clock clock; private final BigtableTimer timer; + // Serializes all session-stream callbacks. Coexists with `lock` for now; `lock` will be removed + // in a follow-up commit once every handler is on the syncContext. + private final SynchronizationContext sessionSyncContext; private final SessionTracer tracer; private final DebugTagTracer debugTagTracer; @@ -177,6 +181,14 @@ public SessionImpl( this.debugTagTracer = metrics.getDebugTagTracer(); this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); this.openParamsUpdated = false; + this.sessionSyncContext = + new SynchronizationContext( + (thread, e) -> + logger.log( + Level.WARNING, + String.format( + "Uncaught exception in session SyncContext for %s", info.getLogName()), + e)); } @Override @@ -271,12 +283,12 @@ public void onBeforeSessionStart(PeerInfo peerInfo) {} @Override public void onMessage(SessionResponse message) { - dispatchResponseMessage(message); + sessionSyncContext.execute(() -> dispatchResponseMessage(message)); } @Override public void onClose(Status status, Metadata trailers) { - dispatchStreamClosed(status, trailers); + sessionSyncContext.execute(() -> dispatchStreamClosed(status, trailers)); } }, headers); @@ -408,7 +420,7 @@ public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable private void scheduleHeartbeatCheck() { heartbeatTimeout = timer.newTimeout( - this::checkHeartbeat, + () -> sessionSyncContext.execute(this::checkHeartbeat), HEARTBEAT_CHECK_INTERVAL.toMillis(), TimeUnit.MILLISECONDS); } @@ -421,31 +433,28 @@ private void cancelHeartbeatTimeout() { } } - // Runs on the wheel-timer tick thread. Takes the per-session lock to read state/nextHeartbeat - // and force-close on miss, then chains the next tick by re-scheduling. If the session is past - // WAIT_SERVER_CLOSE we drop the chain — no further checks are useful. + // Runs on sessionSyncContext (dispatched from the wheel-timer tick body). Checks if the + // heartbeat deadline has passed and force-closes on miss; otherwise re-schedules. private void checkHeartbeat() { - CloseSessionRequest missed = null; + sessionSyncContext.throwIfNotInThisSynchronizationContext(); synchronized (lock) { if (state.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { return; } if (clock.instant().isAfter(nextHeartbeat)) { - missed = MISSED_HEARTBEAT_CLOSE_REQUEST; - } else { - scheduleHeartbeatCheck(); + logger.warning( + String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); + // forceClose acquires the lock again; safe because Java monitors are re-entrant. + forceClose(MISSED_HEARTBEAT_CLOSE_REQUEST); + return; } - } - if (missed != null) { - logger.warning( - String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); - // forceClose acquires the lock again and performs its own state checks. - forceClose(missed); + scheduleHeartbeatCheck(); } } // region SessionStream event handlers private void dispatchResponseMessage(SessionResponse message) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); switch (message.getPayloadCase()) { case OPEN_SESSION: handleOpenSessionResponse(message.getOpenSession()); @@ -475,6 +484,7 @@ private void dispatchResponseMessage(SessionResponse message) { } private void handleOpenSessionResponse(OpenSessionResponse openSession) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); logger.fine(String.format("%s Session is ready", info.getLogName())); PeerInfo localPeerInfo; @@ -502,6 +512,7 @@ private void handleOpenSessionResponse(OpenSessionResponse openSession) { } private void handleSessionParamsResponse(SessionParametersResponse resp) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); synchronized (lock) { if (state.phase >= SessionState.CLOSING.phase) { logger.fine( @@ -525,6 +536,7 @@ private void handleSessionParamsResponse(SessionParametersResponse resp) { } private void handleVRpcResponse(VirtualRpcResponse vrpc) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); // TODO: when stream is supported this should be updated to the next expected time instead of // session life time this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); @@ -585,10 +597,12 @@ private void handleVRpcResponse(VirtualRpcResponse vrpc) { } private void handleHeartBeatResponse(HeartbeatResponse ignored) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); this.nextHeartbeat = clock.instant().plus(heartbeatInterval); } private void handleSessionRefreshConfigResponse(SessionRefreshConfig config) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); synchronized (lock) { Metadata grpcMetadata = new Metadata(); config @@ -604,6 +618,7 @@ private void handleSessionRefreshConfigResponse(SessionRefreshConfig config) { } private void handleVRpcErrorResponse(ErrorResponse error) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); // Skips the heartbeat check when there's no active vrpc on the session this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); @@ -663,6 +678,7 @@ private void handleVRpcErrorResponse(ErrorResponse error) { } private void handleGoAwayResponse(GoAwayResponse goAwayResponse) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); synchronized (lock) { if (state.phase >= SessionState.CLOSING.phase) { debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_go_away_ignored"); @@ -699,6 +715,7 @@ private void handleUnknownResponseMessage(SessionResponse message) { } private void dispatchStreamClosed(Status status, Metadata trailers) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); SessionState prevState; VRpcImpl localVRpc; From dd0ab5897c8528588fffff38a55e203d9b8a3ab3 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 03:17:23 +0000 Subject: [PATCH 07/24] chore: make startRpc and cancelRpc async via sessionSyncContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SessionImpl.startRpc and cancelRpc now submit to sessionSyncContext rather than running synchronously on the caller. VRpcSessionApi.startRpc is void — errors flow through rpc.handleError() onto ctx.getExecutor(). VRpcImpl drops its synchronous post-startRpc error branch. --- .../data/v2/internal/session/SessionImpl.java | 98 ++++++++++--------- .../data/v2/internal/session/VRpcImpl.java | 37 +++---- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index ca855bf06a7b..04259fa6ac4b 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -382,38 +382,45 @@ VRpc newCall(VRpcDescriptor descriptor) { } @Override - public Status startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { - synchronized (lock) { - if (currentRpc != null) { - return Status.INTERNAL.withDescription( - "Session error: RPC multiplexing is not yet supported"); - } - if (state != SessionState.READY) { - return Status.INTERNAL.withDescription( - "Session error: Session was not ready, state = " + state); - } + public void startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { + sessionSyncContext.execute( + () -> { + synchronized (lock) { + if (currentRpc != null) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: RPC multiplexing is not yet supported"))); + return; + } + if (state != SessionState.READY) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: Session was not ready, state = " + state))); + return; + } - this.currentRpc = rpc; - stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); - // Start monitoring for heartbeat when the vRPC is started. heartbeatInterval is read - // inside the lock to avoid a race with handleSessionParamsResponse(). nextHeartbeat is - // volatile and written here without an atomicity guarantee — that is intentional; it is - // only a scheduling hint (see field comment). - this.nextHeartbeat = clock.instant().plus(heartbeatInterval); - return Status.OK; - } + this.currentRpc = rpc; + stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); + this.nextHeartbeat = clock.instant().plus(heartbeatInterval); + } + }); } @Override public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause) { - synchronized (lock) { - if (currentRpc != null && rpcId == currentRpc.rpcId) { - currentCancel = - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(cause)); - } - // do nothing if the rpc is already finished - } + sessionSyncContext.execute( + () -> { + synchronized (lock) { + if (currentRpc != null && rpcId == currentRpc.rpcId) { + currentCancel = + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(cause)); + } + // do nothing if the rpc is already finished + } + }); } @GuardedBy("lock") @@ -540,9 +547,8 @@ private void handleVRpcResponse(VirtualRpcResponse vrpc) { // TODO: when stream is supported this should be updated to the next expected time instead of // session life time this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl localRpc; - VRpcResult localCancel; - + VRpcImpl rpc; + VRpcResult cancel; boolean needsClose; synchronized (lock) { @@ -572,20 +578,20 @@ private void handleVRpcResponse(VirtualRpcResponse vrpc) { vrpc.getRpcId()); // reset state of the current rpc - localCancel = currentCancel; - currentCancel = null; - localRpc = currentRpc; + rpc = currentRpc; + cancel = currentCancel; // TODO: handle multiplexing currentRpc = null; + currentCancel = null; needsClose = (state == SessionState.CLOSING); } - if (localCancel != null) { - tracer.onVRpcClose(localCancel.getStatus().getCode()); - localRpc.handleError(localCancel); + if (cancel != null) { + tracer.onVRpcClose(cancel.getStatus().getCode()); + rpc.handleError(cancel); } else { tracer.onVRpcClose(Status.OK.getCode()); - localRpc.handleResponse(vrpc); + rpc.handleResponse(vrpc); } if (needsClose) { synchronized (lock) { @@ -622,9 +628,9 @@ private void handleVRpcErrorResponse(ErrorResponse error) { // Skips the heartbeat check when there's no active vrpc on the session this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl localRpc; + VRpcImpl rpc; + VRpcResult cancel; boolean needsClose; - VRpcResult localCancel; synchronized (lock) { if (state.phase > SessionState.CLOSING.phase) { @@ -654,19 +660,19 @@ private void handleVRpcErrorResponse(ErrorResponse error) { error.getRpcId()); // reset the state of the current rpc - localCancel = currentCancel; - currentCancel = null; - localRpc = currentRpc; + rpc = currentRpc; + cancel = currentCancel; currentRpc = null; + currentCancel = null; needsClose = (state == SessionState.CLOSING); } - if (localCancel != null) { - tracer.onVRpcClose(localCancel.getStatus().getCode()); - localRpc.handleError(localCancel); + if (cancel != null) { + tracer.onVRpcClose(cancel.getStatus().getCode()); + rpc.handleError(cancel); } else { tracer.onVRpcClose(Status.fromCodeValue(error.getStatus().getCode()).getCode()); - localRpc.handleError(VRpcResult.createServerError(error)); + rpc.handleError(VRpcResult.createServerError(error)); } if (needsClose) { synchronized (lock) { diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java index 95b83b8b4cb6..83228fd43456 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java @@ -54,7 +54,11 @@ class VRpcImpl rpc, VirtualRpcRequest payload); + /** + * Submit the vRPC for sending. Async: errors are delivered via {@link + * VRpcImpl#handleError(VRpcResult)}, which dispatches onto {@code ctx.getExecutor()}. + */ + void startRpc(VRpcImpl rpc, VirtualRpcRequest payload); void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause); } @@ -122,26 +126,17 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { .setTraceparent(ctx.getTraceParent()) .build(); ctx.getTracer().onRequestSent(peerInfo); - Status status = - session.startRpc( - this, - VirtualRpcRequest.newBuilder() - .setRpcId(rpcId) - .setMetadata(vRpcMetadata) - .setDeadline( - Durations.fromNanos( - ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) - .setPayload(desc.encode(req)) - .build()); - if (!status.isOk()) { - debugTagTracer.checkPrecondition( - state.compareAndSet(State.STARTED, State.CLOSED), - "vrpc_incorrect_start_state", - "VRpc has incorrect state. Expected to be started but was %s", - state); - VRpcResult result = VRpcResult.createUncommitedError(status); - ctx.getExecutor().execute(() -> listener.onClose(result)); - } + session.startRpc( + this, + VirtualRpcRequest.newBuilder() + .setRpcId(rpcId) + .setMetadata(vRpcMetadata) + .setDeadline( + Durations.fromNanos( + ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) + .setPayload(desc.encode(req)) + .build()); + // Session delivers startRpc errors asynchronously via handleError() on ctx.getExecutor(). } void handleSessionClose(VRpcResult result) { From f5a0130e8953127bbca43f2d3bb26a4f6c248bc4 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Thu, 28 May 2026 01:37:54 +0000 Subject: [PATCH 08/24] chore: remove synchronized(lock) from SessionImpl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All session state mutations now run on sessionSyncContext, so the per-session Object lock is no longer needed. Public methods (start, close, forceClose, startRpc, cancelRpc) submit onto sessionSyncContext; nextRpcId becomes AtomicLong for the cross-thread newCall() caller. handleVRpcResponse and handleVRpcErrorResponse drop the localCancel/localRpc capture-and-recheck dance — sessionSyncContext serializes them now. Stale lock-era comments on the fields are replaced with a sessionSyncContext-ownership note. SessionImplTest.testHeartbeat polls for the now-async nextHeartbeat update. --- .../data/v2/internal/session/SessionImpl.java | 668 ++++++++---------- .../v2/internal/session/SessionImplTest.java | 10 +- 2 files changed, 298 insertions(+), 380 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 04259fa6ac4b..45868583274b 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -51,10 +51,10 @@ import java.util.Locale; import java.util.Optional; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; -import javax.annotation.concurrent.GuardedBy; /** Wraps a Bidi ClientCall and layers session semantics on top. */ @VisibleForTesting @@ -78,18 +78,10 @@ public class SessionImpl implements Session, VRpcSessionApi { .setDescription("missed heartbeat") .build(); - /* - * This lock should be mostly uncontended - all access should be naturally interleaved. Contention - * can only really happen when an unsolicited gRPC control message (ie GOAWAY) arrives at the same - * time as newCall or cancel. - * TODO: Contention will increase when multiplexing is implemented. - */ - private final Object lock = new Object(); - private final Clock clock; private final BigtableTimer timer; - // Serializes all session-stream callbacks. Coexists with `lock` for now; `lock` will be removed - // in a follow-up commit once every handler is on the syncContext. + // Serializes all session state mutations. Stream callbacks and the heartbeat tick dispatch + // onto it; every handler below runs inside a syncContext task. private final SynchronizationContext sessionSyncContext; private final SessionTracer tracer; @@ -97,63 +89,46 @@ public class SessionImpl implements Session, VRpcSessionApi { private final SessionInfo info; - @GuardedBy("lock") private final SessionStream stream; - @GuardedBy("lock") - private SessionState state = SessionState.NEW; + private volatile SessionState state = SessionState.NEW; - @GuardedBy("lock") - private Instant lastStateChangedAt; + private volatile Instant lastStateChangedAt; - // Set once under lock in start(), then read freely from gRPC callbacks without the lock. - // Safe because start() is always called before any callback fires, so the write is - // visible to all subsequent readers through the happens-before chain from stream.start(). + // All fields below are owned by sessionSyncContext: every writer runs inside a + // sessionSyncContext task, and the in-class readers do too (handlers, scheduled heartbeat + // tick). They lost their volatile / lock guards when synchronized(lock) was removed; + // SyncContext serialization is what makes plain reads/writes safe. + // + // External callers of getOpenParams / isOpenParamsUpdated / getNextHeartbeat must therefore + // either run on sessionSyncContext themselves (e.g. via a Session.Listener callback, which is + // dispatched from inside the context) or accept a possibly-stale snapshot. There are no + // off-context production readers today. private Listener sessionListener; - // volatile: written under lock in handleSessionRefreshConfigResponse(); read without lock in - // getOpenParams() and isOpenParamsUpdated() so callers get a consistent (if possibly stale) - // snapshot without contending on the lock. Stale reads are acceptable for these accessors. - private volatile OpenParams openParams; + private OpenParams openParams; - private volatile boolean openParamsUpdated; + private boolean openParamsUpdated; - // closeReason is written under lock in close(), forceClose(), handleGoAwayResponse(), and - // dispatchStreamClosed(). The one read that occurs outside the lock — in dispatchStreamClosed - // after the synchronized block — runs on the same gRPC callback thread that just released the - // lock, so the lock's release-acquire edge provides the necessary visibility. A stale read is - // structurally impossible given the control flow (closeReason is always set before the lock - // is released on every path that reaches that read site). @Nullable private CloseSessionRequest closeReason = null; - @GuardedBy("lock") - private long nextRpcId = 1; + private final AtomicLong nextRpcId = new AtomicLong(1); // TODO: replace with a map when implementing multiplexing - @GuardedBy("lock") private VRpcImpl currentRpc = null; - @GuardedBy("lock") private VRpcResult currentCancel = null; - @GuardedBy("lock") private SessionParametersResponse sessionParameters = DEFAULT_SESSION_PARAMS; - // volatile: written under lock in handleSessionParamsResponse(); read without lock in - // handleHeartBeatResponse() where a stale read is acceptable — the heartbeat deadline is a - // soft scheduling hint, not a correctness invariant. startRpc() reads this inside the lock. - private volatile Duration heartbeatInterval = + private Duration heartbeatInterval = Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); - // volatile: written from multiple sites without holding the lock (startRpc, handleVRpc*, - // handleHeartBeatResponse). Stale reads are acceptable — nextHeartbeat is used only as a - // scheduling hint by the per-session heartbeat tick. - private volatile Instant nextHeartbeat; + private Instant nextHeartbeat; // Handle for the in-flight heartbeat tick (one outstanding at a time). Set under lock when the // session enters READY (handleOpenSessionResponse) and again from checkHeartbeat to chain the // next tick. Cancelled under lock from updateState when the session transitions past READY. - @GuardedBy("lock") @Nullable private BigtableTimer.Timeout heartbeatTimeout; @@ -193,16 +168,12 @@ public SessionImpl( @Override public SessionState getState() { - synchronized (lock) { - return state; - } + return state; } @Override public Instant getLastStateChange() { - synchronized (lock) { - return lastStateChangedAt; - } + return lastStateChangedAt; } @Override @@ -222,13 +193,7 @@ public Instant getNextHeartbeat() { @Override public PeerInfo getPeerInfo() { - // This lock might not be necessary, its populated once on a gRPC callback which should - // establish a happens before relationship. However access to the underlying stream is guarded - // with errorprone, so sync block is required to get around the lint. - // TODO: consider removing the sync block - synchronized (lock) { - return stream.getPeerInfo(); - } + return stream.getPeerInfo(); } @Override @@ -238,98 +203,101 @@ public String getLogName() { @Override public void forceClose(CloseSessionRequest closeReason) { - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_force_close_wrong_state", - "Tried to forceClose an unstarted session %s in state %s", - info.getLogName(), - state); - - if (state == SessionState.CLOSED) { - return; - } + sessionSyncContext.execute( + () -> { + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_force_close_wrong_state", + "Tried to forceClose an unstarted session %s in state %s", + info.getLogName(), + state); + + if (state == SessionState.CLOSED) { + return; + } - updateState(SessionState.WAIT_SERVER_CLOSE); - this.closeReason = closeReason; + updateState(SessionState.WAIT_SERVER_CLOSE); + this.closeReason = closeReason; - // Not sending the CloseSessionRequest because cancel() will just drop it - stream.forceClose(closeReason.getDescription(), null); - // Listeners will be notified by dispatchStreamClosed - } + // Not sending the CloseSessionRequest because cancel() will just drop it + stream.forceClose(closeReason.getDescription(), null); + // Listeners will be notified by dispatchStreamClosed + }); } @Override public void start(OpenSessionRequest req, Metadata headers, Listener sessionListener) { - synchronized (lock) { - debugTagTracer.checkPrecondition( - state == SessionState.NEW, - "session_start_wrong_state", - "Tried to start a started session, current state: %s", - state); - - logger.fine(String.format("Starting session %s", info.getLogName())); - tracer.onStart(); - - updateState(SessionState.STARTING); - openParams = OpenParams.create(headers, req); - this.sessionListener = sessionListener; - - SessionRequest wrappedReq = SessionRequest.newBuilder().setOpenSession(req).build(); - stream.start( - new SessionStream.Listener() { - @Override - public void onBeforeSessionStart(PeerInfo peerInfo) {} - - @Override - public void onMessage(SessionResponse message) { - sessionSyncContext.execute(() -> dispatchResponseMessage(message)); - } - - @Override - public void onClose(Status status, Metadata trailers) { - sessionSyncContext.execute(() -> dispatchStreamClosed(status, trailers)); - } - }, - headers); - - stream.sendMessage(wrappedReq); - } + sessionSyncContext.execute( + () -> { + debugTagTracer.checkPrecondition( + state == SessionState.NEW, + "session_start_wrong_state", + "Tried to start a started session, current state: %s", + state); + + logger.fine(String.format("Starting session %s", info.getLogName())); + tracer.onStart(); + + updateState(SessionState.STARTING); + openParams = OpenParams.create(headers, req); + this.sessionListener = sessionListener; + + SessionRequest wrappedReq = SessionRequest.newBuilder().setOpenSession(req).build(); + stream.start( + new SessionStream.Listener() { + @Override + public void onBeforeSessionStart(PeerInfo peerInfo) {} + + @Override + public void onMessage(SessionResponse message) { + sessionSyncContext.execute(() -> dispatchResponseMessage(message)); + } + + @Override + public void onClose(Status status, Metadata trailers) { + sessionSyncContext.execute(() -> dispatchStreamClosed(status, trailers)); + } + }, + headers); + + stream.sendMessage(wrappedReq); + }); } @Override public void close(CloseSessionRequest req) { logger.fine(String.format("Closing session %s for reason: %s", info.getLogName(), req)); - synchronized (lock) { - // Throw an exception because this is a bug and we dont have a listener - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_close_wrong_state", - "Session error: Caller tried to close session %s before starting it with the reason: %s", - info.getLogName(), - req); - - // Multiple close is a no-op - if (state.phase >= SessionState.CLOSING.phase) { - logger.fine( - String.format( - "Session error: Caller tried to close a session %s that is %s for reason: %s", - info.getLogName(), state, req)); - return; - } + sessionSyncContext.execute( + () -> { + // Throw an exception because this is a bug and we dont have a listener + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_close_wrong_state", + "Session error: Caller tried to close session %s before starting it with the reason:" + + " %s", + info.getLogName(), + req); + + // Multiple close is a no-op + if (state.phase >= SessionState.CLOSING.phase) { + logger.fine( + String.format( + "Session error: Caller tried to close a session %s that is %s for reason: %s", + info.getLogName(), state, req)); + return; + } - closeReason = req; - updateState(SessionState.CLOSING); + closeReason = req; + updateState(SessionState.CLOSING); - if (currentRpc == null) { - startGracefulClose(); - } - } + if (currentRpc == null) { + startGracefulClose(); + } + }); } /** Wraps the flow of closing a session. */ - @GuardedBy("lock") private void startGracefulClose() { debugTagTracer.checkPrecondition( state == SessionState.CLOSING, @@ -369,42 +337,37 @@ VRpc newCall(VRpcDescriptor descriptor) { "session_new_call_wrong_type", "wrong VRpc descriptor type"); - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_new_call_wrong_state", - "Session error: newCall called before start"); + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_new_call_wrong_state", + "Session error: newCall called before start"); - long rpcId = nextRpcId; - nextRpcId = Math.incrementExact(nextRpcId); - return new VRpcImpl<>(this, descriptor, rpcId, stream.getPeerInfo(), debugTagTracer); - } + long rpcId = nextRpcId.getAndIncrement(); + return new VRpcImpl<>(this, descriptor, rpcId, stream.getPeerInfo(), debugTagTracer); } @Override public void startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { sessionSyncContext.execute( () -> { - synchronized (lock) { - if (currentRpc != null) { - rpc.handleError( - VRpcResult.createUncommitedError( - Status.INTERNAL.withDescription( - "Session error: RPC multiplexing is not yet supported"))); - return; - } - if (state != SessionState.READY) { - rpc.handleError( - VRpcResult.createUncommitedError( - Status.INTERNAL.withDescription( - "Session error: Session was not ready, state = " + state))); - return; - } - - this.currentRpc = rpc; - stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); - this.nextHeartbeat = clock.instant().plus(heartbeatInterval); + if (currentRpc != null) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: RPC multiplexing is not yet supported"))); + return; } + if (state != SessionState.READY) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: Session was not ready, state = " + state))); + return; + } + + this.currentRpc = rpc; + stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); + this.nextHeartbeat = clock.instant().plus(heartbeatInterval); }); } @@ -412,18 +375,15 @@ public void startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause) { sessionSyncContext.execute( () -> { - synchronized (lock) { - if (currentRpc != null && rpcId == currentRpc.rpcId) { - currentCancel = - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(cause)); - } - // do nothing if the rpc is already finished + if (currentRpc != null && rpcId == currentRpc.rpcId) { + currentCancel = + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(cause)); } + // do nothing if the rpc is already finished }); } - @GuardedBy("lock") private void scheduleHeartbeatCheck() { heartbeatTimeout = timer.newTimeout( @@ -432,7 +392,6 @@ private void scheduleHeartbeatCheck() { TimeUnit.MILLISECONDS); } - @GuardedBy("lock") private void cancelHeartbeatTimeout() { if (heartbeatTimeout != null) { heartbeatTimeout.cancel(); @@ -444,19 +403,16 @@ private void cancelHeartbeatTimeout() { // heartbeat deadline has passed and force-closes on miss; otherwise re-schedules. private void checkHeartbeat() { sessionSyncContext.throwIfNotInThisSynchronizationContext(); - synchronized (lock) { - if (state.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { - return; - } - if (clock.instant().isAfter(nextHeartbeat)) { - logger.warning( - String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); - // forceClose acquires the lock again; safe because Java monitors are re-entrant. - forceClose(MISSED_HEARTBEAT_CLOSE_REQUEST); - return; - } - scheduleHeartbeatCheck(); + if (state.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { + return; } + if (clock.instant().isAfter(nextHeartbeat)) { + logger.warning( + String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); + forceClose(MISSED_HEARTBEAT_CLOSE_REQUEST); + return; + } + scheduleHeartbeatCheck(); } // region SessionStream event handlers @@ -494,51 +450,43 @@ private void handleOpenSessionResponse(OpenSessionResponse openSession) { sessionSyncContext.throwIfNotInThisSynchronizationContext(); logger.fine(String.format("%s Session is ready", info.getLogName())); - PeerInfo localPeerInfo; - - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_open_wrong_state", - "Got session open response before session started"); - debugTagTracer.checkPrecondition( - state != SessionState.CLOSED, - "session_open_wrong_state", - "Got session open response after session was closed"); + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_open_wrong_state", + "Got session open response before session started"); + debugTagTracer.checkPrecondition( + state != SessionState.CLOSED, + "session_open_wrong_state", + "Got session open response after session was closed"); - if (state != SessionState.STARTING) { - logger.fine(String.format("Stream was already %s when session open was received", state)); - return; - } - localPeerInfo = stream.getPeerInfo(); - updateState(SessionState.READY); - scheduleHeartbeatCheck(); + if (state != SessionState.STARTING) { + logger.fine(String.format("Stream was already %s when session open was received", state)); + return; } + PeerInfo localPeerInfo = stream.getPeerInfo(); + updateState(SessionState.READY); tracer.onOpen(localPeerInfo); sessionListener.onReady(openSession); + scheduleHeartbeatCheck(); } private void handleSessionParamsResponse(SessionParametersResponse resp) { - sessionSyncContext.throwIfNotInThisSynchronizationContext(); - synchronized (lock) { - if (state.phase >= SessionState.CLOSING.phase) { - logger.fine( - String.format("Stream was already %s when session params were received", state)); - return; - } + if (state.phase >= SessionState.CLOSING.phase) { + logger.fine(String.format("Stream was already %s when session params were received", state)); + return; + } - if (!sessionParameters.equals(resp)) { - this.sessionParameters = resp; - this.heartbeatInterval = - Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); - logger.log( - Level.CONFIG, - () -> - String.format( - "%s session params changed: %s", - info.getLogName(), - TextFormat.printer().emittingSingleLine(true).printToString(resp))); - } + if (!sessionParameters.equals(resp)) { + this.sessionParameters = resp; + this.heartbeatInterval = + Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); + logger.log( + Level.CONFIG, + () -> + String.format( + "%s session params changed: %s", + info.getLogName(), + TextFormat.printer().emittingSingleLine(true).printToString(resp))); } } @@ -547,45 +495,39 @@ private void handleVRpcResponse(VirtualRpcResponse vrpc) { // TODO: when stream is supported this should be updated to the next expected time instead of // session life time this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl rpc; - VRpcResult cancel; - boolean needsClose; - - synchronized (lock) { - if (state.phase > SessionState.CLOSING.phase) { - debugTagTracer.record( - TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); - logger.warning( - String.format( - "%s Discarding vRPC error because session is past the CLOSING phase with the" - + " reason: %s", - info.getLogName(), closeReason)); - return; - } - debugTagTracer.checkPrecondition( - state == SessionState.READY || state == SessionState.CLOSING, - "session_vrpc_response_wrong_state", - "Unexpected vRPC response when session is %s", - state); - debugTagTracer.checkPrecondition( - currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); - debugTagTracer.checkPrecondition( - currentRpc.rpcId == vrpc.getRpcId(), - "session_vrpc_id_mismatch", - "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", - currentRpc.rpcId, - vrpc.getRpcId()); - - // reset state of the current rpc - rpc = currentRpc; - cancel = currentCancel; - // TODO: handle multiplexing - currentRpc = null; - currentCancel = null; - needsClose = (state == SessionState.CLOSING); + if (state.phase > SessionState.CLOSING.phase) { + debugTagTracer.record( + TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); + logger.warning( + String.format( + "%s Discarding vRPC error because session is past the CLOSING phase with the" + + " reason: %s", + info.getLogName(), closeReason)); + return; } + debugTagTracer.checkPrecondition( + state == SessionState.READY || state == SessionState.CLOSING, + "session_vrpc_response_wrong_state", + "Unexpected vRPC response when session is %s", + state); + debugTagTracer.checkPrecondition( + currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); + debugTagTracer.checkPrecondition( + currentRpc.rpcId == vrpc.getRpcId(), + "session_vrpc_id_mismatch", + "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", + currentRpc.rpcId, + vrpc.getRpcId()); + + // reset state of the current rpc + VRpcImpl rpc = currentRpc; + VRpcResult cancel = currentCancel; + // TODO: handle multiplexing + currentRpc = null; + currentCancel = null; + if (cancel != null) { tracer.onVRpcClose(cancel.getStatus().getCode()); rpc.handleError(cancel); @@ -593,12 +535,8 @@ private void handleVRpcResponse(VirtualRpcResponse vrpc) { tracer.onVRpcClose(Status.OK.getCode()); rpc.handleResponse(vrpc); } - if (needsClose) { - synchronized (lock) { - if (state == SessionState.CLOSING) { - startGracefulClose(); - } - } + if (state == SessionState.CLOSING) { + startGracefulClose(); } } @@ -608,19 +546,16 @@ private void handleHeartBeatResponse(HeartbeatResponse ignored) { } private void handleSessionRefreshConfigResponse(SessionRefreshConfig config) { - sessionSyncContext.throwIfNotInThisSynchronizationContext(); - synchronized (lock) { - Metadata grpcMetadata = new Metadata(); - config - .getMetadataList() - .forEach( - entry -> - grpcMetadata.put( - Metadata.Key.of(entry.getKey(), Metadata.ASCII_STRING_MARSHALLER), - entry.getValue().toStringUtf8())); - openParams = OpenParams.create(grpcMetadata, config.getOptimizedOpenRequest()); - openParamsUpdated = true; - } + Metadata grpcMetadata = new Metadata(); + config + .getMetadataList() + .forEach( + entry -> + grpcMetadata.put( + Metadata.Key.of(entry.getKey(), Metadata.ASCII_STRING_MARSHALLER), + entry.getValue().toStringUtf8())); + openParams = OpenParams.create(grpcMetadata, config.getOptimizedOpenRequest()); + openParamsUpdated = true; } private void handleVRpcErrorResponse(ErrorResponse error) { @@ -628,44 +563,37 @@ private void handleVRpcErrorResponse(ErrorResponse error) { // Skips the heartbeat check when there's no active vrpc on the session this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl rpc; - VRpcResult cancel; - boolean needsClose; + if (state.phase > SessionState.CLOSING.phase) { + debugTagTracer.record( + TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); + logger.warning( + String.format( + "%s Discarding vRPC error because session is past the CLOSING phase with the" + + " reason: %s, error was: %s", + info.getLogName(), closeReason, error)); + return; + } - synchronized (lock) { - if (state.phase > SessionState.CLOSING.phase) { - debugTagTracer.record( - TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); - logger.warning( - String.format( - "%s Discarding vRPC error because session is past the CLOSING phase with the" - + " reason: %s, error was: %s", - info.getLogName(), closeReason, error)); - return; - } + debugTagTracer.checkPrecondition( + state == SessionState.READY || state == SessionState.CLOSING, + "session_vrpc_response_wrong_state", + "Unexpected vRPC response when session is %s", + state); - debugTagTracer.checkPrecondition( - state == SessionState.READY || state == SessionState.CLOSING, - "session_vrpc_response_wrong_state", - "Unexpected vRPC response when session is %s", - state); - - debugTagTracer.checkPrecondition( - currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); - debugTagTracer.checkPrecondition( - currentRpc.rpcId == error.getRpcId(), - "session_vrpc_id_mismatch", - "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", - currentRpc.rpcId, - error.getRpcId()); - - // reset the state of the current rpc - rpc = currentRpc; - cancel = currentCancel; - currentRpc = null; - currentCancel = null; - needsClose = (state == SessionState.CLOSING); - } + debugTagTracer.checkPrecondition( + currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); + debugTagTracer.checkPrecondition( + currentRpc.rpcId == error.getRpcId(), + "session_vrpc_id_mismatch", + "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", + currentRpc.rpcId, + error.getRpcId()); + + // reset the state of the current rpc + VRpcImpl rpc = currentRpc; + VRpcResult cancel = currentCancel; + currentRpc = null; + currentCancel = null; if (cancel != null) { tracer.onVRpcClose(cancel.getStatus().getCode()); @@ -674,43 +602,35 @@ private void handleVRpcErrorResponse(ErrorResponse error) { tracer.onVRpcClose(Status.fromCodeValue(error.getStatus().getCode()).getCode()); rpc.handleError(VRpcResult.createServerError(error)); } - if (needsClose) { - synchronized (lock) { - if (state == SessionState.CLOSING) { - startGracefulClose(); - } - } + if (state == SessionState.CLOSING) { + startGracefulClose(); } } private void handleGoAwayResponse(GoAwayResponse goAwayResponse) { - sessionSyncContext.throwIfNotInThisSynchronizationContext(); - synchronized (lock) { - if (state.phase >= SessionState.CLOSING.phase) { - debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_go_away_ignored"); - logger.warning( - String.format( - "Session error: %s Ignoring goaway because session is %s", - info.getLogName(), state)); - return; - } + if (state.phase >= SessionState.CLOSING.phase) { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_go_away_ignored"); + logger.warning( + String.format( + "Session error: %s Ignoring goaway because session is %s", info.getLogName(), state)); + return; + } - debugTagTracer.checkPrecondition( - state.phase >= SessionState.STARTING.phase, - "session_go_away_wrong_state", - "Unexpected goaway when session is %s", - state); + debugTagTracer.checkPrecondition( + state.phase >= SessionState.STARTING.phase, + "session_go_away_wrong_state", + "Unexpected goaway when session is %s", + state); - updateState(SessionState.CLOSING); - closeReason = - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_GOAWAY) - .setDescription( - "Server sent GO_AWAY_" + goAwayResponse.getReason().toUpperCase(Locale.ENGLISH)) - .build(); - if (currentRpc == null) { - startGracefulClose(); - } + updateState(SessionState.CLOSING); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_GOAWAY) + .setDescription( + "Server sent GO_AWAY_" + goAwayResponse.getReason().toUpperCase(Locale.ENGLISH)) + .build(); + if (currentRpc == null) { + startGracefulClose(); } sessionListener.onGoAway(goAwayResponse); } @@ -721,51 +641,44 @@ private void handleUnknownResponseMessage(SessionResponse message) { } private void dispatchStreamClosed(Status status, Metadata trailers) { - sessionSyncContext.throwIfNotInThisSynchronizationContext(); - SessionState prevState; - VRpcImpl localVRpc; + SessionState prevState = state; - PeerInfo localPeerInfo; - synchronized (lock) { - prevState = state; + if (!status.isOk()) { + String augmentedDescription = + Optional.ofNullable(status.getDescription()).map(d -> d + ". ").orElse("") + + "PeerInfo: " + + formatPeerInfo(getPeerInfo()); - if (!status.isOk()) { - String augmentedDescription = - Optional.ofNullable(status.getDescription()).map(d -> d + ". ").orElse("") - + "PeerInfo: " - + formatPeerInfo(getPeerInfo()); + status = status.withDescription(augmentedDescription); + } - status = status.withDescription(augmentedDescription); - } + if (state == SessionState.WAIT_SERVER_CLOSE) { + logger.fine(String.format("%s closed normally with status %s", info.getLogName(), status)); + } else { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_abnormal_close"); + // Unexpected path + String msg = + String.format( + "Session error: %s session closed unexpectedly in state %s. Status: %s", + info.getLogName(), state, status); + logger.warning(msg); - if (state == SessionState.WAIT_SERVER_CLOSE) { - logger.fine(String.format("%s closed normally with status %s", info.getLogName(), status)); - } else { - debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_abnormal_close"); - // Unexpected path - String msg = - String.format( - "Session error: %s session closed unexpectedly in state %s. Status: %s", - info.getLogName(), state, status); - logger.warning(msg); - - if (state == SessionState.CLOSED) { - return; - } - - closeReason = - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) - .setDescription("Unexpected session close with status: " + status.getCode()) - .build(); + if (state == SessionState.CLOSED) { + return; } - localVRpc = currentRpc; - localPeerInfo = stream.getPeerInfo(); - currentRpc = null; - updateState(SessionState.CLOSED); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("Unexpected session close with status: " + status.getCode()) + .build(); } + VRpcImpl localVRpc = currentRpc; + PeerInfo localPeerInfo = stream.getPeerInfo(); + currentRpc = null; + updateState(SessionState.CLOSED); + if (localVRpc != null) { try { localVRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); @@ -784,7 +697,6 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { sessionListener.onClose(prevState, status, trailers); } - @GuardedBy("lock") private void updateState(SessionState newState) { this.state = newState; this.lastStateChangedAt = clock.instant(); diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index c50e4f55d5c2..90a7d3a20066 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -494,8 +494,14 @@ void testHeartbeat() throws Exception { VRpcCallContext.create(Deadline.after(1, TimeUnit.MINUTES), true, tracer), f); - assertThat(session.getNextHeartbeat()) - .isEqualTo(time.plus(Duration.ofMillis(keepAliveDurationMs))); + // startRpc() is now async; poll until sessionSyncContext processes it. + Instant expectedHeartbeat = time.plus(Duration.ofMillis(keepAliveDurationMs)); + Stopwatch sw = Stopwatch.createStarted(); + while (!session.getNextHeartbeat().equals(expectedHeartbeat) + && sw.elapsed(TimeUnit.SECONDS) < 5) { + Thread.sleep(10); + } + assertThat(session.getNextHeartbeat()).isEqualTo(expectedHeartbeat); assertThat(f.get()).isEqualTo(SessionFakeScriptedResponse.getDefaultInstance()); From e03690f9d22c6d00de1ce717cbb30db893e09060 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Thu, 4 Jun 2026 20:57:57 +0000 Subject: [PATCH 09/24] chore: abort session on uncaught exception in sessionSyncContext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split terminal close into notifyTerminalClose (per-target try/catch fan-out) and abortFromUncaughtException (global handler). Uncaught syncContext exceptions always set closeReason to ERROR — the prior reason is folded into the description so tracer/metrics correctly attribute aborts. Adds three regression tests (listener.onReady throws, onClose throws, both throw). --- .../data/v2/internal/session/SessionImpl.java | 149 +++++++++++++++--- .../v2/internal/session/SessionImplTest.java | 143 +++++++++++++++++ 2 files changed, 274 insertions(+), 18 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 45868583274b..9771bf4ed749 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -126,11 +126,13 @@ public class SessionImpl implements Session, VRpcSessionApi { private Instant nextHeartbeat; - // Handle for the in-flight heartbeat tick (one outstanding at a time). Set under lock when the - // session enters READY (handleOpenSessionResponse) and again from checkHeartbeat to chain the - // next tick. Cancelled under lock from updateState when the session transitions past READY. - @Nullable - private BigtableTimer.Timeout heartbeatTimeout; + // Handle for the in-flight heartbeat tick (one outstanding at a time). Cancelled on terminal + // transitions so the wheel doesn't carry a no-op entry until the next fire. + @Nullable private BigtableTimer.Timeout heartbeatTimeout; + + // Set by the global SyncContext handler when an uncaught exception triggers an abort. Read on + // re-entry to break out instead of looping. Only accessed inside sessionSyncContext. + private boolean isAborting = false; public SessionImpl( Metrics metrics, @@ -156,14 +158,77 @@ public SessionImpl( this.debugTagTracer = metrics.getDebugTagTracer(); this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); this.openParamsUpdated = false; + // On uncaught exception, drive the session through a clean terminal-close path so the pool + // and the in-flight vRpc are always notified. notifyTerminalClose has local guards, and + // isAborting prevents recursion if the abort path itself throws. this.sessionSyncContext = - new SynchronizationContext( - (thread, e) -> - logger.log( - Level.WARNING, - String.format( - "Uncaught exception in session SyncContext for %s", info.getLogName()), - e)); + new SynchronizationContext((thread, e) -> abortFromUncaughtException(e)); + } + + private void abortFromUncaughtException(Throwable e) { + if (isAborting) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Secondary uncaught exception during abort, ignoring", + info.getLogName()), + e); + return; + } + isAborting = true; + + logger.log( + Level.SEVERE, + String.format( + "Session error: %s Uncaught exception in session SyncContext in state %s, PeerInfo:" + + " %s — aborting session", + info.getLogName(), state, formatPeerInfo(safeGetPeerInfo())), + e); + + if (state == SessionState.CLOSED) { + return; + } + + // Always overwrite closeReason: the abort is what actually happened, not whatever clean close + // we may have been attempting. Fold the prior reason into the description for forensics so + // downstream metrics (which bucket by reason) attribute this to ERROR rather than the + // interrupted close. + String prevDesc = + (closeReason != null) + ? " (was closing for: " + + closeReason.getReason() + + " — " + + closeReason.getDescription() + + ")" + : ""; + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("Uncaught exception in session SyncContext: " + e + prevDesc) + .build(); + + VRpcImpl localRpc = currentRpc; + currentRpc = null; + SessionState prevState = state; + updateState(SessionState.CLOSED); + + // Defensively tell the transport we're done. Safe on un-started streams via the try/catch. + try { + stream.forceClose("Session aborted due to uncaught exception", e); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Exception while force-closing stream during abort", + info.getLogName()), + t); + } + + notifyTerminalClose( + Status.INTERNAL.withDescription("Session aborted").withCause(e), + new Metadata(), + localRpc, + prevState); } @Override @@ -675,13 +740,27 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { } VRpcImpl localVRpc = currentRpc; - PeerInfo localPeerInfo = stream.getPeerInfo(); currentRpc = null; updateState(SessionState.CLOSED); - if (localVRpc != null) { + notifyTerminalClose(status, trailers, localVRpc, prevState); + } + + /** + * Fan out terminal notifications to the in-flight vRpc, tracer, and session listener with local + * guards so a throw in one notification does not suppress the others. + * + *

Caller contract: must have already transitioned to {@link SessionState#CLOSED}, captured and + * cleared {@code currentRpc}, and ensured {@code closeReason} is set (synthesizing if absent). + */ + private void notifyTerminalClose( + Status status, + Metadata trailers, + @Nullable VRpcImpl localRpc, + SessionState prevState) { + if (localRpc != null) { try { - localVRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); + localRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); } catch (Throwable t) { logger.log( Level.WARNING, @@ -691,10 +770,44 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { info.getLogName(), status), t); } - tracer.onVRpcClose(Status.UNAVAILABLE.getCode()); + try { + tracer.onVRpcClose(Status.UNAVAILABLE.getCode()); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Unhandled exception in tracer.onVRpcClose", info.getLogName()), + t); + } + } + try { + tracer.onClose(safeGetPeerInfo(), closeReason.getReason(), status); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format("Session error: %s Unhandled exception in tracer.onClose", info.getLogName()), + t); + } + if (sessionListener != null) { + try { + sessionListener.onClose(prevState, status, trailers); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Unhandled exception in sessionListener.onClose", + info.getLogName()), + t); + } + } + } + + private PeerInfo safeGetPeerInfo() { + try { + return stream.getPeerInfo(); + } catch (Throwable t) { + return SessionStream.DISCONNECTED_PEER_INFO; } - tracer.onClose(localPeerInfo, closeReason.getReason(), status); - sessionListener.onClose(prevState, status, trailers); } private void updateState(SessionState newState) { diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index 90a7d3a20066..b7e12e057818 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -569,4 +569,147 @@ void testCancel() throws Exception { session.close(CloseSessionRequest.getDefaultInstance()); sessionListener.popUntil(Status.class); } + + // region uncaught-exception abort behaviors + + @Test + void abortFiresWhenListenerOnReadyThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + java.util.concurrent.CountDownLatch onCloseLatch = new java.util.concurrent.CountDownLatch(1); + java.util.concurrent.atomic.AtomicReference capturedStatus = + new java.util.concurrent.atomic.AtomicReference<>(); + + Session.Listener throwingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + throw new RuntimeException("simulated onReady failure"); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + capturedStatus.set(status); + onCloseLatch.countDown(); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + throwingListener); + + // The abort path must drive the session to CLOSED and notify the listener via onClose, even + // though the original onReady threw. + assertWithMessage("listener.onClose must be invoked after onReady throws") + .that(onCloseLatch.await(5, TimeUnit.SECONDS)) + .isTrue(); + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + assertThat(capturedStatus.get().getCode()).isEqualTo(Status.Code.INTERNAL); + } + + @Test + void abortDoesNotHangWhenListenerOnCloseThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + java.util.concurrent.CountDownLatch onReadyLatch = new java.util.concurrent.CountDownLatch(1); + java.util.concurrent.CountDownLatch onCloseLatch = new java.util.concurrent.CountDownLatch(1); + + Session.Listener throwingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + onReadyLatch.countDown(); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + onCloseLatch.countDown(); + throw new RuntimeException("simulated onClose failure"); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + throwingListener); + + assertThat(onReadyLatch.await(5, TimeUnit.SECONDS)).isTrue(); + + // Close normally. The listener's onClose throws — the local guard inside notifyTerminalClose + // must swallow it so the SyncContext drain doesn't recurse infinitely or hang. + session.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("test") + .build()); + + assertWithMessage("listener.onClose should be invoked exactly once during normal close") + .that(onCloseLatch.await(5, TimeUnit.SECONDS)) + .isTrue(); + + // The session should reach CLOSED state cleanly within the test timeout. + Stopwatch sw = Stopwatch.createStarted(); + while (session.getState() != Session.SessionState.CLOSED && sw.elapsed().getSeconds() < 5) { + Thread.sleep(10); + } + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + } + + @Test + void abortDoesNotInfiniteLoopWhenRecoveryListenerAlsoThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + java.util.concurrent.CountDownLatch onCloseInvoked = + new java.util.concurrent.CountDownLatch(1); + + Session.Listener doublyThrowingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + throw new RuntimeException("simulated onReady failure"); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + onCloseInvoked.countDown(); + throw new RuntimeException("simulated onClose failure during abort"); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + doublyThrowingListener); + + // onReady throws → abort fires → abort calls onClose, which also throws → Guard 4 swallows + // and isAborting prevents the handler from re-driving abort. The session must reach CLOSED + // without hanging (the @Timeout(30) on the class is the safety net for infinite loops). + assertThat(onCloseInvoked.await(5, TimeUnit.SECONDS)).isTrue(); + Stopwatch sw = Stopwatch.createStarted(); + while (session.getState() != Session.SessionState.CLOSED && sw.elapsed().getSeconds() < 5) { + Thread.sleep(10); + } + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + } + + // endregion } From 91740172f833ad3bd3287c8b479864aba864a068 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 16:38:29 +0000 Subject: [PATCH 10/24] chore: isolate user-callback executor on a cached thread pool Client (and ShimImpl) own a dedicated bigtable-callback-%d cached pool, plumbed through *Async / TableBase. A blocked user callback can no longer starve heartbeats, retry delays, or pool bookkeeping (all of which run on backgroundExecutor). The op-level SerializingExecutor in a later commit will dispatch onto this pool. --- .../v2/internal/api/AuthorizedViewAsync.java | 6 ++-- .../bigtable/data/v2/internal/api/Client.java | 33 +++++++++++++++---- .../internal/api/MaterializedViewAsync.java | 6 ++-- .../data/v2/internal/api/TableAsync.java | 6 ++-- .../data/v2/internal/api/TableBase.java | 11 +++++-- .../data/v2/internal/compat/ShimImpl.java | 10 +++++- .../data/v2/internal/api/TableBaseTest.java | 3 +- 7 files changed, 58 insertions(+), 17 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java index fdb79871324f..13dc6d7a5d72 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java @@ -48,7 +48,8 @@ static AuthorizedViewAsync createAndStart( String viewId, Permission permission, Metrics metrics, - BigtableTimer timer) { + BigtableTimer timer, + java.util.concurrent.Executor userCallbackExecutor) { AuthorizedViewName viewName = AuthorizedViewName.builder() @@ -78,7 +79,8 @@ static AuthorizedViewAsync createAndStart( callOptions, viewName.toString(), metrics, - timer); + timer, + userCallbackExecutor); return new AuthorizedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 5214c2480131..0d63a07e866e 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -33,10 +33,11 @@ import com.google.cloud.bigtable.data.v2.internal.csm.MetricsImpl; import com.google.cloud.bigtable.data.v2.internal.csm.NoopMetrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; -import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.grpc.CallOptions; import io.opencensus.stats.Stats; import io.opencensus.tags.Tags; @@ -45,6 +46,7 @@ import java.util.Collections; import java.util.Set; import java.util.WeakHashMap; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -73,6 +75,11 @@ public class Client implements AutoCloseable { private final FeatureFlags featureFlags; private final ClientInfo clientInfo; private final Resource backgroundExecutor; + // Drains the per-op SerializingExecutor. Cached pool so a blocked user callback does not starve + // heartbeats, retry delays, or other vRPCs (which all run on backgroundExecutor). + // TODO: source from the gax TransportChannelProvider so transport and user-callback dispatch + // share the same pool. Blocked on missing APIs to extract the configured executor from gax. + private final Resource userCallbackExecutor; // Hashed-wheel timer for heartbeat / deadline / watchdog / retry scheduling. Built over // backgroundExecutor (the timer's tick thread dispatches bodies onto it). Single tick thread per // Client, shared across every SessionPoolImpl. @@ -96,6 +103,12 @@ public static Client create(ClientSettings settings) throws IOException { .build(); ScheduledExecutorService backgroundExecutor = Executors.newScheduledThreadPool(4); + ExecutorService userCallbackExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + .setNameFormat("bigtable-callback-%d") + .setDaemon(true) + .build()); // TODO: compat layer: get this from settings String universeDomain = "googleapis.com"; @@ -143,6 +156,7 @@ public static Client create(ClientSettings settings) throws IOException { } metrics.close(); backgroundExecutor.shutdown(); + userCallbackExecutor.shutdown(); throw new RuntimeException("Failed to fetch initial config", e); } @@ -156,7 +170,8 @@ public static Client create(ClientSettings settings) throws IOException { settings.getChannelProvider(), Resource.createOwned(metrics, metrics::close), Resource.createOwned(configManager, configManager::close), - Resource.createOwned(backgroundExecutor, backgroundExecutor::shutdown)); + Resource.createOwned(backgroundExecutor, backgroundExecutor::shutdown), + Resource.createOwned(userCallbackExecutor, userCallbackExecutor::shutdown)); } public Client( @@ -165,13 +180,15 @@ public Client( ChannelProvider channelProvider, Resource metrics, Resource configManager, - Resource bgExecutor) + Resource bgExecutor, + Resource userCallbackExecutor) throws IOException { this.featureFlags = featureFlags; this.clientInfo = clientInfo; this.metrics = metrics; this.configManager = configManager; this.backgroundExecutor = bgExecutor; + this.userCallbackExecutor = userCallbackExecutor; // Timer's tick thread dispatches bodies onto backgroundExecutor — tick-thread-blocking work // (anything that takes a pool lock) gets handed off there instead of stalling the wheel. this.sessionTimer = new NettyWheelTimer("bigtable-session-timer", bgExecutor.get()); @@ -214,6 +231,7 @@ public void close() { // Stop the timer before tearing down backgroundExecutor (the timer's dispatcher). sessionTimer.stop(); backgroundExecutor.close(); + userCallbackExecutor.close(); } public TableAsync openTableAsync(String tableId, Permission permission) { @@ -227,7 +245,8 @@ public TableAsync openTableAsync(String tableId, Permission permission) { tableId, permission, metrics.get(), - sessionTimer); + sessionTimer, + userCallbackExecutor.get()); sessionPools.add(tableAsync.getSessionPool()); return tableAsync; } @@ -245,7 +264,8 @@ public AuthorizedViewAsync openAuthorizedViewAsync( viewId, permission, metrics.get(), - sessionTimer); + sessionTimer, + userCallbackExecutor.get()); sessionPools.add(viewAsync.getSessionPool()); return viewAsync; } @@ -262,7 +282,8 @@ public MaterializedViewAsync openMaterializedViewAsync( viewId, permission, metrics.get(), - sessionTimer); + sessionTimer, + userCallbackExecutor.get()); sessionPools.add(viewAsync.getSessionPool()); return viewAsync; } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java index 81a76f876fe6..7b955f3d0718 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java @@ -44,7 +44,8 @@ public static MaterializedViewAsync createAndStart( String viewId, OpenMaterializedViewRequest.Permission permission, Metrics metrics, - BigtableTimer timer) { + BigtableTimer timer, + java.util.concurrent.Executor userCallbackExecutor) { MaterializedViewName viewName = MaterializedViewName.builder() @@ -73,7 +74,8 @@ public static MaterializedViewAsync createAndStart( callOptions, viewId, metrics, - timer); + timer, + userCallbackExecutor); return new MaterializedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java index 8fb0ac27a2d3..86478e3f0f1f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java @@ -47,7 +47,8 @@ public static TableAsync createAndStart( String tableId, Permission permission, Metrics metrics, - BigtableTimer timer) { + BigtableTimer timer, + java.util.concurrent.Executor userCallbackExecutor) { TableName tableName = TableName.builder() @@ -76,7 +77,8 @@ public static TableAsync createAndStart( callOptions, tableId, metrics, - timer); + timer, + userCallbackExecutor); return new TableAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java index a11a05c9d4b9..ae8980af1aab 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java @@ -39,10 +39,12 @@ import io.grpc.Context; import io.grpc.Deadline; import io.grpc.Metadata; +import java.util.concurrent.Executor; class TableBase implements AutoCloseable { private final SessionPool sessionPool; private final BigtableTimer timer; + private final Executor userCallbackExecutor; private final Metrics metrics; private final VRpcDescriptor readRowDescriptor; private final VRpcDescriptor @@ -60,7 +62,8 @@ static TableBase createAndStart( CallOptions callOptions, String sessionPoolName, Metrics metrics, - BigtableTimer timer) { + BigtableTimer timer, + Executor userCallbackExecutor) { SessionPool sessionPool = new SessionPoolImpl<>( @@ -77,7 +80,7 @@ static TableBase createAndStart( sessionPool.start(openReq, new Metadata()); return new TableBase( - sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, timer); + sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, timer, userCallbackExecutor); } @VisibleForTesting @@ -86,12 +89,14 @@ static TableBase createAndStart( VRpcDescriptor readRowDescriptor, VRpcDescriptor mutateRowDescriptor, Metrics metrics, - BigtableTimer timer) { + BigtableTimer timer, + Executor userCallbackExecutor) { this.sessionPool = sessionPool; this.readRowDescriptor = readRowDescriptor; this.mutateRowDescriptor = mutateRowDescriptor; this.metrics = metrics; this.timer = timer; + this.userCallbackExecutor = userCallbackExecutor; } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java index 353973dc8e58..fcddf47f03b2 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java @@ -160,6 +160,13 @@ public static Shim create( featureFlags = featureFlags.toBuilder().setSessionsRequired(true).build(); } + java.util.concurrent.ExecutorService userCallbackExecutor = + java.util.concurrent.Executors.newCachedThreadPool( + new com.google.common.util.concurrent.ThreadFactoryBuilder() + .setNameFormat("bigtable-callback-shim-%d") + .setDaemon(true) + .build()); + Client client = new Client( clientChannelProvider.updateFeatureFlags(featureFlags), @@ -167,7 +174,8 @@ public static Shim create( clientChannelProvider, Resource.createShared(metrics), Resource.createShared(configManager), - Resource.createShared(bgExecutor)); + Resource.createShared(bgExecutor), + Resource.createOwned(userCallbackExecutor, userCallbackExecutor::shutdown)); return new ShimImpl(configManager, client); } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index b2b6e819e959..19a7bbed93bb 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -72,7 +72,8 @@ public void setup() { VRpcDescriptor.READ_ROW, VRpcDescriptor.MUTATE_ROW, noopMetrics, - mockTimer); + mockTimer, + com.google.common.util.concurrent.MoreExecutors.directExecutor()); deadline = Deadline.after(1, TimeUnit.MINUTES); f = new UnaryResponseFuture<>(); } From 9f741b089e6de1c09531d81784d1bb690d1799d3 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 16:45:28 +0000 Subject: [PATCH 11/24] chore: back OpExecutor with SerializingExecutor(userCallbackExecutor) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VOperationImpl now constructs OpExecutor over a per-call SequentialExecutor on the shared userCallbackExecutor, replacing the per-call SynchronizationContext. OpExecutor gains an UncaughtExceptionHandler ctor arg — the safety net that the removed RetryingVRpc-owned SyncContext provided. The 3-arg VRpcCallContext.create defaults to a no-op handler for tests; production callers go through VOperationImpl. --- .../data/v2/internal/api/TableBase.java | 6 +++-- .../v2/internal/middleware/OpExecutor.java | 22 +++++++++++++------ .../internal/middleware/VOperationImpl.java | 19 ++++++++++------ .../data/v2/internal/middleware/VRpc.java | 5 ++++- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java index ae8980af1aab..4082c306dae8 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java @@ -118,7 +118,8 @@ public void readRow( new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), timer); VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), readRowDescriptor, deadline); - new VOperationImpl<>(retry, Context.current(), tracer, deadline, true).start(req, listener); + new VOperationImpl<>(retry, Context.current(), userCallbackExecutor, tracer, deadline, true) + .start(req, listener); } public void mutateRow( @@ -131,7 +132,8 @@ public void mutateRow( VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), mutateRowDescriptor, deadline); - new VOperationImpl<>(retry, Context.current(), tracer, deadline, idempotent) + new VOperationImpl<>( + retry, Context.current(), userCallbackExecutor, tracer, deadline, idempotent) .start(req, listener); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java index 91669284549c..7622647bcc38 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java @@ -19,22 +19,28 @@ import java.util.concurrent.Executor; /** - * Per-op serializing executor. Wraps a delegate {@link Executor} and tracks which thread is - * currently running a task, so callers can assert they are on the executor (analogous to {@code + * Per-op serializing executor. Wraps a delegate {@link Executor} (typically a per-call {@code + * SerializingExecutor} over the user-callback pool) and tracks which thread is currently running a + * task, so callers can assert they are on the executor (analogous to {@code * SynchronizationContext#throwIfNotInThisSynchronizationContext}). * - *

Backing executor evolves over the refactor — for now it is the per-call {@link - * io.grpc.SynchronizationContext} that {@link VOperationImpl} constructs. Later commits swap it - * for a {@code SerializingExecutor} over the user-callback pool, and eventually a tailored inline - * queue. + *

If a task throws, the registered {@link UncaughtExceptionHandler} is invoked — this is the + * last-resort recovery point for the chain. Without it, a throw from a user-installed tracer or a + * listener callback would silently drop and the caller's future would never complete. */ public final class OpExecutor implements Executor { + public interface UncaughtExceptionHandler { + void uncaught(Throwable t); + } + private final Executor backing; + private final UncaughtExceptionHandler handler; private volatile Thread runningThread; - public OpExecutor(Executor backing) { + public OpExecutor(Executor backing, UncaughtExceptionHandler handler) { this.backing = backing; + this.handler = handler; } @Override @@ -45,6 +51,8 @@ public void execute(Runnable r) { runningThread = Thread.currentThread(); try { r.run(); + } catch (Throwable t) { + handler.uncaught(t); } finally { runningThread = prev; } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java index 4d8b7427d144..8e8a1554bea8 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -24,8 +24,8 @@ import com.google.common.util.concurrent.MoreExecutors; import io.grpc.Context; import io.grpc.Deadline; -import io.grpc.SynchronizationContext; import java.util.Optional; +import java.util.concurrent.Executor; import java.util.concurrent.TimeoutException; import javax.annotation.Nullable; @@ -39,6 +39,7 @@ public class VOperationImpl implements VOperation { private final VRpc chain; private final Context grpcContext; + private final Executor userCallbackExecutor; private final VRpcTracer tracer; private final Deadline deadline; private final boolean idempotent; @@ -47,11 +48,13 @@ public class VOperationImpl implements VOperation { public VOperationImpl( VRpc chain, Context grpcContext, + Executor userCallbackExecutor, VRpcTracer tracer, Deadline deadline, boolean idempotent) { this.chain = chain; this.grpcContext = grpcContext; + this.userCallbackExecutor = userCallbackExecutor; this.tracer = tracer; this.deadline = deadline; this.idempotent = idempotent; @@ -69,12 +72,14 @@ public VOperationImpl( @Override public void start(ReqT req, VRpcListener listener) { - // Per-call SynchronizationContext serializes all middleware below this layer. Uncaught task - // failures drive the chain to a terminal state so the caller's listener still gets onClose; - // RetryingVRpc.cancel is idempotent so the resulting cascade collapses safely. - SynchronizationContext syncContext = - new SynchronizationContext((t, e) -> chain.cancel("Uncaught exception in op executor", e)); - OpExecutor exec = new OpExecutor(syncContext); + // Per-call SerializingExecutor over the shared user-callback pool. The handler is the + // last-resort recovery: if any op-executor task throws (typically a user-installed tracer or + // a listener callback escape), drive the chain to a terminal state so the caller's listener + // still receives an onClose. RetryingVRpc.cancel is idempotent so cascades collapse safely. + OpExecutor exec = + new OpExecutor( + MoreExecutors.newSequentialExecutor(userCallbackExecutor), + t -> chain.cancel("Uncaught exception in op executor task", t)); VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer, exec); grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); chain.start(req, ctx, new CleanupListener<>(listener, grpcContext, cancellationListener)); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java index d17f9b6eee7f..d5a605461d37 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java @@ -111,7 +111,10 @@ abstract class VRpcCallContext { public static VRpcCallContext create( Deadline deadline, boolean isIdempotent, VRpcTracer tracer) { return create( - deadline, isIdempotent, tracer, new OpExecutor(MoreExecutors.directExecutor())); + deadline, + isIdempotent, + tracer, + new OpExecutor(MoreExecutors.directExecutor(), t -> {})); } public static VRpcCallContext create( From ac7b8901a47989e7d62d7a0c79b1dafd8908b56a Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Thu, 28 May 2026 02:22:22 +0000 Subject: [PATCH 12/24] chore: configure gRPC session streams with DirectExecutor CallOptions.withExecutor(MoreExecutors.directExecutor()) on the session stream so Netty I/O threads deliver SessionStream.Listener callbacks directly; sessionSyncContext immediately trampolines off them. --- .../data/v2/internal/channels/ChannelPoolDpImpl.java | 6 +++++- .../bigtable/data/v2/internal/channels/SessionStream.java | 8 ++++++++ .../data/v2/internal/channels/SingleChannelPool.java | 6 +++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java index 7913b28ef14c..9a60a271ea7f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java @@ -26,6 +26,7 @@ import com.google.common.base.Ticker; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; +import com.google.common.util.concurrent.MoreExecutors; import io.grpc.CallOptions; import io.grpc.ClientCall; import io.grpc.ManagedChannel; @@ -246,8 +247,11 @@ public synchronized SessionStream newStream( channelWrapper.group.numStreams++; totalStreams++; + // DirectExecutor: gRPC/Netty delivers SessionStream.Listener callbacks directly on the + // I/O thread. All work must be fast and non-blocking; blocking work goes to sessionSyncContext. ClientCall innerCall = - channelWrapper.channel.newCall(desc, callOptions); + channelWrapper.channel.newCall( + desc, callOptions.withExecutor(MoreExecutors.directExecutor())); return new SessionStreamImpl(innerCall) { // mark as null so that onClose can tell if onBeforeSessionStart was never called diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java index ebbc39af7f2c..eb6dbfa339eb 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java @@ -37,6 +37,14 @@ public interface SessionStream { public void forceClose(@Nullable String message, @Nullable Throwable cause); + /** + * Callbacks for session stream events. + * + *

Invariant: callbacks are delivered on Netty I/O threads via {@code DirectExecutor}. + * All work must be fast and non-blocking — any user-facing or potentially blocking work must be + * dispatched onto the session {@code SynchronizationContext} (which then forwards to the op + * executor) before returning. Violating this stalls the channel. + */ public interface Listener { void onBeforeSessionStart(PeerInfo peerInfo); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java index 6d40b58d53d4..bfd099f60d51 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java @@ -19,6 +19,7 @@ import com.google.bigtable.v2.SessionClientConfiguration.ChannelPoolConfiguration; import com.google.bigtable.v2.SessionRequest; import com.google.bigtable.v2.SessionResponse; +import com.google.common.util.concurrent.MoreExecutors; import io.grpc.CallOptions; import io.grpc.ManagedChannel; import io.grpc.MethodDescriptor; @@ -45,7 +46,10 @@ public void close() { @Override public SessionStream newStream( MethodDescriptor desc, CallOptions callOptions) { - return new SessionStreamImpl(channel.newCall(desc, callOptions)); + // DirectExecutor: gRPC/Netty delivers SessionStream.Listener callbacks directly on the + // I/O thread. All work must be fast and non-blocking; blocking work goes to sessionSyncContext. + return new SessionStreamImpl( + channel.newCall(desc, callOptions.withExecutor(MoreExecutors.directExecutor()))); } @Override From ea69618b2b6762df4f9cc04370583b5987a862b6 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Thu, 28 May 2026 02:45:48 +0000 Subject: [PATCH 13/24] chore: route PendingVRpc per-op state through the op executor PendingVRpc.cancel and drainTo move isCancelled/realCall onto ctx.getExecutor(); the pool lock now covers only queue / poolState / session list. close() switches to cancelWithResult to honor the new op-executor contract. NOOP_CALL sentinel removed. --- .../v2/internal/session/SessionPoolImpl.java | 165 ++++++++---------- 1 file changed, 71 insertions(+), 94 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index 67c2ac505358..bd1fac364b5e 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -236,6 +236,33 @@ public SessionPoolImpl( }); } + @GuardedBy("this") + private void scheduleNextAfePrune() { + if (closed) { + return; + } + afeListPruneTimeout = + timer.newTimeout( + this::runAfePruneAndReschedule, + SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), + TimeUnit.MILLISECONDS); + } + + private void runAfePruneAndReschedule() { + synchronized (SessionPoolImpl.this) { + try { + if (closed) { + return; + } + sessions.prune(); + } catch (Throwable t) { + logger.log(Level.WARNING, "AFE prune tick threw; continuing", t); + } finally { + scheduleNextAfePrune(); + } + } + } + @Override public SessionPoolInfo getInfo() { return info; @@ -245,6 +272,7 @@ public SessionPoolInfo getInfo() { public void close(CloseSessionRequest req) { configListenerHandle.close(); + List> toCancel; synchronized (this) { if (poolState == PoolState.CLOSED) { logger.fine(String.format("Tried to close a closed SessionPool %s", info.getLogName())); @@ -255,9 +283,8 @@ public void close(CloseSessionRequest req) { poolState = PoolState.CLOSED; closed = true; - for (PendingVRpc pendingRpc : pendingRpcs) { - pendingRpc.cancel("SessionPool closed: " + req, null); - } + toCancel = new ArrayList<>(pendingRpcs); + pendingRpcs.clear(); if (afeListPruneTimeout != null) { afeListPruneTimeout.cancel(); afeListPruneTimeout = null; @@ -270,36 +297,14 @@ public void close(CloseSessionRequest req) { sessions.close(req); } - // Timer is owned by the Client and shared across pools; do not stop it here. - } - - // Self-rescheduling AFE prune. Pattern matches Watchdog: tick dispatches to the pool executor, - // executor takes the lock, prunes, schedules the next tick. Tolerates body exceptions so a - // transient fault does not permanently disable pruning. - @GuardedBy("this") - private void scheduleNextAfePrune() { - if (closed) { - return; - } - afeListPruneTimeout = - timer.newTimeout( - this::runAfePruneAndReschedule, - SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), - TimeUnit.MILLISECONDS); - } - - private void runAfePruneAndReschedule() { - synchronized (SessionPoolImpl.this) { - try { - if (closed) { - return; - } - sessions.prune(); - } catch (Throwable t) { - logger.log(Level.WARNING, "AFE prune tick threw; continuing", t); - } finally { - scheduleNextAfePrune(); - } + // cancelWithResult trampolines through ctx.getExecutor() — required because the public + // cancel(String, Throwable) path asserts opExecutor affinity, but close() runs on the + // caller thread. + VRpcResult closeResult = + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription("SessionPool closed: " + req)); + for (PendingVRpc pendingRpc : toCancel) { + pendingRpc.cancelWithResult(closeResult); } } @@ -542,16 +547,7 @@ private void onSessionClose( status, trailers))); for (PendingVRpc vrpc : toBeClosed) { try { - vrpc.ctx - .getExecutor() - .execute( - () -> { - try { - vrpc.getListener().onClose(result); - } catch (Throwable t) { - logger.log(Level.WARNING, "Exception when closing request", t); - } - }); + vrpc.cancelWithResult(result); } catch (Throwable t) { logger.log(Level.WARNING, "Exception dispatching close to op executor", t); } @@ -562,10 +558,6 @@ private void onSessionClose( @GuardedBy("this") private void tryDrainPendingRpcs() { while (!pendingRpcs.isEmpty()) { - if (pendingRpcs.peek().isCancelled) { - pendingRpcs.pop(); - continue; - } Optional handle = picker.pickSession(); if (!handle.isPresent()) { break; @@ -581,11 +573,8 @@ private void tryDrainPendingRpcs() { Iterator> iter = pendingRpcs.iterator(); while (iter.hasNext()) { PendingVRpc vrpc = iter.next(); - // vrpcs that have started on a session gets closed in SessionImpl. Do not double close. - if (!vrpc.isCancelled && vrpc.realCall == null) { - iter.remove(); - toBeClosed.add(vrpc); - } + iter.remove(); + toBeClosed.add(vrpc); } return toBeClosed; } @@ -606,7 +595,6 @@ public synchronized VRpc(desc); } - @GuardedBy("this") private VRpc newRealCall( VRpcDescriptor desc, SessionHandle handle) { @@ -697,9 +685,6 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { } } - // It's safe to call cancel on a vrpc more than once. It'll be a noop after the initial - // call. Cancelled vrpcs are removed from the pending vrpc queue the next time we - // drain the queue. @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { Status status = Status.CANCELLED; @@ -712,31 +697,31 @@ public void cancel(@Nullable String message, @Nullable Throwable cause) { cancel(status, false); } - // Cancel could race with drainTo which sets the real call. Assign realCall to a NOOP_CALL - // so if drainTo gets called at the same time, it'll just get swallowed and we're only calling - // onClose once on the listener. The cancel could also come from deadline monitor when - // the deadline expires. In this case if the real call is already set, we want to real call - // to handle the deadline and return early. + // cancel() and drainTo() are sequenced via ctx.getExecutor() (a per-op SerializingExecutor), + // so isCancelled and realCall are owned exclusively by that executor — no pool lock needed. private void cancel(Status status, boolean onlyCancelPendingCall) { - boolean delegateToRealCall = true; synchronized (SessionPoolImpl.this) { - if (isCancelled) { - return; - } + pendingRpcs.remove(this); // eager removal; no-op if already drained + } + ctx.getExecutor().execute(() -> { + if (isCancelled) return; isCancelled = true; - if (realCall == null) { - this.realCall = NOOP_CALL; - delegateToRealCall = false; - } else if (onlyCancelPendingCall) { - return; + if (realCall != null) { + if (!onlyCancelPendingCall) { + realCall.cancel(status.getDescription(), status.getCause()); + } + } else { + listener.onClose(VRpcResult.createRejectedError(status)); } - } - if (delegateToRealCall) { - realCall.cancel(status.getDescription(), status.getCause()); - } else { - VRpcResult result = VRpcResult.createRejectedError(status); - ctx.getExecutor().execute(() -> listener.onClose(result)); - } + }); + } + + void cancelWithResult(VRpcResult result) { + ctx.getExecutor().execute(() -> { + if (isCancelled) return; + isCancelled = true; + listener.onClose(result); + }); } @Override @@ -749,15 +734,18 @@ public void requestNext() { } private void drainTo(SessionHandle handle) { - synchronized (SessionPoolImpl.this) { - if (realCall == null) { - this.realCall = newRealCall(desc, handle); - } - } - this.realCall.start(req, ctx, listener); if (deadlineMonitor != null) { deadlineMonitor.cancel(); } + ctx.getExecutor().execute(() -> { + if (isCancelled) { + SessionPoolImpl.this.onVRpcComplete( + handle, Duration.ZERO, VRpcResult.createRejectedError(Status.CANCELLED)); + return; + } + realCall = newRealCall(desc, handle); + realCall.start(req, ctx, listener); + }); } private VRpcListener getListener() { @@ -910,15 +898,4 @@ public void close() { } } - private static final VRpc NOOP_CALL = - new VRpc() { - @Override - public void start(Object req, VRpcCallContext ctx, VRpcListener listener) {} - - @Override - public void cancel(@Nullable String message, @Nullable Throwable cause) {} - - @Override - public void requestNext() {} - }; } From a17cc1e56b87f1b0c0e19cf3133971d75eb9c3d5 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 17:12:08 +0000 Subject: [PATCH 14/24] chore: consolidate cancel trampolines at VOperationImpl VOperationImpl captures opExecutor in start() and trampolines start/cancel via it. RetryingVRpc.start runs synchronously on the op-executor task RetryingVRpc.cancel no longer wraps in execute. Tracer.onOperationStart reordered before started=true (a throwing tracer short-circuits to direct listener.onClose). listener.onMessage failures classify as USER_FAILURE. CleanupListener tracks a closed flag to prevent gRPC-context listener leaks on synchronous chain close. --- .../v2/internal/middleware/RetryingVRpc.java | 137 +++++++++++------- .../internal/middleware/VOperationImpl.java | 38 +++-- 2 files changed, 112 insertions(+), 63 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index ffaafc4e180f..5c2f389198cd 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -47,11 +47,11 @@ public class RetryingVRpc implements VRpc { private final BigtableTimer timer; - // current state and all the flags don't need to be volatile because they're only updated within - // the op executor. + // All mutable state is owned by the op executor; VOperationImpl trampolines every inbound call + // onto it, so no synchronization is needed here. private State currentState; private boolean started; - // Breaks the loop if uncaught exception happens during op-executor execution. + // Breaks the loop on uncaught exception during cancel. private boolean isCancelling; public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { @@ -69,64 +69,80 @@ public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { @Override public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { - ctx.getExecutor() - .execute( - () -> { - if (started) { - listener.onClose( - VRpcResult.createRejectedError( - Status.FAILED_PRECONDITION.withDescription( - "operation is already started"))); - return; - } - started = true; + if (started) { + listener.onClose( + VRpcResult.createRejectedError( + Status.FAILED_PRECONDITION.withDescription("operation is already started"))); + return; + } - this.request = req; - this.listener = listener; - this.context = ctx; - this.tracer = context.getTracer(); + // Publish the fields BEFORE the try block. If anything below throws and we recover via + // cancel(), cancel() reads this.context / this.listener — they must be set already, or + // we trade the original failure for an NPE inside the recovery path. + this.request = req; + this.listener = listener; + this.context = ctx; + this.tracer = context.getTracer(); + + // tracer.onOperationStart runs BEFORE started=true so a tracer failure short-circuits to a + // direct listener.onClose without entering the state machine. If started=true were set first, + // a tracer throw would route through cancel→Done.onStart, which would then NPE in its own + // finally on tracer.onOperationFinish/recordApplicationBlockingLatencies, swallowing the + // original cause and surprising the caller with the secondary NPE. + try { + tracer.onOperationStart(); + } catch (Throwable t) { + listener.onClose( + VRpcResult.createRejectedError( + Status.INTERNAL.withDescription("tracer.onOperationStart failed").withCause(t))); + return; + } + started = true; - tracer.onOperationStart(); - currentState.onStart(); - }); + try { + currentState.onStart(); + } catch (Throwable t) { + cancel("Unexpected error in start", t); + } } @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { - context - .getExecutor() - .execute( - () -> { - if (currentState.isDone() || isCancelling) { - LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); - return; - } - // Prevents infinite loop if there's any error thrown during this phase. - isCancelling = true; - Throwable finalCause = cause; - try { - currentState.onCancel(message, cause); - } catch (Throwable t) { - if (finalCause != null) { - finalCause.addSuppressed(t); - } else { - finalCause = t; - } - } - onStateChange( - new Done( - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(finalCause)))); - }); + if (currentState.isDone() || isCancelling) { + LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); + return; + } + // Prevents infinite loop if there's any error thrown during this phase. + isCancelling = true; + Throwable finalCause = cause; + try { + currentState.onCancel(message, cause); + } catch (Throwable t) { + if (finalCause != null) { + finalCause.addSuppressed(t); + } else { + finalCause = t; + } + } + onStateChange( + new Done( + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(finalCause)))); } @Override public void requestNext() { + // Assert the op-executor affinity even though the body is dead today — when streaming lands + // and this becomes real, the missing assertion would silently allow off-thread access. + // Guarded on context being set so a misuse before start() still throws UnsupportedOperationException + // rather than NPE on the assertion. + if (context != null) { + context.getExecutor().throwIfNotInThisExecutor(); + } throw new UnsupportedOperationException("request next is not supported in unary"); } void onStateChange(State state) { - context.getExecutor().throwIfNotInThisExecutor(); if (currentState.isDone()) { return; } @@ -169,10 +185,9 @@ public void onStart() { request, context, new VRpcListener() { - // VRpcImpl dispatches its callbacks via ctx.getExecutor() already, so these methods - // run inside the op-executor task — no need to re-dispatch here. @Override public void onMessage(RespT msg) { + context.getExecutor().throwIfNotInThisExecutor(); if (currentState != Active.this) { LOG.log( Level.FINE, @@ -182,15 +197,30 @@ public void onMessage(RespT msg) { } tracer.onResponseReceived(); Stopwatch appTimer = Stopwatch.createStarted(); + Throwable userThrow = null; try { listener.onMessage(msg); + } catch (Throwable t) { + userThrow = t; } finally { tracer.recordApplicationBlockingLatencies(appTimer.elapsed()); } + if (userThrow != null) { + // Classify as USER_FAILURE (not CANCELLED, which is what the OpExecutor uncaught + // handler would produce via chain.cancel). Finish tracing for the in-flight + // attempt, cancel the underlying gRPC call so no further events arrive (its later + // onClose is dropped by the currentState != Active.this guard), and transition + // directly to Done with the user-error result. + VRpcResult userResult = VRpcResult.createUserError(userThrow); + tracer.onAttemptFinish(userResult); + attempt.cancel("User callback threw", userThrow); + onStateChange(new Done(userResult)); + } } @Override public void onClose(VRpcResult result) { + context.getExecutor().throwIfNotInThisExecutor(); tracer.onAttemptFinish(result); if (currentState != Active.this) { LOG.log( @@ -214,6 +244,7 @@ public void onClose(VRpcResult result) { } return; } + onStateChange(new Done(result)); } }); @@ -271,8 +302,6 @@ public void onStart() { try { // Wraps go innermost so the captured gRPC + OpenTelemetry contexts are re-established at // the moment the body runs, not just while the dispatcher is invoking the outer task. - // The executor may queue the inner runnable for a later drain on a different thread; an - // outer wrap's scope would already be closed by then. future = timer.newTimeout( () -> @@ -299,9 +328,9 @@ public void onStart() { @Override public void onCancel(String reason, Throwable throwable) { - // future can be null if newTimeout throws an exception. In which case sync context uncaught - // exception handler will be called, which calls cancel on the current state before - // transition into done state. + // future can be null if schedule throws an exception that's not RejectedExecutionException. + // In which case sync context uncaught exception handler will be called, which calls cancel on + // the current state before transition into done state. if (future != null && !future.isCancelled()) { future.cancel(); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java index 8e8a1554bea8..b7a89fa65917 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -30,8 +30,9 @@ import javax.annotation.Nullable; /** - * The single edge between the user and the VRpc middleware chain. Constructs the per-op {@link - * VRpcCallContext} and owns the gRPC {@link Context} cancellation listener. + * The single edge between the user and the VRpc middleware chain. Trampolines all inbound user + * calls onto opExecutor and owns the gRPC {@link Context} cancellation listener so that every + * layer below is single-threaded on opExecutor. * *

Precondition: {@link #cancel} must not be called before {@link #start}. */ @@ -45,6 +46,10 @@ public class VOperationImpl implements VOperation { private final boolean idempotent; private final Context.CancellationListener cancellationListener; + // Written in start() on the caller thread before the listener is registered and before cancel() + // is reachable from any external thread. Volatile for safe publication to those threads. + private volatile OpExecutor opExecutor; + public VOperationImpl( VRpc chain, Context grpcContext, @@ -63,7 +68,7 @@ public VOperationImpl( boolean deadlineExceeded = Optional.ofNullable(c.getDeadline()).map(Deadline::isExpired).orElse(false); deadlineExceeded = deadlineExceeded && c.cancellationCause() instanceof TimeoutException; - // Let VRpc machinery handle deadline exceeded. + // Let VRpc machinery handle deadline exceeded if (!deadlineExceeded) { cancel("gRPC context cancelled", c.cancellationCause()); } @@ -72,27 +77,41 @@ public VOperationImpl( @Override public void start(ReqT req, VRpcListener listener) { - // Per-call SerializingExecutor over the shared user-callback pool. The handler is the - // last-resort recovery: if any op-executor task throws (typically a user-installed tracer or - // a listener callback escape), drive the chain to a terminal state so the caller's listener - // still receives an onClose. RetryingVRpc.cancel is idempotent so cascades collapse safely. + // Last-resort recovery: if any op-executor task throws (typically a user-installed tracer, + // or a listener callback that escapes RetryingVRpc's existing per-state try/catches), drive + // the chain to a terminal state so the caller's listener still receives an onClose. The + // handler runs on the failed task's wrapper, so chain.cancel() — which calls + // OpExecutor#throwIfNotInThisExecutor — passes. RetryingVRpc.cancel is idempotent + // (isCancelling / currentState.isDone() guards), so a cascade of failures collapses to a + // single Done. OpExecutor exec = new OpExecutor( MoreExecutors.newSequentialExecutor(userCallbackExecutor), t -> chain.cancel("Uncaught exception in op executor task", t)); + this.opExecutor = exec; VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer, exec); + CleanupListener wrapped = + new CleanupListener<>(listener, grpcContext, cancellationListener); + // Register the gRPC context listener BEFORE submitting chain.start. The submit queues the + // task on the op executor; chain.cancel from this listener also queues. SequentialExecutor + // preserves submission order, so a context-cancel fired during/before chain.start will be + // processed after it. grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); - chain.start(req, ctx, new CleanupListener<>(listener, grpcContext, cancellationListener)); + exec.execute(() -> chain.start(req, ctx, wrapped)); } @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { - chain.cancel(message, cause); + opExecutor.execute(() -> chain.cancel(message, cause)); } private static class CleanupListener extends ForwardListener { private final Context grpcContext; private final Context.CancellationListener cancellationListener; + // Read by VOperationImpl.start on the caller thread after runInline returns. runInline runs + // chain.start synchronously, so any sync onClose has completed (and this flag been set) by + // the time start() reads it on the same thread — no synchronization needed. + volatile boolean closed = false; CleanupListener( VRpcListener delegate, @@ -105,6 +124,7 @@ private static class CleanupListener extends ForwardListener { @Override public void onClose(VRpcResult result) { + closed = true; grpcContext.removeListener(cancellationListener); super.onClose(result); } From fc26c64220aeed8d65cecedd94008f108c862f26 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Sun, 14 Jun 2026 17:17:51 +0000 Subject: [PATCH 15/24] chore: replace OpExecutor backing with an inline-capable queue; tighten Client.close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpExecutor switches from a SequentialExecutor backing to an internal ArrayDeque + drain loop, and gains runInline() — runs the task synchronously on the caller thread when the executor is idle, otherwise queues it. VOperationImpl uses runInline for chain.start so the start dispatch skips the queue+drain round-trip. Drain rejections (e.g. during shutdown) reset drainScheduled so subsequent submissions can retry. Client.close drains userCallbackExecutor first via a 5s-bounded shutdownAndAwait so pool.close's cancelWithResult onClose notifications complete before the executor is torn down. Resource.close becomes idempotent. --- .../bigtable/data/v2/internal/api/Client.java | 33 ++++- .../v2/internal/middleware/OpExecutor.java | 114 +++++++++++++++--- .../internal/middleware/VOperationImpl.java | 22 ++-- 3 files changed, 138 insertions(+), 31 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 0d63a07e866e..56d517f021a7 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -171,7 +171,7 @@ public static Client create(ClientSettings settings) throws IOException { Resource.createOwned(metrics, metrics::close), Resource.createOwned(configManager, configManager::close), Resource.createOwned(backgroundExecutor, backgroundExecutor::shutdown), - Resource.createOwned(userCallbackExecutor, userCallbackExecutor::shutdown)); + Resource.createOwned(userCallbackExecutor, () -> shutdownAndAwait(userCallbackExecutor))); } public Client( @@ -225,13 +225,17 @@ public void close() { .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) .setDescription("Client closing") .build())); + // Drain user-callback first so pool.close's cancelWithResult listener notifications complete + // before we tear down the surrounding executors and timer. Without this, the late onClose + // submissions race the shutdown and get RejectedExecutionException, silently dropping the + // user's terminal onClose. + userCallbackExecutor.close(); metrics.close(); channelPool.close(); configManager.close(); // Stop the timer before tearing down backgroundExecutor (the timer's dispatcher). sessionTimer.stop(); backgroundExecutor.close(); - userCallbackExecutor.close(); } public TableAsync openTableAsync(String tableId, Permission permission) { @@ -289,8 +293,10 @@ public MaterializedViewAsync openMaterializedViewAsync( } public static class Resource { - private T value; - private Runnable closer; + private final T value; + private final Runnable closer; + private final java.util.concurrent.atomic.AtomicBoolean closed = + new java.util.concurrent.atomic.AtomicBoolean(false); public static Resource createOwned(T value, Runnable closer) { return new Resource<>(value, closer); @@ -305,12 +311,29 @@ private Resource(T value, Runnable closer) { this.closer = closer; } + /** Idempotent. Repeat calls are no-ops. */ public void close() { - this.closer.run(); + if (closed.compareAndSet(false, true)) { + this.closer.run(); + } } public T get() { return value; } } + + // Drain in-flight listener.onClose tasks before the executor is shut down; bound the wait at 5s + // so close() doesn't hang the caller on a pathological listener. + private static void shutdownAndAwait(ExecutorService exec) { + exec.shutdown(); + try { + if (!exec.awaitTermination(5, TimeUnit.SECONDS)) { + exec.shutdownNow(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + exec.shutdownNow(); + } + } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java index 7622647bcc38..5a3b3e0b9750 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java @@ -16,17 +16,26 @@ package com.google.cloud.bigtable.data.v2.internal.middleware; +import com.google.common.util.concurrent.MoreExecutors; +import java.util.ArrayDeque; import java.util.concurrent.Executor; /** - * Per-op serializing executor. Wraps a delegate {@link Executor} (typically a per-call {@code - * SerializingExecutor} over the user-callback pool) and tracks which thread is currently running a - * task, so callers can assert they are on the executor (analogous to {@code + * Per-op serializing executor that tracks which thread is currently draining it, so callers can + * assert they are running inside the executor (analogous to {@code * SynchronizationContext#throwIfNotInThisSynchronizationContext}). * - *

If a task throws, the registered {@link UncaughtExceptionHandler} is invoked — this is the - * last-resort recovery point for the chain. Without it, a throw from a user-installed tracer or a - * listener callback would silently drop and the caller's future would never complete. + *

{@link #runInline} executes synchronously on the caller thread when the executor is idle, or + * falls back to a queued submission when busy. Use it to avoid the queue+drain round-trip for the + * first task of a fresh op executor (e.g. the start() dispatch). + * + *

If a task throws, the registered {@link UncaughtExceptionHandler} is invoked on the same task + * thread (still inside the drain wrapper, so {@link #throwIfNotInThisExecutor} still passes). This + * is the last-resort recovery point for the op chain — without it, an exception inside a callback + * is silently dropped and the caller's listener never sees a terminal close. The handler's own + * throws propagate (caught by the backing executor in production; surfaced to the calling thread + * when the backing is {@link MoreExecutors#directExecutor()}, which is what makes fail-fast test + * handlers like {@code t -> throw new AssertionError(t)} work). */ public final class OpExecutor implements Executor { @@ -36,6 +45,11 @@ public interface UncaughtExceptionHandler { private final Executor backing; private final UncaughtExceptionHandler handler; + + // Guards queue and drainScheduled. runningThread is volatile so throwIfNotInThisExecutor() can + // read it lock-free; writes happen inside the lock to piggy-back the memory barrier. + private final ArrayDeque queue = new ArrayDeque<>(); + private boolean drainScheduled = false; private volatile Thread runningThread; public OpExecutor(Executor backing, UncaughtExceptionHandler handler) { @@ -45,18 +59,82 @@ public OpExecutor(Executor backing, UncaughtExceptionHandler handler) { @Override public void execute(Runnable r) { - backing.execute( - () -> { - Thread prev = runningThread; - runningThread = Thread.currentThread(); - try { - r.run(); - } catch (Throwable t) { - handler.uncaught(t); - } finally { - runningThread = prev; - } - }); + synchronized (queue) { + queue.add(r); + if (!drainScheduled && runningThread == null) { + scheduleDrainLocked(); + } + } + } + + /** + * Runs {@code r} synchronously on the caller thread if this executor is idle, otherwise queues + * it for later drain on the backing executor. Either way, FIFO ordering with other tasks is + * preserved. + */ + public void runInline(Runnable r) { + synchronized (queue) { + if (drainScheduled || runningThread != null || !queue.isEmpty()) { + queue.add(r); + if (!drainScheduled) { + scheduleDrainLocked(); + } + return; + } + runningThread = Thread.currentThread(); + } + try { + try { + r.run(); + } catch (Throwable t) { + handler.uncaught(t); + } + } finally { + synchronized (queue) { + runningThread = null; + if (!queue.isEmpty() && !drainScheduled) { + scheduleDrainLocked(); + } + } + } + } + + // Schedule a drain on the backing executor. If the backing throws (e.g. RejectedExecutionException + // during shutdown), reset drainScheduled before propagating so the next execute() can retry + // instead of wedging the executor with no drainer. + private void scheduleDrainLocked() { + drainScheduled = true; + try { + backing.execute(this::drain); + } catch (Throwable t) { + drainScheduled = false; + throw t; + } + } + + private void drain() { + while (true) { + Runnable r; + synchronized (queue) { + r = queue.poll(); + if (r == null) { + drainScheduled = false; + return; + } + runningThread = Thread.currentThread(); + } + try { + try { + r.run(); + } catch (Throwable t) { + handler.uncaught(t); + } + } finally { + synchronized (queue) { + runningThread = null; + } + } + } } public void throwIfNotInThisExecutor() { diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java index b7a89fa65917..86556958f4da 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -86,18 +86,24 @@ public void start(ReqT req, VRpcListener listener) { // single Done. OpExecutor exec = new OpExecutor( - MoreExecutors.newSequentialExecutor(userCallbackExecutor), - t -> chain.cancel("Uncaught exception in op executor task", t)); + userCallbackExecutor, t -> chain.cancel("Uncaught exception in op executor task", t)); this.opExecutor = exec; VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer, exec); CleanupListener wrapped = new CleanupListener<>(listener, grpcContext, cancellationListener); - // Register the gRPC context listener BEFORE submitting chain.start. The submit queues the - // task on the op executor; chain.cancel from this listener also queues. SequentialExecutor - // preserves submission order, so a context-cancel fired during/before chain.start will be - // processed after it. - grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); - exec.execute(() -> chain.start(req, ctx, wrapped)); + exec.runInline(() -> chain.start(req, ctx, wrapped)); + // Register AFTER chain.start so a context-cancel that fires immediately is sequenced behind + // start. runInline runs chain.start synchronously, so it has fully completed by the time the + // listener is registered. Matches ClientCallImpl's ordering (grpc-java issue #1343). + // + // If the chain reached a terminal onClose synchronously inside runInline (uncaught-handler + // recovery, immediate failure), CleanupListener already tried to remove a listener that was + // never registered (no-op). Skip the addListener in that case — otherwise we'd register a + // listener on grpcContext that nothing will ever remove, pinning the entire chain for the + // lifetime of the (potentially long-lived) gRPC Context. + if (!wrapped.closed) { + grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); + } } @Override From 520a2be94b5028f9acad4cc065724a8a7e980789 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Wed, 27 May 2026 23:51:04 +0000 Subject: [PATCH 16/24] test: add 30-second @Timeout to all session/pool integration tests Tests that call Future.get() on vRpc chains can hang indefinitely if an exception breaks the callback dispatch chain and orphans the future (see ISSUE-006). A class-level JUnit 5 @Timeout(30) converts a silent hang into a clear test failure within a bounded time. Applied to: RetryingVRpcTest, VRpcTracerTest, ClientTest, TableBaseTest, SessionImplTest, SessionPoolImplTest. --- .../google/cloud/bigtable/data/v2/internal/api/ClientTest.java | 2 ++ .../cloud/bigtable/data/v2/internal/api/TableBaseTest.java | 2 ++ .../bigtable/data/v2/internal/session/SessionImplTest.java | 2 ++ .../bigtable/data/v2/internal/session/SessionPoolImplTest.java | 2 ++ 4 files changed, 8 insertions(+) diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java index 641a63f0a573..e27f20c809d1 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java @@ -55,7 +55,9 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +@Timeout(30) public class ClientTest { private ClientConfiguration defaultConfig; diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index 19a7bbed93bb..1f24d3d156f4 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -39,10 +39,12 @@ import javax.annotation.Nullable; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +@Timeout(30) @ExtendWith(MockitoExtension.class) public class TableBaseTest { diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index b7e12e057818..f78e210ea35c 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -77,11 +77,13 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Answers; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +@Timeout(30) @ExtendWith(MockitoExtension.class) public class SessionImplTest { private ScheduledExecutorService executor; diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index ceb84d4b0e59..04c6de88d216 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -91,6 +91,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Answers; import org.mockito.ArgumentCaptor; @@ -98,6 +99,7 @@ import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +@Timeout(30) @Nested @ExtendWith(MockitoExtension.class) public class SessionPoolImplTest { From 2f2a111fd22bd2666f74f0db8a553f154fcb5463 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 17:23:13 +0000 Subject: [PATCH 17/24] fix: arm PendingVRpc deadline monitor only after committing to queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PendingVRpc.start scheduled the deadline timer before the synchronized pool-state check, so the pool-closed early-return path returned without cancelling it. The timer fired later and called listener.onClose a second time with DEADLINE_EXCEEDED. RetryingVRpc.Active suppressed the duplicate at the user-facing layer via its currentState guard, but tracer.onAttemptFinish still ran a second time for the already-finished attempt — corrupting per- attempt metrics. Move the monitorDeadline call inside the synchronized block, after the pool-state check, so closed pools take the fast-fail path without scheduling. Adds SessionPoolImplTest#pendingVRpcOnClosedPoolDoesNotLeakDeadlineMonitor; verified the test fails against the pre-fix code. --- .../v2/internal/session/SessionPoolImpl.java | 5 ++- .../internal/session/SessionPoolImplTest.java | 43 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index bd1fac364b5e..103abe6b2ec5 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -655,7 +655,6 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { this.req = req; this.ctx = ctx; this.listener = listener; - this.deadlineMonitor = monitorDeadline(ctx.getOperationInfo().getDeadline()); synchronized (SessionPoolImpl.this) { if (SessionPoolImpl.this.poolState != PoolState.STARTED) { @@ -666,6 +665,10 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { ctx.getExecutor().execute(() -> listener.onClose(result)); return; } + // Only arm the deadline monitor after we've committed to queueing; otherwise the + // fast-fail early return above would leak a timer that fires later and emits a phantom + // tracer.onAttemptFinish on the Active state's stale listener. + this.deadlineMonitor = monitorDeadline(ctx.getOperationInfo().getDeadline()); pendingRpcs.add(this); if (logger.isLoggable(Level.FINE)) { diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index 04c6de88d216..82cbc979dce4 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -82,6 +82,7 @@ import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -279,6 +280,48 @@ public void onClose(VRpcResult result) { } } + @Test + void pendingVRpcOnClosedPoolDoesNotLeakDeadlineMonitor() throws InterruptedException { + // Regression: PendingVRpc.start used to arm the deadline timer before the pool-state + // check, so the fast-fail "pool closed" branch leaked an armed timer that fired later + // and called listener.onClose a second time with DEADLINE_EXCEEDED. + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("close before issuing rpc") + .build()); + + CopyOnWriteArrayList closes = new CopyOnWriteArrayList<>(); + CountDownLatch firstClose = new CountDownLatch(1); + Duration deadline = Duration.ofMillis(100); + + VRpc vrpc = + sessionPool.newCall(FakeDescriptor.SCRIPTED); + vrpc.start( + SessionFakeScriptedRequest.getDefaultInstance(), + VRpcCallContext.create( + Deadline.after(deadline.toMillis(), TimeUnit.MILLISECONDS), true, vrpcTracer), + new VRpc.VRpcListener() { + @Override + public void onMessage(SessionFakeScriptedResponse msg) {} + + @Override + public void onClose(VRpcResult result) { + closes.add(result); + firstClose.countDown(); + } + }); + + // The fast-fail UNAVAILABLE onClose should arrive immediately. + assertThat(firstClose.await(1, TimeUnit.SECONDS)).isTrue(); + assertThat(closes).hasSize(1); + + // Wait past the deadline. With the bug (leaked deadlineMonitor), a phantom + // onClose(DEADLINE_EXCEEDED) would arrive in this window. With the fix, no second close. + Thread.sleep(deadline.toMillis() * 5); + assertThat(closes).hasSize(1); + } + @Test void testCreateSessionDoesntPropagateDeadline() { DeadlineInterceptor deadlineInterceptor = new DeadlineInterceptor(); From bc7b4c355678d895707434b1c4d62dea837dca40 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 17:34:55 +0000 Subject: [PATCH 18/24] fix: synchronize Client.sessionPools access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sessionPools is Collections.newSetFromMap(new WeakHashMap<>()) — neither thread-safe nor synchronized. close() iterated it via forEach while openTableAsync / openAuthorizedViewAsync / openMaterializedViewAsync added to it concurrently, risking ConcurrentModificationException or a silently skipped just-added pool that close() believed it had closed. Wrap all four sites in synchronized(sessionPools) blocks. close() snapshots into an ArrayList under the monitor and iterates the copy outside, so the slow pool.close() calls don't block concurrent opens. --- .../bigtable/data/v2/internal/api/Client.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 56d517f021a7..ac9703974c20 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -43,7 +43,9 @@ import io.opencensus.tags.Tags; import io.opentelemetry.sdk.OpenTelemetrySdk; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Set; import java.util.WeakHashMap; import java.util.concurrent.ExecutorService; @@ -218,7 +220,11 @@ public Client( @Override public void close() { - sessionPools.forEach( + List> toClose; + synchronized (sessionPools) { + toClose = new ArrayList<>(sessionPools); + } + toClose.forEach( pool -> pool.close( CloseSessionRequest.newBuilder() @@ -251,7 +257,9 @@ public TableAsync openTableAsync(String tableId, Permission permission) { metrics.get(), sessionTimer, userCallbackExecutor.get()); - sessionPools.add(tableAsync.getSessionPool()); + synchronized (sessionPools) { + sessionPools.add(tableAsync.getSessionPool()); + } return tableAsync; } @@ -270,7 +278,9 @@ public AuthorizedViewAsync openAuthorizedViewAsync( metrics.get(), sessionTimer, userCallbackExecutor.get()); - sessionPools.add(viewAsync.getSessionPool()); + synchronized (sessionPools) { + sessionPools.add(viewAsync.getSessionPool()); + } return viewAsync; } @@ -288,7 +298,9 @@ public MaterializedViewAsync openMaterializedViewAsync( metrics.get(), sessionTimer, userCallbackExecutor.get()); - sessionPools.add(viewAsync.getSessionPool()); + synchronized (sessionPools) { + sessionPools.add(viewAsync.getSessionPool()); + } return viewAsync; } From bfa99cd0d9eb1bda653d4b77c340dda20bea9147 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 17:45:11 +0000 Subject: [PATCH 19/24] fix: synthesize fallback closeReason in notifyTerminalClose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit notifyTerminalClose dereferenced closeReason without a null guard. Today every caller sets it first (forceClose, startGracefulClose's null-synth, dispatchStreamClosed's else branch, abortFromUncaughtException), so the invariant holds — but it is fragile: a future writer of WAIT_SERVER_CLOSE that forgets to set closeReason would NPE here. A precondition checkState would be wrong: the function's whole purpose is to fan out terminal notifications behind per-notification try/catch so one failure does not suppress the rest. A throw before any notification runs escapes to the sessionSyncContext uncaught handler, which finds the state already CLOSED and early-returns — all cleanup silently skipped. Mirror the synthesizer pattern from startGracefulClose: log a warning with an IllegalStateException for stack-trace observability, then build a fallback CloseSessionRequest so the rest of the fan-out runs unchanged. --- .../data/v2/internal/session/SessionImpl.java | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 9771bf4ed749..b976eef652a2 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -750,14 +750,32 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { * Fan out terminal notifications to the in-flight vRpc, tracer, and session listener with local * guards so a throw in one notification does not suppress the others. * - *

Caller contract: must have already transitioned to {@link SessionState#CLOSED}, captured and - * cleared {@code currentRpc}, and ensured {@code closeReason} is set (synthesizing if absent). + *

Caller contract: must have already transitioned to {@link SessionState#CLOSED} and captured + * and cleared {@code currentRpc}. Callers should also set {@code closeReason}; if missing we + * synthesize a fallback here rather than throw, since throwing from this fan-out aborts the + * remaining notifications and (because the state is already CLOSED) defeats the + * sessionSyncContext uncaught handler's cleanup. */ private void notifyTerminalClose( Status status, Metadata trailers, @Nullable VRpcImpl localRpc, SessionState prevState) { + // Should never happen — matches the synthesizer in startGracefulClose. + if (closeReason == null) { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_close_no_reason"); + logger.log( + Level.WARNING, + String.format( + "%s notifyTerminalClose reached without a closeReason; status=%s", + info.getLogName(), status), + new IllegalStateException("notifyTerminalClose without closeReason")); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("notifyTerminalClose reached without closeReason; status=" + status) + .build(); + } if (localRpc != null) { try { localRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); From 305bd1e08ae5554f9745d13c3f241a28d51fbede Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 19:19:40 +0000 Subject: [PATCH 20/24] fix: drain SessionPools before tearing down userCallbackExecutor on Client.close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Client.close shut down userCallbackExecutor before draining the SessionPools that depend on it, so late listener.onClose tasks from in-flight RPCs arrived after backing was dead and got RejectedExecutionException — silently stranding the user's terminal callbacks. The earlier fix sprinkled inline-drain fallbacks inside OpExecutor; restructure shutdown instead so the race can't happen. SessionPool gains awaitTerminated(Duration), backed by a CompletableFuture SessionPoolImpl completes from onSessionClose once the pool is CLOSED and the last session has drained. close() no longer kills the watchdog — awaitTerminated takes ownership of that, so the watchdog stays alive during shutdown and can escalate any session stuck in WAIT_SERVER_CLOSE longer than its tick interval (5 min) via forceClose. Client.close becomes three explicit phases: (1) initiate graceful close on each pool, (2) awaitTerminated on each with a 6-minute per-pool budget (one full watchdog tick plus buffer), (3) tear down userCallbackExecutor / channelPool / timers in the existing order, now safely because all listener.onClose tasks are queued or drained before backing dies. Add a `closed` AtomicBoolean + checkNotClosed() guard on the three openers so concurrent opens during shutdown can't create pools the close path won't see. close() is now idempotent via CAS on that flag. Tests: - ClientTest#openAfterCloseThrows: openTable/View/MaterializedView all throw IllegalStateException after close - ClientTest#closeIsIdempotent: double close is a no-op - SessionPoolImplTest#awaitTerminatedReturnsTrueWhenPoolIsEmpty - SessionPoolImplTest#awaitTerminatedReturnsTrueAfterSessionsDrain - SessionPoolImplTest tearDown now calls awaitTerminated so the watchdog is closed before testTimer.stop races its self-reschedule - FakeSessionPool in TableBaseTest gains a no-op awaitTerminated stub Drive-by: remove the spurious @Nested annotation from SessionPoolImplTest's top-level class. @Nested is meaningful only on non-static inner classes; on the outer class it caused Surefire to mis-attribute test counts (outer tests reported under $RetrySessionCreation). Tests were executed correctly either way, just the reporting was wrong. Full module suite (2375 tests) passes in 3:15, unchanged from before. --- .../bigtable/data/v2/internal/api/Client.java | 74 ++++++++++++++++--- .../data/v2/internal/session/SessionPool.java | 7 ++ .../v2/internal/session/SessionPoolImpl.java | 36 ++++++++- .../data/v2/internal/api/ClientTest.java | 35 +++++++++ .../data/v2/internal/api/TableBaseTest.java | 5 ++ .../internal/session/SessionPoolImplTest.java | 43 ++++++++++- 6 files changed, 186 insertions(+), 14 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index ac9703974c20..310af0e66744 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -43,6 +43,7 @@ import io.opencensus.tags.Tags; import io.opentelemetry.sdk.OpenTelemetrySdk; import java.io.IOException; +import java.time.Duration; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -52,8 +53,18 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.logging.Level; +import java.util.logging.Logger; public class Client implements AutoCloseable { + private static final Logger logger = Logger.getLogger(Client.class.getName()); + + // Per-pool drain budget during close. One full watchdog tick (5 min) plus 1 min buffer; if a + // pool can't drain in that window, something is genuinely wrong on the server side and we give + // up on it so close() returns. The watchdog interval is what makes the worst case finite. + private static final Duration POOL_DRAIN_TIMEOUT = Duration.ofMinutes(6); + public static final FeatureFlags BASE_FEATURE_FLAGS = FeatureFlags.newBuilder() .setReverseScans(false) @@ -93,6 +104,10 @@ public class Client implements AutoCloseable { private final Resource configManager; private final Set> sessionPools = Collections.newSetFromMap(new WeakHashMap<>()); + // Set true at the start of close(); guards openTableAsync / openAuthorizedViewAsync / + // openMaterializedViewAsync so concurrent opens during shutdown don't create pools the close + // path won't see. + private final AtomicBoolean closed = new AtomicBoolean(false); public static Client create(ClientSettings settings) throws IOException { FeatureFlags featureFlags = @@ -220,21 +235,49 @@ public Client( @Override public void close() { + if (!closed.compareAndSet(false, true)) { + return; // idempotent + } + List> toClose; synchronized (sessionPools) { toClose = new ArrayList<>(sessionPools); } - toClose.forEach( - pool -> - pool.close( - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) - .setDescription("Client closing") - .build())); - // Drain user-callback first so pool.close's cancelWithResult listener notifications complete - // before we tear down the surrounding executors and timer. Without this, the late onClose - // submissions race the shutdown and get RejectedExecutionException, silently dropping the - // user's terminal onClose. + + CloseSessionRequest closeReq = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("Client closing") + .build(); + + // Phase 1: initiate graceful close on each pool. Returns immediately; sessions transition + // CLOSING → graceful CloseSessionRequest → WAIT_SERVER_CLOSE → CLOSED asynchronously. + toClose.forEach(p -> p.close(closeReq)); + + // Phase 2: wait for sessions to drain. The pool's watchdog stays alive during this wait and + // escalates anything stuck in WAIT_SERVER_CLOSE longer than its tick interval (5 min). Once + // a pool's last session reaches CLOSED, drainedFuture completes and awaitTerminated returns. + // Sequential: worst case is POOL_DRAIN_TIMEOUT * N pools, but the happy path drains in << 1s. + for (SessionPool pool : toClose) { + try { + if (!pool.awaitTerminated(POOL_DRAIN_TIMEOUT)) { + logger.warning( + "SessionPool did not drain within " + + POOL_DRAIN_TIMEOUT + + "; abandoning and continuing shutdown"); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.log(Level.WARNING, "Interrupted while draining SessionPool", e); + break; + } + } + + // Phase 3: tear down infrastructure. By this point all listener.onClose tasks for in-flight + // RPCs are queued on their op executors (which run on userCallbackExecutor), and no new + // session responses are coming since every session is CLOSED. The 5s await inside + // userCallbackExecutor.close() is therefore just a guard for tasks in flight — it should + // return immediately in the typical case. userCallbackExecutor.close(); metrics.close(); channelPool.close(); @@ -244,7 +287,14 @@ public void close() { backgroundExecutor.close(); } + private void checkNotClosed() { + if (closed.get()) { + throw new IllegalStateException("Client is closed"); + } + } + public TableAsync openTableAsync(String tableId, Permission permission) { + checkNotClosed(); TableAsync tableAsync = TableAsync.createAndStart( featureFlags, @@ -265,6 +315,7 @@ public TableAsync openTableAsync(String tableId, Permission permission) { public AuthorizedViewAsync openAuthorizedViewAsync( String tableId, String viewId, OpenAuthorizedViewRequest.Permission permission) { + checkNotClosed(); AuthorizedViewAsync viewAsync = AuthorizedViewAsync.createAndStart( featureFlags, @@ -286,6 +337,7 @@ public AuthorizedViewAsync openAuthorizedViewAsync( public MaterializedViewAsync openMaterializedViewAsync( String viewId, OpenMaterializedViewRequest.Permission permission) { + checkNotClosed(); MaterializedViewAsync viewAsync = MaterializedViewAsync.createAndStart( featureFlags, diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java index 0b8cd1cdaea6..9ff1d6ffe9f8 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java @@ -20,6 +20,7 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.protobuf.Message; import io.grpc.Metadata; +import java.time.Duration; public interface SessionPool { void start(OpenReqT openReq, Metadata md); @@ -29,6 +30,12 @@ VRpc newCall( void close(CloseSessionRequest req); + /** + * Blocks until all sessions in this pool have terminated, or the timeout elapses. Must be called + * after {@link #close} to be meaningful. Returns true if drained, false on timeout. + */ + boolean awaitTerminated(Duration timeout) throws InterruptedException; + SessionPoolInfo getInfo(); int getConsecutiveUnimplementedFailures(); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index 103abe6b2ec5..cfab32770dd7 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -64,7 +64,10 @@ import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; @@ -133,6 +136,11 @@ private enum PoolState { @GuardedBy("this") private boolean closed = false; + // Completed when this pool has been close()d AND every session has reached the CLOSED terminal + // state. Drives Client.close()'s drain barrier so that listener.onClose tasks finish queueing + // onto userCallbackExecutor before that executor is shut down. + private final CompletableFuture drainedFuture = new CompletableFuture<>(); + @GuardedBy("this") private BigtableTimer.Timeout retryCreateSessionFuture = null; @@ -293,8 +301,13 @@ public void close(CloseSessionRequest req) { retryCreateSessionFuture.cancel(); retryCreateSessionFuture = null; } - watchdog.close(); + // Watchdog stays alive past close() so it can escalate any session that lingers in + // WAIT_SERVER_CLOSE during shutdown. awaitTerminated() takes ownership of closing it. sessions.close(req); + // If the pool had no sessions, drainedFuture would never be completed by onSessionClose. + if (sessions.getAllSessions().isEmpty()) { + drainedFuture.complete(null); + } } // cancelWithResult trampolines through ctx.getExecutor() — required because the public @@ -308,6 +321,23 @@ public void close(CloseSessionRequest req) { } } + @Override + public boolean awaitTerminated(Duration timeout) throws InterruptedException { + try { + drainedFuture.get(timeout.toNanos(), TimeUnit.NANOSECONDS); + return true; + } catch (TimeoutException e) { + return false; + } catch (ExecutionException e) { + // drainedFuture is only completed via .complete(null), never .completeExceptionally — + // a CancellationException would still be wrapped here. Treat as a bug. + throw new IllegalStateException("drainedFuture failed unexpectedly", e); + } finally { + // Close the watchdog on the way out — drained or timed out, its job is done. + watchdog.close(); + } + } + @Override public synchronized void start(OpenReqT openReq, Metadata md) { Preconditions.checkState(poolState == PoolState.NEW); @@ -504,6 +534,10 @@ private void onSessionClose( // If the pool is closed then there is nothing else to do // dont need to create a replacement session and pending vRpcs get cleaned up in close() if (poolState == PoolState.CLOSED) { + // Signal awaitTerminated() once the last session has drained. + if (sessions.getAllSessions().isEmpty()) { + drainedFuture.complete(null); + } return; } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java index e27f20c809d1..eb2e0f78c8a0 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java @@ -107,6 +107,41 @@ void tearDown() { executor.shutdownNow(); } + @Test + public void openAfterCloseThrows() { + client.close(); + + IllegalStateException tableEx = + assertThrows( + IllegalStateException.class, + () -> client.openTableAsync("fake-table", OpenTableRequest.Permission.PERMISSION_READ)); + assertThat(tableEx).hasMessageThat().contains("closed"); + + IllegalStateException viewEx = + assertThrows( + IllegalStateException.class, + () -> + client.openAuthorizedViewAsync( + "fake-table", + "fake-view", + OpenAuthorizedViewRequest.Permission.PERMISSION_READ)); + assertThat(viewEx).hasMessageThat().contains("closed"); + + IllegalStateException mvEx = + assertThrows( + IllegalStateException.class, + () -> + client.openMaterializedViewAsync( + "fake-view", OpenMaterializedViewRequest.Permission.PERMISSION_READ)); + assertThat(mvEx).hasMessageThat().contains("closed"); + } + + @Test + public void closeIsIdempotent() { + client.close(); + client.close(); // must not throw or hang + } + @Test public void testRequestFails() { TableAsync table = diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index 1f24d3d156f4..257b63549f4a 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -177,6 +177,11 @@ public void start(OpenTableRequest openReq, Metadata md) {} @Override public void close(CloseSessionRequest req) {} + @Override + public boolean awaitTerminated(java.time.Duration timeout) { + return true; + } + @Override public SessionPoolInfo getInfo() { return SessionPoolInfo.create(clientInfo, VRpcDescriptor.TABLE_SESSION, "fake-pool"); diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index 82cbc979dce4..6dc2ead454f8 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -101,7 +101,6 @@ import org.mockito.junit.jupiter.MockitoExtension; @Timeout(30) -@Nested @ExtendWith(MockitoExtension.class) public class SessionPoolImplTest { private static final ClientInfo CLIENT_INFO = @@ -170,12 +169,15 @@ void setUp() throws IOException { } @AfterEach - void tearDown() { + void tearDown() throws InterruptedException { sessionPool.close( CloseSessionRequest.newBuilder() .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) .setDescription("close session") .build()); + // Wait for sessions to drain so the watchdog can be closed before testTimer.stop() races + // with its self-reschedule loop. + sessionPool.awaitTerminated(Duration.ofSeconds(10)); channelPool.close(); // channel gets shutdown in channelPool.close() server.shutdownNow(); @@ -280,6 +282,43 @@ public void onClose(VRpcResult result) { } } + @Test + void awaitTerminatedReturnsTrueWhenPoolIsEmpty() throws InterruptedException { + // A pool that was never started has no sessions; close() should complete drainedFuture + // immediately and awaitTerminated should return true without blocking. + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("empty pool") + .build()); + assertThat(sessionPool.awaitTerminated(Duration.ofMillis(100))).isTrue(); + } + + @Test + void awaitTerminatedReturnsTrueAfterSessionsDrain() + throws InterruptedException, ExecutionException, TimeoutException { + // Start a real session, issue + complete a vRPC so the session is fully open, then close + // the pool and verify awaitTerminated returns true (sessions cleanly drained). + sessionPool.start(OpenFakeSessionRequest.getDefaultInstance(), new Metadata()); + + VRpc vrpc = + sessionPool.newCall(FakeDescriptor.SCRIPTED); + UnaryResponseFuture resultFuture = new UnaryResponseFuture<>(); + vrpc.start( + SessionFakeScriptedRequest.getDefaultInstance(), + VRpcCallContext.create(Deadline.after(10, TimeUnit.SECONDS), true, vrpcTracer), + resultFuture); + resultFuture.get(10, TimeUnit.SECONDS); + + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("after drain") + .build()); + + assertThat(sessionPool.awaitTerminated(Duration.ofSeconds(10))).isTrue(); + } + @Test void pendingVRpcOnClosedPoolDoesNotLeakDeadlineMonitor() throws InterruptedException { // Regression: PendingVRpc.start used to arm the deadline timer before the pool-state From e1e6f6c3127da02663b4f9bfb112bcc052e80e4e Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 20:03:18 +0000 Subject: [PATCH 21/24] fix: serialize VOperationImpl addListener through op executor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VOperationImpl.start read wrapped.closed on the caller thread and then called grpcContext.addListener if false. If chain.start had asynchronously queued an onClose (PendingVRpc pool-closed fast-fail, VRpcImpl deadline-exceeded short-circuit), the backing-thread drain could land between the read and the addListener call — CleanupListener.onClose would set closed=true and call removeListener as a no-op pre-registration, and the caller would then register a listener with nothing to remove it. The leak is per-RPC, permanent until grpcContext cancels. For long-lived application contexts that pin chain → opExecutor → userCallbackExecutor, this accumulates indefinitely. Queue the addListener through exec.runInline so FIFO ordering with the op executor guarantees soundness: any onClose chain.start enqueued drains first, so wrapped.closed is accurate by the time we evaluate it. If grpcContext is cancelled between start() returning and the queued task running, addListener+directExecutor fires the cancellationListener immediately at registration time, so cancel still propagates correctly. runInline (not execute) is the right verb here: when chain.start enqueued nothing (common path) the executor is idle and the registration body runs inline on the caller thread — no extra context switch. When chain.start did enqueue, runInline takes the queue branch and FIFO drains both. Adds VOperationImplTest with two tests: - grpcContextCancelPropagatesToChain: normal-path sanity that addListener still wires cancellation through to chain.cancel - asyncOnCloseFromChainDoesNotPropagateLaterContextCancel: regression — with an async-queued onClose, the cancellationListener must not be registered, so a later grpcContext.cancel does not reach chain.cancel --- .../internal/middleware/VOperationImpl.java | 29 ++-- .../middleware/VOperationImplTest.java | 155 ++++++++++++++++++ 2 files changed, 174 insertions(+), 10 deletions(-) create mode 100644 java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java index 86556958f4da..880b65a6f56f 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -93,17 +93,26 @@ public void start(ReqT req, VRpcListener listener) { new CleanupListener<>(listener, grpcContext, cancellationListener); exec.runInline(() -> chain.start(req, ctx, wrapped)); // Register AFTER chain.start so a context-cancel that fires immediately is sequenced behind - // start. runInline runs chain.start synchronously, so it has fully completed by the time the - // listener is registered. Matches ClientCallImpl's ordering (grpc-java issue #1343). + // start. Matches ClientCallImpl's ordering (grpc-java issue #1343). // - // If the chain reached a terminal onClose synchronously inside runInline (uncaught-handler - // recovery, immediate failure), CleanupListener already tried to remove a listener that was - // never registered (no-op). Skip the addListener in that case — otherwise we'd register a - // listener on grpcContext that nothing will ever remove, pinning the entire chain for the - // lifetime of the (potentially long-lived) gRPC Context. - if (!wrapped.closed) { - grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); - } + // Queueing the registration onto the op executor is what makes the closed-check sound: any + // onClose that chain.start enqueued during runInline drains FIRST (FIFO), so by the time this + // task runs wrapped.closed reflects whether onClose has already fired. If it has, we skip + // addListener — otherwise the listener would pin the chain on grpcContext for its lifetime + // (CleanupListener.onClose already called removeListener as a no-op pre-registration). If + // grpcContext gets cancelled between start() returning and this task running, the + // directExecutor below fires the listener immediately on addListener, so cancel still + // propagates correctly. + // + // runInline is the right verb here: when chain.start enqueued nothing (common path), the + // executor is idle and the body runs inline on this thread — no extra context switch. When + // chain.start did enqueue an onClose, runInline takes the queue branch and FIFO drains both. + exec.runInline( + () -> { + if (!wrapped.closed) { + grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); + } + }); } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java new file mode 100644 index 000000000000..a69bed4c0462 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java @@ -0,0 +1,155 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.cloud.bigtable.data.v2.internal.csm.NoopMetrics; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; +import com.google.common.util.concurrent.MoreExecutors; +import io.grpc.Context; +import io.grpc.Deadline; +import io.grpc.Status; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.junit.jupiter.api.Test; + +class VOperationImplTest { + + @Test + void grpcContextCancelPropagatesToChain() throws InterruptedException { + // Normal-path sanity: addListener fires (chain.start doesn't queue an onClose), then a + // later grpcContext.cancel routes through cancellationListener -> opExecutor -> chain.cancel. + // This exercises the new runInline-based addListener path introduced to close the TOCTOU + // between wrapped.closed and addListener. + Context.CancellableContext grpcContext = Context.current().withCancellation(); + AtomicInteger chainCancelCount = new AtomicInteger(); + CountDownLatch cancelLatch = new CountDownLatch(1); + + VRpc chain = + new VRpc() { + @Override + public void start(String req, VRpcCallContext ctx, VRpcListener listener) { + // Hold open — do not call listener.onClose. Cancellation must drive termination. + } + + @Override + public void cancel(@Nullable String msg, @Nullable Throwable cause) { + chainCancelCount.incrementAndGet(); + cancelLatch.countDown(); + } + + @Override + public void requestNext() {} + }; + + VOperationImpl op = + new VOperationImpl<>( + chain, + grpcContext, + MoreExecutors.directExecutor(), + NoopMetrics.NoopVrpcTracer.INSTANCE, + Deadline.after(10, TimeUnit.SECONDS), + true); + + op.start( + "req", + new VRpcListener() { + @Override + public void onMessage(String msg) {} + + @Override + public void onClose(VRpcResult result) {} + }); + + grpcContext.cancel(Status.CANCELLED.withDescription("test").asException()); + + assertThat(cancelLatch.await(2, TimeUnit.SECONDS)).isTrue(); + assertThat(chainCancelCount.get()).isEqualTo(1); + } + + @Test + void asyncOnCloseFromChainDoesNotPropagateLaterContextCancel() throws InterruptedException { + // Regression for the wrapped.closed TOCTOU. When chain.start asynchronously queues an + // onClose via the op executor, the addListener task (also queued through the op executor + // by VOperationImpl.start) drains FIFO after the onClose and observes wrapped.closed=true, + // so the cancellationListener is NOT registered. A later grpcContext.cancel therefore has + // no path to reach chain.cancel — which is correct because the chain has already terminated. + Context.CancellableContext grpcContext = Context.current().withCancellation(); + AtomicInteger chainCancelCount = new AtomicInteger(); + AtomicReference userClose = new AtomicReference<>(); + CountDownLatch onCloseLatch = new CountDownLatch(1); + + VRpc chain = + new VRpc() { + @Override + public void start(String req, VRpcCallContext ctx, VRpcListener listener) { + // Simulate PendingVRpc pool-closed branch / VRpcImpl deadline short-circuit. + ctx.getExecutor() + .execute( + () -> + listener.onClose( + VRpcResult.createUncommitedError( + Status.UNAVAILABLE.withDescription("fast-fail")))); + } + + @Override + public void cancel(@Nullable String msg, @Nullable Throwable cause) { + chainCancelCount.incrementAndGet(); + } + + @Override + public void requestNext() {} + }; + + VOperationImpl op = + new VOperationImpl<>( + chain, + grpcContext, + MoreExecutors.directExecutor(), + NoopMetrics.NoopVrpcTracer.INSTANCE, + Deadline.after(10, TimeUnit.SECONDS), + true); + + op.start( + "req", + new VRpcListener() { + @Override + public void onMessage(String msg) {} + + @Override + public void onClose(VRpcResult result) { + userClose.set(result); + onCloseLatch.countDown(); + } + }); + + assertThat(onCloseLatch.await(2, TimeUnit.SECONDS)).isTrue(); + assertThat(userClose.get().getStatus().getCode()).isEqualTo(Status.UNAVAILABLE.getCode()); + + grpcContext.cancel(Status.CANCELLED.withDescription("test").asException()); + Thread.sleep(50); // give any leaked listener a chance to fire + + // No chain.cancel — the cancellationListener was correctly skipped because the chain had + // already reached its terminal state via the queued onClose. + assertThat(chainCancelCount.get()).isEqualTo(0); + } +} From a52b63c0024560743419df3c192e20eb98b9ef18 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 20:16:55 +0000 Subject: [PATCH 22/24] fix: gate tracer.onAttemptFinish on the Active stale-state check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Active.onClose called tracer.onAttemptFinish before the currentState != Active.this guard, so a discarded onClose still counted as an attempt finish. Today nothing reaches this path with a stale Active (VRpcImpl's CAS blocks late deliveries upstream), but the ordering is a latent hazard — any future code change that lets a late onClose through would double-fire tracer.onAttemptFinish for the same attempt and skew per- attempt metrics. Swap the two lines so the guard runs first, matching onMessage above. No behavior change today since the guard doesn't fire in current code paths; this is structural defense. --- .../bigtable/data/v2/internal/middleware/RetryingVRpc.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index 5c2f389198cd..3edfc2390b03 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -221,7 +221,9 @@ public void onMessage(RespT msg) { @Override public void onClose(VRpcResult result) { context.getExecutor().throwIfNotInThisExecutor(); - tracer.onAttemptFinish(result); + // Stale-state guard first (matches onMessage above): a discarded onClose must not + // count as an attempt finish, otherwise any future path that delivers a late + // onClose would double-fire tracer.onAttemptFinish for the same attempt. if (currentState != Active.this) { LOG.log( Level.FINE, @@ -230,6 +232,7 @@ public void onClose(VRpcResult result) { result); return; } + tracer.onAttemptFinish(result); if (shouldRetry(result)) { context = context.createForNextAttempt(); Duration retryDelay = From d6f82444b70c2a2a950df4b638f822029900a335 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 20:23:59 +0000 Subject: [PATCH 23/24] fix: ShimImpl uses shutdownAndAwait for userCallbackExecutor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ShimImpl registered its userCallbackExecutor with a bare shutdown() closer, while Client.create's path uses shutdownAndAwait (which gives in-flight listener.onClose tasks a 5-second drain window before shutdownNow). On ShimImpl close, queued callbacks were abandoned mid-flight — fine for quiescent shutdowns but a regression for fast-close patterns (test boundaries, dynamic config reloads) where in-flight callbacks have not yet drained. Promote Client.shutdownAndAwait from private-static to public-static so ShimImpl (different package) can reuse the same shutdown semantics, and update ShimImpl to call it. --- .../google/cloud/bigtable/data/v2/internal/api/Client.java | 6 ++++-- .../cloud/bigtable/data/v2/internal/compat/ShimImpl.java | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 310af0e66744..44aeb4ca18c2 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -388,8 +388,10 @@ public T get() { } // Drain in-flight listener.onClose tasks before the executor is shut down; bound the wait at 5s - // so close() doesn't hang the caller on a pathological listener. - private static void shutdownAndAwait(ExecutorService exec) { + // so close() doesn't hang the caller on a pathological listener. Public so the compat + // ShimImpl (different package) can reuse the same shutdown semantics for the user-callback + // executor it owns. + public static void shutdownAndAwait(ExecutorService exec) { exec.shutdown(); try { if (!exec.awaitTermination(5, TimeUnit.SECONDS)) { diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java index fcddf47f03b2..1dd38b1af36e 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java @@ -175,7 +175,8 @@ public static Shim create( Resource.createShared(metrics), Resource.createShared(configManager), Resource.createShared(bgExecutor), - Resource.createOwned(userCallbackExecutor, userCallbackExecutor::shutdown)); + Resource.createOwned( + userCallbackExecutor, () -> Client.shutdownAndAwait(userCallbackExecutor))); return new ShimImpl(configManager, client); } From 5e619a4f057112f9a0a56f499f6ea0e4dd2e1f18 Mon Sep 17 00:00:00 2001 From: Igor Bernstein Date: Mon, 15 Jun 2026 21:23:50 +0000 Subject: [PATCH 24/24] chore: replace fully-qualified type names with imports Also swap Guava Objects.hashCode(id) for Long.hashCode(id) in ChannelPoolDpImpl.AfeId to avoid per-call boxing and array allocation. --- .../v2/internal/api/AuthorizedViewAsync.java | 3 ++- .../v2/internal/api/MaterializedViewAsync.java | 3 ++- .../data/v2/internal/api/TableAsync.java | 3 ++- .../v2/internal/channels/ChannelPoolDpImpl.java | 2 +- .../data/v2/internal/compat/ShimImpl.java | 9 ++++++--- .../data/v2/internal/api/TableBaseTest.java | 3 ++- .../v2/internal/session/SessionImplTest.java | 16 +++++++++------- .../v2/internal/session/SessionPoolImplTest.java | 5 +++-- 8 files changed, 27 insertions(+), 17 deletions(-) diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java index 13dc6d7a5d72..99f044347d9b 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java @@ -33,6 +33,7 @@ import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; public class AuthorizedViewAsync implements AutoCloseable, Closeable { @@ -49,7 +50,7 @@ static AuthorizedViewAsync createAndStart( Permission permission, Metrics metrics, BigtableTimer timer, - java.util.concurrent.Executor userCallbackExecutor) { + Executor userCallbackExecutor) { AuthorizedViewName viewName = AuthorizedViewName.builder() diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java index 7b955f3d0718..bf798e2eb141 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java @@ -30,6 +30,7 @@ import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; public class MaterializedViewAsync implements AutoCloseable, Closeable { @@ -45,7 +46,7 @@ public static MaterializedViewAsync createAndStart( OpenMaterializedViewRequest.Permission permission, Metrics metrics, BigtableTimer timer, - java.util.concurrent.Executor userCallbackExecutor) { + Executor userCallbackExecutor) { MaterializedViewName viewName = MaterializedViewName.builder() diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java index 86478e3f0f1f..4f390c110486 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java @@ -34,6 +34,7 @@ import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; public class TableAsync implements AutoCloseable, Closeable { private final TableBase base; @@ -48,7 +49,7 @@ public static TableAsync createAndStart( Permission permission, Metrics metrics, BigtableTimer timer, - java.util.concurrent.Executor userCallbackExecutor) { + Executor userCallbackExecutor) { TableName tableName = TableName.builder() diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java index 9a60a271ea7f..402151676c7c 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java @@ -573,7 +573,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return com.google.common.base.Objects.hashCode(id); + return Long.hashCode(id); } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java index 1dd38b1af36e..24ae2e546939 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java @@ -47,6 +47,7 @@ import com.google.cloud.bigtable.data.v2.stub.MetadataExtractorInterceptor; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.grpc.ClientInterceptor; import io.grpc.ManagedChannel; import io.grpc.Metadata; @@ -54,6 +55,8 @@ import java.io.IOException; import java.time.Duration; import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.logging.Level; @@ -160,9 +163,9 @@ public static Shim create( featureFlags = featureFlags.toBuilder().setSessionsRequired(true).build(); } - java.util.concurrent.ExecutorService userCallbackExecutor = - java.util.concurrent.Executors.newCachedThreadPool( - new com.google.common.util.concurrent.ThreadFactoryBuilder() + ExecutorService userCallbackExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() .setNameFormat("bigtable-callback-shim-%d") .setDaemon(true) .build()); diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index 257b63549f4a..c88895e4e370 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -34,6 +34,7 @@ import com.google.protobuf.Message; import io.grpc.Deadline; import io.grpc.Metadata; +import java.time.Duration; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; @@ -178,7 +179,7 @@ public void start(OpenTableRequest openReq, Metadata md) {} public void close(CloseSessionRequest req) {} @Override - public boolean awaitTerminated(java.time.Duration timeout) { + public boolean awaitTerminated(Duration timeout) { return true; } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index f78e210ea35c..ddbf697b9e8e 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -70,10 +70,12 @@ import java.io.IOException; import java.time.Duration; import java.time.Instant; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -579,9 +581,9 @@ void abortFiresWhenListenerOnReadyThrows() throws Exception { SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); - java.util.concurrent.CountDownLatch onCloseLatch = new java.util.concurrent.CountDownLatch(1); - java.util.concurrent.atomic.AtomicReference capturedStatus = - new java.util.concurrent.atomic.AtomicReference<>(); + CountDownLatch onCloseLatch = new CountDownLatch(1); + AtomicReference capturedStatus = + new AtomicReference<>(); Session.Listener throwingListener = new Session.Listener() { @@ -621,8 +623,8 @@ void abortDoesNotHangWhenListenerOnCloseThrows() throws Exception { SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); - java.util.concurrent.CountDownLatch onReadyLatch = new java.util.concurrent.CountDownLatch(1); - java.util.concurrent.CountDownLatch onCloseLatch = new java.util.concurrent.CountDownLatch(1); + CountDownLatch onReadyLatch = new CountDownLatch(1); + CountDownLatch onCloseLatch = new CountDownLatch(1); Session.Listener throwingListener = new Session.Listener() { @@ -675,8 +677,8 @@ void abortDoesNotInfiniteLoopWhenRecoveryListenerAlsoThrows() throws Exception { SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); - java.util.concurrent.CountDownLatch onCloseInvoked = - new java.util.concurrent.CountDownLatch(1); + CountDownLatch onCloseInvoked = + new CountDownLatch(1); Session.Listener doublyThrowingListener = new Session.Listener() { diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index 6dc2ead454f8..47a1cacf8a97 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -81,6 +81,7 @@ import java.time.Instant; import java.util.List; import java.util.concurrent.CompletableFuture; +import java.util.function.LongPredicate; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; @@ -466,9 +467,9 @@ public void test() throws Exception { // retry-create-session site computes its delay against the real wall clock and the fake // budget clock, so it can land anywhere from sub-second to a couple of penalty intervals. // Match anything that isn't one of the two fixed cadences. - long watchdogMs = java.time.Duration.ofMinutes(5).toMillis(); + long watchdogMs = Duration.ofMinutes(5).toMillis(); long afePruneMs = SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(); - java.util.function.LongPredicate isRetrySchedule = + LongPredicate isRetrySchedule = d -> d > 0 && d != watchdogMs && d != afePruneMs; // start the pool