diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java index 3edacf7766e0..99f044347d9b 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/AuthorizedViewAsync.java @@ -26,13 +26,14 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.Executor; public class AuthorizedViewAsync implements AutoCloseable, Closeable { @@ -48,7 +49,8 @@ static AuthorizedViewAsync createAndStart( String viewId, Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer, + Executor userCallbackExecutor) { AuthorizedViewName viewName = AuthorizedViewName.builder() @@ -78,7 +80,8 @@ static AuthorizedViewAsync createAndStart( callOptions, viewName.toString(), metrics, - executorService); + timer, + userCallbackExecutor); return new AuthorizedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java index 82ddff08c3cb..44aeb4ca18c2 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/Client.java @@ -33,21 +33,38 @@ import com.google.cloud.bigtable.data.v2.internal.csm.MetricsImpl; import com.google.cloud.bigtable.data.v2.internal.csm.NoopMetrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.grpc.CallOptions; import io.opencensus.stats.Stats; import io.opencensus.tags.Tags; import io.opentelemetry.sdk.OpenTelemetrySdk; import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Set; import java.util.WeakHashMap; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.logging.Level; +import java.util.logging.Logger; public class Client implements AutoCloseable { + private static final Logger logger = Logger.getLogger(Client.class.getName()); + + // Per-pool drain budget during close. One full watchdog tick (5 min) plus 1 min buffer; if a + // pool can't drain in that window, something is genuinely wrong on the server side and we give + // up on it so close() returns. The watchdog interval is what makes the worst case finite. + private static final Duration POOL_DRAIN_TIMEOUT = Duration.ofMinutes(6); + public static final FeatureFlags BASE_FEATURE_FLAGS = FeatureFlags.newBuilder() .setReverseScans(false) @@ -71,6 +88,15 @@ public class Client implements AutoCloseable { private final FeatureFlags featureFlags; private final ClientInfo clientInfo; private final Resource backgroundExecutor; + // Drains the per-op SerializingExecutor. Cached pool so a blocked user callback does not starve + // heartbeats, retry delays, or other vRPCs (which all run on backgroundExecutor). + // TODO: source from the gax TransportChannelProvider so transport and user-callback dispatch + // share the same pool. Blocked on missing APIs to extract the configured executor from gax. + private final Resource userCallbackExecutor; + // Hashed-wheel timer for heartbeat / deadline / watchdog / retry scheduling. Built over + // backgroundExecutor (the timer's tick thread dispatches bodies onto it). Single tick thread per + // Client, shared across every SessionPoolImpl. + private final BigtableTimer sessionTimer; private final CallOptions defaultCallOptions; private final ChannelPool channelPool; @@ -78,6 +104,10 @@ public class Client implements AutoCloseable { private final Resource configManager; private final Set> sessionPools = Collections.newSetFromMap(new WeakHashMap<>()); + // Set true at the start of close(); guards openTableAsync / openAuthorizedViewAsync / + // openMaterializedViewAsync so concurrent opens during shutdown don't create pools the close + // path won't see. + private final AtomicBoolean closed = new AtomicBoolean(false); public static Client create(ClientSettings settings) throws IOException { FeatureFlags featureFlags = @@ -90,6 +120,12 @@ public static Client create(ClientSettings settings) throws IOException { .build(); ScheduledExecutorService backgroundExecutor = Executors.newScheduledThreadPool(4); + ExecutorService userCallbackExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + .setNameFormat("bigtable-callback-%d") + .setDaemon(true) + .build()); // TODO: compat layer: get this from settings String universeDomain = "googleapis.com"; @@ -137,6 +173,7 @@ public static Client create(ClientSettings settings) throws IOException { } metrics.close(); backgroundExecutor.shutdown(); + userCallbackExecutor.shutdown(); throw new RuntimeException("Failed to fetch initial config", e); } @@ -150,7 +187,8 @@ public static Client create(ClientSettings settings) throws IOException { settings.getChannelProvider(), Resource.createOwned(metrics, metrics::close), Resource.createOwned(configManager, configManager::close), - Resource.createOwned(backgroundExecutor, backgroundExecutor::shutdown)); + Resource.createOwned(backgroundExecutor, backgroundExecutor::shutdown), + Resource.createOwned(userCallbackExecutor, () -> shutdownAndAwait(userCallbackExecutor))); } public Client( @@ -159,13 +197,18 @@ public Client( ChannelProvider channelProvider, Resource metrics, Resource configManager, - Resource bgExecutor) + Resource bgExecutor, + Resource userCallbackExecutor) throws IOException { this.featureFlags = featureFlags; this.clientInfo = clientInfo; this.metrics = metrics; this.configManager = configManager; this.backgroundExecutor = bgExecutor; + this.userCallbackExecutor = userCallbackExecutor; + // Timer's tick thread dispatches bodies onto backgroundExecutor — tick-thread-blocking work + // (anything that takes a pool lock) gets handed off there instead of stalling the wheel. + this.sessionTimer = new NettyWheelTimer("bigtable-session-timer", bgExecutor.get()); defaultCallOptions = CallOptions.DEFAULT; @@ -192,20 +235,66 @@ public Client( @Override public void close() { - sessionPools.forEach( - pool -> - pool.close( - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) - .setDescription("Client closing") - .build())); + if (!closed.compareAndSet(false, true)) { + return; // idempotent + } + + List> toClose; + synchronized (sessionPools) { + toClose = new ArrayList<>(sessionPools); + } + + CloseSessionRequest closeReq = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("Client closing") + .build(); + + // Phase 1: initiate graceful close on each pool. Returns immediately; sessions transition + // CLOSING → graceful CloseSessionRequest → WAIT_SERVER_CLOSE → CLOSED asynchronously. + toClose.forEach(p -> p.close(closeReq)); + + // Phase 2: wait for sessions to drain. The pool's watchdog stays alive during this wait and + // escalates anything stuck in WAIT_SERVER_CLOSE longer than its tick interval (5 min). Once + // a pool's last session reaches CLOSED, drainedFuture completes and awaitTerminated returns. + // Sequential: worst case is POOL_DRAIN_TIMEOUT * N pools, but the happy path drains in << 1s. + for (SessionPool pool : toClose) { + try { + if (!pool.awaitTerminated(POOL_DRAIN_TIMEOUT)) { + logger.warning( + "SessionPool did not drain within " + + POOL_DRAIN_TIMEOUT + + "; abandoning and continuing shutdown"); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.log(Level.WARNING, "Interrupted while draining SessionPool", e); + break; + } + } + + // Phase 3: tear down infrastructure. By this point all listener.onClose tasks for in-flight + // RPCs are queued on their op executors (which run on userCallbackExecutor), and no new + // session responses are coming since every session is CLOSED. The 5s await inside + // userCallbackExecutor.close() is therefore just a guard for tasks in flight — it should + // return immediately in the typical case. + userCallbackExecutor.close(); metrics.close(); channelPool.close(); configManager.close(); + // Stop the timer before tearing down backgroundExecutor (the timer's dispatcher). + sessionTimer.stop(); backgroundExecutor.close(); } + private void checkNotClosed() { + if (closed.get()) { + throw new IllegalStateException("Client is closed"); + } + } + public TableAsync openTableAsync(String tableId, Permission permission) { + checkNotClosed(); TableAsync tableAsync = TableAsync.createAndStart( featureFlags, @@ -216,13 +305,17 @@ public TableAsync openTableAsync(String tableId, Permission permission) { tableId, permission, metrics.get(), - backgroundExecutor.get()); - sessionPools.add(tableAsync.getSessionPool()); + sessionTimer, + userCallbackExecutor.get()); + synchronized (sessionPools) { + sessionPools.add(tableAsync.getSessionPool()); + } return tableAsync; } public AuthorizedViewAsync openAuthorizedViewAsync( String tableId, String viewId, OpenAuthorizedViewRequest.Permission permission) { + checkNotClosed(); AuthorizedViewAsync viewAsync = AuthorizedViewAsync.createAndStart( featureFlags, @@ -234,13 +327,17 @@ public AuthorizedViewAsync openAuthorizedViewAsync( viewId, permission, metrics.get(), - backgroundExecutor.get()); - sessionPools.add(viewAsync.getSessionPool()); + sessionTimer, + userCallbackExecutor.get()); + synchronized (sessionPools) { + sessionPools.add(viewAsync.getSessionPool()); + } return viewAsync; } public MaterializedViewAsync openMaterializedViewAsync( String viewId, OpenMaterializedViewRequest.Permission permission) { + checkNotClosed(); MaterializedViewAsync viewAsync = MaterializedViewAsync.createAndStart( featureFlags, @@ -251,14 +348,19 @@ public MaterializedViewAsync openMaterializedViewAsync( viewId, permission, metrics.get(), - backgroundExecutor.get()); - sessionPools.add(viewAsync.getSessionPool()); + sessionTimer, + userCallbackExecutor.get()); + synchronized (sessionPools) { + sessionPools.add(viewAsync.getSessionPool()); + } return viewAsync; } public static class Resource { - private T value; - private Runnable closer; + private final T value; + private final Runnable closer; + private final java.util.concurrent.atomic.AtomicBoolean closed = + new java.util.concurrent.atomic.AtomicBoolean(false); public static Resource createOwned(T value, Runnable closer) { return new Resource<>(value, closer); @@ -273,12 +375,31 @@ private Resource(T value, Runnable closer) { this.closer = closer; } + /** Idempotent. Repeat calls are no-ops. */ public void close() { - this.closer.run(); + if (closed.compareAndSet(false, true)) { + this.closer.run(); + } } public T get() { return value; } } + + // Drain in-flight listener.onClose tasks before the executor is shut down; bound the wait at 5s + // so close() doesn't hang the caller on a pathological listener. Public so the compat + // ShimImpl (different package) can reuse the same shutdown semantics for the user-callback + // executor it owns. + public static void shutdownAndAwait(ExecutorService exec) { + exec.shutdown(); + try { + if (!exec.awaitTermination(5, TimeUnit.SECONDS)) { + exec.shutdownNow(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + exec.shutdownNow(); + } + } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java index 70023645efc2..bf798e2eb141 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/MaterializedViewAsync.java @@ -23,13 +23,14 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.Executor; public class MaterializedViewAsync implements AutoCloseable, Closeable { @@ -44,7 +45,8 @@ public static MaterializedViewAsync createAndStart( String viewId, OpenMaterializedViewRequest.Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer, + Executor userCallbackExecutor) { MaterializedViewName viewName = MaterializedViewName.builder() @@ -73,7 +75,8 @@ public static MaterializedViewAsync createAndStart( callOptions, viewId, metrics, - executorService); + timer, + userCallbackExecutor); return new MaterializedViewAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java index 0bc699d4f146..4f390c110486 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableAsync.java @@ -27,13 +27,14 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import io.grpc.CallOptions; import io.grpc.Deadline; import java.io.Closeable; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.Executor; public class TableAsync implements AutoCloseable, Closeable { private final TableBase base; @@ -47,7 +48,8 @@ public static TableAsync createAndStart( String tableId, Permission permission, Metrics metrics, - ScheduledExecutorService executorService) { + BigtableTimer timer, + Executor userCallbackExecutor) { TableName tableName = TableName.builder() @@ -76,7 +78,8 @@ public static TableAsync createAndStart( callOptions, tableId, metrics, - executorService); + timer, + userCallbackExecutor); return new TableAsync(base); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java index 8feef399d625..4082c306dae8 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/api/TableBase.java @@ -25,12 +25,12 @@ import com.google.cloud.bigtable.data.v2.internal.csm.Metrics; import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; -import com.google.cloud.bigtable.data.v2.internal.middleware.CancellableVRpc; +import com.google.cloud.bigtable.data.v2.internal.middleware.VOperationImpl; import com.google.cloud.bigtable.data.v2.internal.middleware.RetryingVRpc; -import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolImpl; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.cloud.bigtable.data.v2.internal.util.ClientConfigurationManager; import com.google.common.annotations.VisibleForTesting; @@ -39,11 +39,12 @@ import io.grpc.Context; import io.grpc.Deadline; import io.grpc.Metadata; -import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.Executor; class TableBase implements AutoCloseable { private final SessionPool sessionPool; - private final ScheduledExecutorService backgroundExecutor; + private final BigtableTimer timer; + private final Executor userCallbackExecutor; private final Metrics metrics; private final VRpcDescriptor readRowDescriptor; private final VRpcDescriptor @@ -61,7 +62,8 @@ static TableBase createAndStart( CallOptions callOptions, String sessionPoolName, Metrics metrics, - ScheduledExecutorService executor) { + BigtableTimer timer, + Executor userCallbackExecutor) { SessionPool sessionPool = new SessionPoolImpl<>( @@ -73,11 +75,12 @@ static TableBase createAndStart( callOptions, sessionDescriptor, sessionPoolName, - executor); + timer); sessionPool.start(openReq, new Metadata()); - return new TableBase(sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, executor); + return new TableBase( + sessionPool, readRowDescriptor, mutateRowDescriptor, metrics, timer, userCallbackExecutor); } @VisibleForTesting @@ -86,12 +89,14 @@ static TableBase createAndStart( VRpcDescriptor readRowDescriptor, VRpcDescriptor mutateRowDescriptor, Metrics metrics, - ScheduledExecutorService executor) { + BigtableTimer timer, + Executor userCallbackExecutor) { this.sessionPool = sessionPool; this.readRowDescriptor = readRowDescriptor; this.mutateRowDescriptor = mutateRowDescriptor; this.metrics = metrics; - this.backgroundExecutor = executor; + this.timer = timer; + this.userCallbackExecutor = userCallbackExecutor; } @Override @@ -110,15 +115,11 @@ public SessionPool getSessionPool() { public void readRow( SessionReadRowRequest req, VRpcListener listener, Deadline deadline) { RetryingVRpc retry = - new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), backgroundExecutor); - + new RetryingVRpc<>(() -> sessionPool.newCall(readRowDescriptor), timer); VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), readRowDescriptor, deadline); - VRpcCallContext ctx = VRpcCallContext.create(deadline, true, tracer); - - CancellableVRpc cancellableVRpc = - new CancellableVRpc<>(retry, Context.current()); - cancellableVRpc.start(req, ctx, listener); + new VOperationImpl<>(retry, Context.current(), userCallbackExecutor, tracer, deadline, true) + .start(req, listener); } public void mutateRow( @@ -126,17 +127,13 @@ public void mutateRow( VRpcListener listener, Deadline deadline) { RetryingVRpc retry = - new RetryingVRpc<>(() -> sessionPool.newCall(mutateRowDescriptor), backgroundExecutor); - + new RetryingVRpc<>(() -> sessionPool.newCall(mutateRowDescriptor), timer); boolean idempotent = Util.isIdempotent(req.getMutationsList()); - VRpcTracer tracer = metrics.newTableTracer(sessionPool.getInfo(), mutateRowDescriptor, deadline); - VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer); - - CancellableVRpc cancellable = - new CancellableVRpc<>(retry, Context.current()); - cancellable.start(req, ctx, listener); + new VOperationImpl<>( + retry, Context.current(), userCallbackExecutor, tracer, deadline, idempotent) + .start(req, listener); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java index 7913b28ef14c..402151676c7c 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/ChannelPoolDpImpl.java @@ -26,6 +26,7 @@ import com.google.common.base.Ticker; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; +import com.google.common.util.concurrent.MoreExecutors; import io.grpc.CallOptions; import io.grpc.ClientCall; import io.grpc.ManagedChannel; @@ -246,8 +247,11 @@ public synchronized SessionStream newStream( channelWrapper.group.numStreams++; totalStreams++; + // DirectExecutor: gRPC/Netty delivers SessionStream.Listener callbacks directly on the + // I/O thread. All work must be fast and non-blocking; blocking work goes to sessionSyncContext. ClientCall innerCall = - channelWrapper.channel.newCall(desc, callOptions); + channelWrapper.channel.newCall( + desc, callOptions.withExecutor(MoreExecutors.directExecutor())); return new SessionStreamImpl(innerCall) { // mark as null so that onClose can tell if onBeforeSessionStart was never called @@ -569,7 +573,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return com.google.common.base.Objects.hashCode(id); + return Long.hashCode(id); } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java index ebbc39af7f2c..eb6dbfa339eb 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SessionStream.java @@ -37,6 +37,14 @@ public interface SessionStream { public void forceClose(@Nullable String message, @Nullable Throwable cause); + /** + * Callbacks for session stream events. + * + *

Invariant: callbacks are delivered on Netty I/O threads via {@code DirectExecutor}. + * All work must be fast and non-blocking — any user-facing or potentially blocking work must be + * dispatched onto the session {@code SynchronizationContext} (which then forwards to the op + * executor) before returning. Violating this stalls the channel. + */ public interface Listener { void onBeforeSessionStart(PeerInfo peerInfo); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java index 6d40b58d53d4..bfd099f60d51 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/channels/SingleChannelPool.java @@ -19,6 +19,7 @@ import com.google.bigtable.v2.SessionClientConfiguration.ChannelPoolConfiguration; import com.google.bigtable.v2.SessionRequest; import com.google.bigtable.v2.SessionResponse; +import com.google.common.util.concurrent.MoreExecutors; import io.grpc.CallOptions; import io.grpc.ManagedChannel; import io.grpc.MethodDescriptor; @@ -45,7 +46,10 @@ public void close() { @Override public SessionStream newStream( MethodDescriptor desc, CallOptions callOptions) { - return new SessionStreamImpl(channel.newCall(desc, callOptions)); + // DirectExecutor: gRPC/Netty delivers SessionStream.Listener callbacks directly on the + // I/O thread. All work must be fast and non-blocking; blocking work goes to sessionSyncContext. + return new SessionStreamImpl( + channel.newCall(desc, callOptions.withExecutor(MoreExecutors.directExecutor()))); } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java index 353973dc8e58..24ae2e546939 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/compat/ShimImpl.java @@ -47,6 +47,7 @@ import com.google.cloud.bigtable.data.v2.stub.MetadataExtractorInterceptor; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.grpc.ClientInterceptor; import io.grpc.ManagedChannel; import io.grpc.Metadata; @@ -54,6 +55,8 @@ import java.io.IOException; import java.time.Duration; import java.util.Optional; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.logging.Level; @@ -160,6 +163,13 @@ public static Shim create( featureFlags = featureFlags.toBuilder().setSessionsRequired(true).build(); } + ExecutorService userCallbackExecutor = + Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + .setNameFormat("bigtable-callback-shim-%d") + .setDaemon(true) + .build()); + Client client = new Client( clientChannelProvider.updateFeatureFlags(featureFlags), @@ -167,7 +177,9 @@ public static Shim create( clientChannelProvider, Resource.createShared(metrics), Resource.createShared(configManager), - Resource.createShared(bgExecutor)); + Resource.createShared(bgExecutor), + Resource.createOwned( + userCallbackExecutor, () -> Client.shutdownAndAwait(userCallbackExecutor))); return new ShimImpl(configManager, client); } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java deleted file mode 100644 index ded284e1979c..000000000000 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/CancellableVRpc.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright 2026 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.google.cloud.bigtable.data.v2.internal.middleware; - -import com.google.common.util.concurrent.MoreExecutors; -import io.grpc.Context; -import io.grpc.Deadline; -import java.util.Optional; -import java.util.concurrent.TimeoutException; - -/** - * A {@link VRpc} decorator that propagates gRPC {@link Context} cancellation to the underlying - * VRpc. - */ -public class CancellableVRpc extends ForwardingVRpc { - private final Context context; - private final Context.CancellationListener cancellationListener; - - public CancellableVRpc(VRpc delegate, Context context) { - super(delegate); - this.context = context; - this.cancellationListener = - (c) -> { - boolean deadlineExceeded = - Optional.ofNullable(c.getDeadline()).map(Deadline::isExpired).orElse(false); - deadlineExceeded = deadlineExceeded && c.cancellationCause() instanceof TimeoutException; - // Let VRpc machinery handle deadline exceeded - if (!deadlineExceeded) { - delegate.cancel("gRPC context cancelled", c.cancellationCause()); - } - }; - } - - @Override - public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { - context.addListener(cancellationListener, MoreExecutors.directExecutor()); - super.start( - req, ctx, new CancellationCleanupListener<>(listener, context, cancellationListener)); - } - - private static class CancellationCleanupListener extends ForwardListener { - private final Context context; - private final Context.CancellationListener cancellationListener; - - private CancellationCleanupListener( - VRpcListener delegate, - Context context, - Context.CancellationListener cancellationListener) { - super(delegate); - this.context = context; - this.cancellationListener = cancellationListener; - } - - @Override - public void onClose(VRpcResult result) { - context.removeListener(cancellationListener); - super.onClose(result); - } - } -} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java new file mode 100644 index 000000000000..5a3b3e0b9750 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/OpExecutor.java @@ -0,0 +1,145 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import com.google.common.util.concurrent.MoreExecutors; +import java.util.ArrayDeque; +import java.util.concurrent.Executor; + +/** + * Per-op serializing executor that tracks which thread is currently draining it, so callers can + * assert they are running inside the executor (analogous to {@code + * SynchronizationContext#throwIfNotInThisSynchronizationContext}). + * + *

{@link #runInline} executes synchronously on the caller thread when the executor is idle, or + * falls back to a queued submission when busy. Use it to avoid the queue+drain round-trip for the + * first task of a fresh op executor (e.g. the start() dispatch). + * + *

If a task throws, the registered {@link UncaughtExceptionHandler} is invoked on the same task + * thread (still inside the drain wrapper, so {@link #throwIfNotInThisExecutor} still passes). This + * is the last-resort recovery point for the op chain — without it, an exception inside a callback + * is silently dropped and the caller's listener never sees a terminal close. The handler's own + * throws propagate (caught by the backing executor in production; surfaced to the calling thread + * when the backing is {@link MoreExecutors#directExecutor()}, which is what makes fail-fast test + * handlers like {@code t -> throw new AssertionError(t)} work). + */ +public final class OpExecutor implements Executor { + + public interface UncaughtExceptionHandler { + void uncaught(Throwable t); + } + + private final Executor backing; + private final UncaughtExceptionHandler handler; + + // Guards queue and drainScheduled. runningThread is volatile so throwIfNotInThisExecutor() can + // read it lock-free; writes happen inside the lock to piggy-back the memory barrier. + private final ArrayDeque queue = new ArrayDeque<>(); + private boolean drainScheduled = false; + private volatile Thread runningThread; + + public OpExecutor(Executor backing, UncaughtExceptionHandler handler) { + this.backing = backing; + this.handler = handler; + } + + @Override + public void execute(Runnable r) { + synchronized (queue) { + queue.add(r); + if (!drainScheduled && runningThread == null) { + scheduleDrainLocked(); + } + } + } + + /** + * Runs {@code r} synchronously on the caller thread if this executor is idle, otherwise queues + * it for later drain on the backing executor. Either way, FIFO ordering with other tasks is + * preserved. + */ + public void runInline(Runnable r) { + synchronized (queue) { + if (drainScheduled || runningThread != null || !queue.isEmpty()) { + queue.add(r); + if (!drainScheduled) { + scheduleDrainLocked(); + } + return; + } + runningThread = Thread.currentThread(); + } + try { + try { + r.run(); + } catch (Throwable t) { + handler.uncaught(t); + } + } finally { + synchronized (queue) { + runningThread = null; + if (!queue.isEmpty() && !drainScheduled) { + scheduleDrainLocked(); + } + } + } + } + + // Schedule a drain on the backing executor. If the backing throws (e.g. RejectedExecutionException + // during shutdown), reset drainScheduled before propagating so the next execute() can retry + // instead of wedging the executor with no drainer. + private void scheduleDrainLocked() { + drainScheduled = true; + try { + backing.execute(this::drain); + } catch (Throwable t) { + drainScheduled = false; + throw t; + } + } + + private void drain() { + while (true) { + Runnable r; + synchronized (queue) { + r = queue.poll(); + if (r == null) { + drainScheduled = false; + return; + } + runningThread = Thread.currentThread(); + } + try { + try { + r.run(); + } catch (Throwable t) { + handler.uncaught(t); + } + } finally { + synchronized (queue) { + runningThread = null; + } + } + } + } + + public void throwIfNotInThisExecutor() { + if (Thread.currentThread() != runningThread) { + throw new IllegalStateException("Not running on this op executor"); + } + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java index d6048bfb9140..3edfc2390b03 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpc.java @@ -17,16 +17,15 @@ package com.google.cloud.bigtable.data.v2.internal.middleware; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.common.base.Stopwatch; import com.google.protobuf.Duration; import com.google.protobuf.util.Durations; import com.google.rpc.RetryInfo; import io.grpc.Context; import io.grpc.Status; -import io.grpc.SynchronizationContext; import java.util.Optional; import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import java.util.logging.Level; @@ -46,31 +45,22 @@ public class RetryingVRpc implements VRpc { private VRpcCallContext context; private VRpcTracer tracer; - private final ScheduledExecutorService executor; - private final SynchronizationContext syncContext; + private final BigtableTimer timer; - // current state and all the flags don't need to be volatile because they're only updated within - // the sync context. + // All mutable state is owned by the op executor; VOperationImpl trampolines every inbound call + // onto it, so no synchronization is needed here. private State currentState; private boolean started; - // Breaks the loop if uncaught exception happens during sync context execution. + // Breaks the loop on uncaught exception during cancel. private boolean isCancelling; - public RetryingVRpc(Supplier> supplier, ScheduledExecutorService executor) { + public RetryingVRpc(Supplier> supplier, BigtableTimer timer) { this.attemptFactory = supplier; grpcContext = Context.current(); otelContext = io.opentelemetry.context.Context.current(); - this.executor = otelContext.wrap(executor); - this.syncContext = - new SynchronizationContext( - (t, e) -> { - this.cancel( - "Unexpected error while notifying the caller of RetryingVRpc. Trying to cancel" - + " vRpc to ensure consistent state", - e); - }); + this.timer = timer; started = false; isCancelling = false; @@ -79,60 +69,80 @@ public RetryingVRpc(Supplier> supplier, ScheduledExecutorServi @Override public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { - syncContext.execute( - () -> { - if (started) { - listener.onClose( - VRpcResult.createRejectedError( - Status.FAILED_PRECONDITION.withDescription("operation is already started"))); - return; - } - started = true; - - this.request = req; - this.listener = listener; - this.context = ctx; - this.tracer = context.getTracer(); - - tracer.onOperationStart(); - currentState.onStart(); - }); + if (started) { + listener.onClose( + VRpcResult.createRejectedError( + Status.FAILED_PRECONDITION.withDescription("operation is already started"))); + return; + } + + // Publish the fields BEFORE the try block. If anything below throws and we recover via + // cancel(), cancel() reads this.context / this.listener — they must be set already, or + // we trade the original failure for an NPE inside the recovery path. + this.request = req; + this.listener = listener; + this.context = ctx; + this.tracer = context.getTracer(); + + // tracer.onOperationStart runs BEFORE started=true so a tracer failure short-circuits to a + // direct listener.onClose without entering the state machine. If started=true were set first, + // a tracer throw would route through cancel→Done.onStart, which would then NPE in its own + // finally on tracer.onOperationFinish/recordApplicationBlockingLatencies, swallowing the + // original cause and surprising the caller with the secondary NPE. + try { + tracer.onOperationStart(); + } catch (Throwable t) { + listener.onClose( + VRpcResult.createRejectedError( + Status.INTERNAL.withDescription("tracer.onOperationStart failed").withCause(t))); + return; + } + started = true; + + try { + currentState.onStart(); + } catch (Throwable t) { + cancel("Unexpected error in start", t); + } } @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { - syncContext.execute( - () -> { - if (currentState.isDone() || isCancelling) { - LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); - return; - } - // Prevents infinite loop if there's any error thrown during this phase. - isCancelling = true; - Throwable finalCause = cause; - try { - currentState.onCancel(message, cause); - } catch (Throwable t) { - if (finalCause != null) { - finalCause.addSuppressed(t); - } else { - finalCause = t; - } - } - onStateChange( - new Done( - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(finalCause)))); - }); + if (currentState.isDone() || isCancelling) { + LOG.fine("Ignoring cancel because the vRPC is already cancelled or done."); + return; + } + // Prevents infinite loop if there's any error thrown during this phase. + isCancelling = true; + Throwable finalCause = cause; + try { + currentState.onCancel(message, cause); + } catch (Throwable t) { + if (finalCause != null) { + finalCause.addSuppressed(t); + } else { + finalCause = t; + } + } + onStateChange( + new Done( + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(finalCause)))); } @Override public void requestNext() { + // Assert the op-executor affinity even though the body is dead today — when streaming lands + // and this becomes real, the missing assertion would silently allow off-thread access. + // Guarded on context being set so a misuse before start() still throws UnsupportedOperationException + // rather than NPE on the assertion. + if (context != null) { + context.getExecutor().throwIfNotInThisExecutor(); + } throw new UnsupportedOperationException("request next is not supported in unary"); } void onStateChange(State state) { - syncContext.throwIfNotInThisSynchronizationContext(); if (currentState.isDone()) { return; } @@ -177,55 +187,68 @@ public void onStart() { new VRpcListener() { @Override public void onMessage(RespT msg) { - syncContext.execute( - () -> { - if (currentState != Active.this) { - LOG.log( - Level.FINE, - "Discarding response {0} because the attempt is no longer active.", - msg); - return; - } - tracer.onResponseReceived(); - Stopwatch appTimer = Stopwatch.createStarted(); - try { - listener.onMessage(msg); - } finally { - tracer.recordApplicationBlockingLatencies(appTimer.elapsed()); - } - }); + context.getExecutor().throwIfNotInThisExecutor(); + if (currentState != Active.this) { + LOG.log( + Level.FINE, + "Discarding response {0} because the attempt is no longer active.", + msg); + return; + } + tracer.onResponseReceived(); + Stopwatch appTimer = Stopwatch.createStarted(); + Throwable userThrow = null; + try { + listener.onMessage(msg); + } catch (Throwable t) { + userThrow = t; + } finally { + tracer.recordApplicationBlockingLatencies(appTimer.elapsed()); + } + if (userThrow != null) { + // Classify as USER_FAILURE (not CANCELLED, which is what the OpExecutor uncaught + // handler would produce via chain.cancel). Finish tracing for the in-flight + // attempt, cancel the underlying gRPC call so no further events arrive (its later + // onClose is dropped by the currentState != Active.this guard), and transition + // directly to Done with the user-error result. + VRpcResult userResult = VRpcResult.createUserError(userThrow); + tracer.onAttemptFinish(userResult); + attempt.cancel("User callback threw", userThrow); + onStateChange(new Done(userResult)); + } } @Override public void onClose(VRpcResult result) { - syncContext.execute( - () -> { - tracer.onAttemptFinish(result); - if (currentState != Active.this) { - LOG.log( - Level.FINE, - "Discarding server close with result {0} because the the attempt is no" - + " longer active.", - result); - return; - } - if (shouldRetry(result)) { - context = context.createForNextAttempt(); - Duration retryDelay = - Optional.ofNullable(result.getRetryInfo()) - .map(RetryInfo::getRetryDelay) - .orElse(Durations.ZERO); - if (Durations.compare(retryDelay, Durations.ZERO) > 0) { - Scheduled scheduled = new Scheduled(retryDelay); - onStateChange(scheduled); - } else { - onStateChange(new Idle()); - } - return; - } - - onStateChange(new Done(result)); - }); + context.getExecutor().throwIfNotInThisExecutor(); + // Stale-state guard first (matches onMessage above): a discarded onClose must not + // count as an attempt finish, otherwise any future path that delivers a late + // onClose would double-fire tracer.onAttemptFinish for the same attempt. + if (currentState != Active.this) { + LOG.log( + Level.FINE, + "Discarding server close with result {0} because the the attempt is no" + + " longer active.", + result); + return; + } + tracer.onAttemptFinish(result); + if (shouldRetry(result)) { + context = context.createForNextAttempt(); + Duration retryDelay = + Optional.ofNullable(result.getRetryInfo()) + .map(RetryInfo::getRetryDelay) + .orElse(Durations.ZERO); + if (Durations.compare(retryDelay, Durations.ZERO) > 0) { + Scheduled scheduled = new Scheduled(retryDelay); + onStateChange(scheduled); + } else { + onStateChange(new Idle()); + } + return; + } + + onStateChange(new Done(result)); } }); } @@ -271,7 +294,7 @@ boolean shouldRetry(VRpcResult result) { class Scheduled extends State { private final Duration retryDelay; - private SynchronizationContext.ScheduledHandle future; + private BigtableTimer.Timeout future; Scheduled(Duration retryDelay) { this.retryDelay = retryDelay; @@ -280,12 +303,21 @@ class Scheduled extends State { @Override public void onStart() { try { + // Wraps go innermost so the captured gRPC + OpenTelemetry contexts are re-established at + // the moment the body runs, not just while the dispatcher is invoking the outer task. future = - syncContext.schedule( - () -> grpcContext.wrap(() -> onStateChange(new Idle())).run(), + timer.newTimeout( + () -> + context.getExecutor().execute( + () -> + grpcContext.wrap( + () -> + otelContext + .wrap(() -> onStateChange(new Idle())) + .run()) + .run()), Durations.toMillis(retryDelay), - TimeUnit.MILLISECONDS, - executor); + TimeUnit.MILLISECONDS); } catch (RejectedExecutionException e) { onStateChange( new Done( @@ -301,9 +333,8 @@ public void onStart() { public void onCancel(String reason, Throwable throwable) { // future can be null if schedule throws an exception that's not RejectedExecutionException. // In which case sync context uncaught exception handler will be called, which calls cancel on - // the current - // state before transition into done state. - if (future != null && future.isPending()) { + // the current state before transition into done state. + if (future != null && !future.isCancelled()) { future.cancel(); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java new file mode 100644 index 000000000000..fcc50904fedd --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperation.java @@ -0,0 +1,37 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import javax.annotation.Nullable; + +/** + * Entry point for a single user-facing operation that may translate into one or more {@link VRpc} + * attempts. + * + *

{@code VOperation} sits above the {@code VRpc} chain. It owns the per-op {@link + * VRpc.VRpcCallContext} and the gRPC {@link io.grpc.Context} cancellation listener, so downstream + * middleware only deals with the chain itself. + */ +public interface VOperation { + + /** Start the operation. Results are delivered to {@code listener}. */ + void start(ReqT req, VRpcListener listener); + + /** Cancel a started operation. Best effort. */ + void cancel(@Nullable String message, @Nullable Throwable cause); +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java new file mode 100644 index 000000000000..880b65a6f56f --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImpl.java @@ -0,0 +1,147 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; +import com.google.cloud.bigtable.data.v2.internal.middleware.ForwardingVRpc.ForwardListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; +import com.google.common.util.concurrent.MoreExecutors; +import io.grpc.Context; +import io.grpc.Deadline; +import java.util.Optional; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeoutException; +import javax.annotation.Nullable; + +/** + * The single edge between the user and the VRpc middleware chain. Trampolines all inbound user + * calls onto opExecutor and owns the gRPC {@link Context} cancellation listener so that every + * layer below is single-threaded on opExecutor. + * + *

Precondition: {@link #cancel} must not be called before {@link #start}. + */ +public class VOperationImpl implements VOperation { + + private final VRpc chain; + private final Context grpcContext; + private final Executor userCallbackExecutor; + private final VRpcTracer tracer; + private final Deadline deadline; + private final boolean idempotent; + private final Context.CancellationListener cancellationListener; + + // Written in start() on the caller thread before the listener is registered and before cancel() + // is reachable from any external thread. Volatile for safe publication to those threads. + private volatile OpExecutor opExecutor; + + public VOperationImpl( + VRpc chain, + Context grpcContext, + Executor userCallbackExecutor, + VRpcTracer tracer, + Deadline deadline, + boolean idempotent) { + this.chain = chain; + this.grpcContext = grpcContext; + this.userCallbackExecutor = userCallbackExecutor; + this.tracer = tracer; + this.deadline = deadline; + this.idempotent = idempotent; + this.cancellationListener = + (c) -> { + boolean deadlineExceeded = + Optional.ofNullable(c.getDeadline()).map(Deadline::isExpired).orElse(false); + deadlineExceeded = deadlineExceeded && c.cancellationCause() instanceof TimeoutException; + // Let VRpc machinery handle deadline exceeded + if (!deadlineExceeded) { + cancel("gRPC context cancelled", c.cancellationCause()); + } + }; + } + + @Override + public void start(ReqT req, VRpcListener listener) { + // Last-resort recovery: if any op-executor task throws (typically a user-installed tracer, + // or a listener callback that escapes RetryingVRpc's existing per-state try/catches), drive + // the chain to a terminal state so the caller's listener still receives an onClose. The + // handler runs on the failed task's wrapper, so chain.cancel() — which calls + // OpExecutor#throwIfNotInThisExecutor — passes. RetryingVRpc.cancel is idempotent + // (isCancelling / currentState.isDone() guards), so a cascade of failures collapses to a + // single Done. + OpExecutor exec = + new OpExecutor( + userCallbackExecutor, t -> chain.cancel("Uncaught exception in op executor task", t)); + this.opExecutor = exec; + VRpcCallContext ctx = VRpcCallContext.create(deadline, idempotent, tracer, exec); + CleanupListener wrapped = + new CleanupListener<>(listener, grpcContext, cancellationListener); + exec.runInline(() -> chain.start(req, ctx, wrapped)); + // Register AFTER chain.start so a context-cancel that fires immediately is sequenced behind + // start. Matches ClientCallImpl's ordering (grpc-java issue #1343). + // + // Queueing the registration onto the op executor is what makes the closed-check sound: any + // onClose that chain.start enqueued during runInline drains FIRST (FIFO), so by the time this + // task runs wrapped.closed reflects whether onClose has already fired. If it has, we skip + // addListener — otherwise the listener would pin the chain on grpcContext for its lifetime + // (CleanupListener.onClose already called removeListener as a no-op pre-registration). If + // grpcContext gets cancelled between start() returning and this task running, the + // directExecutor below fires the listener immediately on addListener, so cancel still + // propagates correctly. + // + // runInline is the right verb here: when chain.start enqueued nothing (common path), the + // executor is idle and the body runs inline on this thread — no extra context switch. When + // chain.start did enqueue an onClose, runInline takes the queue branch and FIFO drains both. + exec.runInline( + () -> { + if (!wrapped.closed) { + grpcContext.addListener(cancellationListener, MoreExecutors.directExecutor()); + } + }); + } + + @Override + public void cancel(@Nullable String message, @Nullable Throwable cause) { + opExecutor.execute(() -> chain.cancel(message, cause)); + } + + private static class CleanupListener extends ForwardListener { + private final Context grpcContext; + private final Context.CancellationListener cancellationListener; + // Read by VOperationImpl.start on the caller thread after runInline returns. runInline runs + // chain.start synchronously, so any sync onClose has completed (and this flag been set) by + // the time start() reads it on the same thread — no synchronization needed. + volatile boolean closed = false; + + CleanupListener( + VRpcListener delegate, + Context grpcContext, + Context.CancellationListener cancellationListener) { + super(delegate); + this.grpcContext = grpcContext; + this.cancellationListener = cancellationListener; + } + + @Override + public void onClose(VRpcResult result) { + closed = true; + grpcContext.removeListener(cancellationListener); + super.onClose(result); + } + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java index a29a51fd72c6..d5a605461d37 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/middleware/VRpc.java @@ -24,6 +24,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.MoreExecutors; import com.google.protobuf.Any; import com.google.rpc.RetryInfo; import io.grpc.Context; @@ -92,12 +93,32 @@ abstract class VRpcCallContext { public abstract VRpcTracer getTracer(); + /** + * The per-op executor that serializes all middleware below {@link VOperationImpl}. Owned by + * {@code VOperationImpl}; downstream layers use it via {@code ctx.getExecutor()}. + */ + public abstract OpExecutor getExecutor(); + // TODO: csm // Clientside metrics instrument // public abstract BigtableTracer getTracer(); + /** + * Defaults the executor to one over {@link MoreExecutors#directExecutor()}. Suitable for tests + * that do not exercise the per-op serialization; production callers go through {@link + * VOperationImpl} which supplies a real serializing executor. + */ public static VRpcCallContext create( Deadline deadline, boolean isIdempotent, VRpcTracer tracer) { + return create( + deadline, + isIdempotent, + tracer, + new OpExecutor(MoreExecutors.directExecutor(), t -> {})); + } + + public static VRpcCallContext create( + Deadline deadline, boolean isIdempotent, VRpcTracer tracer, OpExecutor executor) { Deadline grpcContextDeadline = Context.current().getDeadline(); @@ -114,12 +135,12 @@ public static VRpcCallContext create( } return new AutoValue_VRpc_VRpcCallContext( - OperationInfo.create(operationTimeout, isIdempotent), "TODO", tracer); + OperationInfo.create(operationTimeout, isIdempotent), "TODO", tracer, executor); } public VRpcCallContext createForNextAttempt() { return new AutoValue_VRpc_VRpcCallContext( - getOperationInfo().createForNextAttempt(), getTraceParent(), getTracer()); + getOperationInfo().createForNextAttempt(), getTraceParent(), getTracer(), getExecutor()); } } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java new file mode 100644 index 000000000000..48930ece77e1 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/BigtableTimer.java @@ -0,0 +1,60 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.session; + +import java.util.concurrent.TimeUnit; + +/** + * Schedules short-lived callbacks (heartbeat ticks, deadline monitors, watchdog ticks) at + * approximate, low-resolution times. Backed by a hashed wheel: O(1) insert and O(1) cancel, + * regardless of how many pending timeouts the wheel holds. + * + *

{@link #newTimeout} runs the callback on the timer's bundled dispatch executor — callers do + * not have to wrap their bodies in {@code executor.execute(...)} to stay off the tick thread. This + * is the default and is correct for any callback that takes a lock or does real work. + * + *

TODO: once later refactor steps introduce per-op / per-session dispatchers (e.g. {@code + * SerializingExecutor} or {@code SynchronizationContext}), add a {@code newTimeoutOnTickThread} + * variant so callers with their own dispatcher can skip the wasted hop through the bundled + * executor. + * + *

This is a thin abstraction over a single concrete implementation today (see {@code + * NettyWheelTimer}). It exists so the implementation can be swapped after benchmarking establishes + * a baseline. + */ +public interface BigtableTimer { + /** + * Schedules {@code task} to run after {@code delay}. The task body runs on the timer's bundled + * dispatch executor, not on the tick thread, so it is safe to take locks or do bounded work. + * + *

The returned handle can be used to cancel the task; cancel is O(1) and does not leave the + * entry in any heap. + */ + Timeout newTimeout(Runnable task, long delay, TimeUnit unit); + + /** + * Releases the tick thread and discards any pending timeouts. Idempotent. After {@code stop()}, + * subsequent calls to {@link #newTimeout} throw {@link IllegalStateException}. + */ + void stop(); + + interface Timeout { + /** Cancels the scheduled task. Returns true if the task had not yet fired. */ + boolean cancel(); + + boolean isCancelled(); + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java new file mode 100644 index 000000000000..825065055110 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/NettyWheelTimer.java @@ -0,0 +1,78 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.session; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import io.grpc.netty.shaded.io.netty.util.HashedWheelTimer; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; + +/** + * {@link BigtableTimer} backed by Netty's {@code HashedWheelTimer}, accessed via the shaded copy + * inside {@code grpc-netty-shaded}. We depend on the shaded class so the library does not pull in + * an additional {@code io.netty:netty-common} artifact. + * + *

Temporary: once threading-refactor benchmarks establish a baseline, this should be replaced + * with an in-tree implementation that does not reach into gRPC's shaded internals. + */ +public final class NettyWheelTimer implements BigtableTimer { + // 10 ms tick × 512 buckets ≈ 5 s per rotation. Heartbeat (100 ms), deadlines (sub-second to + // seconds), and watchdog (5 min) all sit comfortably inside this resolution. + private static final long TICK_DURATION_MS = 10; + private static final int TICKS_PER_WHEEL = 512; + + private final HashedWheelTimer delegate; + private final Executor dispatcher; + + public NettyWheelTimer(String name, Executor dispatcher) { + this.dispatcher = dispatcher; + this.delegate = + new HashedWheelTimer( + new ThreadFactoryBuilder().setNameFormat(name + "-%d").setDaemon(true).build(), + TICK_DURATION_MS, + TimeUnit.MILLISECONDS, + TICKS_PER_WHEEL); + } + + @Override + public Timeout newTimeout(Runnable task, long delay, TimeUnit unit) { + return new TimeoutHandle( + delegate.newTimeout(ignored -> dispatcher.execute(task), delay, unit)); + } + + @Override + public void stop() { + delegate.stop(); + } + + private static final class TimeoutHandle implements Timeout { + private final io.grpc.netty.shaded.io.netty.util.Timeout nettyTimeout; + + TimeoutHandle(io.grpc.netty.shaded.io.netty.util.Timeout nettyTimeout) { + this.nettyTimeout = nettyTimeout; + } + + @Override + public boolean cancel() { + return nettyTimeout.cancel(); + } + + @Override + public boolean isCancelled() { + return nettyTimeout.isCancelled(); + } + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java index 9ee86bffbdbd..b976eef652a2 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImpl.java @@ -44,15 +44,17 @@ import com.google.protobuf.util.Durations; import io.grpc.Metadata; import io.grpc.Status; +import io.grpc.SynchronizationContext; import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.Locale; import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; -import javax.annotation.concurrent.GuardedBy; /** Wraps a Bidi ClientCall and layers session semantics on top. */ @VisibleForTesting @@ -70,57 +72,75 @@ public class SessionImpl implements Session, VRpcSessionApi { // A time in the future to skip heartbeat checks when there's no active vRPCs on the session static final Duration FUTURE_TIME = Duration.ofMinutes(30); - /* - * This lock should be mostly uncontended - all access should be naturally interleaved. Contention - * can only really happen when an unsolicited gRPC control message (ie GOAWAY) arrives at the same - * time as newCall or cancel. - * TODO: Contention will increase when multiplexing is implemented. - */ - private final Object lock = new Object(); + private static final CloseSessionRequest MISSED_HEARTBEAT_CLOSE_REQUEST = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_MISSED_HEARTBEAT) + .setDescription("missed heartbeat") + .build(); private final Clock clock; + private final BigtableTimer timer; + // Serializes all session state mutations. Stream callbacks and the heartbeat tick dispatch + // onto it; every handler below runs inside a syncContext task. + private final SynchronizationContext sessionSyncContext; private final SessionTracer tracer; private final DebugTagTracer debugTagTracer; private final SessionInfo info; - @GuardedBy("lock") private final SessionStream stream; - @GuardedBy("lock") - private SessionState state = SessionState.NEW; + private volatile SessionState state = SessionState.NEW; - @GuardedBy("lock") - private Instant lastStateChangedAt; + private volatile Instant lastStateChangedAt; + // All fields below are owned by sessionSyncContext: every writer runs inside a + // sessionSyncContext task, and the in-class readers do too (handlers, scheduled heartbeat + // tick). They lost their volatile / lock guards when synchronized(lock) was removed; + // SyncContext serialization is what makes plain reads/writes safe. + // + // External callers of getOpenParams / isOpenParamsUpdated / getNextHeartbeat must therefore + // either run on sessionSyncContext themselves (e.g. via a Session.Listener callback, which is + // dispatched from inside the context) or accept a possibly-stale snapshot. There are no + // off-context production readers today. private Listener sessionListener; - private volatile OpenParams openParams; + private OpenParams openParams; - private volatile boolean openParamsUpdated; + private boolean openParamsUpdated; @Nullable private CloseSessionRequest closeReason = null; - @GuardedBy("lock") - private long nextRpcId = 1; + private final AtomicLong nextRpcId = new AtomicLong(1); // TODO: replace with a map when implementing multiplexing - @GuardedBy("lock") private VRpcImpl currentRpc = null; - @GuardedBy("lock") private VRpcResult currentCancel = null; private SessionParametersResponse sessionParameters = DEFAULT_SESSION_PARAMS; - private volatile Duration heartbeatInterval = + + private Duration heartbeatInterval = Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); - private volatile Instant nextHeartbeat; + private Instant nextHeartbeat; + + // Handle for the in-flight heartbeat tick (one outstanding at a time). Cancelled on terminal + // transitions so the wheel doesn't carry a no-op entry until the next fire. + @Nullable private BigtableTimer.Timeout heartbeatTimeout; + + // Set by the global SyncContext handler when an uncaught exception triggers an abort. Read on + // re-entry to break out instead of looping. Only accessed inside sessionSyncContext. + private boolean isAborting = false; public SessionImpl( - Metrics metrics, SessionPoolInfo poolInfo, long sessionNum, SessionStream stream) { - this(metrics, Clock.systemUTC(), poolInfo, sessionNum, stream); + Metrics metrics, + SessionPoolInfo poolInfo, + long sessionNum, + SessionStream stream, + BigtableTimer timer) { + this(metrics, Clock.systemUTC(), poolInfo, sessionNum, stream, timer); } SessionImpl( @@ -128,28 +148,97 @@ public SessionImpl( Clock clock, SessionPoolInfo poolInfo, long sessionNum, - SessionStream stream) { + SessionStream stream, + BigtableTimer timer) { this.clock = clock; + this.timer = timer; this.info = SessionInfo.create(poolInfo, sessionNum); this.stream = stream; this.tracer = metrics.newSessionTracer(poolInfo); this.debugTagTracer = metrics.getDebugTagTracer(); this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); this.openParamsUpdated = false; + // On uncaught exception, drive the session through a clean terminal-close path so the pool + // and the in-flight vRpc are always notified. notifyTerminalClose has local guards, and + // isAborting prevents recursion if the abort path itself throws. + this.sessionSyncContext = + new SynchronizationContext((thread, e) -> abortFromUncaughtException(e)); + } + + private void abortFromUncaughtException(Throwable e) { + if (isAborting) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Secondary uncaught exception during abort, ignoring", + info.getLogName()), + e); + return; + } + isAborting = true; + + logger.log( + Level.SEVERE, + String.format( + "Session error: %s Uncaught exception in session SyncContext in state %s, PeerInfo:" + + " %s — aborting session", + info.getLogName(), state, formatPeerInfo(safeGetPeerInfo())), + e); + + if (state == SessionState.CLOSED) { + return; + } + + // Always overwrite closeReason: the abort is what actually happened, not whatever clean close + // we may have been attempting. Fold the prior reason into the description for forensics so + // downstream metrics (which bucket by reason) attribute this to ERROR rather than the + // interrupted close. + String prevDesc = + (closeReason != null) + ? " (was closing for: " + + closeReason.getReason() + + " — " + + closeReason.getDescription() + + ")" + : ""; + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("Uncaught exception in session SyncContext: " + e + prevDesc) + .build(); + + VRpcImpl localRpc = currentRpc; + currentRpc = null; + SessionState prevState = state; + updateState(SessionState.CLOSED); + + // Defensively tell the transport we're done. Safe on un-started streams via the try/catch. + try { + stream.forceClose("Session aborted due to uncaught exception", e); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Exception while force-closing stream during abort", + info.getLogName()), + t); + } + + notifyTerminalClose( + Status.INTERNAL.withDescription("Session aborted").withCause(e), + new Metadata(), + localRpc, + prevState); } @Override public SessionState getState() { - synchronized (lock) { - return state; - } + return state; } @Override public Instant getLastStateChange() { - synchronized (lock) { - return lastStateChangedAt; - } + return lastStateChangedAt; } @Override @@ -169,13 +258,7 @@ public Instant getNextHeartbeat() { @Override public PeerInfo getPeerInfo() { - // This lock might not be necessary, its populated once on a gRPC callback which should - // establish a happens before relationship. However access to the underlying stream is guarded - // with errorprone, so sync block is required to get around the lint. - // TODO: consider removing the sync block - synchronized (lock) { - return stream.getPeerInfo(); - } + return stream.getPeerInfo(); } @Override @@ -185,98 +268,101 @@ public String getLogName() { @Override public void forceClose(CloseSessionRequest closeReason) { - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_force_close_wrong_state", - "Tried to forceClose an unstarted session %s in state %s", - info.getLogName(), - state); - - if (state == SessionState.CLOSED) { - return; - } - - updateState(SessionState.WAIT_SERVER_CLOSE); - this.closeReason = closeReason; - - // Not sending the CloseSessionRequest because cancel() will just drop it - stream.forceClose(closeReason.getDescription(), null); - // Listeners will be notified by dispatchStreamClosed - } + sessionSyncContext.execute( + () -> { + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_force_close_wrong_state", + "Tried to forceClose an unstarted session %s in state %s", + info.getLogName(), + state); + + if (state == SessionState.CLOSED) { + return; + } + + updateState(SessionState.WAIT_SERVER_CLOSE); + this.closeReason = closeReason; + + // Not sending the CloseSessionRequest because cancel() will just drop it + stream.forceClose(closeReason.getDescription(), null); + // Listeners will be notified by dispatchStreamClosed + }); } @Override public void start(OpenSessionRequest req, Metadata headers, Listener sessionListener) { - synchronized (lock) { - debugTagTracer.checkPrecondition( - state == SessionState.NEW, - "session_start_wrong_state", - "Tried to start a started session, current state: %s", - state); - - logger.fine(String.format("Starting session %s", info.getLogName())); - tracer.onStart(); - - updateState(SessionState.STARTING); - openParams = OpenParams.create(headers, req); - this.sessionListener = sessionListener; - - SessionRequest wrappedReq = SessionRequest.newBuilder().setOpenSession(req).build(); - stream.start( - new SessionStream.Listener() { - @Override - public void onBeforeSessionStart(PeerInfo peerInfo) {} - - @Override - public void onMessage(SessionResponse message) { - dispatchResponseMessage(message); - } - - @Override - public void onClose(Status status, Metadata trailers) { - dispatchStreamClosed(status, trailers); - } - }, - headers); - - stream.sendMessage(wrappedReq); - } + sessionSyncContext.execute( + () -> { + debugTagTracer.checkPrecondition( + state == SessionState.NEW, + "session_start_wrong_state", + "Tried to start a started session, current state: %s", + state); + + logger.fine(String.format("Starting session %s", info.getLogName())); + tracer.onStart(); + + updateState(SessionState.STARTING); + openParams = OpenParams.create(headers, req); + this.sessionListener = sessionListener; + + SessionRequest wrappedReq = SessionRequest.newBuilder().setOpenSession(req).build(); + stream.start( + new SessionStream.Listener() { + @Override + public void onBeforeSessionStart(PeerInfo peerInfo) {} + + @Override + public void onMessage(SessionResponse message) { + sessionSyncContext.execute(() -> dispatchResponseMessage(message)); + } + + @Override + public void onClose(Status status, Metadata trailers) { + sessionSyncContext.execute(() -> dispatchStreamClosed(status, trailers)); + } + }, + headers); + + stream.sendMessage(wrappedReq); + }); } @Override public void close(CloseSessionRequest req) { logger.fine(String.format("Closing session %s for reason: %s", info.getLogName(), req)); - synchronized (lock) { - // Throw an exception because this is a bug and we dont have a listener - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_close_wrong_state", - "Session error: Caller tried to close session %s before starting it with the reason: %s", - info.getLogName(), - req); - - // Multiple close is a no-op - if (state.phase >= SessionState.CLOSING.phase) { - logger.fine( - String.format( - "Session error: Caller tried to close a session %s that is %s for reason: %s", - info.getLogName(), state, req)); - return; - } - - closeReason = req; - updateState(SessionState.CLOSING); - - if (currentRpc == null) { - startGracefulClose(); - } - } + sessionSyncContext.execute( + () -> { + // Throw an exception because this is a bug and we dont have a listener + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_close_wrong_state", + "Session error: Caller tried to close session %s before starting it with the reason:" + + " %s", + info.getLogName(), + req); + + // Multiple close is a no-op + if (state.phase >= SessionState.CLOSING.phase) { + logger.fine( + String.format( + "Session error: Caller tried to close a session %s that is %s for reason: %s", + info.getLogName(), state, req)); + return; + } + + closeReason = req; + updateState(SessionState.CLOSING); + + if (currentRpc == null) { + startGracefulClose(); + } + }); } /** Wraps the flow of closing a session. */ - @GuardedBy("lock") private void startGracefulClose() { debugTagTracer.checkPrecondition( state == SessionState.CLOSING, @@ -316,53 +402,87 @@ VRpc newCall(VRpcDescriptor descriptor) { "session_new_call_wrong_type", "wrong VRpc descriptor type"); - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_new_call_wrong_state", - "Session error: newCall called before start"); + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_new_call_wrong_state", + "Session error: newCall called before start"); - long rpcId = nextRpcId; - nextRpcId = Math.incrementExact(nextRpcId); - return new VRpcImpl<>(this, descriptor, rpcId, stream.getPeerInfo(), debugTagTracer); - } + long rpcId = nextRpcId.getAndIncrement(); + return new VRpcImpl<>(this, descriptor, rpcId, stream.getPeerInfo(), debugTagTracer); } @Override - public Status startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { - // start monitoring for heartbeat when the vrpc is started - this.nextHeartbeat = clock.instant().plus(heartbeatInterval); + public void startRpc(VRpcImpl rpc, VirtualRpcRequest payload) { + sessionSyncContext.execute( + () -> { + if (currentRpc != null) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: RPC multiplexing is not yet supported"))); + return; + } + if (state != SessionState.READY) { + rpc.handleError( + VRpcResult.createUncommitedError( + Status.INTERNAL.withDescription( + "Session error: Session was not ready, state = " + state))); + return; + } + + this.currentRpc = rpc; + stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); + this.nextHeartbeat = clock.instant().plus(heartbeatInterval); + }); + } - synchronized (lock) { - if (currentRpc != null) { - return Status.INTERNAL.withDescription( - "Session error: RPC multiplexing is not yet supported"); - } - if (state != SessionState.READY) { - return Status.INTERNAL.withDescription( - "Session error: Session was not ready, state = " + state); - } + @Override + public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause) { + sessionSyncContext.execute( + () -> { + if (currentRpc != null && rpcId == currentRpc.rpcId) { + currentCancel = + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription(message).withCause(cause)); + } + // do nothing if the rpc is already finished + }); + } - this.currentRpc = rpc; - stream.sendMessage(SessionRequest.newBuilder().setVirtualRpc(payload).build()); - return Status.OK; + private void scheduleHeartbeatCheck() { + heartbeatTimeout = + timer.newTimeout( + () -> sessionSyncContext.execute(this::checkHeartbeat), + HEARTBEAT_CHECK_INTERVAL.toMillis(), + TimeUnit.MILLISECONDS); + } + + private void cancelHeartbeatTimeout() { + if (heartbeatTimeout != null) { + heartbeatTimeout.cancel(); + heartbeatTimeout = null; } } - @Override - public void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause) { - synchronized (lock) { - if (currentRpc != null && rpcId == currentRpc.rpcId) { - currentCancel = - VRpcResult.createRejectedError( - Status.CANCELLED.withDescription(message).withCause(cause)); - } - // do nothing if the rpc is already finished + // Runs on sessionSyncContext (dispatched from the wheel-timer tick body). Checks if the + // heartbeat deadline has passed and force-closes on miss; otherwise re-schedules. + private void checkHeartbeat() { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); + if (state.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { + return; } + if (clock.instant().isAfter(nextHeartbeat)) { + logger.warning( + String.format("Missed heartbeat for %s, forcing session close", info.getLogName())); + forceClose(MISSED_HEARTBEAT_CLOSE_REQUEST); + return; + } + scheduleHeartbeatCheck(); } // region SessionStream event handlers private void dispatchResponseMessage(SessionResponse message) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); switch (message.getPayloadCase()) { case OPEN_SESSION: handleOpenSessionResponse(message.getOpenSession()); @@ -392,219 +512,190 @@ private void dispatchResponseMessage(SessionResponse message) { } private void handleOpenSessionResponse(OpenSessionResponse openSession) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); logger.fine(String.format("%s Session is ready", info.getLogName())); - PeerInfo localPeerInfo; - - synchronized (lock) { - debugTagTracer.checkPrecondition( - state != SessionState.NEW, - "session_open_wrong_state", - "Got session open response before session started"); - debugTagTracer.checkPrecondition( - state != SessionState.CLOSED, - "session_open_wrong_state", - "Got session open response after session was closed"); + debugTagTracer.checkPrecondition( + state != SessionState.NEW, + "session_open_wrong_state", + "Got session open response before session started"); + debugTagTracer.checkPrecondition( + state != SessionState.CLOSED, + "session_open_wrong_state", + "Got session open response after session was closed"); - if (state != SessionState.STARTING) { - logger.fine(String.format("Stream was already %s when session open was received", state)); - return; - } - localPeerInfo = stream.getPeerInfo(); - updateState(SessionState.READY); + if (state != SessionState.STARTING) { + logger.fine(String.format("Stream was already %s when session open was received", state)); + return; } + PeerInfo localPeerInfo = stream.getPeerInfo(); + updateState(SessionState.READY); tracer.onOpen(localPeerInfo); sessionListener.onReady(openSession); + scheduleHeartbeatCheck(); } private void handleSessionParamsResponse(SessionParametersResponse resp) { - synchronized (lock) { - if (state.phase >= SessionState.CLOSING.phase) { - logger.fine( - String.format("Stream was already %s when session params were received", state)); - return; - } + if (state.phase >= SessionState.CLOSING.phase) { + logger.fine(String.format("Stream was already %s when session params were received", state)); + return; + } - if (!sessionParameters.equals(resp)) { - this.sessionParameters = resp; - this.heartbeatInterval = - Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); - logger.log( - Level.CONFIG, - () -> - String.format( - "%s session params changed: %s", - info.getLogName(), - TextFormat.printer().emittingSingleLine(true).printToString(resp))); - } + if (!sessionParameters.equals(resp)) { + this.sessionParameters = resp; + this.heartbeatInterval = + Duration.ofMillis(Durations.toMillis(sessionParameters.getKeepAlive())); + logger.log( + Level.CONFIG, + () -> + String.format( + "%s session params changed: %s", + info.getLogName(), + TextFormat.printer().emittingSingleLine(true).printToString(resp))); } } private void handleVRpcResponse(VirtualRpcResponse vrpc) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); // TODO: when stream is supported this should be updated to the next expected time instead of // session life time this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl localRpc; - VRpcResult localCancel; - - boolean needsClose; - synchronized (lock) { - if (state.phase > SessionState.CLOSING.phase) { - debugTagTracer.record( - TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); - logger.warning( - String.format( - "%s Discarding vRPC error because session is past the CLOSING phase with the" - + " reason: %s", - info.getLogName(), closeReason)); - return; - } - - debugTagTracer.checkPrecondition( - state == SessionState.READY || state == SessionState.CLOSING, - "session_vrpc_response_wrong_state", - "Unexpected vRPC response when session is %s", - state); - debugTagTracer.checkPrecondition( - currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); - debugTagTracer.checkPrecondition( - currentRpc.rpcId == vrpc.getRpcId(), - "session_vrpc_id_mismatch", - "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", - currentRpc.rpcId, - vrpc.getRpcId()); - - // reset state of the current rpc - localCancel = currentCancel; - currentCancel = null; - localRpc = currentRpc; - // TODO: handle multiplexing - currentRpc = null; - needsClose = (state == SessionState.CLOSING); + if (state.phase > SessionState.CLOSING.phase) { + debugTagTracer.record( + TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); + logger.warning( + String.format( + "%s Discarding vRPC error because session is past the CLOSING phase with the" + + " reason: %s", + info.getLogName(), closeReason)); + return; } - if (localCancel != null) { - tracer.onVRpcClose(localCancel.getStatus().getCode()); - localRpc.handleError(localCancel); + debugTagTracer.checkPrecondition( + state == SessionState.READY || state == SessionState.CLOSING, + "session_vrpc_response_wrong_state", + "Unexpected vRPC response when session is %s", + state); + debugTagTracer.checkPrecondition( + currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); + debugTagTracer.checkPrecondition( + currentRpc.rpcId == vrpc.getRpcId(), + "session_vrpc_id_mismatch", + "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", + currentRpc.rpcId, + vrpc.getRpcId()); + + // reset state of the current rpc + VRpcImpl rpc = currentRpc; + VRpcResult cancel = currentCancel; + // TODO: handle multiplexing + currentRpc = null; + currentCancel = null; + + if (cancel != null) { + tracer.onVRpcClose(cancel.getStatus().getCode()); + rpc.handleError(cancel); } else { tracer.onVRpcClose(Status.OK.getCode()); - localRpc.handleResponse(vrpc); + rpc.handleResponse(vrpc); } - if (needsClose) { - synchronized (lock) { - if (state == SessionState.CLOSING) { - startGracefulClose(); - } - } + if (state == SessionState.CLOSING) { + startGracefulClose(); } } private void handleHeartBeatResponse(HeartbeatResponse ignored) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); this.nextHeartbeat = clock.instant().plus(heartbeatInterval); } private void handleSessionRefreshConfigResponse(SessionRefreshConfig config) { - synchronized (lock) { - Metadata grpcMetadata = new Metadata(); - config - .getMetadataList() - .forEach( - entry -> - grpcMetadata.put( - Metadata.Key.of(entry.getKey(), Metadata.ASCII_STRING_MARSHALLER), - entry.getValue().toStringUtf8())); - openParams = OpenParams.create(grpcMetadata, config.getOptimizedOpenRequest()); - openParamsUpdated = true; - } + Metadata grpcMetadata = new Metadata(); + config + .getMetadataList() + .forEach( + entry -> + grpcMetadata.put( + Metadata.Key.of(entry.getKey(), Metadata.ASCII_STRING_MARSHALLER), + entry.getValue().toStringUtf8())); + openParams = OpenParams.create(grpcMetadata, config.getOptimizedOpenRequest()); + openParamsUpdated = true; } private void handleVRpcErrorResponse(ErrorResponse error) { + sessionSyncContext.throwIfNotInThisSynchronizationContext(); // Skips the heartbeat check when there's no active vrpc on the session this.nextHeartbeat = clock.instant().plus(FUTURE_TIME); - VRpcImpl localRpc; - boolean needsClose; - VRpcResult localCancel; - - synchronized (lock) { - if (state.phase > SessionState.CLOSING.phase) { - debugTagTracer.record( - TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); - logger.warning( - String.format( - "%s Discarding vRPC error because session is past the CLOSING phase with the" - + " reason: %s, error was: %s", - info.getLogName(), closeReason, error)); - return; - } - - debugTagTracer.checkPrecondition( - state == SessionState.READY || state == SessionState.CLOSING, - "session_vrpc_response_wrong_state", - "Unexpected vRPC response when session is %s", - state); - - debugTagTracer.checkPrecondition( - currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); - debugTagTracer.checkPrecondition( - currentRpc.rpcId == error.getRpcId(), - "session_vrpc_id_mismatch", - "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", - currentRpc.rpcId, - error.getRpcId()); - - // reset the state of the current rpc - localCancel = currentCancel; - currentCancel = null; - localRpc = currentRpc; - currentRpc = null; - needsClose = (state == SessionState.CLOSING); + if (state.phase > SessionState.CLOSING.phase) { + debugTagTracer.record( + TelemetryConfiguration.Level.WARN, "session_closed_discard_vrpc_response"); + logger.warning( + String.format( + "%s Discarding vRPC error because session is past the CLOSING phase with the" + + " reason: %s, error was: %s", + info.getLogName(), closeReason, error)); + return; } - if (localCancel != null) { - tracer.onVRpcClose(localCancel.getStatus().getCode()); - localRpc.handleError(localCancel); + debugTagTracer.checkPrecondition( + state == SessionState.READY || state == SessionState.CLOSING, + "session_vrpc_response_wrong_state", + "Unexpected vRPC response when session is %s", + state); + + debugTagTracer.checkPrecondition( + currentRpc != null, "session_vrpc_null", "Got vRPC response but current vRPC is unset"); + debugTagTracer.checkPrecondition( + currentRpc.rpcId == error.getRpcId(), + "session_vrpc_id_mismatch", + "Got vRPC response for the wrong vRPC: expect: %s, actual: %s", + currentRpc.rpcId, + error.getRpcId()); + + // reset the state of the current rpc + VRpcImpl rpc = currentRpc; + VRpcResult cancel = currentCancel; + currentRpc = null; + currentCancel = null; + + if (cancel != null) { + tracer.onVRpcClose(cancel.getStatus().getCode()); + rpc.handleError(cancel); } else { tracer.onVRpcClose(Status.fromCodeValue(error.getStatus().getCode()).getCode()); - localRpc.handleError(VRpcResult.createServerError(error)); + rpc.handleError(VRpcResult.createServerError(error)); } - if (needsClose) { - synchronized (lock) { - if (state == SessionState.CLOSING) { - startGracefulClose(); - } - } + if (state == SessionState.CLOSING) { + startGracefulClose(); } } private void handleGoAwayResponse(GoAwayResponse goAwayResponse) { - synchronized (lock) { - if (state.phase >= SessionState.CLOSING.phase) { - debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_go_away_ignored"); - logger.warning( - String.format( - "Session error: %s Ignoring goaway because session is %s", - info.getLogName(), state)); - return; - } + if (state.phase >= SessionState.CLOSING.phase) { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_go_away_ignored"); + logger.warning( + String.format( + "Session error: %s Ignoring goaway because session is %s", info.getLogName(), state)); + return; + } - debugTagTracer.checkPrecondition( - state.phase >= SessionState.STARTING.phase, - "session_go_away_wrong_state", - "Unexpected goaway when session is %s", - state); + debugTagTracer.checkPrecondition( + state.phase >= SessionState.STARTING.phase, + "session_go_away_wrong_state", + "Unexpected goaway when session is %s", + state); - updateState(SessionState.CLOSING); - closeReason = - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_GOAWAY) - .setDescription( - "Server sent GO_AWAY_" + goAwayResponse.getReason().toUpperCase(Locale.ENGLISH)) - .build(); - if (currentRpc == null) { - startGracefulClose(); - } + updateState(SessionState.CLOSING); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_GOAWAY) + .setDescription( + "Server sent GO_AWAY_" + goAwayResponse.getReason().toUpperCase(Locale.ENGLISH)) + .build(); + if (currentRpc == null) { + startGracefulClose(); } sessionListener.onGoAway(goAwayResponse); } @@ -615,53 +706,79 @@ private void handleUnknownResponseMessage(SessionResponse message) { } private void dispatchStreamClosed(Status status, Metadata trailers) { - SessionState prevState; - VRpcImpl localVRpc; + SessionState prevState = state; - PeerInfo localPeerInfo; - synchronized (lock) { - prevState = state; + if (!status.isOk()) { + String augmentedDescription = + Optional.ofNullable(status.getDescription()).map(d -> d + ". ").orElse("") + + "PeerInfo: " + + formatPeerInfo(getPeerInfo()); - if (!status.isOk()) { - String augmentedDescription = - Optional.ofNullable(status.getDescription()).map(d -> d + ". ").orElse("") - + "PeerInfo: " - + formatPeerInfo(getPeerInfo()); + status = status.withDescription(augmentedDescription); + } - status = status.withDescription(augmentedDescription); - } + if (state == SessionState.WAIT_SERVER_CLOSE) { + logger.fine(String.format("%s closed normally with status %s", info.getLogName(), status)); + } else { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_abnormal_close"); + // Unexpected path + String msg = + String.format( + "Session error: %s session closed unexpectedly in state %s. Status: %s", + info.getLogName(), state, status); + logger.warning(msg); - if (state == SessionState.WAIT_SERVER_CLOSE) { - logger.fine(String.format("%s closed normally with status %s", info.getLogName(), status)); - } else { - debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_abnormal_close"); - // Unexpected path - String msg = - String.format( - "Session error: %s session closed unexpectedly in state %s. Status: %s", - info.getLogName(), state, status); - logger.warning(msg); - - if (state == SessionState.CLOSED) { - return; - } - - closeReason = - CloseSessionRequest.newBuilder() - .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) - .setDescription("Unexpected session close with status: " + status.getCode()) - .build(); + if (state == SessionState.CLOSED) { + return; } - localVRpc = currentRpc; - localPeerInfo = stream.getPeerInfo(); - currentRpc = null; - updateState(SessionState.CLOSED); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("Unexpected session close with status: " + status.getCode()) + .build(); } - if (localVRpc != null) { + VRpcImpl localVRpc = currentRpc; + currentRpc = null; + updateState(SessionState.CLOSED); + + notifyTerminalClose(status, trailers, localVRpc, prevState); + } + + /** + * Fan out terminal notifications to the in-flight vRpc, tracer, and session listener with local + * guards so a throw in one notification does not suppress the others. + * + *

Caller contract: must have already transitioned to {@link SessionState#CLOSED} and captured + * and cleared {@code currentRpc}. Callers should also set {@code closeReason}; if missing we + * synthesize a fallback here rather than throw, since throwing from this fan-out aborts the + * remaining notifications and (because the state is already CLOSED) defeats the + * sessionSyncContext uncaught handler's cleanup. + */ + private void notifyTerminalClose( + Status status, + Metadata trailers, + @Nullable VRpcImpl localRpc, + SessionState prevState) { + // Should never happen — matches the synthesizer in startGracefulClose. + if (closeReason == null) { + debugTagTracer.record(TelemetryConfiguration.Level.WARN, "session_close_no_reason"); + logger.log( + Level.WARNING, + String.format( + "%s notifyTerminalClose reached without a closeReason; status=%s", + info.getLogName(), status), + new IllegalStateException("notifyTerminalClose without closeReason")); + closeReason = + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_ERROR) + .setDescription("notifyTerminalClose reached without closeReason; status=" + status) + .build(); + } + if (localRpc != null) { try { - localVRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); + localRpc.handleSessionClose(VRpcResult.createRemoteTransportError(status, trailers)); } catch (Throwable t) { logger.log( Level.WARNING, @@ -671,16 +788,55 @@ private void dispatchStreamClosed(Status status, Metadata trailers) { info.getLogName(), status), t); } - tracer.onVRpcClose(Status.UNAVAILABLE.getCode()); + try { + tracer.onVRpcClose(Status.UNAVAILABLE.getCode()); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Unhandled exception in tracer.onVRpcClose", info.getLogName()), + t); + } + } + try { + tracer.onClose(safeGetPeerInfo(), closeReason.getReason(), status); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format("Session error: %s Unhandled exception in tracer.onClose", info.getLogName()), + t); + } + if (sessionListener != null) { + try { + sessionListener.onClose(prevState, status, trailers); + } catch (Throwable t) { + logger.log( + Level.WARNING, + String.format( + "Session error: %s Unhandled exception in sessionListener.onClose", + info.getLogName()), + t); + } + } + } + + private PeerInfo safeGetPeerInfo() { + try { + return stream.getPeerInfo(); + } catch (Throwable t) { + return SessionStream.DISCONNECTED_PEER_INFO; } - tracer.onClose(localPeerInfo, closeReason.getReason(), status); - sessionListener.onClose(prevState, status, trailers); } - @GuardedBy("lock") private void updateState(SessionState newState) { this.state = newState; this.lastStateChangedAt = clock.instant(); + // Once we're past READY, no further heartbeat checks are useful: checkHeartbeat short-circuits + // on state.phase >= WAIT_SERVER_CLOSE. Cancel any pending tick to keep the wheel clean during + // session churn. + if (newState.phase >= SessionState.WAIT_SERVER_CLOSE.phase) { + cancelHeartbeatTimeout(); + } } private static String formatPeerInfo(PeerInfo peerInfo) { diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java index cba048fe0ce0..9beccf40fd85 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionList.java @@ -16,15 +16,12 @@ package com.google.cloud.bigtable.data.v2.internal.session; -import static com.google.bigtable.v2.CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_MISSED_HEARTBEAT; - import com.google.auto.value.AutoValue; import com.google.bigtable.v2.CloseSessionRequest; import com.google.bigtable.v2.PeerInfo; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; import com.google.cloud.bigtable.data.v2.internal.session.Session.SessionState; import com.google.common.annotations.VisibleForTesting; -import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; @@ -41,7 +38,6 @@ import java.util.Queue; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.concurrent.NotThreadSafe; @@ -69,12 +65,6 @@ class SessionList { private final Set allSessions = new HashSet<>(); private final Set inUseSessions = new HashSet<>(); - private final CloseSessionRequest missedHeartbeatCloseRequest = - CloseSessionRequest.newBuilder() - .setReason(CLOSE_SESSION_REASON_MISSED_HEARTBEAT) - .setDescription("missed heartbeat") - .build(); - // pool level statistics across all the afes private final PoolStats poolStats = new PoolStats(); @@ -145,20 +135,6 @@ void prune() { } } - void checkHeartbeat(Clock clock) { - Instant now = clock.instant(); - inUseSessions.forEach( - handle -> { - if (now.isAfter(handle.getSession().getNextHeartbeat())) { - LOG.log( - Level.WARNING, - "Missed heartbeat for {0}, forcing session close", - handle.getSession().getLogName()); - handle.getSession().forceClose(missedHeartbeatCloseRequest); - } - }); - } - @NotThreadSafe class SessionHandle { private final Session session; diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java index 0b8cd1cdaea6..9ff1d6ffe9f8 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPool.java @@ -20,6 +20,7 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.protobuf.Message; import io.grpc.Metadata; +import java.time.Duration; public interface SessionPool { void start(OpenReqT openReq, Metadata md); @@ -29,6 +30,12 @@ VRpc newCall( void close(CloseSessionRequest req); + /** + * Blocks until all sessions in this pool have terminated, or the timeout elapses. Must be called + * after {@link #close} to be meaningful. Returns true if drained, false on timeout. + */ + boolean awaitTerminated(Duration timeout) throws InterruptedException; + SessionPoolInfo getInfo(); int getConsecutiveUnimplementedFailures(); diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java index 35884cb74319..cfab32770dd7 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImpl.java @@ -59,13 +59,15 @@ import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; @@ -112,6 +114,10 @@ private enum PoolState { private final Watchdog watchdog; + // Shared by every SessionImpl, PendingVRpc, Watchdog, AFE pruner, and retry-create-session in + // this pool. One tick thread per Client (owned by Client); O(1) insert / O(1) cancel. + private final BigtableTimer timer; + @GuardedBy("this") private int consecutiveFailures = 0; @@ -122,14 +128,21 @@ private enum PoolState { */ private volatile int consecutiveUnimplementedFailures = 0; - private final ScheduledFuture afeListPruneTask; + // Self-rescheduling AFE-prune chain. Set by scheduleNextAfePrune; cancelled by close. + @GuardedBy("this") + @Nullable + private BigtableTimer.Timeout afeListPruneTimeout; - private final ScheduledFuture heartbeatMonitor; + @GuardedBy("this") + private boolean closed = false; - private final ScheduledExecutorService executorService; + // Completed when this pool has been close()d AND every session has reached the CLOSED terminal + // state. Drives Client.close()'s drain barrier so that listener.onClose tasks finish queueing + // onto userCallbackExecutor before that executor is shut down. + private final CompletableFuture drainedFuture = new CompletableFuture<>(); @GuardedBy("this") - private ScheduledFuture retryCreateSessionFuture = null; + private BigtableTimer.Timeout retryCreateSessionFuture = null; // TODO: get the max from ClientConfiguration @GuardedBy("this") @@ -140,6 +153,10 @@ private enum PoolState { private final DebugTagTracer debugTagTracer; + // @SuppressWarnings("GuardedBy"): error-prone flags writes to @GuardedBy("this") fields + // (sessions, picker, poolSizer, pendingRpcs, budget, retryCreateSessionFuture) inside the + // constructor without holding the monitor. This is safe because the object is not yet published + // to other threads — no external reference exists until the constructor returns. @SuppressWarnings("GuardedBy") public SessionPoolImpl( Metrics metrics, @@ -150,7 +167,7 @@ public SessionPoolImpl( CallOptions callOptions, SessionDescriptor sessionDescriptor, String name, - ScheduledExecutorService executorService) { + BigtableTimer timer) { this( metrics, featureFlags, @@ -160,10 +177,11 @@ public SessionPoolImpl( callOptions, sessionDescriptor, name, - executorService, + timer, createInitialBudget(configManager.getClientConfiguration())); } + // @SuppressWarnings("GuardedBy"): same rationale as the public constructor above. @SuppressWarnings("GuardedBy") @VisibleForTesting SessionPoolImpl( @@ -175,7 +193,7 @@ public SessionPoolImpl( CallOptions callOptions, SessionDescriptor sessionDescriptor, String name, - ScheduledExecutorService executorService, + BigtableTimer timer, SessionCreationBudget budget) { this.metrics = metrics; this.featureFlags = featureFlags; @@ -183,7 +201,9 @@ public SessionPoolImpl( this.factory = new SessionFactory(channelPool, sessionDescriptor.getMethodDescriptor(), callOptions); this.descriptor = sessionDescriptor; - this.executorService = executorService; + // Timer is owned by the caller (typically Client) and shared across pools — do NOT stop it + // in close(). + this.timer = timer; sessions = new SessionList(); LoadBalancingOptions lbOptions = @@ -205,29 +225,9 @@ public SessionPoolImpl( debugTagTracer = metrics.getDebugTagTracer(); // Watchdog checks for sessions in WAIT_SERVER_CLOSE state and runs every 5 minutes - watchdog = new Watchdog(this, executorService, Duration.ofMinutes(5), sessions, debugTagTracer); - // Heartbeat monitor checks for sessions in READY state with active vRPCs and runs more - // frequently - heartbeatMonitor = - executorService.scheduleAtFixedRate( - () -> { - synchronized (SessionPoolImpl.this) { - sessions.checkHeartbeat(Clock.systemUTC()); - } - }, - SessionImpl.HEARTBEAT_CHECK_INTERVAL.toMillis(), - SessionImpl.HEARTBEAT_CHECK_INTERVAL.toMillis(), - TimeUnit.MILLISECONDS); - afeListPruneTask = - executorService.scheduleAtFixedRate( - () -> { - synchronized (SessionPoolImpl.this) { - sessions.prune(); - } - }, - SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), - SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), - TimeUnit.MILLISECONDS); + watchdog = new Watchdog(this, timer, Duration.ofMinutes(5), sessions, debugTagTracer); + // Heartbeat monitoring is now done per-session via SessionImpl.scheduleHeartbeatCheck. + scheduleNextAfePrune(); this.budget = budget; @@ -244,6 +244,33 @@ public SessionPoolImpl( }); } + @GuardedBy("this") + private void scheduleNextAfePrune() { + if (closed) { + return; + } + afeListPruneTimeout = + timer.newTimeout( + this::runAfePruneAndReschedule, + SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(), + TimeUnit.MILLISECONDS); + } + + private void runAfePruneAndReschedule() { + synchronized (SessionPoolImpl.this) { + try { + if (closed) { + return; + } + sessions.prune(); + } catch (Throwable t) { + logger.log(Level.WARNING, "AFE prune tick threw; continuing", t); + } finally { + scheduleNextAfePrune(); + } + } + } + @Override public SessionPoolInfo getInfo() { return info; @@ -253,6 +280,7 @@ public SessionPoolInfo getInfo() { public void close(CloseSessionRequest req) { configListenerHandle.close(); + List> toCancel; synchronized (this) { if (poolState == PoolState.CLOSED) { logger.fine(String.format("Tried to close a closed SessionPool %s", info.getLogName())); @@ -261,18 +289,52 @@ public void close(CloseSessionRequest req) { logger.fine(String.format("Closing session pool %s for reason %s", info.getLogName(), req)); poolState = PoolState.CLOSED; + closed = true; - for (PendingVRpc pendingRpc : pendingRpcs) { - pendingRpc.cancel("SessionPool closed: " + req, null); + toCancel = new ArrayList<>(pendingRpcs); + pendingRpcs.clear(); + if (afeListPruneTimeout != null) { + afeListPruneTimeout.cancel(); + afeListPruneTimeout = null; } - afeListPruneTask.cancel(false); - heartbeatMonitor.cancel(false); if (retryCreateSessionFuture != null) { - retryCreateSessionFuture.cancel(false); + retryCreateSessionFuture.cancel(); retryCreateSessionFuture = null; } - watchdog.close(); + // Watchdog stays alive past close() so it can escalate any session that lingers in + // WAIT_SERVER_CLOSE during shutdown. awaitTerminated() takes ownership of closing it. sessions.close(req); + // If the pool had no sessions, drainedFuture would never be completed by onSessionClose. + if (sessions.getAllSessions().isEmpty()) { + drainedFuture.complete(null); + } + } + + // cancelWithResult trampolines through ctx.getExecutor() — required because the public + // cancel(String, Throwable) path asserts opExecutor affinity, but close() runs on the + // caller thread. + VRpcResult closeResult = + VRpcResult.createRejectedError( + Status.CANCELLED.withDescription("SessionPool closed: " + req)); + for (PendingVRpc pendingRpc : toCancel) { + pendingRpc.cancelWithResult(closeResult); + } + } + + @Override + public boolean awaitTerminated(Duration timeout) throws InterruptedException { + try { + drainedFuture.get(timeout.toNanos(), TimeUnit.NANOSECONDS); + return true; + } catch (TimeoutException e) { + return false; + } catch (ExecutionException e) { + // drainedFuture is only completed via .complete(null), never .completeExceptionally — + // a CancellationException would still be wrapped here. Treat as a bug. + throw new IllegalStateException("drainedFuture failed unexpectedly", e); + } finally { + // Close the watchdog on the way out — drained or timed out, its job is done. + watchdog.close(); } } @@ -345,7 +407,7 @@ private synchronized void createSession(OpenParams openParams) { try (Scope ignored = io.opentelemetry.context.Context.root().makeCurrent()) { SessionStream stream = factory.createNew(); - Session session = new SessionImpl(metrics, info, sessionNum++, stream); + Session session = new SessionImpl(metrics, info, sessionNum++, stream, timer); SessionHandle handle = sessions.newHandle(session); Metadata localMd = new Metadata(); @@ -388,9 +450,9 @@ private void maybeScheduleCreateSessionRetry() { retryIntervalMs = 1; } retryCreateSessionFuture = - executorService.schedule( + timer.newTimeout( () -> { - synchronized (this) { + synchronized (SessionPoolImpl.this) { retryCreateSessionFuture = null; if (poolState != PoolState.CLOSED && poolSizer.getScaleDelta() > 0) { createSession(openParams); @@ -472,6 +534,10 @@ private void onSessionClose( // If the pool is closed then there is nothing else to do // dont need to create a replacement session and pending vRpcs get cleaned up in close() if (poolState == PoolState.CLOSED) { + // Signal awaitTerminated() once the last session has drained. + if (sessions.getAllSessions().isEmpty()) { + drainedFuture.complete(null); + } return; } @@ -515,9 +581,9 @@ private void onSessionClose( status, trailers))); for (PendingVRpc vrpc : toBeClosed) { try { - vrpc.getListener().onClose(result); + vrpc.cancelWithResult(result); } catch (Throwable t) { - logger.log(Level.WARNING, "Exception when closing request", t); + logger.log(Level.WARNING, "Exception dispatching close to op executor", t); } } } @@ -526,10 +592,6 @@ private void onSessionClose( @GuardedBy("this") private void tryDrainPendingRpcs() { while (!pendingRpcs.isEmpty()) { - if (pendingRpcs.peek().isCancelled) { - pendingRpcs.pop(); - continue; - } Optional handle = picker.pickSession(); if (!handle.isPresent()) { break; @@ -545,11 +607,8 @@ private void tryDrainPendingRpcs() { Iterator> iter = pendingRpcs.iterator(); while (iter.hasNext()) { PendingVRpc vrpc = iter.next(); - // vrpcs that have started on a session gets closed in SessionImpl. Do not double close. - if (!vrpc.isCancelled && vrpc.realCall == null) { - iter.remove(); - toBeClosed.add(vrpc); - } + iter.remove(); + toBeClosed.add(vrpc); } return toBeClosed; } @@ -570,7 +629,6 @@ public synchronized VRpc(desc); } - @GuardedBy("this") private VRpc newRealCall( VRpcDescriptor desc, SessionHandle handle) { @@ -615,7 +673,7 @@ class PendingVRpc implements VRpc realCall; - private ScheduledFuture deadlineMonitor; + private BigtableTimer.Timeout deadlineMonitor; public PendingVRpc(VRpcDescriptor desc) { this.desc = desc; @@ -631,16 +689,20 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { this.req = req; this.ctx = ctx; this.listener = listener; - this.deadlineMonitor = monitorDeadline(executorService, ctx.getOperationInfo().getDeadline()); synchronized (SessionPoolImpl.this) { if (SessionPoolImpl.this.poolState != PoolState.STARTED) { - listener.onClose( + VRpcResult result = VRpcResult.createUncommitedError( Status.UNAVAILABLE.withCause( - new IllegalStateException("SessionPool is closed")))); + new IllegalStateException("SessionPool is closed"))); + ctx.getExecutor().execute(() -> listener.onClose(result)); return; } + // Only arm the deadline monitor after we've committed to queueing; otherwise the + // fast-fail early return above would leak a timer that fires later and emits a phantom + // tracer.onAttemptFinish on the Active state's stale listener. + this.deadlineMonitor = monitorDeadline(ctx.getOperationInfo().getDeadline()); pendingRpcs.add(this); if (logger.isLoggable(Level.FINE)) { @@ -660,9 +722,6 @@ public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { } } - // It's safe to call cancel on a vrpc more than once. It'll be a noop after the initial - // call. Cancelled vrpcs are removed from the pending vrpc queue the next time we - // drain the queue. @Override public void cancel(@Nullable String message, @Nullable Throwable cause) { Status status = Status.CANCELLED; @@ -675,30 +734,31 @@ public void cancel(@Nullable String message, @Nullable Throwable cause) { cancel(status, false); } - // Cancel could race with drainTo which sets the real call. Assign realCall to a NOOP_CALL - // so if drainTo gets called at the same time, it'll just get swallowed and we're only calling - // onClose once on the listener. The cancel could also come from deadline monitor when - // the deadline expires. In this case if the real call is already set, we want to real call - // to handle the deadline and return early. + // cancel() and drainTo() are sequenced via ctx.getExecutor() (a per-op SerializingExecutor), + // so isCancelled and realCall are owned exclusively by that executor — no pool lock needed. private void cancel(Status status, boolean onlyCancelPendingCall) { - boolean delegateToRealCall = true; synchronized (SessionPoolImpl.this) { - if (isCancelled) { - return; - } + pendingRpcs.remove(this); // eager removal; no-op if already drained + } + ctx.getExecutor().execute(() -> { + if (isCancelled) return; isCancelled = true; - if (realCall == null) { - this.realCall = NOOP_CALL; - delegateToRealCall = false; - } else if (onlyCancelPendingCall) { - return; + if (realCall != null) { + if (!onlyCancelPendingCall) { + realCall.cancel(status.getDescription(), status.getCause()); + } + } else { + listener.onClose(VRpcResult.createRejectedError(status)); } - } - if (delegateToRealCall) { - realCall.cancel(status.getDescription(), status.getCause()); - } else { - listener.onClose(VRpcResult.createRejectedError(status)); - } + }); + } + + void cancelWithResult(VRpcResult result) { + ctx.getExecutor().execute(() -> { + if (isCancelled) return; + isCancelled = true; + listener.onClose(result); + }); } @Override @@ -711,29 +771,33 @@ public void requestNext() { } private void drainTo(SessionHandle handle) { - synchronized (SessionPoolImpl.this) { - if (realCall == null) { - this.realCall = newRealCall(desc, handle); - } - } - this.realCall.start(req, ctx, listener); if (deadlineMonitor != null) { - deadlineMonitor.cancel(false); + deadlineMonitor.cancel(); } + ctx.getExecutor().execute(() -> { + if (isCancelled) { + SessionPoolImpl.this.onVRpcComplete( + handle, Duration.ZERO, VRpcResult.createRejectedError(Status.CANCELLED)); + return; + } + realCall = newRealCall(desc, handle); + realCall.start(req, ctx, listener); + }); } private VRpcListener getListener() { return listener; } - private ScheduledFuture monitorDeadline( - ScheduledExecutorService executorService, Deadline deadline) { - return executorService.schedule( + private BigtableTimer.Timeout monitorDeadline(Deadline deadline) { + // Body runs on the timer's bundled dispatcher (off the tick thread). + // onlyCancelPendingCall=true avoids racing with a user cancel that already attached a real + // call. + return timer.newTimeout( () -> - // This could race with user cancel. Setting onlyCancelPendingCall to true - // so that if the real call is set, this cancellation is going to be a noop. cancel( - Status.DEADLINE_EXCEEDED.withDescription("Deadline exceeded waiting for session"), + Status.DEADLINE_EXCEEDED.withDescription( + "Deadline exceeded waiting for session"), true), deadline.timeRemaining(TimeUnit.NANOSECONDS), TimeUnit.NANOSECONDS); @@ -744,33 +808,45 @@ static class Watchdog implements Runnable { private static final Logger LOG = Logger.getLogger(Watchdog.class.getName()); private final Object lock; - private final ScheduledExecutorService executor; + private final BigtableTimer timer; private final Duration interval; private final SessionList sessions; - private ScheduledFuture future; private final Clock clock; private final DebugTagTracer debugTagTracer; - // TODO: fix lock sharing + // Guards currentTimeout and watchdogClosed. Self-contained — never composed with any external + // lock. + private final Object scheduleLock = new Object(); + + @GuardedBy("scheduleLock") + private BigtableTimer.Timeout currentTimeout; + + @GuardedBy("scheduleLock") + private boolean watchdogClosed = false; + + // The `lock` parameter is the pool-wide monitor (SessionPoolImpl.this). It is typed as Object + // because Watchdog is a static nested class and cannot reference the outer instance type in its + // constructor signature without creating a circular dependency. Phase 5 will replace this with + // a properly typed lock once the per-AFE sharding model is established. public Watchdog( Object lock, - ScheduledExecutorService executor, + BigtableTimer timer, Duration interval, SessionList sessionList, DebugTagTracer debugTagTracer) { - this(lock, executor, interval, sessionList, debugTagTracer, Clock.systemUTC()); + this(lock, timer, interval, sessionList, debugTagTracer, Clock.systemUTC()); } @VisibleForTesting Watchdog( Object lock, - ScheduledExecutorService executor, + BigtableTimer timer, Duration interval, SessionList sessionList, DebugTagTracer debugTagTracer, Clock clock) { this.lock = lock; - this.executor = executor; + this.timer = timer; this.interval = interval; this.sessions = sessionList; this.debugTagTracer = debugTagTracer; @@ -778,16 +854,40 @@ public Watchdog( } public void start() { - future = - executor.scheduleAtFixedRate( - this, interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); + scheduleNext(); + } + + // Self-reschedule. Called once from start() and again at the end of each tick. + private void scheduleNext() { + synchronized (scheduleLock) { + if (watchdogClosed) return; + currentTimeout = + timer.newTimeout(this::runAndReschedule, interval.toMillis(), TimeUnit.MILLISECONDS); + } + } + + private void runAndReschedule() { + try { + run(); + } catch (Throwable t) { + // Preserve the watchdog across body exceptions — unlike scheduleAtFixedRate, which silently + // stops the schedule on the first exception, we keep going so a transient fault doesn't + // permanently disable session leak detection. + LOG.log(Level.WARNING, "Watchdog tick threw; continuing", t); + } finally { + scheduleNext(); + } } @Override public void run() { + // Snapshot under the pool lock: getAllSessions() returns the live HashSet, and pool state + // mutation (session create/close on another thread) during iteration would throw + // ConcurrentModificationException. Most common trigger is a heartbeat-miss cascade that + // churns sessions while the watchdog is walking the list. Set allSessions; synchronized (lock) { - allSessions = sessions.getAllSessions(); + allSessions = new HashSet<>(sessions.getAllSessions()); } for (SessionHandle handle : allSessions) { @@ -825,21 +925,14 @@ public void run() { } public void close() { - if (future != null) { - future.cancel(false); + synchronized (scheduleLock) { + watchdogClosed = true; + if (currentTimeout != null) { + currentTimeout.cancel(); + currentTimeout = null; + } } } } - private static final VRpc NOOP_CALL = - new VRpc() { - @Override - public void start(Object req, VRpcCallContext ctx, VRpcListener listener) {} - - @Override - public void cancel(@Nullable String message, @Nullable Throwable cause) {} - - @Override - public void requestNext() {} - }; } diff --git a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java index a2d841bd1a68..83228fd43456 100644 --- a/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java +++ b/java-bigtable/google-cloud-bigtable/src/main/java/com/google/cloud/bigtable/data/v2/internal/session/VRpcImpl.java @@ -54,7 +54,11 @@ class VRpcImpl rpc, VirtualRpcRequest payload); + /** + * Submit the vRPC for sending. Async: errors are delivered via {@link + * VRpcImpl#handleError(VRpcResult)}, which dispatches onto {@code ctx.getExecutor()}. + */ + void startRpc(VRpcImpl rpc, VirtualRpcRequest payload); void cancelRpc(long rpcId, @Nullable String message, @Nullable Throwable cause); } @@ -69,6 +73,7 @@ private enum State { private final VRpcDescriptor desc; final long rpcId; private VRpcListener listener; + private VRpcCallContext ctx; private PeerInfo peerInfo; private AtomicReference state; @@ -91,55 +96,47 @@ public VRpcImpl( @Override public void start(ReqT req, VRpcCallContext ctx, VRpcListener listener) { + if (!state.compareAndSet(State.NEW, State.STARTED)) { + // Lost the CAS — a duplicate start. Dispatch to the local listener/ctx without touching + // the shared fields, otherwise we'd corrupt the in-flight call owned by the CAS winner. + VRpcResult result = + VRpcResult.createRejectedError( + Status.INTERNAL.withDescription("VRpc already started in state: " + state.get())); + ctx.getExecutor().execute(() -> listener.onClose(result)); + return; + } + // Won the CAS — publish the fields. this.listener = listener; + this.ctx = ctx; - Status status; - boolean retryable = true; - - if (!state.compareAndSet(State.NEW, State.STARTED)) { - status = Status.INTERNAL.withDescription("VRpc already started in state: " + state.get()); - retryable = false; - } else if (ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.MICROSECONDS) + if (ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.MICROSECONDS) < TimeUnit.MILLISECONDS.toMicros(1)) { - // Don't send RPCs that don't have any hope of succeeding - status = - Status.DEADLINE_EXCEEDED.withDescription("Remaining deadline is too short to send RPC"); - retryable = false; - } else { - Metadata vRpcMetadata = - Metadata.newBuilder() - .setAttemptNumber(ctx.getOperationInfo().getAttemptNumber()) - .setTraceparent(ctx.getTraceParent()) - .build(); - ctx.getTracer().onRequestSent(peerInfo); - status = - session.startRpc( - this, - VirtualRpcRequest.newBuilder() - .setRpcId(rpcId) - .setMetadata(vRpcMetadata) - .setDeadline( - Durations.fromNanos( - ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) - .setPayload(desc.encode(req)) - .build()); - // if status is not OK, the session might not be ready and the vRPC can be retried on a - // different session + state.set(State.CLOSED); + VRpcResult result = + VRpcResult.createRejectedError( + Status.DEADLINE_EXCEEDED.withDescription( + "Remaining deadline is too short to send RPC")); + ctx.getExecutor().execute(() -> listener.onClose(result)); + return; } - if (!status.isOk()) { - debugTagTracer.checkPrecondition( - state.compareAndSet(State.STARTED, State.CLOSED), - "vrpc_incorrect_start_state", - "VRpc has incorrect state. Expected to be started but was %s", - state); - // TODO: loop through the session executor - if (retryable) { - listener.onClose(VRpcResult.createUncommitedError(status)); - } else { - listener.onClose(VRpcResult.createRejectedError(status)); - } - } + Metadata vRpcMetadata = + Metadata.newBuilder() + .setAttemptNumber(ctx.getOperationInfo().getAttemptNumber()) + .setTraceparent(ctx.getTraceParent()) + .build(); + ctx.getTracer().onRequestSent(peerInfo); + session.startRpc( + this, + VirtualRpcRequest.newBuilder() + .setRpcId(rpcId) + .setMetadata(vRpcMetadata) + .setDeadline( + Durations.fromNanos( + ctx.getOperationInfo().getDeadline().timeRemaining(TimeUnit.NANOSECONDS))) + .setPayload(desc.encode(req)) + .build()); + // Session delivers startRpc errors asynchronously via handleError() on ctx.getExecutor(). } void handleSessionClose(VRpcResult result) { @@ -147,8 +144,7 @@ void handleSessionClose(VRpcResult result) { logger.warning("tried to close a vRPC after it was already closed state: " + state.get()); return; } - - listener.onClose(result); + ctx.getExecutor().execute(() -> listener.onClose(result)); } void handleResponse(VirtualRpcResponse response) { @@ -159,38 +155,40 @@ void handleResponse(VirtualRpcResponse response) { } // TODO: handle streaming - RespT resp; - try { - resp = desc.decode(response.getPayload()); - } catch (Throwable e) { - // TODO: notify Session to cancel the vRPC - // Right now, vrpc streaming & cancellation is not supported, so notifying SessionImpl is - // unnecessary. In the future handleResponse will need to notify that Session that the user - // was already notified of the error and no further notifications should be delivered - VRpcResult result = - VRpcResult.createLocalTransportError( - Status.INTERNAL.withDescription("Failed to decode VRpc payload").withCause(e)); - listener.onClose(result); - return; - } - - try { - listener.onMessage(resp); - } catch (Throwable e) { - VRpcResult result = VRpcResult.createUserError(e); - listener.onClose(result); - return; - } - - listener.onClose(VRpcResult.createServerOk(response)); + // Decode + callback fan-out all run on the op executor: keeps the (potentially heavy) decode + // off the session sync context, and gives every callback a single dispatcher. + ctx.getExecutor() + .execute( + () -> { + RespT resp; + try { + resp = desc.decode(response.getPayload()); + } catch (Throwable e) { + // TODO: notify Session to cancel the vRPC + listener.onClose( + VRpcResult.createLocalTransportError( + Status.INTERNAL + .withDescription("Failed to decode VRpc payload") + .withCause(e))); + return; + } + try { + listener.onMessage(resp); + } catch (Throwable e) { + listener.onClose(VRpcResult.createUserError(e)); + return; + } + listener.onClose(VRpcResult.createServerOk(response)); + }); } void handleError(VRpcResult result) { - if (state.getAndSet(State.CLOSED) == State.CLOSED) { + // CAS STARTED -> CLOSED, matching handleResponse / handleSessionClose. The previous + // getAndSet(CLOSED) would proceed from NEW and dereference null ctx/listener fields. + if (!state.compareAndSet(State.STARTED, State.CLOSED)) { return; } - - listener.onClose(result); + ctx.getExecutor().execute(() -> listener.onClose(result)); } @Override diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java index 641a63f0a573..eb2e0f78c8a0 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/ClientTest.java @@ -55,7 +55,9 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +@Timeout(30) public class ClientTest { private ClientConfiguration defaultConfig; @@ -105,6 +107,41 @@ void tearDown() { executor.shutdownNow(); } + @Test + public void openAfterCloseThrows() { + client.close(); + + IllegalStateException tableEx = + assertThrows( + IllegalStateException.class, + () -> client.openTableAsync("fake-table", OpenTableRequest.Permission.PERMISSION_READ)); + assertThat(tableEx).hasMessageThat().contains("closed"); + + IllegalStateException viewEx = + assertThrows( + IllegalStateException.class, + () -> + client.openAuthorizedViewAsync( + "fake-table", + "fake-view", + OpenAuthorizedViewRequest.Permission.PERMISSION_READ)); + assertThat(viewEx).hasMessageThat().contains("closed"); + + IllegalStateException mvEx = + assertThrows( + IllegalStateException.class, + () -> + client.openMaterializedViewAsync( + "fake-view", OpenMaterializedViewRequest.Permission.PERMISSION_READ)); + assertThat(mvEx).hasMessageThat().contains("closed"); + } + + @Test + public void closeIsIdempotent() { + client.close(); + client.close(); // must not throw or hang + } + @Test public void testRequestFails() { TableAsync table = diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java index 54406abe51a8..c88895e4e370 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/api/TableBaseTest.java @@ -29,19 +29,23 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.cloud.bigtable.data.v2.internal.session.SessionPool; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.VRpcDescriptor; import com.google.protobuf.Message; import io.grpc.Deadline; import io.grpc.Metadata; +import java.time.Duration; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +@Timeout(30) @ExtendWith(MockitoExtension.class) public class TableBaseTest { @@ -50,6 +54,7 @@ public class TableBaseTest { private final Metrics noopMetrics = new NoopMetrics(); private final ScheduledExecutorService mockExecutor = Mockito.mock(ScheduledExecutorService.class); + private final BigtableTimer mockTimer = Mockito.mock(BigtableTimer.class); private static final ClientInfo clientInfo = ClientInfo.builder() .setInstanceName( @@ -70,7 +75,8 @@ public void setup() { VRpcDescriptor.READ_ROW, VRpcDescriptor.MUTATE_ROW, noopMetrics, - mockExecutor); + mockTimer, + com.google.common.util.concurrent.MoreExecutors.directExecutor()); deadline = Deadline.after(1, TimeUnit.MINUTES); f = new UnaryResponseFuture<>(); } @@ -172,6 +178,11 @@ public void start(OpenTableRequest openReq, Metadata md) {} @Override public void close(CloseSessionRequest req) {} + @Override + public boolean awaitTerminated(Duration timeout) { + return true; + } + @Override public SessionPoolInfo getInfo() { return SessionPoolInfo.create(clientInfo, VRpcDescriptor.TABLE_SESSION, "fake-pool"); diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java index 62a802dfeb0d..ec7dc6a01245 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/csm/tracers/VRpcTracerTest.java @@ -45,8 +45,10 @@ import com.google.cloud.bigtable.data.v2.internal.middleware.RetryingVRpc; import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc; import com.google.cloud.bigtable.data.v2.internal.session.FakeDescriptor; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.Session; import com.google.cloud.bigtable.data.v2.internal.session.SessionFactory; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionImpl; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeServiceBuilder; @@ -97,6 +99,7 @@ public class VRpcTracerTest { Correspondence.transforming(MetricData::getName, "MetricData name"); private ScheduledExecutorService executor; + private BigtableTimer timer; private Server server; private ChannelPool channelPool; @@ -115,6 +118,7 @@ public class VRpcTracerTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("vrpc-tracer-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -157,7 +161,7 @@ void setUp() throws IOException { SessionFactory sessionFactory = new SessionFactory( channelPool, FakeDescriptor.FAKE_SESSION.getMethodDescriptor(), CallOptions.DEFAULT); - session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); } @AfterEach @@ -173,6 +177,7 @@ void tearDown() { metrics.close(); server.shutdownNow(); executor.shutdownNow(); + timer.stop(); } @Test @@ -199,7 +204,7 @@ public void operationLatencyTest() throws Exception { CompletableFuture opFinished = new CompletableFuture<>(); Stopwatch stopwatch = Stopwatch.createStarted(); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture userFuture = new UnaryResponseFuture<>(); MethodInfo methodInfo = MethodInfo.builder().setName("Bigtable.ReadRow").setStreaming(false).build(); @@ -258,7 +263,7 @@ public void attemptLatencyTest() throws Exception { AtomicLong maxAttemptLatency = new AtomicLong(); DelayedVRpc delayedVRpc = new DelayedVRpc<>( - () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor)); + () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer)); UnaryResponseFuture userFuture = new UnaryResponseFuture<>(); MethodInfo methodInfo = MethodInfo.builder().setName("Bigtable.ReadRow").setStreaming(false).build(); @@ -316,7 +321,7 @@ public void retryCountTest() throws Exception { // Test RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); CompletableFuture opFinished = new CompletableFuture<>(); MethodInfo methodInfo = @@ -367,7 +372,7 @@ public void clientBlockingLatencySessionDelayTest() throws Exception { // Test DelayedVRpc delayedVRpc = new DelayedVRpc<>( - () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor)); + () -> new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer)); UnaryResponseFuture f = new UnaryResponseFuture<>(); CompletableFuture attemptFinished = new CompletableFuture<>(); MethodInfo methodInfo = diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java index b9af925d4871..e823db5b3aea 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/RetryingVRpcTest.java @@ -38,9 +38,11 @@ import com.google.cloud.bigtable.data.v2.internal.csm.attributes.ClientInfo; import com.google.cloud.bigtable.data.v2.internal.csm.tracers.VRpcTracer; import com.google.cloud.bigtable.data.v2.internal.session.FakeDescriptor; +import com.google.cloud.bigtable.data.v2.internal.session.NettyWheelTimer; import com.google.cloud.bigtable.data.v2.internal.session.SessionFactory; import com.google.cloud.bigtable.data.v2.internal.session.SessionImpl; import com.google.cloud.bigtable.data.v2.internal.session.SessionPoolInfo; +import com.google.cloud.bigtable.data.v2.internal.session.BigtableTimer; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeServiceBuilder; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeSessionListener; import com.google.cloud.bigtable.data.v2.internal.session.fake.FakeSessionService; @@ -74,6 +76,7 @@ @ExtendWith(MockitoExtension.class) public class RetryingVRpcTest { private ScheduledExecutorService executor; + private BigtableTimer timer; private Server server; private ChannelPool channelPool; @@ -88,6 +91,7 @@ public class RetryingVRpcTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("retrying-vrpc-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -117,11 +121,12 @@ void tearDown() { channelPool.close(); server.shutdownNow(); executor.shutdownNow(); + timer.stop(); } @Test void noRetryTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -133,7 +138,7 @@ void noRetryTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(0).build(), @@ -152,7 +157,7 @@ void noRetryTest() throws Exception { @Test public void retryServerError() throws Exception { int requestTag = 1; - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -199,7 +204,7 @@ public void retryServerError() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(requestTag).build(), @@ -218,7 +223,7 @@ public void retryServerError() throws Exception { @Test public void retryDeadlineRespectedTest() throws Exception { int requestTag = 1; - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -274,7 +279,7 @@ public void retryDeadlineRespectedTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(requestTag).build(), @@ -295,7 +300,7 @@ public void retryDeadlineRespectedTest() throws Exception { @Test public void vRpcFailureTest() throws Exception { // vrpc error on the session should not close the stream - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -333,7 +338,7 @@ public void vRpcFailureTest() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(0).build(), @@ -353,7 +358,7 @@ public void vRpcFailureTest() throws Exception { @Test void cancelInScheduledState() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -392,7 +397,7 @@ void cancelInScheduledState() throws Exception { .isInstanceOf(OpenSessionResponse.class); RetryingVRpc retrying = - new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), executor); + new RetryingVRpc<>(() -> session.newCall(FakeDescriptor.SCRIPTED), timer); UnaryResponseFuture f = new UnaryResponseFuture<>(); retrying.start( SessionFakeScriptedRequest.newBuilder().setTag(1).build(), diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java new file mode 100644 index 000000000000..a69bed4c0462 --- /dev/null +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/middleware/VOperationImplTest.java @@ -0,0 +1,155 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.cloud.bigtable.data.v2.internal.middleware; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.cloud.bigtable.data.v2.internal.csm.NoopMetrics; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcCallContext; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcListener; +import com.google.cloud.bigtable.data.v2.internal.middleware.VRpc.VRpcResult; +import com.google.common.util.concurrent.MoreExecutors; +import io.grpc.Context; +import io.grpc.Deadline; +import io.grpc.Status; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import javax.annotation.Nullable; +import org.junit.jupiter.api.Test; + +class VOperationImplTest { + + @Test + void grpcContextCancelPropagatesToChain() throws InterruptedException { + // Normal-path sanity: addListener fires (chain.start doesn't queue an onClose), then a + // later grpcContext.cancel routes through cancellationListener -> opExecutor -> chain.cancel. + // This exercises the new runInline-based addListener path introduced to close the TOCTOU + // between wrapped.closed and addListener. + Context.CancellableContext grpcContext = Context.current().withCancellation(); + AtomicInteger chainCancelCount = new AtomicInteger(); + CountDownLatch cancelLatch = new CountDownLatch(1); + + VRpc chain = + new VRpc() { + @Override + public void start(String req, VRpcCallContext ctx, VRpcListener listener) { + // Hold open — do not call listener.onClose. Cancellation must drive termination. + } + + @Override + public void cancel(@Nullable String msg, @Nullable Throwable cause) { + chainCancelCount.incrementAndGet(); + cancelLatch.countDown(); + } + + @Override + public void requestNext() {} + }; + + VOperationImpl op = + new VOperationImpl<>( + chain, + grpcContext, + MoreExecutors.directExecutor(), + NoopMetrics.NoopVrpcTracer.INSTANCE, + Deadline.after(10, TimeUnit.SECONDS), + true); + + op.start( + "req", + new VRpcListener() { + @Override + public void onMessage(String msg) {} + + @Override + public void onClose(VRpcResult result) {} + }); + + grpcContext.cancel(Status.CANCELLED.withDescription("test").asException()); + + assertThat(cancelLatch.await(2, TimeUnit.SECONDS)).isTrue(); + assertThat(chainCancelCount.get()).isEqualTo(1); + } + + @Test + void asyncOnCloseFromChainDoesNotPropagateLaterContextCancel() throws InterruptedException { + // Regression for the wrapped.closed TOCTOU. When chain.start asynchronously queues an + // onClose via the op executor, the addListener task (also queued through the op executor + // by VOperationImpl.start) drains FIFO after the onClose and observes wrapped.closed=true, + // so the cancellationListener is NOT registered. A later grpcContext.cancel therefore has + // no path to reach chain.cancel — which is correct because the chain has already terminated. + Context.CancellableContext grpcContext = Context.current().withCancellation(); + AtomicInteger chainCancelCount = new AtomicInteger(); + AtomicReference userClose = new AtomicReference<>(); + CountDownLatch onCloseLatch = new CountDownLatch(1); + + VRpc chain = + new VRpc() { + @Override + public void start(String req, VRpcCallContext ctx, VRpcListener listener) { + // Simulate PendingVRpc pool-closed branch / VRpcImpl deadline short-circuit. + ctx.getExecutor() + .execute( + () -> + listener.onClose( + VRpcResult.createUncommitedError( + Status.UNAVAILABLE.withDescription("fast-fail")))); + } + + @Override + public void cancel(@Nullable String msg, @Nullable Throwable cause) { + chainCancelCount.incrementAndGet(); + } + + @Override + public void requestNext() {} + }; + + VOperationImpl op = + new VOperationImpl<>( + chain, + grpcContext, + MoreExecutors.directExecutor(), + NoopMetrics.NoopVrpcTracer.INSTANCE, + Deadline.after(10, TimeUnit.SECONDS), + true); + + op.start( + "req", + new VRpcListener() { + @Override + public void onMessage(String msg) {} + + @Override + public void onClose(VRpcResult result) { + userClose.set(result); + onCloseLatch.countDown(); + } + }); + + assertThat(onCloseLatch.await(2, TimeUnit.SECONDS)).isTrue(); + assertThat(userClose.get().getStatus().getCode()).isEqualTo(Status.UNAVAILABLE.getCode()); + + grpcContext.cancel(Status.CANCELLED.withDescription("test").asException()); + Thread.sleep(50); // give any leaked listener a chance to fire + + // No chain.cancel — the cancellationListener was correctly skipped because the chain had + // already reached its terminal state via the queued onClose. + assertThat(chainCancelCount.get()).isEqualTo(0); + } +} diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java index 838cea0e2ffe..ddbf697b9e8e 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionImplTest.java @@ -70,21 +70,26 @@ import java.io.IOException; import java.time.Duration; import java.time.Instant; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Answers; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +@Timeout(30) @ExtendWith(MockitoExtension.class) public class SessionImplTest { private ScheduledExecutorService executor; + private BigtableTimer timer; @Mock(answer = Answers.RETURNS_DEEP_STUBS) private Metrics metrics; @@ -98,6 +103,7 @@ public class SessionImplTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + timer = new NettyWheelTimer("session-impl-test", com.google.common.util.concurrent.MoreExecutors.directExecutor()); server = FakeServiceBuilder.create(new FakeSessionService(executor)) .intercept(new PeerInfoInterceptor()) @@ -128,12 +134,13 @@ void setUp() throws IOException { void tearDown() { channelPool.close(); server.shutdownNow(); + timer.stop(); executor.shutdownNow(); } @Test void sessionSendAndCloseTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -163,7 +170,7 @@ void sessionSendAndCloseTest() throws Exception { @Test void sessionCloseBeforeInit() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); OpenSessionRequest openSessionRequest = @@ -180,7 +187,7 @@ void sessionCloseBeforeInit() throws Exception { @Test void sessionGoAwayTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); Duration goAwayDelay = Duration.ofMillis(500); FakeSessionListener sessionListener = new FakeSessionListener(); @@ -268,7 +275,7 @@ void sessionGoAwayTest() throws Exception { @Test void streamErrorDuringRpcTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); Status.Code actualCode = Status.Code.INTERNAL; @@ -337,7 +344,7 @@ void streamErrorDuringRpcTest() throws Exception { @Test void rpcErrorDuringRpcTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); com.google.rpc.Status expectedRpcStatus = com.google.rpc.Status.newBuilder() @@ -404,7 +411,7 @@ void rpcErrorDuringRpcTest() throws Exception { @Test void localErrorTest() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); FakeSessionListener sessionListener = new FakeSessionListener(); session.start( @@ -451,7 +458,8 @@ void testHeartbeat() throws Exception { Instant time = clock.instant(); - SessionImpl session = new SessionImpl(metrics, clock, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = + new SessionImpl(metrics, clock, poolInfo, 0, sessionFactory.createNew(), timer); int keepAliveDurationMs = 150; @@ -490,8 +498,14 @@ void testHeartbeat() throws Exception { VRpcCallContext.create(Deadline.after(1, TimeUnit.MINUTES), true, tracer), f); - assertThat(session.getNextHeartbeat()) - .isEqualTo(time.plus(Duration.ofMillis(keepAliveDurationMs))); + // startRpc() is now async; poll until sessionSyncContext processes it. + Instant expectedHeartbeat = time.plus(Duration.ofMillis(keepAliveDurationMs)); + Stopwatch sw = Stopwatch.createStarted(); + while (!session.getNextHeartbeat().equals(expectedHeartbeat) + && sw.elapsed(TimeUnit.SECONDS) < 5) { + Thread.sleep(10); + } + assertThat(session.getNextHeartbeat()).isEqualTo(expectedHeartbeat); assertThat(f.get()).isEqualTo(SessionFakeScriptedResponse.getDefaultInstance()); @@ -507,7 +521,7 @@ void testHeartbeat() throws Exception { @Test void testCancel() throws Exception { - SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew()); + SessionImpl session = new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); int responseDelayMs = 200; // Configure the fake service to delay the response, giving us time to cancel it @@ -559,4 +573,147 @@ void testCancel() throws Exception { session.close(CloseSessionRequest.getDefaultInstance()); sessionListener.popUntil(Status.class); } + + // region uncaught-exception abort behaviors + + @Test + void abortFiresWhenListenerOnReadyThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + CountDownLatch onCloseLatch = new CountDownLatch(1); + AtomicReference capturedStatus = + new AtomicReference<>(); + + Session.Listener throwingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + throw new RuntimeException("simulated onReady failure"); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + capturedStatus.set(status); + onCloseLatch.countDown(); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + throwingListener); + + // The abort path must drive the session to CLOSED and notify the listener via onClose, even + // though the original onReady threw. + assertWithMessage("listener.onClose must be invoked after onReady throws") + .that(onCloseLatch.await(5, TimeUnit.SECONDS)) + .isTrue(); + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + assertThat(capturedStatus.get().getCode()).isEqualTo(Status.Code.INTERNAL); + } + + @Test + void abortDoesNotHangWhenListenerOnCloseThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + CountDownLatch onReadyLatch = new CountDownLatch(1); + CountDownLatch onCloseLatch = new CountDownLatch(1); + + Session.Listener throwingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + onReadyLatch.countDown(); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + onCloseLatch.countDown(); + throw new RuntimeException("simulated onClose failure"); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + throwingListener); + + assertThat(onReadyLatch.await(5, TimeUnit.SECONDS)).isTrue(); + + // Close normally. The listener's onClose throws — the local guard inside notifyTerminalClose + // must swallow it so the SyncContext drain doesn't recurse infinitely or hang. + session.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("test") + .build()); + + assertWithMessage("listener.onClose should be invoked exactly once during normal close") + .that(onCloseLatch.await(5, TimeUnit.SECONDS)) + .isTrue(); + + // The session should reach CLOSED state cleanly within the test timeout. + Stopwatch sw = Stopwatch.createStarted(); + while (session.getState() != Session.SessionState.CLOSED && sw.elapsed().getSeconds() < 5) { + Thread.sleep(10); + } + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + } + + @Test + void abortDoesNotInfiniteLoopWhenRecoveryListenerAlsoThrows() throws Exception { + SessionImpl session = + new SessionImpl(metrics, poolInfo, 0, sessionFactory.createNew(), timer); + + CountDownLatch onCloseInvoked = + new CountDownLatch(1); + + Session.Listener doublyThrowingListener = + new Session.Listener() { + @Override + public void onReady(OpenSessionResponse msg) { + throw new RuntimeException("simulated onReady failure"); + } + + @Override + public void onGoAway(GoAwayResponse msg) {} + + @Override + public void onClose(Session.SessionState prevState, Status status, Metadata trailers) { + onCloseInvoked.countDown(); + throw new RuntimeException("simulated onClose failure during abort"); + } + }; + + session.start( + OpenSessionRequest.newBuilder() + .setPayload(OpenFakeSessionRequest.getDefaultInstance().toByteString()) + .build(), + new Metadata(), + doublyThrowingListener); + + // onReady throws → abort fires → abort calls onClose, which also throws → Guard 4 swallows + // and isAborting prevents the handler from re-driving abort. The session must reach CLOSED + // without hanging (the @Timeout(30) on the class is the safety net for infinite loops). + assertThat(onCloseInvoked.await(5, TimeUnit.SECONDS)).isTrue(); + Stopwatch sw = Stopwatch.createStarted(); + while (session.getState() != Session.SessionState.CLOSED && sw.elapsed().getSeconds() < 5) { + Thread.sleep(10); + } + assertThat(session.getState()).isEqualTo(Session.SessionState.CLOSED); + } + + // endregion } diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java index ea142d8a7449..47a1cacf8a97 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/SessionPoolImplTest.java @@ -21,6 +21,8 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.longThat; +import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; @@ -79,7 +81,9 @@ import java.time.Instant; import java.util.List; import java.util.concurrent.CompletableFuture; +import java.util.function.LongPredicate; import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -89,6 +93,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Answers; import org.mockito.ArgumentCaptor; @@ -96,7 +101,7 @@ import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -@Nested +@Timeout(30) @ExtendWith(MockitoExtension.class) public class SessionPoolImplTest { private static final ClientInfo CLIENT_INFO = @@ -110,6 +115,7 @@ public class SessionPoolImplTest { Correspondence.transforming(SessionRequest::getOpenSession, "open session"); private ScheduledExecutorService executor; + private NettyWheelTimer testTimer; @Mock(answer = Answers.RETURNS_DEEP_STUBS) private Metrics metrics; @@ -126,6 +132,7 @@ public class SessionPoolImplTest { @BeforeEach void setUp() throws IOException { executor = Executors.newScheduledThreadPool(4); + testTimer = new NettyWheelTimer("test-timer", com.google.common.util.concurrent.MoreExecutors.directExecutor()); fakeService = new FakeSessionService(executor); headerInterceptor = new HeaderInterceptor(); server = @@ -159,20 +166,24 @@ void setUp() throws IOException { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); } @AfterEach - void tearDown() { + void tearDown() throws InterruptedException { sessionPool.close( CloseSessionRequest.newBuilder() .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) .setDescription("close session") .build()); + // Wait for sessions to drain so the watchdog can be closed before testTimer.stop() races + // with its self-reschedule loop. + sessionPool.awaitTerminated(Duration.ofSeconds(10)); channelPool.close(); // channel gets shutdown in channelPool.close() server.shutdownNow(); executor.shutdownNow(); + testTimer.stop(); } @Test @@ -239,7 +250,7 @@ void pendingVRpcDrainTest() throws ExecutionException, InterruptedException, Tim CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); // session ack should be delayed by at least 10ms testSessionPool.start(OpenFakeSessionRequest.getDefaultInstance(), new Metadata()); @@ -272,6 +283,85 @@ public void onClose(VRpcResult result) { } } + @Test + void awaitTerminatedReturnsTrueWhenPoolIsEmpty() throws InterruptedException { + // A pool that was never started has no sessions; close() should complete drainedFuture + // immediately and awaitTerminated should return true without blocking. + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("empty pool") + .build()); + assertThat(sessionPool.awaitTerminated(Duration.ofMillis(100))).isTrue(); + } + + @Test + void awaitTerminatedReturnsTrueAfterSessionsDrain() + throws InterruptedException, ExecutionException, TimeoutException { + // Start a real session, issue + complete a vRPC so the session is fully open, then close + // the pool and verify awaitTerminated returns true (sessions cleanly drained). + sessionPool.start(OpenFakeSessionRequest.getDefaultInstance(), new Metadata()); + + VRpc vrpc = + sessionPool.newCall(FakeDescriptor.SCRIPTED); + UnaryResponseFuture resultFuture = new UnaryResponseFuture<>(); + vrpc.start( + SessionFakeScriptedRequest.getDefaultInstance(), + VRpcCallContext.create(Deadline.after(10, TimeUnit.SECONDS), true, vrpcTracer), + resultFuture); + resultFuture.get(10, TimeUnit.SECONDS); + + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("after drain") + .build()); + + assertThat(sessionPool.awaitTerminated(Duration.ofSeconds(10))).isTrue(); + } + + @Test + void pendingVRpcOnClosedPoolDoesNotLeakDeadlineMonitor() throws InterruptedException { + // Regression: PendingVRpc.start used to arm the deadline timer before the pool-state + // check, so the fast-fail "pool closed" branch leaked an armed timer that fired later + // and called listener.onClose a second time with DEADLINE_EXCEEDED. + sessionPool.close( + CloseSessionRequest.newBuilder() + .setReason(CloseSessionRequest.CloseSessionReason.CLOSE_SESSION_REASON_USER) + .setDescription("close before issuing rpc") + .build()); + + CopyOnWriteArrayList closes = new CopyOnWriteArrayList<>(); + CountDownLatch firstClose = new CountDownLatch(1); + Duration deadline = Duration.ofMillis(100); + + VRpc vrpc = + sessionPool.newCall(FakeDescriptor.SCRIPTED); + vrpc.start( + SessionFakeScriptedRequest.getDefaultInstance(), + VRpcCallContext.create( + Deadline.after(deadline.toMillis(), TimeUnit.MILLISECONDS), true, vrpcTracer), + new VRpc.VRpcListener() { + @Override + public void onMessage(SessionFakeScriptedResponse msg) {} + + @Override + public void onClose(VRpcResult result) { + closes.add(result); + firstClose.countDown(); + } + }); + + // The fast-fail UNAVAILABLE onClose should arrive immediately. + assertThat(firstClose.await(1, TimeUnit.SECONDS)).isTrue(); + assertThat(closes).hasSize(1); + + // Wait past the deadline. With the bug (leaked deadlineMonitor), a phantom + // onClose(DEADLINE_EXCEEDED) would arrive in this window. With the fix, no second close. + Thread.sleep(deadline.toMillis() * 5); + assertThat(closes).hasSize(1); + } + @Test void testCreateSessionDoesntPropagateDeadline() { DeadlineInterceptor deadlineInterceptor = new DeadlineInterceptor(); @@ -295,7 +385,7 @@ void testCreateSessionDoesntPropagateDeadline() { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - executor); + testTimer); Context.current() .withDeadlineAfter(1, TimeUnit.MINUTES, executor) @@ -314,7 +404,7 @@ void testCreateSessionDoesntPropagateDeadline() { class RetrySessionCreation { private FakeClock fakeClock; - private ScheduledExecutorService mockExecutor; + private BigtableTimer mockTimer; private FakeSessionService fakeService; private ChannelPool channelPool; private SessionPoolImpl sessionPool; @@ -324,7 +414,9 @@ class RetrySessionCreation { @BeforeEach void setUp() throws Exception { fakeClock = new FakeClock(Instant.now()); - mockExecutor = mock(ScheduledExecutorService.class, Mockito.RETURNS_DEEP_STUBS); + // The retry-create-session site uses timer.newTimeout(); we capture the scheduled body on a + // mock timer and run it inline below. + mockTimer = mock(BigtableTimer.class, Mockito.RETURNS_DEEP_STUBS); Duration penalty = Duration.ofMinutes(1); SessionCreationBudget budget = new SessionCreationBudget(10, penalty, fakeClock); @@ -353,7 +445,7 @@ void setUp() throws Exception { CallOptions.DEFAULT, FakeDescriptor.FAKE_SESSION, "fake-pool", - mockExecutor, + mockTimer, budget); } @@ -371,7 +463,14 @@ void tearDown() { @Test public void test() throws Exception { ArgumentCaptor runnableCaptor = ArgumentCaptor.forClass(Runnable.class); - ArgumentCaptor delayCaptor = ArgumentCaptor.forClass(Long.class); + // Filter out watchdog (5 min, exact) and AFE prune (10 min, exact) ticks. The + // retry-create-session site computes its delay against the real wall clock and the fake + // budget clock, so it can land anywhere from sub-second to a couple of penalty intervals. + // Match anything that isn't one of the two fixed cadences. + long watchdogMs = Duration.ofMinutes(5).toMillis(); + long afePruneMs = SessionList.SESSION_LIST_PRUNE_INTERVAL.toMillis(); + LongPredicate isRetrySchedule = + d -> d > 0 && d != watchdogMs && d != afePruneMs; // start the pool sessionPool.start( @@ -384,12 +483,11 @@ public void test() throws Exception { // The delay should be around budget creation failure penalty. It'll take some time for the // job to exhaust all the creation budget so set a delay before verifying. int waitForReadyMs = 1000; - verify(mockExecutor, Mockito.timeout(waitForReadyMs)) - .schedule(runnableCaptor.capture(), delayCaptor.capture(), eq(TimeUnit.MILLISECONDS)); - assertThat(delayCaptor.getValue()) - .isIn( - Range.openClosed( - penalty.minus(Duration.ofMillis(waitForReadyMs)).toMillis(), penalty.toMillis())); + verify(mockTimer, Mockito.timeout(waitForReadyMs)) + .newTimeout( + runnableCaptor.capture(), + longThat(isRetrySchedule::test), + eq(TimeUnit.MILLISECONDS)); // we should have received some open requests int requestsBefore = fakeService.getOpenRequestCount().get(); @@ -407,13 +505,16 @@ public void test() throws Exception { // Advance the clock so there's more budget to create sessions fakeClock.increment(penalty.plusMillis(1)); - // Run the scheduled task, pool sizer will return a positive scale factor because there's a - // pending vrpc + // Run the scheduled timer body inline. The body executes the retry which schedules another + // timer tick. runnableCaptor.getValue().run(); // The retry task will try to open new sessions. This will fail and schedule another retry. - verify(mockExecutor, Mockito.timeout(1000).times(2)) - .schedule(any(Runnable.class), anyLong(), eq(TimeUnit.MILLISECONDS)); + verify(mockTimer, Mockito.timeout(5000).times(2)) + .newTimeout( + any(Runnable.class), + longThat(isRetrySchedule::test), + eq(TimeUnit.MILLISECONDS)); // the retry will exhaust the budget again. we should see double the request compared to // before diff --git a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java index 94bd6fcc5049..3582e66e1971 100644 --- a/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java +++ b/java-bigtable/google-cloud-bigtable/src/test/java/com/google/cloud/bigtable/data/v2/internal/session/WatchdogTest.java @@ -30,9 +30,7 @@ import io.grpc.Metadata; import java.time.Duration; import java.time.Instant; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import org.junit.jupiter.api.AfterEach; +import java.util.concurrent.TimeUnit; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -41,7 +39,7 @@ @ExtendWith(MockitoExtension.class) public class WatchdogTest { private final Duration interval = Duration.ofMinutes(5); - private ScheduledExecutorService executor; + private BigtableTimer timer; private SessionPoolImpl.Watchdog watchdog; private SessionList sessions; private FakeSession fakeSession = new FakeSession(); @@ -51,7 +49,9 @@ public class WatchdogTest { @BeforeEach void setUp() { - executor = Executors.newScheduledThreadPool(4); + // run() is invoked synchronously in tests; the timer is wired in only so the constructor + // signature is satisfied. start() / close() are not exercised here. + timer = new NoOpBigtableTimer(); now = Instant.now(); fakeClock = new FakeClock(now); @@ -59,16 +59,33 @@ void setUp() { watchdog = new Watchdog( new Object(), - executor, + timer, interval, sessions, NoopMetrics.NoopDebugTracer.INSTANCE, fakeClock); } - @AfterEach - void tearDown() { - executor.shutdownNow(); + // A BigtableTimer that drops every newTimeout(). Used because awaitCloseTest drives the watchdog + // by calling run() directly; the scheduling layer is not under test. + private static final class NoOpBigtableTimer implements BigtableTimer { + @Override + public Timeout newTimeout(Runnable task, long delay, TimeUnit unit) { + return new Timeout() { + @Override + public boolean cancel() { + return false; + } + + @Override + public boolean isCancelled() { + return true; + } + }; + } + + @Override + public void stop() {} } @Test