diff --git a/CMakeLists.txt b/CMakeLists.txt index a12f96223f01c..258444104ab34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23837,6 +23837,7 @@ add_executable(otel_plugin_test src/cpp/ext/otel/otel_client_call_tracer.cc src/cpp/ext/otel/otel_plugin.cc src/cpp/ext/otel/otel_server_call_tracer.cc + test/core/test_util/fail_first_call_filter.cc test/core/test_util/fake_stats_plugin.cc test/cpp/end2end/test_service_impl.cc test/cpp/ext/otel/otel_plugin_test.cc diff --git a/build_autogenerated.yaml b/build_autogenerated.yaml index 34f887efb897a..0b05741788ff2 100644 --- a/build_autogenerated.yaml +++ b/build_autogenerated.yaml @@ -15943,6 +15943,7 @@ targets: - src/cpp/ext/otel/otel_plugin.h - src/cpp/ext/otel/otel_server_call_tracer.h - test/core/promise/test_context.h + - test/core/test_util/fail_first_call_filter.h - test/core/test_util/fake_stats_plugin.h - test/cpp/end2end/test_service_impl.h - test/cpp/ext/otel/otel_test_library.h @@ -15958,6 +15959,7 @@ targets: - src/cpp/ext/otel/otel_client_call_tracer.cc - src/cpp/ext/otel/otel_plugin.cc - src/cpp/ext/otel/otel_server_call_tracer.cc + - test/core/test_util/fail_first_call_filter.cc - test/core/test_util/fake_stats_plugin.cc - test/cpp/end2end/test_service_impl.cc - test/cpp/ext/otel/otel_plugin_test.cc diff --git a/include/grpcpp/ext/otel_plugin.h b/include/grpcpp/ext/otel_plugin.h index 1e475306c28cf..8626115355984 100644 --- a/include/grpcpp/ext/otel_plugin.h +++ b/include/grpcpp/ext/otel_plugin.h @@ -100,6 +100,15 @@ class OpenTelemetryPluginBuilder { kServerCallRcvdTotalCompressedMessageSizeInstrumentName = "grpc.server.call.rcvd_total_compressed_message_size"; + /// Experimental Retry Metrics + static constexpr absl::string_view kClientCallRetriesInstrumentName = + "grpc.client.call.retries"; + static constexpr absl::string_view + kClientCallTransparentRetriesInstrumentName = + "grpc.client.call.transparent_retries"; + static constexpr absl::string_view kClientCallRetryDelayInstrumentName = + "grpc.client.call.retry_delay"; + OpenTelemetryPluginBuilder(); ~OpenTelemetryPluginBuilder(); /// If `SetMeterProvider()` is not called, no metrics are collected. diff --git a/src/cpp/ext/otel/otel_client_call_tracer.cc b/src/cpp/ext/otel/otel_client_call_tracer.cc index 8d6c1b546b45d..3a939f656e546 100644 --- a/src/cpp/ext/otel/otel_client_call_tracer.cc +++ b/src/cpp/ext/otel/otel_client_call_tracer.cc @@ -115,7 +115,7 @@ class OpenTelemetryPluginImpl::ClientCallTracer::CallAttemptTracer< // template OpenTelemetryPluginImpl::ClientCallTracer::CallAttemptTracer:: - CallAttemptTracer(const OpenTelemetryPluginImpl::ClientCallTracer* parent, + CallAttemptTracer(OpenTelemetryPluginImpl::ClientCallTracer* const parent, uint64_t attempt_num, bool is_transparent_retry) : parent_(parent), start_time_(absl::Now()) { if (parent_->otel_plugin_->client_.attempt.started != nullptr) { @@ -310,6 +310,12 @@ void OpenTelemetryPluginImpl::ClientCallTracer:: parent_->otel_plugin_->client_.attempt.rcvd_total_compressed_message_size ->Record(incoming_bytes, labels, opentelemetry::context::Context{}); } + if (parent_->otel_plugin_->client_.call.retry_delay != nullptr) { + grpc_core::MutexLock lock(&parent_->mu_); + if (--parent_->num_active_attempts_ == 0) { + parent_->time_at_last_attempt_end_ = absl::Now(); + } + } if (span_ != nullptr) { if (status.ok()) { span_->SetStatus(opentelemetry::trace::StatusCode::kOk); @@ -441,6 +447,32 @@ OpenTelemetryPluginImpl::ClientCallTracer::ClientCallTracer( } OpenTelemetryPluginImpl::ClientCallTracer::~ClientCallTracer() { + std::array, + 2> + attributes = { + std::pair(AbslStringViewToNoStdStringView(OpenTelemetryMethodKey()), + opentelemetry::common::AttributeValue( + AbslStringViewToNoStdStringView(MethodForStats()))), + std::pair(AbslStringViewToNoStdStringView(OpenTelemetryTargetKey()), + opentelemetry::common::AttributeValue( + AbslStringViewToNoStdStringView( + scope_config_->filtered_target())))}; + if (otel_plugin_->client_.call.retries != nullptr && retries_ > 1) { + otel_plugin_->client_.call.retries->Record( + retries_ - 1, attributes, opentelemetry::context::Context{}); + } + if (otel_plugin_->client_.call.transparent_retries != nullptr && + transparent_retries_ != 0) { + otel_plugin_->client_.call.transparent_retries->Record( + transparent_retries_, attributes, opentelemetry::context::Context{}); + } + if (otel_plugin_->client_.call.retry_delay != nullptr && + retry_delay_ != absl::ZeroDuration() && retries_ > 1) { + otel_plugin_->client_.call.retry_delay->Record( + absl::ToDoubleSeconds(retry_delay_), attributes, + opentelemetry::context::Context{}); + } if (span_ != nullptr) { span_->End(); } @@ -458,6 +490,10 @@ OpenTelemetryPluginImpl::ClientCallTracer::StartNewAttempt( grpc_core::MutexLock lock(&mu_); if (transparent_retries_ != 0 || retries_ != 0) { is_first_attempt = false; + if (otel_plugin_->client_.call.retry_delay != nullptr && + num_active_attempts_ == 0 && !is_transparent_retry) { + retry_delay_ += absl::Now() - time_at_last_attempt_end_; + } } if (is_transparent_retry) { ++transparent_retries_; @@ -465,6 +501,7 @@ OpenTelemetryPluginImpl::ClientCallTracer::StartNewAttempt( ++retries_; } attempt_num = retries_ - 1; // Sequence starts at 0 + ++num_active_attempts_; } if (is_first_attempt) { return arena_ diff --git a/src/cpp/ext/otel/otel_client_call_tracer.h b/src/cpp/ext/otel/otel_client_call_tracer.h index 6d3ec7f7a4772..6a9cc202b51a0 100644 --- a/src/cpp/ext/otel/otel_client_call_tracer.h +++ b/src/cpp/ext/otel/otel_client_call_tracer.h @@ -55,7 +55,7 @@ class OpenTelemetryPluginImpl::ClientCallTracer grpc_core::NonPolymorphicRefCount, UnrefBehavior> { public: - CallAttemptTracer(const OpenTelemetryPluginImpl::ClientCallTracer* parent, + CallAttemptTracer(OpenTelemetryPluginImpl::ClientCallTracer* const parent, uint64_t attempt_num, bool is_transparent_retry); ~CallAttemptTracer() override; @@ -105,7 +105,7 @@ class OpenTelemetryPluginImpl::ClientCallTracer void PopulateLabelInjectors(grpc_metadata_batch* metadata); - const ClientCallTracer* parent_; + ClientCallTracer* const parent_; // Start time (for measuring latency). absl::Time start_time_; std::unique_ptr injected_labels_; @@ -157,11 +157,18 @@ class OpenTelemetryPluginImpl::ClientCallTracer const bool registered_method_; OpenTelemetryPluginImpl* otel_plugin_; std::shared_ptr scope_config_; + // TODO(ctiller@): When refactoring the tracer code, consider the possibility + // of removing this mutex. More discussion in + // https://github.com/grpc/grpc/pull/39195/files#r2191231973. grpc_core::Mutex mu_; - // Non-transparent attempts per call (including first attempt) + // Non-transparent retry attempts per call (includes initial attempt) uint64_t retries_ ABSL_GUARDED_BY(&mu_) = 0; // Transparent retries per call uint64_t transparent_retries_ ABSL_GUARDED_BY(&mu_) = 0; + // Retry delay + absl::Duration retry_delay_ ABSL_GUARDED_BY(&mu_); + absl::Time time_at_last_attempt_end_ ABSL_GUARDED_BY(&mu_); + uint64_t num_active_attempts_ ABSL_GUARDED_BY(&mu_) = 0; opentelemetry::nostd::shared_ptr span_; }; diff --git a/src/cpp/ext/otel/otel_plugin.cc b/src/cpp/ext/otel/otel_plugin.cc index b20eeb7fc1f33..1f31c3230104f 100644 --- a/src/cpp/ext/otel/otel_plugin.cc +++ b/src/cpp/ext/otel/otel_plugin.cc @@ -502,6 +502,33 @@ OpenTelemetryPluginImpl::OpenTelemetryPluginImpl( kServerCallRcvdTotalCompressedMessageSizeInstrumentName), "Compressed message bytes received per server call", "By"); } + if (metrics.contains(grpc::OpenTelemetryPluginBuilder:: + kClientCallRetriesInstrumentName)) { + client_.call.retries = meter->CreateUInt64Histogram( + std::string(grpc::OpenTelemetryPluginBuilder:: + kClientCallRetriesInstrumentName), + "EXPERIMENTAL: Number of retries during the client call. If there " + "were no retries, 0 is not reported.", + "{retry}"); + } + if (metrics.contains(grpc::OpenTelemetryPluginBuilder:: + kClientCallTransparentRetriesInstrumentName)) { + client_.call.transparent_retries = meter->CreateUInt64Histogram( + std::string(grpc::OpenTelemetryPluginBuilder:: + kClientCallTransparentRetriesInstrumentName), + "EXPERIMENTAL: Number of transparent retries during the client call. " + "If there were no transparent retries, 0 is not reported.", + "{transparent_retry}"); + } + if (metrics.contains(grpc::OpenTelemetryPluginBuilder:: + kClientCallRetryDelayInstrumentName)) { + client_.call.retry_delay = meter->CreateDoubleHistogram( + std::string(grpc::OpenTelemetryPluginBuilder:: + kClientCallRetryDelayInstrumentName), + "EXPERIMENTAL: Total time of delay while there is no active attempt " + "during the client call", + "s"); + } // Store optional label keys for per call metrics CHECK(static_cast(grpc_core::ClientCallTracer::CallAttemptTracer:: OptionalLabelKey::kSize) <= diff --git a/src/cpp/ext/otel/otel_plugin.h b/src/cpp/ext/otel/otel_plugin.h index 415d727066db8..40c0eb3cfbf6f 100644 --- a/src/cpp/ext/otel/otel_plugin.h +++ b/src/cpp/ext/otel/otel_plugin.h @@ -356,6 +356,12 @@ class OpenTelemetryPluginImpl }; struct ClientMetrics { + struct Call { + std::unique_ptr> retries; + std::unique_ptr> + transparent_retries; + std::unique_ptr> retry_delay; + } call; struct Attempt { std::unique_ptr> started; std::unique_ptr> duration; diff --git a/test/cpp/ext/otel/BUILD b/test/cpp/ext/otel/BUILD index 7b63dfa2db33a..97761a4986066 100644 --- a/test/cpp/ext/otel/BUILD +++ b/test/cpp/ext/otel/BUILD @@ -59,6 +59,8 @@ grpc_cc_test( "otel/sdk/src/metrics", ], tags = [ + # Incompatible with internal tracing + "grpc:no-internal-poller", # TODO(b/332369798): Remove after fixing bug "grpc:otel-namespace-calamity", ], @@ -67,6 +69,7 @@ grpc_cc_test( "//:grpc++", "//src/cpp/ext/otel:otel_plugin", "//test/core/promise:test_context", + "//test/core/test_util:fail_first_call_filter", "//test/core/test_util:grpc_test_util", ], ) @@ -82,6 +85,8 @@ grpc_cc_test( "otel/sdk/src/metrics", ], tags = [ + # Incompatible with internal tracing + "grpc:no-internal-poller", # TODO(b/332369798): Remove after fixing bug "grpc:otel-namespace-calamity", ], diff --git a/test/cpp/ext/otel/otel_plugin_test.cc b/test/cpp/ext/otel/otel_plugin_test.cc index 805e464dcdf72..d11fe652ee6d2 100644 --- a/test/cpp/ext/otel/otel_plugin_test.cc +++ b/test/cpp/ext/otel/otel_plugin_test.cc @@ -40,6 +40,7 @@ #include "src/core/config/core_configuration.h" #include "src/core/lib/event_engine/channel_args_endpoint_config.h" #include "src/core/telemetry/call_tracer.h" +#include "test/core/test_util/fail_first_call_filter.h" #include "test/core/test_util/fake_stats_plugin.h" #include "test/core/test_util/test_config.h" #include "test/cpp/end2end/test_service_impl.h" @@ -313,6 +314,170 @@ TEST_F(OpenTelemetryPluginEnd2EndTest, EXPECT_EQ(*status_value, "OK"); } +// When there are no retries, no retry stats are exported. +TEST_F(OpenTelemetryPluginEnd2EndTest, RetryStatsWithoutRetries) { + Init(std::move(Options().set_metric_names( + {grpc::OpenTelemetryPluginBuilder::kClientAttemptDurationInstrumentName, + grpc::OpenTelemetryPluginBuilder::kClientCallRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallTransparentRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallRetryDelayInstrumentName}))); + SendRPC(); + const char* kRetryMetricName = "grpc.client.call.retries"; + const char* kTransparentRetryMetricName = + "grpc.client.call.transparent_retries"; + const char* kRetryDelayMetricName = "grpc.client.call.retry_delay"; + const char* kClientAttemptDurationMetricName = "grpc.client.attempt.duration"; + auto data = ReadCurrentMetricsData( + [&](const absl::flat_hash_map< + std::string, + std::vector>& + data) { + // Use grpc.client.attempt.duration as a signal that client metrics have + // been collected for the call. + return !data.contains(kClientAttemptDurationMetricName); + }); + EXPECT_TRUE(data.contains(kClientAttemptDurationMetricName)); + EXPECT_FALSE(data.contains(kRetryMetricName)); + EXPECT_FALSE(data.contains(kTransparentRetryMetricName)); + EXPECT_FALSE(data.contains(kRetryDelayMetricName)); +} + +TEST_F(OpenTelemetryPluginEnd2EndTest, RetryStatsWithRetries) { + Init(std::move( + Options() + .set_metric_names({grpc::OpenTelemetryPluginBuilder:: + kClientCallRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallTransparentRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallRetryDelayInstrumentName}) + .set_service_config( + "{\n" + " \"methodConfig\": [ {\n" + " \"name\": [\n" + " { \"service\": \"grpc.testing.EchoTestService\" }\n" + " ],\n" + " \"retryPolicy\": {\n" + " \"maxAttempts\": 3,\n" + " \"initialBackoff\": \"0.1s\",\n" + " \"maxBackoff\": \"120s\",\n" + " \"backoffMultiplier\": 1,\n" + " \"retryableStatusCodes\": [ \"ABORTED\" ]\n" + " }\n" + " } ]\n" + "}"))); + { + EchoRequest request; + request.mutable_param()->mutable_expected_error()->set_code( + StatusCode::ABORTED); + request.set_message("foo"); + EchoResponse response; + grpc::ClientContext context; + grpc::Status status = stub_->Echo(&context, request, &response); + EXPECT_EQ(status.error_code(), StatusCode::ABORTED); + } + const char* kRetryMetricName = "grpc.client.call.retries"; + const char* kTransparentRetryMetricName = + "grpc.client.call.transparent_retries"; + const char* kRetryDelayMetricName = "grpc.client.call.retry_delay"; + auto data = ReadCurrentMetricsData( + [&](const absl::flat_hash_map< + std::string, + std::vector>& + data) { + return !data.contains(kRetryMetricName) || + !data.contains(kRetryDelayMetricName); + }); + + ASSERT_EQ(data.size(), 2); + // Check for retry stats + EXPECT_THAT(data[kRetryMetricName], + ::testing::UnorderedElementsAre(::testing::AllOf( + AttributesEq(std::array{"grpc.method", + "grpc.target"}, + std::array{ + kMethodName, canonical_server_address_}, + std::array{}, + std::array{}), + HistogramResultEq(::testing::Eq(int64_t(2)), + ::testing::Eq(int64_t(2)), + ::testing::Eq(int64_t(2)), 1)))); + // Check for retry delay stats. + EXPECT_THAT( + data[kRetryDelayMetricName], + ::testing::ElementsAre(::testing::AllOf( + AttributesEq( + std::array{"grpc.method", "grpc.target"}, + std::array{kMethodName, + canonical_server_address_}, + std::array{}, + std::array{}), + HistogramResultEq( + IsWithinRange(0.1, 0.3 * grpc_test_slowdown_factor()), + IsWithinRange(0.1, 0.3 * grpc_test_slowdown_factor()), + IsWithinRange(0.1, 0.3 * grpc_test_slowdown_factor()), 1)))); + // No transparent retry stats reported + EXPECT_FALSE(data.contains(kTransparentRetryMetricName)); +} + +class OTelMetricsTestForTransparentRetries + : public OpenTelemetryPluginEnd2EndTest { + protected: + void SetUp() override { + grpc_core::CoreConfiguration::RegisterEphemeralBuilder( + [](grpc_core::CoreConfiguration::Builder* builder) { + // Register FailFirstCallFilter to simulate transparent retries. + builder->channel_init()->RegisterFilter( + GRPC_CLIENT_SUBCHANNEL, + &grpc_core::testing::FailFirstCallFilter::kFilterVtable); + }); + OpenTelemetryPluginEnd2EndTest::SetUp(); + } +}; + +TEST_F(OTelMetricsTestForTransparentRetries, RetryStatsWithTransparentRetries) { + Init(std::move(Options().set_metric_names( + {grpc::OpenTelemetryPluginBuilder::kClientCallRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallTransparentRetriesInstrumentName, + grpc::OpenTelemetryPluginBuilder:: + kClientCallRetryDelayInstrumentName}))); + ChannelArguments args; + args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, 1); + auto channel = grpc::CreateCustomChannel( + server_address_, grpc::InsecureChannelCredentials(), args); + ResetStub(std::move(channel)); + SendRPC(); + const char* kRetryMetricName = "grpc.client.call.retries"; + const char* kTransparentRetryMetricName = + "grpc.client.call.transparent_retries"; + const char* kRetryDelayMetricName = "grpc.client.call.retry_delay"; + auto data = ReadCurrentMetricsData( + [&](const absl::flat_hash_map< + std::string, + std::vector>& + data) { return !data.contains(kTransparentRetryMetricName); }); + ASSERT_EQ(data.size(), 1); + // No retry stats reported + EXPECT_FALSE(data.contains(kRetryMetricName)); + // Check for transparent retry stats + EXPECT_THAT(data[kTransparentRetryMetricName], + ::testing::UnorderedElementsAre(::testing::AllOf( + AttributesEq(std::array{"grpc.method", + "grpc.target"}, + std::array{ + kMethodName, canonical_server_address_}, + std::array{}, + std::array{}), + HistogramResultEq(::testing::Eq(int64_t(1)), + ::testing::Eq(int64_t(1)), + ::testing::Eq(int64_t(1)), 1)))); + // No retry delay reported + EXPECT_FALSE(data.contains(kRetryDelayMetricName)); +} + // Make sure that no meter provider results in normal operations. TEST_F(OpenTelemetryPluginEnd2EndTest, NoMeterProviderRegistered) { Init( diff --git a/test/cpp/ext/otel/otel_test_library.cc b/test/cpp/ext/otel/otel_test_library.cc index d1f666c04c0da..ff1c016812d1d 100644 --- a/test/cpp/ext/otel/otel_test_library.cc +++ b/test/cpp/ext/otel/otel_test_library.cc @@ -230,8 +230,6 @@ OpenTelemetryPluginEnd2EndTest::MetricsCollectorThread::Stop() { } void OpenTelemetryPluginEnd2EndTest::Init(Options config) { - grpc_core::CoreConfiguration:: - ResetEverythingIncludingPersistentBuildersAbsolutelyNotRecommended(); ChannelArguments channel_args; if (!config.labels_to_inject.empty()) { labels_to_inject_ = std::move(config.labels_to_inject); @@ -277,6 +275,7 @@ void OpenTelemetryPluginEnd2EndTest::TearDown() { grpc_core::ServerCallTracerFactory::TestOnlyReset(); grpc_core::GlobalStatsPluginRegistryTestPeer:: ResetGlobalStatsPluginRegistry(); + grpc_core::CoreConfiguration::Reset(); } void OpenTelemetryPluginEnd2EndTest::ResetStub( diff --git a/test/cpp/ext/otel/otel_test_library.h b/test/cpp/ext/otel/otel_test_library.h index f7e667e58cee7..8de9dd8100b89 100644 --- a/test/cpp/ext/otel/otel_test_library.h +++ b/test/cpp/ext/otel/otel_test_library.h @@ -407,6 +407,21 @@ MATCHER_P7(GaugeDataIsIncrementalForSpecificMetricAndLabelSet, metric_name, return result; } +// Helper matcher to check whether a value is within a certain range +MATCHER_P2(IsWithinRange, lo, hi, + absl::StrCat(negation ? "isn't" : "is", " between ", + ::testing::PrintToString(lo), " and ", + ::testing::PrintToString(hi))) { + return (lo) <= arg && arg <= (hi); +} + +// Specialization of Extract to be able to use `IsWithinRange` matcher within +// the `HistogramResultEq` matcher defined above. +template +struct Extract> { + using Type = T; +}; + } // namespace testing } // namespace grpc