Skip to content

Commit 40bceea

Browse files
authored
fix: nats timing flakes (#26944)
1 parent 2ab3d10 commit 40bceea

2 files changed

Lines changed: 39 additions & 10 deletions

File tree

coderd/x/nats/metrics_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,13 @@ func TestPubsub_Metrics(t *testing.T) {
5858
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_nats_pubsub_messages_total", "normal") &&
5959
testutil.PromCounterHasValue(t, metrics, float64(len(data))+latencyBytes, "coder_nats_pubsub_received_bytes_total") &&
6060
testutil.PromCounterHasValue(t, metrics, float64(len(data))+latencyBytes, "coder_nats_pubsub_published_bytes_total") &&
61-
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_nats_pubsub_send_latency_seconds") &&
62-
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_nats_pubsub_receive_latency_seconds") &&
61+
// The latency gauges can be exactly 0 on Windows, whose monotonic
62+
// clock advances in coarse (up to 15.6ms) ticks; a sub-tick publish
63+
// measures a zero duration. An absent gauge reads as -1, so >= 0
64+
// still proves the gauge was gathered, and the measures_total/errs
65+
// counters below prove measurement ran and succeeded.
66+
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in >= 0 }, "coder_nats_pubsub_send_latency_seconds") &&
67+
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in >= 0 }, "coder_nats_pubsub_receive_latency_seconds") &&
6368
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_nats_pubsub_latency_measures_total") &&
6469
!testutil.PromCounterGathered(t, metrics, "coder_nats_pubsub_latency_measure_errs_total")
6570
}, testutil.WaitShort, testutil.IntervalFast)
@@ -95,8 +100,10 @@ func TestPubsub_Metrics(t *testing.T) {
95100
testutil.PromCounterHasValue(t, metrics, 1, "coder_nats_pubsub_messages_total", "colossal") &&
96101
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data))+latencyBytes, "coder_nats_pubsub_received_bytes_total") &&
97102
testutil.PromCounterHasValue(t, metrics, float64(colossalSize+len(data))+latencyBytes, "coder_nats_pubsub_published_bytes_total") &&
98-
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_nats_pubsub_send_latency_seconds") &&
99-
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in > 0 }, "coder_nats_pubsub_receive_latency_seconds") &&
103+
// Zero latency is accepted for coarse clocks; see the comment on
104+
// the first Eventually above.
105+
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in >= 0 }, "coder_nats_pubsub_send_latency_seconds") &&
106+
testutil.PromGaugeAssertion(t, metrics, func(in float64) bool { return in >= 0 }, "coder_nats_pubsub_receive_latency_seconds") &&
100107
testutil.PromCounterHasValue(t, metrics, gatherCount, "coder_nats_pubsub_latency_measures_total") &&
101108
!testutil.PromCounterGathered(t, metrics, "coder_nats_pubsub_latency_measure_errs_total")
102109
}, testutil.WaitShort, testutil.IntervalFast)

coderd/x/nats/natsbench/natsbench_internal_test.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package main
22

33
import (
44
"testing"
5-
"time"
65

76
"github.com/stretchr/testify/require"
87

@@ -66,9 +65,22 @@ func TestRunSingleNode(t *testing.T) {
6665
require.EqualValues(t, pl.totalExpected, res.Delivered)
6766
require.Greater(t, res.Delivered, res.Published, "fan-out must exceed publishes")
6867
require.Zero(t, res.Drops)
69-
require.Greater(t, res.PubsPerSec, 0.0)
70-
require.Greater(t, res.DeliveriesPerSec, 0.0)
71-
require.Greater(t, res.DeliverDuration, time.Duration(0))
68+
// The exact count assertions above are the authoritative correctness
69+
// checks; a run that published or delivered nothing fails there. The
70+
// rates below are derived (count / duration), and Windows' monotonic
71+
// clock advances in coarse (up to 15.6ms) ticks, so a sub-tick phase
72+
// measures a 0 duration and must yield a 0 rate rather than a
73+
// fabricated one. On fine-grained clocks the strict branch applies.
74+
if res.PublishDuration > 0 {
75+
require.Greater(t, res.PubsPerSec, 0.0)
76+
} else {
77+
require.Zero(t, res.PubsPerSec)
78+
}
79+
if res.DeliverDuration > 0 {
80+
require.Greater(t, res.DeliveriesPerSec, 0.0)
81+
} else {
82+
require.Zero(t, res.DeliveriesPerSec)
83+
}
7284
require.GreaterOrEqual(t, res.DeliverDuration, res.PublishDuration)
7385
}
7486

@@ -101,6 +113,16 @@ func TestRunCluster(t *testing.T) {
101113
require.EqualValues(t, pl.totalExpected, res.Expected)
102114
require.EqualValues(t, pl.totalExpected, res.Delivered)
103115
require.Zero(t, res.Drops)
104-
require.Greater(t, res.PubsPerSec, 0.0)
105-
require.Greater(t, res.DeliveriesPerSec, 0.0)
116+
// See TestRunSingleNode: counts above are the correctness checks;
117+
// coarse clocks can quantize a phase duration, and thus its rate, to 0.
118+
if res.PublishDuration > 0 {
119+
require.Greater(t, res.PubsPerSec, 0.0)
120+
} else {
121+
require.Zero(t, res.PubsPerSec)
122+
}
123+
if res.DeliverDuration > 0 {
124+
require.Greater(t, res.DeliveriesPerSec, 0.0)
125+
} else {
126+
require.Zero(t, res.DeliveriesPerSec)
127+
}
106128
}

0 commit comments

Comments
 (0)