From 33cfa62d4d03ffb74e40bed318d439a2a1f232b6 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 12 Jun 2026 17:56:15 +0000 Subject: [PATCH 1/5] feat(coderd/x/nats): add cluster mTLS config builder Mints an ephemeral IP-SAN leaf certificate from an injected CA for mutually verified TLS on NATS cluster routes. Co-authored-by: Mux --- coderd/x/nats/tls.go | 117 ++++++++++++++++++ coderd/x/nats/tls_internal_test.go | 185 +++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 coderd/x/nats/tls.go create mode 100644 coderd/x/nats/tls_internal_test.go diff --git a/coderd/x/nats/tls.go b/coderd/x/nats/tls.go new file mode 100644 index 0000000000000..dcf65bf3c0e09 --- /dev/null +++ b/coderd/x/nats/tls.go @@ -0,0 +1,117 @@ +package nats + +import ( + "crypto" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "math/big" + "net" + "time" + + "golang.org/x/xerrors" +) + +// leafCertValidity is how long a replica's ephemeral cluster leaf +// certificate remains valid. Leaves are minted in memory at startup and +// die with the process, so this only needs to exceed the maximum +// expected process lifetime between restarts. +const leafCertValidity = 30 * 24 * time.Hour + +// ClusterTLSOptions configures mutual TLS for the inter-replica NATS +// cluster route listener. The CA signs an ephemeral per-replica leaf +// certificate at startup; the leaf private key is never persisted. +type ClusterTLSOptions struct { + // CACert is the deployment-wide cluster CA certificate. Peers are + // verified against it in both directions of a route handshake. + CACert *x509.Certificate + + // CAKey is the private key for CACert, used to sign this replica's + // leaf certificate. + CAKey crypto.Signer + + // SANHost is this replica's relay-URL host, embedded as an IP SAN + // in the leaf certificate. It must be an IP address and must match + // the address peers dial, or route TLS handshakes fail with a + // hostname verification error. + SANHost string +} + +// buildClusterTLSConfig mints an ephemeral ECDSA P-256 leaf certificate +// signed by the configured CA and returns a *tls.Config suitable for +// natsserver.ClusterOpts.TLSConfig. The same config serves both route +// roles: the NATS server uses it when accepting routes and clones it +// (setting ServerName from the dialed route URL) when soliciting them. +func buildClusterTLSConfig(opts ClusterTLSOptions) (*tls.Config, error) { + if opts.CACert == nil { + return nil, xerrors.New("cluster TLS: CA certificate is required") + } + if opts.CAKey == nil { + return nil, xerrors.New("cluster TLS: CA private key is required") + } + ip := net.ParseIP(opts.SANHost) + if ip == nil { + return nil, xerrors.Errorf("cluster TLS: SAN host %q is not an IP address; NATS cluster TLS requires an IP-based relay URL", opts.SANHost) + } + + leafKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, xerrors.Errorf("generate leaf key: %w", err) + } + + serialLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serial, err := rand.Int(rand.Reader, serialLimit) + if err != nil { + return nil, xerrors.Errorf("generate leaf serial: %w", err) + } + + now := time.Now() + template := &x509.Certificate{ + SerialNumber: serial, + Subject: pkix.Name{ + CommonName: opts.SANHost, + }, + IPAddresses: []net.IP{ip}, + NotBefore: now, + NotAfter: now.Add(leafCertValidity), + KeyUsage: x509.KeyUsageDigitalSignature, + // Both usages are required: on a route each server acts as the + // TLS server when accepting and the TLS client when dialing. + ExtKeyUsage: []x509.ExtKeyUsage{ + x509.ExtKeyUsageServerAuth, + x509.ExtKeyUsageClientAuth, + }, + BasicConstraintsValid: true, + } + + leafDER, err := x509.CreateCertificate(rand.Reader, template, opts.CACert, &leafKey.PublicKey, opts.CAKey) + if err != nil { + return nil, xerrors.Errorf("sign leaf certificate: %w", err) + } + leaf, err := x509.ParseCertificate(leafDER) + if err != nil { + return nil, xerrors.Errorf("parse leaf certificate: %w", err) + } + + // A pool rather than a single cert so multiple CA certificates can + // coexist during a future CA rotation overlap window. + caPool := x509.NewCertPool() + caPool.AddCert(opts.CACert) + + return &tls.Config{ + Certificates: []tls.Certificate{{ + Certificate: [][]byte{leafDER}, + PrivateKey: leafKey, + Leaf: leaf, + }}, + // RootCAs verifies peers when dialing routes; ClientCAs + // verifies the dialing peer's certificate when accepting them. + RootCAs: caPool, + ClientCAs: caPool, + ClientAuth: tls.RequireAndVerifyClientCert, + MinVersion: tls.VersionTLS12, + }, nil +} diff --git a/coderd/x/nats/tls_internal_test.go b/coderd/x/nats/tls_internal_test.go new file mode 100644 index 0000000000000..66bcc8c05435e --- /dev/null +++ b/coderd/x/nats/tls_internal_test.go @@ -0,0 +1,185 @@ +package nats + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "math/big" + "net" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// generateTestCA returns a self-signed CA certificate and its private +// key for signing test leaf certificates. +func generateTestCA(t *testing.T) (*x509.Certificate, *ecdsa.PrivateKey) { + t.Helper() + + caKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + + template := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{ + CommonName: "test-cluster-ca", + }, + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour), + KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, + BasicConstraintsValid: true, + IsCA: true, + } + der, err := x509.CreateCertificate(rand.Reader, template, template, &caKey.PublicKey, caKey) + require.NoError(t, err) + caCert, err := x509.ParseCertificate(der) + require.NoError(t, err) + return caCert, caKey +} + +func Test_buildClusterTLSConfig_Validation(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + + testCases := []struct { + name string + opts ClusterTLSOptions + errPart string + }{ + { + name: "MissingCACert", + opts: ClusterTLSOptions{ + CAKey: caKey, + SANHost: "127.0.0.1", + }, + errPart: "CA certificate is required", + }, + { + name: "MissingCAKey", + opts: ClusterTLSOptions{ + CACert: caCert, + SANHost: "127.0.0.1", + }, + errPart: "CA private key is required", + }, + { + name: "EmptySANHost", + opts: ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + }, + errPart: "is not an IP address", + }, + { + name: "HostnameSANHost", + opts: ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "coderd-0.coderd.svc.cluster.local", + }, + errPart: "is not an IP address", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + cfg, err := buildClusterTLSConfig(tc.opts) + require.ErrorContains(t, err, tc.errPart) + require.Nil(t, cfg) + }) + } +} + +func Test_buildClusterTLSConfig_Leaf(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + + testCases := []struct { + name string + sanHost string + }{ + {name: "IPv4", sanHost: "10.0.1.5"}, + {name: "IPv6", sanHost: "fd12:3456:789a::1"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + cfg, err := buildClusterTLSConfig(ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: tc.sanHost, + }) + require.NoError(t, err) + + require.Equal(t, uint16(tls.VersionTLS12), cfg.MinVersion) + require.Equal(t, tls.RequireAndVerifyClientCert, cfg.ClientAuth) + require.NotNil(t, cfg.RootCAs) + require.NotNil(t, cfg.ClientCAs) + require.False(t, cfg.InsecureSkipVerify) + + require.Len(t, cfg.Certificates, 1) + leaf := cfg.Certificates[0].Leaf + require.NotNil(t, leaf) + + // The leaf must verify against the CA pool for both route + // roles: ServerAuth when accepting and ClientAuth when + // dialing. + wantIP := net.ParseIP(tc.sanHost) + require.Len(t, leaf.IPAddresses, 1) + require.True(t, leaf.IPAddresses[0].Equal(wantIP)) + require.Equal(t, tc.sanHost, leaf.Subject.CommonName) + require.ElementsMatch(t, []x509.ExtKeyUsage{ + x509.ExtKeyUsageServerAuth, + x509.ExtKeyUsageClientAuth, + }, leaf.ExtKeyUsage) + + for _, usage := range leaf.ExtKeyUsage { + _, err = leaf.Verify(x509.VerifyOptions{ + Roots: cfg.RootCAs, + KeyUsages: []x509.ExtKeyUsage{usage}, + }) + require.NoError(t, err) + } + require.NoError(t, leaf.VerifyHostname(tc.sanHost)) + + now := time.Now() + require.True(t, leaf.NotBefore.Before(now.Add(time.Minute))) + require.WithinDuration(t, now.Add(leafCertValidity), leaf.NotAfter, time.Minute) + }) + } +} + +func Test_buildClusterTLSConfig_UniqueLeafKeys(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + opts := ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + } + + first, err := buildClusterTLSConfig(opts) + require.NoError(t, err) + second, err := buildClusterTLSConfig(opts) + require.NoError(t, err) + + // Each replica mints its own ephemeral key; two builds must never + // share key material or serial numbers. + firstKey, ok := first.Certificates[0].PrivateKey.(*ecdsa.PrivateKey) + require.True(t, ok) + secondKey, ok := second.Certificates[0].PrivateKey.(*ecdsa.PrivateKey) + require.True(t, ok) + require.False(t, firstKey.Equal(secondKey)) + require.NotEqual(t, first.Certificates[0].Leaf.SerialNumber, second.Certificates[0].Leaf.SerialNumber) +} From a5a51908c1dab166774aeadcfb8e94305db09b76 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 12 Jun 2026 18:02:08 +0000 Subject: [PATCH 2/5] feat(coderd/x/nats): wire cluster mTLS into embedded server options Adds Options.ClusterTLS so buildServerOptions configures the route listener with the minted leaf and a 10s handshake timeout; nil preserves plaintext routes. Co-authored-by: Mux --- coderd/x/nats/pubsub.go | 5 ++++ coderd/x/nats/server.go | 10 +++++++ coderd/x/nats/tls.go | 4 +++ coderd/x/nats/tls_internal_test.go | 47 ++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/coderd/x/nats/pubsub.go b/coderd/x/nats/pubsub.go index 4c6d902fd2a50..5274feaf2e654 100644 --- a/coderd/x/nats/pubsub.go +++ b/coderd/x/nats/pubsub.go @@ -93,6 +93,11 @@ type Options struct { // default when cluster mode is enabled. RoutePoolSize int + // ClusterTLS enables mutual TLS on the cluster route listener. A + // per-replica leaf certificate is minted from the configured CA at + // startup. Nil keeps routes plaintext (token auth only). + ClusterTLS *ClusterTLSOptions + // disableCluster is intended only for testing. Since we cannot reload a server // with a cluster host/port after initialization, we start all production servers // with clustering enabled. diff --git a/coderd/x/nats/server.go b/coderd/x/nats/server.go index 47194c8a75160..32096fe7d5d9a 100644 --- a/coderd/x/nats/server.go +++ b/coderd/x/nats/server.go @@ -62,6 +62,16 @@ func buildServerOptions(opts Options) (*natsserver.Options, error) { sopts.Cluster.Username = defaultClusterTokenUsername sopts.Cluster.Password = opts.ClusterAuthToken } + if opts.ClusterTLS != nil { + tlsConfig, err := buildClusterTLSConfig(*opts.ClusterTLS) + if err != nil { + return nil, err + } + sopts.Cluster.TLSConfig = tlsConfig + // The NATS default route TLS timeout is 2s, which is tight + // under CI load. + sopts.Cluster.TLSTimeout = clusterTLSTimeout.Seconds() + } } return sopts, nil diff --git a/coderd/x/nats/tls.go b/coderd/x/nats/tls.go index dcf65bf3c0e09..ace26db434ba2 100644 --- a/coderd/x/nats/tls.go +++ b/coderd/x/nats/tls.go @@ -21,6 +21,10 @@ import ( // expected process lifetime between restarts. const leafCertValidity = 30 * 24 * time.Hour +// clusterTLSTimeout bounds the route TLS handshake. It replaces the +// NATS default of 2s, which is tight under CI load. +const clusterTLSTimeout = 10 * time.Second + // ClusterTLSOptions configures mutual TLS for the inter-replica NATS // cluster route listener. The CA signs an ephemeral per-replica leaf // certificate at startup; the leaf private key is never persisted. diff --git a/coderd/x/nats/tls_internal_test.go b/coderd/x/nats/tls_internal_test.go index 66bcc8c05435e..8e5ea0b64d6a0 100644 --- a/coderd/x/nats/tls_internal_test.go +++ b/coderd/x/nats/tls_internal_test.go @@ -183,3 +183,50 @@ func Test_buildClusterTLSConfig_UniqueLeafKeys(t *testing.T) { require.False(t, firstKey.Equal(secondKey)) require.NotEqual(t, first.Certificates[0].Leaf.SerialNumber, second.Certificates[0].Leaf.SerialNumber) } + +func Test_buildServerOptions_ClusterTLS(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + + t.Run("Enabled", func(t *testing.T) { + t.Parallel() + + sopts, err := buildServerOptions(Options{ + ClusterAuthToken: "token", + ClusterTLS: &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + }, + }) + require.NoError(t, err) + require.NotNil(t, sopts.Cluster.TLSConfig) + require.Equal(t, tls.RequireAndVerifyClientCert, sopts.Cluster.TLSConfig.ClientAuth) + require.Equal(t, clusterTLSTimeout.Seconds(), sopts.Cluster.TLSTimeout) + }) + + t.Run("Disabled", func(t *testing.T) { + t.Parallel() + + sopts, err := buildServerOptions(Options{ + ClusterAuthToken: "token", + }) + require.NoError(t, err) + require.Nil(t, sopts.Cluster.TLSConfig) + require.Zero(t, sopts.Cluster.TLSTimeout) + }) + + t.Run("InvalidOptions", func(t *testing.T) { + t.Parallel() + + _, err := buildServerOptions(Options{ + ClusterTLS: &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "not-an-ip", + }, + }) + require.ErrorContains(t, err, "is not an IP address") + }) +} From 274cdcbf0bdc5f1f3c8d2aabc9a4e9cfc018ace7 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 12 Jun 2026 21:15:43 +0000 Subject: [PATCH 3/5] test(coderd/x/nats): add cluster TLS mesh tests and relay URL constructor Adds ClusterTLSOptionsFromRelayURL for deriving the leaf SAN from a replica's relay URL, plus mesh tests covering TLS delivery, wrong-CA rejection, SAN mismatch, and mixed TLS/plaintext. Co-authored-by: Mux --- coderd/x/nats/tls.go | 22 ++++ coderd/x/nats/tls_internal_test.go | 51 +++++++++ coderd/x/nats/tls_mesh_internal_test.go | 143 ++++++++++++++++++++++++ 3 files changed, 216 insertions(+) create mode 100644 coderd/x/nats/tls_mesh_internal_test.go diff --git a/coderd/x/nats/tls.go b/coderd/x/nats/tls.go index ace26db434ba2..8b052df42fb04 100644 --- a/coderd/x/nats/tls.go +++ b/coderd/x/nats/tls.go @@ -10,6 +10,7 @@ import ( "crypto/x509/pkix" "math/big" "net" + "net/url" "time" "golang.org/x/xerrors" @@ -44,6 +45,27 @@ type ClusterTLSOptions struct { SANHost string } +// ClusterTLSOptionsFromRelayURL derives ClusterTLSOptions from this +// replica's relay URL, whose host is the address peers dial for +// cluster routes and therefore the leaf certificate's IP SAN. The +// relay host must be an IP address. +func ClusterTLSOptionsFromRelayURL(relayURL *url.URL, caCert *x509.Certificate, caKey crypto.Signer) (*ClusterTLSOptions, error) { + if relayURL == nil { + return nil, xerrors.New("cluster TLS: relay URL is required") + } + opts := &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: relayURL.Hostname(), + } + // Surface invalid options at construction time rather than at + // server startup. + if _, err := buildClusterTLSConfig(*opts); err != nil { + return nil, err + } + return opts, nil +} + // buildClusterTLSConfig mints an ephemeral ECDSA P-256 leaf certificate // signed by the configured CA and returns a *tls.Config suitable for // natsserver.ClusterOpts.TLSConfig. The same config serves both route diff --git a/coderd/x/nats/tls_internal_test.go b/coderd/x/nats/tls_internal_test.go index 8e5ea0b64d6a0..c82fb70ebb52e 100644 --- a/coderd/x/nats/tls_internal_test.go +++ b/coderd/x/nats/tls_internal_test.go @@ -9,6 +9,7 @@ import ( "crypto/x509/pkix" "math/big" "net" + "net/url" "testing" "time" @@ -230,3 +231,53 @@ func Test_buildServerOptions_ClusterTLS(t *testing.T) { require.ErrorContains(t, err, "is not an IP address") }) } + +func Test_ClusterTLSOptionsFromRelayURL(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + + mustParse := func(raw string) *url.URL { + u, err := url.Parse(raw) + require.NoError(t, err) + return u + } + + t.Run("OK", func(t *testing.T) { + t.Parallel() + + opts, err := ClusterTLSOptionsFromRelayURL(mustParse("http://10.0.1.5:3457"), caCert, caKey) + require.NoError(t, err) + require.Equal(t, "10.0.1.5", opts.SANHost) + require.Equal(t, caCert, opts.CACert) + }) + + t.Run("IPv6", func(t *testing.T) { + t.Parallel() + + opts, err := ClusterTLSOptionsFromRelayURL(mustParse("http://[fd12:3456:789a::1]:3457"), caCert, caKey) + require.NoError(t, err) + require.Equal(t, "fd12:3456:789a::1", opts.SANHost) + }) + + t.Run("NilRelayURL", func(t *testing.T) { + t.Parallel() + + _, err := ClusterTLSOptionsFromRelayURL(nil, caCert, caKey) + require.ErrorContains(t, err, "relay URL is required") + }) + + t.Run("HostnameRelayURL", func(t *testing.T) { + t.Parallel() + + _, err := ClusterTLSOptionsFromRelayURL(mustParse("http://coderd-0.coderd:3457"), caCert, caKey) + require.ErrorContains(t, err, "is not an IP address") + }) + + t.Run("MissingCA", func(t *testing.T) { + t.Parallel() + + _, err := ClusterTLSOptionsFromRelayURL(mustParse("http://10.0.1.5:3457"), nil, nil) + require.ErrorContains(t, err, "CA certificate is required") + }) +} diff --git a/coderd/x/nats/tls_mesh_internal_test.go b/coderd/x/nats/tls_mesh_internal_test.go new file mode 100644 index 0000000000000..cc03124ed1029 --- /dev/null +++ b/coderd/x/nats/tls_mesh_internal_test.go @@ -0,0 +1,143 @@ +package nats + +import ( + "context" + "testing" + + natsserver "github.com/nats-io/nats-server/v2/server" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/testutil" +) + +func numRoutes(ps *Pubsub) int { + routez, err := ps.Server.Routez(&natsserver.RoutezOptions{}) + if err != nil { + return 0 + } + return len(routez.Routes) +} + +func TestPubsub_ClusterTLS(t *testing.T) { + t.Parallel() + + t.Run("Mesh", func(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + opts := clusterTestOptions(t) + opts.ClusterTLS = &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + } + + a := newTestPubsub(t, opts) + b := newTestPubsub(t, opts) + c := newTestPubsub(t, opts) + + addrA := clusterRouteAddress(t, a) + require.NoError(t, b.setPeerAddresses([]string{addrA})) + require.NoError(t, c.setPeerAddresses([]string{addrA})) + + ctx := testutil.Context(t, testutil.WaitLong) + received := make(chan string, 4) + cancelSub, err := c.Subscribe("tls-mesh", func(_ context.Context, msg []byte) { + select { + case received <- string(msg): + default: + } + }) + require.NoError(t, err) + defer cancelSub() + + // b -> a -> c crosses two TLS route hops (gossip meshes b and c + // through a). + waitForRouteSubscription(t, b, "tls-mesh") + require.NoError(t, b.Publish("tls-mesh", []byte("hello"))) + require.Equal(t, "hello", testutil.TryReceive(ctx, t, received)) + }) + + t.Run("WrongCA", func(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + otherCACert, otherCAKey := generateTestCA(t) + + optsA := clusterTestOptions(t) + optsA.ClusterTLS = &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + } + a := newTestPubsub(t, optsA) + + optsB := optsA + optsB.ClusterTLS = &ClusterTLSOptions{ + CACert: otherCACert, + CAKey: otherCAKey, + SANHost: "127.0.0.1", + } + b := newTestPubsub(t, optsB) + + require.NoError(t, b.setPeerAddresses([]string{clusterRouteAddress(t, a)})) + require.Never(t, func() bool { + return numRoutes(a) > 0 || numRoutes(b) > 0 + }, testutil.WaitShort, testutil.IntervalMedium) + }) + + t.Run("SANMismatch", func(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + opts := clusterTestOptions(t) + opts.ClusterTLS = &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + } + a := newTestPubsub(t, opts) + + // b's leaf is signed by the same CA but for a different IP. SAN + // verification happens on the dialing side (the accept side + // checks only the chain, as Go does not SAN-check client + // certs), so a must dial b to hit b's mismatched SAN. + optsB := opts + optsB.ClusterTLS = &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "10.99.99.99", + } + b := newTestPubsub(t, optsB) + + require.NoError(t, a.setPeerAddresses([]string{clusterRouteAddress(t, b)})) + require.Never(t, func() bool { + return numRoutes(a) > 0 || numRoutes(b) > 0 + }, testutil.WaitShort, testutil.IntervalMedium) + }) + + t.Run("MixedTLSAndPlaintext", func(t *testing.T) { + t.Parallel() + + caCert, caKey := generateTestCA(t) + optsTLS := clusterTestOptions(t) + optsTLS.ClusterTLS = &ClusterTLSOptions{ + CACert: caCert, + CAKey: caKey, + SANHost: "127.0.0.1", + } + a := newTestPubsub(t, optsTLS) + + optsPlain := optsTLS + optsPlain.ClusterTLS = nil + b := newTestPubsub(t, optsPlain) + + // Routes cannot form in either direction; rollout must enable + // TLS on every replica of a deployment. + require.NoError(t, b.setPeerAddresses([]string{clusterRouteAddress(t, a)})) + require.NoError(t, a.setPeerAddresses([]string{clusterRouteAddress(t, b)})) + require.Never(t, func() bool { + return numRoutes(a) > 0 || numRoutes(b) > 0 + }, testutil.WaitShort, testutil.IntervalMedium) + }) +} From 978710a95668654f9c7be692178bbdf8f414a480 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 12 Jun 2026 22:18:37 +0000 Subject: [PATCH 4/5] feat: add nats_ca crypto key feature for NATS cluster mTLS CA Adds cryptokeys support for a CA that will sign the ephemeral leaf certificates replicas use for NATS cluster mTLS: - New crypto_key_feature enum value nats_ca; the rotator generates a self-signed ECDSA P-256 CA (PEM cert+key bundle in the secret column) and rotates it on the shared key duration with a 30 day token duration matching the maximum leaf lifetime. - FetchNATSCA accessor returns the active CA (parsed cert and signer) plus a trust bundle of all valid CA rows and the active row sequence so callers can detect rotation. It creates the CA on first fetch under an advisory lock since the NATS pubsub is constructed before the rotator starts. - The CA bundle contains a private key and stays excluded from the wsproxy crypto-keys endpoint allowlist and the signing/encryption keycaches. The feature is dormant: nothing fetches the CA until cli/server.go is wired up in a follow-up branch. --- coderd/cryptokeys/ca.go | 272 ++++++++++++++++++ coderd/cryptokeys/ca_internal_test.go | 182 ++++++++++++ coderd/cryptokeys/rotate.go | 13 +- coderd/cryptokeys/rotate_internal_test.go | 63 +++- coderd/database/dbgen/dbgen.go | 40 +++ coderd/database/dump.sql | 3 +- coderd/database/lock.go | 1 + ...000520_nats_ca_crypto_key_feature.down.sql | 16 ++ .../000520_nats_ca_crypto_key_feature.up.sql | 1 + coderd/database/models.go | 5 +- enterprise/coderd/workspaceproxy_test.go | 6 + 11 files changed, 597 insertions(+), 5 deletions(-) create mode 100644 coderd/cryptokeys/ca.go create mode 100644 coderd/cryptokeys/ca_internal_test.go create mode 100644 coderd/database/migrations/000520_nats_ca_crypto_key_feature.down.sql create mode 100644 coderd/database/migrations/000520_nats_ca_crypto_key_feature.up.sql diff --git a/coderd/cryptokeys/ca.go b/coderd/cryptokeys/ca.go new file mode 100644 index 0000000000000..05f89477c50e9 --- /dev/null +++ b/coderd/cryptokeys/ca.go @@ -0,0 +1,272 @@ +package cryptokeys + +import ( + "context" + "crypto" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "database/sql" + "encoding/pem" + "math/big" + "time" + + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/dbtime" +) + +const ( + caCertPEMBlockType = "CERTIFICATE" + caKeyPEMBlockType = "EC PRIVATE KEY" +) + +// NATSCA is the parsed state of the nats_ca crypto key feature at one point in +// time. The CA signs the ephemeral leaf certificates that replicas use for +// NATS cluster mTLS. +// +// Callers that need to react to CA rotation (re-minting leaves and reloading +// the NATS server config) should poll FetchNATSCA and compare Sequence to +// detect when the active CA has changed. +type NATSCA struct { + // Sequence is the crypto_keys sequence of the active row. + Sequence int32 + // Cert is the active CA certificate used to sign leaf certificates. + Cert *x509.Certificate + // Key is the active CA private key. + Key crypto.Signer + // TrustBundle contains the certificates of all currently valid CA rows, + // including Cert. During a rotation overlap window it has two entries; + // installing the full bundle as the trust root lets replicas on either + // side of a rotation verify each other. + TrustBundle []*x509.Certificate +} + +// FetchNATSCA returns the current NATS cluster CA, creating it if no valid CA +// exists. The NATS pubsub is constructed before the key rotator starts, so on +// fresh deployments the CA row will not exist at first fetch; creation here is +// guarded by an advisory lock and is idempotent under concurrent callers. +// After creation the rotator owns the key's lifecycle. +func FetchNATSCA(ctx context.Context, db database.Store) (*NATSCA, error) { + //nolint:gocritic // The CA accessor requires the same crypto key access as the rotator. + ctx = dbauthz.AsKeyRotator(ctx) + + now := dbtime.Now() + + keys, err := db.GetCryptoKeysByFeature(ctx, database.CryptoKeyFeatureNatsCa) + if err != nil { + return nil, xerrors.Errorf("get crypto keys by feature: %w", err) + } + + ca, ok, err := parseNATSCAKeys(keys, now) + if err != nil { + return nil, err + } + if ok { + return ca, nil + } + + // No active CA exists. Create one inside a transaction under an advisory + // lock, re-checking after the lock is acquired so that concurrent callers + // insert exactly one row. This mirrors rotator.rotateKeys. + err = db.InTx(func(tx database.Store) error { + err := tx.AcquireLock(ctx, database.LockIDNATSCACreate) + if err != nil { + return xerrors.Errorf("acquire lock: %w", err) + } + + keys, err = tx.GetCryptoKeysByFeature(ctx, database.CryptoKeyFeatureNatsCa) + if err != nil { + return xerrors.Errorf("get crypto keys by feature: %w", err) + } + + // Recompute now after acquiring the lock: a concurrent creator may + // have committed a row with a StartsAt later than the time captured + // before we blocked on the lock. + now = dbtime.Now() + var ok bool + ca, ok, err = parseNATSCAKeys(keys, now) + if err != nil { + return err + } + if ok { + return nil + } + + secret, err := generateCASecret(now) + if err != nil { + return xerrors.Errorf("generate CA secret: %w", err) + } + + latestKey, err := tx.GetLatestCryptoKeyByFeature(ctx, database.CryptoKeyFeatureNatsCa) + if err != nil && !xerrors.Is(err, sql.ErrNoRows) { + return xerrors.Errorf("get latest key: %w", err) + } + + newKey, err := tx.InsertCryptoKey(ctx, database.InsertCryptoKeyParams{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: latestKey.Sequence + 1, + Secret: sql.NullString{ + String: secret, + Valid: true, + }, + // Set by dbcrypt if it's required. + SecretKeyID: sql.NullString{}, + StartsAt: now, + }) + if err != nil { + return xerrors.Errorf("insert crypto key: %w", err) + } + + ca, ok, err = parseNATSCAKeys([]database.CryptoKey{newKey}, now) + if err != nil { + return err + } + if !ok { + return xerrors.New("inserted NATS CA is not usable for signing") + } + return nil + }, &database.TxOptions{ + // Read committed (the default) is required here: with repeatable + // read, the snapshot is taken before the advisory lock is granted, + // so the post-lock re-check would not see a row committed by a + // concurrent creator and we would insert a duplicate. + TxIdentifier: "fetch_nats_ca", + }) + if err != nil { + return nil, err + } + return ca, nil +} + +// parseNATSCAKeys builds a NATSCA from the database rows for the nats_ca +// feature. Rows must be ordered by sequence descending (the order returned by +// GetCryptoKeysByFeature). The active CA is the newest row that is usable for +// signing; the trust bundle contains the certificates of every row that is +// still valid for verification. The boolean reports whether a row could act +// as the active CA. +func parseNATSCAKeys(keys []database.CryptoKey, now time.Time) (*NATSCA, bool, error) { + ca := &NATSCA{} + for _, key := range keys { + if !key.CanVerify(now) { + continue + } + cert, signer, err := parseCASecret(key.Secret.String) + if err != nil { + return nil, false, xerrors.Errorf("parse CA secret for sequence %d: %w", key.Sequence, err) + } + ca.TrustBundle = append(ca.TrustBundle, cert) + if ca.Cert == nil && key.CanSign(now) { + ca.Sequence = key.Sequence + ca.Cert = cert + ca.Key = signer + } + } + if ca.Cert == nil { + return nil, false, nil + } + return ca, true, nil +} + +// generateCASecret generates a new self-signed CA certificate and private key +// for signing NATS cluster leaf certificates, PEM-encoded into a single +// bundle for storage in the crypto_keys secret column. +// +// The certificate outlives the key row on purpose: a row is rotated after +// DefaultKeyDuration but remains a valid trust root until its deletes_at +// (an hour plus NATSCATokenDuration after rotation), and leaves minted just +// before rotation live for up to NATSCATokenDuration. +func generateCASecret(now time.Time) (string, error) { + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return "", xerrors.Errorf("generate key: %w", err) + } + + // 128-bit random serial per CA/Browser Forum conventions. + serial, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 128)) + if err != nil { + return "", xerrors.Errorf("generate serial: %w", err) + } + + template := &x509.Certificate{ + SerialNumber: serial, + Subject: pkix.Name{ + CommonName: "coder-nats-ca", + }, + // Backdate NotBefore to tolerate clock skew between replicas. + NotBefore: now.Add(-time.Hour), + NotAfter: now.Add(DefaultKeyDuration + NATSCATokenDuration + time.Hour), + KeyUsage: x509.KeyUsageCertSign, + BasicConstraintsValid: true, + IsCA: true, + MaxPathLenZero: true, + } + + der, err := x509.CreateCertificate(rand.Reader, template, template, key.Public(), key) + if err != nil { + return "", xerrors.Errorf("create certificate: %w", err) + } + + keyDER, err := x509.MarshalECPrivateKey(key) + if err != nil { + return "", xerrors.Errorf("marshal private key: %w", err) + } + + var secret []byte + secret = append(secret, pem.EncodeToMemory(&pem.Block{Type: caCertPEMBlockType, Bytes: der})...) + secret = append(secret, pem.EncodeToMemory(&pem.Block{Type: caKeyPEMBlockType, Bytes: keyDER})...) + return string(secret), nil +} + +// parseCASecret parses a PEM bundle produced by generateCASecret back into +// the CA certificate and private key. +func parseCASecret(secret string) (*x509.Certificate, crypto.Signer, error) { + var ( + cert *x509.Certificate + key *ecdsa.PrivateKey + ) + rest := []byte(secret) + for { + var block *pem.Block + block, rest = pem.Decode(rest) + if block == nil { + break + } + switch block.Type { + case caCertPEMBlockType: + if cert != nil { + return nil, nil, xerrors.New("multiple certificates in CA secret") + } + var err error + cert, err = x509.ParseCertificate(block.Bytes) + if err != nil { + return nil, nil, xerrors.Errorf("parse certificate: %w", err) + } + case caKeyPEMBlockType: + if key != nil { + return nil, nil, xerrors.New("multiple private keys in CA secret") + } + var err error + key, err = x509.ParseECPrivateKey(block.Bytes) + if err != nil { + return nil, nil, xerrors.Errorf("parse private key: %w", err) + } + default: + return nil, nil, xerrors.Errorf("unexpected PEM block type: %q", block.Type) + } + } + if cert == nil { + return nil, nil, xerrors.New("no certificate in CA secret") + } + if key == nil { + return nil, nil, xerrors.New("no private key in CA secret") + } + if !key.PublicKey.Equal(cert.PublicKey) { + return nil, nil, xerrors.New("private key does not match certificate") + } + return cert, key, nil +} diff --git a/coderd/cryptokeys/ca_internal_test.go b/coderd/cryptokeys/ca_internal_test.go new file mode 100644 index 0000000000000..bdd28eb49d4e8 --- /dev/null +++ b/coderd/cryptokeys/ca_internal_test.go @@ -0,0 +1,182 @@ +package cryptokeys + +import ( + "crypto/x509" + "database/sql" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbgen" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/testutil" +) + +func TestCASecretRoundTrip(t *testing.T) { + t.Parallel() + + now := time.Now().UTC().Truncate(time.Second) + secret, err := generateCASecret(now) + require.NoError(t, err) + + cert, signer, err := parseCASecret(secret) + require.NoError(t, err) + + require.True(t, cert.IsCA) + require.True(t, cert.BasicConstraintsValid) + require.True(t, cert.MaxPathLenZero) + require.Equal(t, x509.KeyUsageCertSign, cert.KeyUsage) + require.Equal(t, now.Add(-time.Hour), cert.NotBefore) + require.Equal(t, now.Add(DefaultKeyDuration+NATSCATokenDuration+time.Hour), cert.NotAfter) + require.Equal(t, cert.PublicKey, signer.Public()) + + // The cert must be able to verify itself as a trust root. + pool := x509.NewCertPool() + pool.AddCert(cert) + _, err = cert.Verify(x509.VerifyOptions{Roots: pool}) + require.NoError(t, err) +} + +func TestParseCASecretErrors(t *testing.T) { + t.Parallel() + + _, _, err := parseCASecret("") + require.ErrorContains(t, err, "no certificate") + + _, _, err = parseCASecret("not pem at all") + require.ErrorContains(t, err, "no certificate") +} + +func TestFetchNATSCA(t *testing.T) { + t.Parallel() + + t.Run("CreatesWhenMissing", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + + ca, err := FetchNATSCA(ctx, db) + require.NoError(t, err) + require.NotNil(t, ca.Cert) + require.NotNil(t, ca.Key) + require.Len(t, ca.TrustBundle, 1) + require.Equal(t, ca.Cert, ca.TrustBundle[0]) + + // A second fetch returns the same CA without inserting another row. + again, err := FetchNATSCA(ctx, db) + require.NoError(t, err) + require.Equal(t, ca.Sequence, again.Sequence) + require.Equal(t, ca.Cert.Raw, again.Cert.Raw) + + keys, err := db.GetCryptoKeysByFeature(ctx, database.CryptoKeyFeatureNatsCa) + require.NoError(t, err) + require.Len(t, keys, 1) + }) + + t.Run("ConcurrentCreate", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitLong) + + const fetchers = 8 + cas := make([]*NATSCA, fetchers) + errs := make([]error, fetchers) + var wg sync.WaitGroup + for i := range fetchers { + wg.Add(1) + go func() { + defer wg.Done() + cas[i], errs[i] = FetchNATSCA(ctx, db) + }() + } + wg.Wait() + + for i := range fetchers { + require.NoError(t, errs[i]) + require.Equal(t, cas[0].Sequence, cas[i].Sequence) + require.Equal(t, cas[0].Cert.Raw, cas[i].Cert.Raw) + } + + keys, err := db.GetCryptoKeysByFeature(ctx, database.CryptoKeyFeatureNatsCa) + require.NoError(t, err) + require.Len(t, keys, 1) + }) + + t.Run("RotationOverlap", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + now := time.Now().UTC() + + // Old CA scheduled for deletion in the future: still a trust root, + // no longer the active signer. + oldKey := dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: 1, + StartsAt: now.Add(-2 * time.Hour), + DeletesAt: sql.NullTime{Time: now.Add(time.Hour), Valid: true}, + }) + newKey := dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: 2, + StartsAt: now.Add(-time.Hour), + }) + // Deleted key: excluded entirely. + deletedKey := dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: 3, + StartsAt: now.Add(-3 * time.Hour), + DeletesAt: sql.NullTime{Time: now.Add(-time.Hour), Valid: true}, + }) + + ca, err := FetchNATSCA(ctx, db) + require.NoError(t, err) + require.Equal(t, newKey.Sequence, ca.Sequence) + + newCert, _, err := parseCASecret(newKey.Secret.String) + require.NoError(t, err) + oldCert, _, err := parseCASecret(oldKey.Secret.String) + require.NoError(t, err) + deletedCert, _, err := parseCASecret(deletedKey.Secret.String) + require.NoError(t, err) + + require.Equal(t, newCert.Raw, ca.Cert.Raw) + require.Len(t, ca.TrustBundle, 2) + bundle := [][]byte{ca.TrustBundle[0].Raw, ca.TrustBundle[1].Raw} + require.Contains(t, bundle, newCert.Raw) + require.Contains(t, bundle, oldCert.Raw) + require.NotContains(t, bundle, deletedCert.Raw) + }) + + t.Run("FutureKeyNotActive", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + now := time.Now().UTC() + + current := dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: 1, + StartsAt: now.Add(-time.Hour), + }) + // A rotated-in key that hasn't started yet must not be the active + // signer, but its cert belongs in the trust bundle. + _ = dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: 2, + StartsAt: now.Add(time.Hour), + }) + + ca, err := FetchNATSCA(ctx, db) + require.NoError(t, err) + require.Equal(t, current.Sequence, ca.Sequence) + require.Len(t, ca.TrustBundle, 2) + }) +} diff --git a/coderd/cryptokeys/rotate.go b/coderd/cryptokeys/rotate.go index e768d53273dd3..f6848c2b2b835 100644 --- a/coderd/cryptokeys/rotate.go +++ b/coderd/cryptokeys/rotate.go @@ -20,6 +20,11 @@ const ( WorkspaceAppsTokenDuration = time.Minute OIDCConvertTokenDuration = time.Minute * 5 TailnetResumeTokenDuration = time.Hour * 24 + // NATSCATokenDuration is the maximum lifetime of a leaf certificate + // minted under the NATS cluster CA. Old CA rows must remain valid trust + // roots for this long after rotation so that replicas holding leaves + // signed by the old CA can still be verified. + NATSCATokenDuration = time.Hour * 24 * 30 // defaultRotationInterval is the default interval at which keys are checked for rotation. defaultRotationInterval = time.Minute * 10 @@ -170,7 +175,7 @@ func (k *rotator) rotateKeys(ctx context.Context) error { } func (k *rotator) insertNewKey(ctx context.Context, tx database.Store, feature database.CryptoKeyFeature, startsAt time.Time) (database.CryptoKey, error) { - secret, err := generateNewSecret(feature) + secret, err := generateNewSecret(feature, startsAt) if err != nil { return database.CryptoKey{}, xerrors.Errorf("generate new secret: %w", err) } @@ -227,7 +232,7 @@ func (k *rotator) rotateKey(ctx context.Context, tx database.Store, key database return []database.CryptoKey{updatedKey, newKey}, nil } -func generateNewSecret(feature database.CryptoKeyFeature) (string, error) { +func generateNewSecret(feature database.CryptoKeyFeature, startsAt time.Time) (string, error) { switch feature { case database.CryptoKeyFeatureWorkspaceAppsAPIKey: return generateKey(32) @@ -237,6 +242,8 @@ func generateNewSecret(feature database.CryptoKeyFeature) (string, error) { return generateKey(64) case database.CryptoKeyFeatureTailnetResume: return generateKey(64) + case database.CryptoKeyFeatureNatsCa: + return generateCASecret(startsAt) } return "", xerrors.Errorf("unknown feature: %s", feature) } @@ -260,6 +267,8 @@ func tokenDuration(feature database.CryptoKeyFeature) time.Duration { return OIDCConvertTokenDuration case database.CryptoKeyFeatureTailnetResume: return TailnetResumeTokenDuration + case database.CryptoKeyFeatureNatsCa: + return NATSCATokenDuration default: return 0 } diff --git a/coderd/cryptokeys/rotate_internal_test.go b/coderd/cryptokeys/rotate_internal_test.go index a8202320aea09..862a5d9554a09 100644 --- a/coderd/cryptokeys/rotate_internal_test.go +++ b/coderd/cryptokeys/rotate_internal_test.go @@ -104,6 +104,58 @@ func Test_rotateKeys(t *testing.T) { require.Equal(t, newKey, keys[0]) }) + t.Run("RotatesNATSCA", func(t *testing.T) { + t.Parallel() + + var ( + db, _ = dbtestutil.NewDB(t) + clock = quartz.NewMock(t) + keyDuration = time.Hour * 24 * 7 + logger = testutil.Logger(t) + ctx = testutil.Context(t, testutil.WaitShort) + ) + + kr := &rotator{ + db: db, + keyDuration: keyDuration, + clock: clock, + logger: logger, + features: []database.CryptoKeyFeature{ + database.CryptoKeyFeatureNatsCa, + }, + } + + now := dbnow(clock) + + oldKey := dbgen.CryptoKey(t, db, database.CryptoKey{ + Feature: database.CryptoKeyFeatureNatsCa, + StartsAt: now, + Sequence: 4, + }) + + // Advance the window to just inside rotation time. + _ = clock.Advance(keyDuration - time.Minute*59) + err := kr.rotateKeys(ctx) + require.NoError(t, err) + + // The old CA must remain a valid trust root for the maximum leaf + // lifetime after rotation. + expectedDeletesAt := oldKey.ExpiresAt(keyDuration).Add(NATSCATokenDuration + time.Hour) + oldKey, err = db.GetCryptoKeyByFeatureAndSequence(ctx, database.GetCryptoKeyByFeatureAndSequenceParams{ + Feature: oldKey.Feature, + Sequence: oldKey.Sequence, + }) + require.NoError(t, err) + require.Equal(t, expectedDeletesAt, oldKey.DeletesAt.Time.UTC()) + + newKey, err := db.GetCryptoKeyByFeatureAndSequence(ctx, database.GetCryptoKeyByFeatureAndSequenceParams{ + Feature: database.CryptoKeyFeatureNatsCa, + Sequence: oldKey.Sequence + 1, + }) + require.NoError(t, err) + requireKey(t, newKey, database.CryptoKeyFeatureNatsCa, oldKey.ExpiresAt(keyDuration), nullTime, oldKey.Sequence+1) + }) + t.Run("DoesNotRotateValidKeys", func(t *testing.T) { t.Parallel() @@ -407,7 +459,7 @@ func Test_rotateKeys(t *testing.T) { keys, err := db.GetCryptoKeys(ctx) require.NoError(t, err) - require.Len(t, keys, 5) + require.Len(t, keys, 6) kbf, err := keysByFeature(keys, database.AllCryptoKeyFeatureValues()) require.NoError(t, err) @@ -420,6 +472,7 @@ func Test_rotateKeys(t *testing.T) { // caused a key to be inserted. require.Len(t, kbf[database.CryptoKeyFeatureTailnetResume], 1) require.Len(t, kbf[database.CryptoKeyFeatureWorkspaceAppsToken], 1) + require.Len(t, kbf[database.CryptoKeyFeatureNatsCa], 1) oidcKey := kbf[database.CryptoKeyFeatureOIDCConvert][0] tailnetKey := kbf[database.CryptoKeyFeatureTailnetResume][0] @@ -586,6 +639,14 @@ func requireKey(t *testing.T, key database.CryptoKey, feature database.CryptoKey require.Equal(t, deletesAt.Time.UTC(), key.DeletesAt.Time.UTC()) require.Equal(t, sequence, key.Sequence) + // The NATS CA secret is a PEM bundle rather than hex-encoded bytes. + if key.Feature == database.CryptoKeyFeatureNatsCa { + cert, _, err := parseCASecret(key.Secret.String) + require.NoError(t, err) + require.True(t, cert.IsCA) + return + } + secret, err := hex.DecodeString(key.Secret.String) require.NoError(t, err) diff --git a/coderd/database/dbgen/dbgen.go b/coderd/database/dbgen/dbgen.go index 3955220efeeda..6ed85c122d487 100644 --- a/coderd/database/dbgen/dbgen.go +++ b/coderd/database/dbgen/dbgen.go @@ -2,13 +2,19 @@ package dbgen import ( "context" + "crypto/ecdsa" + "crypto/elliptic" "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" "database/sql" "encoding/hex" "encoding/json" + "encoding/pem" "errors" "fmt" "maps" + "math/big" "net" "strings" "testing" @@ -2208,10 +2214,44 @@ func newCryptoKeySecret(feature database.CryptoKeyFeature) (string, error) { return generateCryptoKey(64) case database.CryptoKeyFeatureTailnetResume: return generateCryptoKey(64) + case database.CryptoKeyFeatureNatsCa: + return generateCACryptoKeySecret() } return "", xerrors.Errorf("unknown feature: %s", feature) } +// generateCACryptoKeySecret generates a self-signed CA certificate and +// private key as a PEM bundle, matching the secret format that +// coderd/cryptokeys produces for the nats_ca feature. +func generateCACryptoKeySecret() (string, error) { + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return "", xerrors.Errorf("generate key: %w", err) + } + template := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "dbgen-ca"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageCertSign, + BasicConstraintsValid: true, + IsCA: true, + MaxPathLenZero: true, + } + der, err := x509.CreateCertificate(rand.Reader, template, template, key.Public(), key) + if err != nil { + return "", xerrors.Errorf("create certificate: %w", err) + } + keyDER, err := x509.MarshalECPrivateKey(key) + if err != nil { + return "", xerrors.Errorf("marshal private key: %w", err) + } + var secret []byte + secret = append(secret, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der})...) + secret = append(secret, pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER})...) + return string(secret), nil +} + func generateCryptoKey(length int) (string, error) { b := make([]byte, length) _, err := rand.Read(b) diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 6f4d588d67f7a..384f1adf9e1bf 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -373,7 +373,8 @@ CREATE TYPE crypto_key_feature AS ENUM ( 'workspace_apps_token', 'workspace_apps_api_key', 'oidc_convert', - 'tailnet_resume' + 'tailnet_resume', + 'nats_ca' ); CREATE TYPE display_app AS ENUM ( diff --git a/coderd/database/lock.go b/coderd/database/lock.go index 8d0894abc8756..fa3a4ac20b763 100644 --- a/coderd/database/lock.go +++ b/coderd/database/lock.go @@ -16,6 +16,7 @@ const ( LockIDReconcileSystemRoles LockIDBoundaryUsageStats LockIDAIProvidersEnvSeed + LockIDNATSCACreate ) // GenLockID generates a unique and consistent lock ID from a given string. diff --git a/coderd/database/migrations/000520_nats_ca_crypto_key_feature.down.sql b/coderd/database/migrations/000520_nats_ca_crypto_key_feature.down.sql new file mode 100644 index 0000000000000..ec36128a5146a --- /dev/null +++ b/coderd/database/migrations/000520_nats_ca_crypto_key_feature.down.sql @@ -0,0 +1,16 @@ +DELETE FROM crypto_keys WHERE feature = 'nats_ca'; + +CREATE TYPE old_crypto_key_feature AS ENUM ( + 'workspace_apps_token', + 'workspace_apps_api_key', + 'oidc_convert', + 'tailnet_resume' +); + +ALTER TABLE crypto_keys + ALTER COLUMN feature TYPE old_crypto_key_feature + USING (feature::text::old_crypto_key_feature); + +DROP TYPE crypto_key_feature; + +ALTER TYPE old_crypto_key_feature RENAME TO crypto_key_feature; diff --git a/coderd/database/migrations/000520_nats_ca_crypto_key_feature.up.sql b/coderd/database/migrations/000520_nats_ca_crypto_key_feature.up.sql new file mode 100644 index 0000000000000..c37227451d263 --- /dev/null +++ b/coderd/database/migrations/000520_nats_ca_crypto_key_feature.up.sql @@ -0,0 +1 @@ +ALTER TYPE crypto_key_feature ADD VALUE IF NOT EXISTS 'nats_ca'; diff --git a/coderd/database/models.go b/coderd/database/models.go index e52d5f6c4a96d..89ee4e41ce994 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1884,6 +1884,7 @@ const ( CryptoKeyFeatureWorkspaceAppsAPIKey CryptoKeyFeature = "workspace_apps_api_key" CryptoKeyFeatureOIDCConvert CryptoKeyFeature = "oidc_convert" CryptoKeyFeatureTailnetResume CryptoKeyFeature = "tailnet_resume" + CryptoKeyFeatureNatsCa CryptoKeyFeature = "nats_ca" ) func (e *CryptoKeyFeature) Scan(src interface{}) error { @@ -1926,7 +1927,8 @@ func (e CryptoKeyFeature) Valid() bool { case CryptoKeyFeatureWorkspaceAppsToken, CryptoKeyFeatureWorkspaceAppsAPIKey, CryptoKeyFeatureOIDCConvert, - CryptoKeyFeatureTailnetResume: + CryptoKeyFeatureTailnetResume, + CryptoKeyFeatureNatsCa: return true } return false @@ -1938,6 +1940,7 @@ func AllCryptoKeyFeatureValues() []CryptoKeyFeature { CryptoKeyFeatureWorkspaceAppsAPIKey, CryptoKeyFeatureOIDCConvert, CryptoKeyFeatureTailnetResume, + CryptoKeyFeatureNatsCa, } } diff --git a/enterprise/coderd/workspaceproxy_test.go b/enterprise/coderd/workspaceproxy_test.go index 41956485521b8..a9cf27fe71b47 100644 --- a/enterprise/coderd/workspaceproxy_test.go +++ b/enterprise/coderd/workspaceproxy_test.go @@ -1092,6 +1092,12 @@ func TestGetCryptoKeys(t *testing.T) { require.Error(t, err) require.ErrorAs(t, err, &sdkErr) require.Equal(t, http.StatusBadRequest, sdkErr.StatusCode()) + // The NATS cluster CA bundle contains a private key and must never be + // served to workspace proxies. + _, err = proxy.SDKClient.CryptoKeys(ctx, codersdk.CryptoKeyFeature(database.CryptoKeyFeatureNatsCa)) + require.Error(t, err) + require.ErrorAs(t, err, &sdkErr) + require.Equal(t, http.StatusBadRequest, sdkErr.StatusCode()) _, err = proxy.SDKClient.CryptoKeys(ctx, "invalid") require.Error(t, err) require.ErrorAs(t, err, &sdkErr) From dff598b02eb6334aeed150c3e0db36d6ad77a2dc Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 19 Jun 2026 21:36:55 +0000 Subject: [PATCH 5/5] feat(cli): wire NATS cluster mTLS into server startup When the NATS pubsub experiment is enabled and a DERP relay URL is configured (HA), enable mutual TLS on inter-replica cluster routes. The per-replica leaf certificate's IP SAN is derived from the relay URL host (the address peers dial), and the cluster CA is fetched from cryptokeys. Adds an integration test that meshes three nodes over TLS using a CA minted by cryptokeys.FetchNATSCA and verifies a cross-route pubsub round-trip. Co-authored-by: Mux --- cli/server.go | 24 ++++++- .../x/nats/tls_integration_internal_test.go | 71 +++++++++++++++++++ 2 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 coderd/x/nats/tls_integration_internal_test.go diff --git a/cli/server.go b/cli/server.go index 1dbdc5a152c53..350f1ef12b53d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -66,6 +66,7 @@ import ( "github.com/coder/coder/v2/coderd" "github.com/coder/coder/v2/coderd/aibridged" "github.com/coder/coder/v2/coderd/autobuild" + "github.com/coder/coder/v2/coderd/cryptokeys" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/awsiamrds" "github.com/coder/coder/v2/coderd/database/dbauthz" @@ -792,9 +793,26 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. // Use NATS for pubsub if the experiment is enabled. if experiments.Enabled(codersdk.ExperimentNATSPubsub) { token := fmt.Sprintf("%x", sha256.Sum256([]byte(dbURL))) - natsps, err := nats.New(ctx, logger.Named("pubsub"), nats.Options{ - ClusterAuthToken: token, - }) + natsOpts := nats.Options{ClusterAuthToken: token} + + // Enable mutual TLS on inter-replica cluster routes when a + // relay URL is configured (i.e. HA). The per-replica leaf + // certificate's IP-SAN is derived from the relay URL host, + // which is the address peers dial for cluster routes. + if vals.DERP.Server.RelayURL.String() != "" { + ca, err := cryptokeys.FetchNATSCA(ctx, options.Database) + if err != nil { + return xerrors.Errorf("fetch nats cluster CA: %w", err) + } + clusterTLS, err := nats.ClusterTLSOptionsFromRelayURL( + vals.DERP.Server.RelayURL.Value(), ca.Cert, ca.Key) + if err != nil { + return xerrors.Errorf("configure nats cluster TLS: %w", err) + } + natsOpts.ClusterTLS = clusterTLS + } + + natsps, err := nats.New(ctx, logger.Named("pubsub"), natsOpts) if err != nil { return xerrors.Errorf("create nats pubsub: %w", err) } diff --git a/coderd/x/nats/tls_integration_internal_test.go b/coderd/x/nats/tls_integration_internal_test.go new file mode 100644 index 0000000000000..3ce7b6b2240b3 --- /dev/null +++ b/coderd/x/nats/tls_integration_internal_test.go @@ -0,0 +1,71 @@ +package nats + +import ( + "context" + "net/url" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/coder/coder/v2/coderd/cryptokeys" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/testutil" +) + +// TestPubsub_ClusterTLS_RealCA stands up a three-node TLS mesh whose trust +// root is a real CA fetched from cryptokeys (create-at-fetch against a real +// DB), then verifies a cross-route publish/subscribe round-trip. This +// exercises the integration seam between the cryptokeys CA and the x/nats +// cluster TLS constructor, including the real PEM/x509 round-trip that the +// synthetic generateTestCA helper does not cover. +// +// The three-node star topology (b and c both peer with a, message flows +// b -> a -> c) mirrors TestPubsub_ClusterTLS/Mesh. A two-node single-route +// topology is avoided because route-interest propagation across a lone NATS +// route is timing-sensitive and makes such tests flaky. +func TestPubsub_ClusterTLS_RealCA(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitLong) + + // Real CA from the cryptokeys accessor. On an empty DB this creates the + // nats_ca row under an advisory lock and returns the parsed cert+key. + ca, err := cryptokeys.FetchNATSCA(ctx, db) + require.NoError(t, err) + + // Nodes mesh on loopback, so the leaf IP-SAN must be 127.0.0.1. Driving + // it through ClusterTLSOptionsFromRelayURL also exercises the production + // seam (relay URL host -> SANHost). + relayURL, err := url.Parse("nats://127.0.0.1:6222") + require.NoError(t, err) + tlsOpts, err := ClusterTLSOptionsFromRelayURL(relayURL, ca.Cert, ca.Key) + require.NoError(t, err) + + opts := clusterTestOptions(t) + opts.ClusterTLS = tlsOpts + + a := newTestPubsub(t, opts) + b := newTestPubsub(t, opts) + c := newTestPubsub(t, opts) + + addrA := clusterRouteAddress(t, a) + require.NoError(t, b.setPeerAddresses([]string{addrA})) + require.NoError(t, c.setPeerAddresses([]string{addrA})) + + received := make(chan string, 4) + cancelSub, err := c.Subscribe("tls-realca", func(_ context.Context, msg []byte) { + select { + case received <- string(msg): + default: + } + }) + require.NoError(t, err) + defer cancelSub() + + // b -> a -> c crosses two TLS route hops (gossip meshes b and c + // through a). + waitForRouteSubscription(t, b, "tls-realca") + require.NoError(t, b.Publish("tls-realca", []byte("hello"))) + require.Equal(t, "hello", testutil.TryReceive(ctx, t, received)) +}