Skip to content

Commit 85f56e4

Browse files
dannykoppingclaude
andauthored
fix: recreate ai_provider_type instead of ADD VALUE (coder#25895)
Coder runs all migrations in a single transaction (`pgTxnDriver`). Postgres forbids using an enum value added by `ALTER TYPE ... ADD VALUE` within the same transaction that added it. Migration `000499` widened `ai_provider_type` with `ADD VALUE`, and `000504` casts existing `chat_providers` rows to that enum in the same transaction. On deployments with a legacy provider using one of the new values (for example `openai-compat`), the batch failed with `unsafe use of new value` and the server could not start. Recreate the type (create a new enum, alter the column, drop and rename) instead of using `ADD VALUE`, matching the existing precedent in `000144_user_status_dormant`. A freshly created enum's values are usable immediately in the same transaction, so the cast in `000504` succeeds. The resulting schema is identical, so `make gen` produces no `dump.sql` diff and databases that already applied these migrations see no drift. Added a regression test that seeds an `openai-compat` provider and applies `000499` through `000504` in a single transaction, reproducing the production path. The per-step `Stepper` used by the other migration tests commits each migration separately and cannot surface this class of bug. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Signed-off-by: Danny Kopping <danny@coder.com> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent a85462b commit 85f56e4

4 files changed

Lines changed: 143 additions & 9 deletions

File tree

.claude/docs/DATABASE.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,41 @@
4141
dimension.
4242
- Avoid `ByX` names for grouped queries.
4343

44+
### Enum Changes Run in a Single Transaction
45+
46+
All migrations run inside one transaction (`pgTxnDriver`). Postgres forbids
47+
*using* an enum value added by `ALTER TYPE ... ADD VALUE` within the same
48+
transaction that added it, so it fails with `unsafe use of new value`.
49+
50+
Adding the value is fine; using it in the same batch is not. "Using it"
51+
includes a later migration that casts to it (`col::my_enum`), inserts or
52+
updates a row with it, or sets it as a column default. This only fails when a
53+
row actually materializes the new value, so fresh databases and CI pass while
54+
deployments with existing data break.
55+
56+
**MUST DO**: If any migration uses a newly added enum value, recreate the type
57+
instead of using `ADD VALUE`. A freshly created enum's values are usable
58+
immediately in the same transaction. Precedent: `000144_user_status_dormant`.
59+
60+
```sql
61+
CREATE TYPE new_my_enum AS ENUM ('existing', 'value', 'new_value');
62+
63+
ALTER TABLE my_table
64+
ALTER COLUMN col TYPE new_my_enum USING (col::text::new_my_enum);
65+
66+
DROP TYPE my_enum;
67+
68+
ALTER TYPE new_my_enum RENAME TO my_enum;
69+
```
70+
71+
Recreating produces an identical schema, so `make gen` yields no `dump.sql`
72+
diff and databases that already applied the migration see no drift.
73+
74+
**Testing**: `migrations.Stepper` commits each migration separately, so tests
75+
built on it cannot surface this. To catch it, seed a row using the new value,
76+
then apply the affected migrations in a single transaction (see
77+
`TestMigration000504AIProvidersBackfillEnumInSingleTxn`).
78+
4479
## Handling Nullable Fields
4580

4681
Use `sql.NullString`, `sql.NullBool`, etc. for optional database fields:
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
-- No-op: Postgres does not allow removing enum values safely.
2-
-- Matches the precedent in 000495_ai_providers.down.sql for ALTER
3-
-- TYPE resource_type / api_key_scope ADD VALUE.
1+
-- No-op: the up recreates ai_provider_type with a wider value set, but the
2+
-- down does not narrow it back. Narrowing would drop rows that already use the
3+
-- new values, and 000495_ai_providers.down.sql drops the type wholesale when
4+
-- migrating all the way down.

coderd/database/migrations/000499_ai_provider_type_chatd_values.up.sql

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,27 @@
77
-- OpenAI-compatible endpoints. Native gateway-side support for these
88
-- providers comes later, at which point this enum already carries the
99
-- right discriminator and no further migration is needed.
10-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'azure';
11-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'bedrock';
12-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'google';
13-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openai-compat';
14-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openrouter';
15-
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'vercel';
10+
--
11+
-- Recreate the type rather than using ALTER TYPE ... ADD VALUE. Postgres
12+
-- forbids using a value added by ADD VALUE within the same transaction, and
13+
-- all migrations run in one transaction. 000504 casts existing chat_providers
14+
-- rows to these new values in that same transaction, so ADD VALUE fails with
15+
-- "unsafe use of new value". A freshly created enum's values are usable
16+
-- immediately, so the cast in 000504 succeeds.
17+
CREATE TYPE new_ai_provider_type AS ENUM (
18+
'openai',
19+
'anthropic',
20+
'azure',
21+
'bedrock',
22+
'google',
23+
'openai-compat',
24+
'openrouter',
25+
'vercel'
26+
);
27+
28+
ALTER TABLE ai_providers
29+
ALTER COLUMN type TYPE new_ai_provider_type USING (type::text::new_ai_provider_type);
30+
31+
DROP TYPE ai_provider_type;
32+
33+
ALTER TYPE new_ai_provider_type RENAME TO ai_provider_type;

coderd/database/migrations/migrate_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"os"
88
"path/filepath"
99
"slices"
10+
"strings"
1011
"sync"
1112
"testing"
1213
"time"
@@ -1502,6 +1503,85 @@ func TestMigration000504AIProvidersBackfillOverridesNameConflict(t *testing.T) {
15021503
require.True(t, fresh.Enabled)
15031504
}
15041505

1506+
// TestMigration000504AIProvidersBackfillEnumInSingleTxn reproduces the
1507+
// production migration path, where every pending migration runs inside a
1508+
// single transaction (see pgTxnDriver). Migration 000499 widens
1509+
// ai_provider_type with ALTER TYPE ... ADD VALUE, and 000504 casts existing
1510+
// chat_providers rows to that enum. Postgres forbids using an enum value
1511+
// added by ADD VALUE within the same transaction, so when a legacy provider
1512+
// uses one of the new values (for example openai-compat) the batch fails with
1513+
// "unsafe use of new value". The per-step Stepper used by the other tests
1514+
// commits each migration separately and cannot surface this.
1515+
func TestMigration000504AIProvidersBackfillEnumInSingleTxn(t *testing.T) {
1516+
t.Parallel()
1517+
1518+
sqlDB := testSQLDB(t)
1519+
ctx := testutil.Context(t, testutil.WaitSuperLong)
1520+
1521+
// Apply everything through 498 and commit, so chat_providers exists and is
1522+
// populated before the batch under test runs, matching a deployment that
1523+
// ran an earlier migration batch before this one.
1524+
applyMigrationsInTxn(ctx, t, sqlDB, 1, 498)
1525+
1526+
now := time.Now().UTC().Truncate(time.Microsecond)
1527+
providerID := uuid.New()
1528+
1529+
// A legacy provider whose type is one of the values added in 000499.
1530+
_, err := sqlDB.ExecContext(ctx, `
1531+
INSERT INTO chat_providers (id, provider, display_name, api_key, enabled, base_url, created_at, updated_at)
1532+
VALUES ($1, 'openai-compat', 'OpenAI Compatible', '', TRUE, 'https://api.example.com/v1', $2, $2)
1533+
`, providerID, now)
1534+
require.NoError(t, err)
1535+
1536+
// Apply 000499 through 000504 in a single transaction, as production does.
1537+
applyMigrationsInTxn(ctx, t, sqlDB, 499, 504)
1538+
1539+
var typ string
1540+
err = sqlDB.QueryRowContext(ctx,
1541+
`SELECT type FROM ai_providers WHERE id = $1`, providerID,
1542+
).Scan(&typ)
1543+
require.NoError(t, err)
1544+
require.Equal(t, "openai-compat", typ)
1545+
}
1546+
1547+
// applyMigrationsInTxn executes the up SQL for every migration whose version is
1548+
// in [from, to] inside a single transaction, mirroring pgTxnDriver. The whole
1549+
// batch commits or rolls back together.
1550+
func applyMigrationsInTxn(ctx context.Context, t *testing.T, sqlDB *sql.DB, from, to int) {
1551+
t.Helper()
1552+
1553+
entries, err := os.ReadDir(".")
1554+
require.NoError(t, err)
1555+
1556+
var files []string
1557+
for _, entry := range entries {
1558+
name := entry.Name()
1559+
if !strings.HasSuffix(name, ".up.sql") {
1560+
continue
1561+
}
1562+
var version int
1563+
if _, err := fmt.Sscanf(name, "%06d_", &version); err != nil {
1564+
continue
1565+
}
1566+
if version >= from && version <= to {
1567+
files = append(files, name)
1568+
}
1569+
}
1570+
slices.Sort(files)
1571+
1572+
tx, err := sqlDB.BeginTx(ctx, nil)
1573+
require.NoError(t, err)
1574+
defer tx.Rollback()
1575+
1576+
for _, name := range files {
1577+
query, err := os.ReadFile(name)
1578+
require.NoError(t, err)
1579+
_, err = tx.ExecContext(ctx, string(query))
1580+
require.NoErrorf(t, err, "apply migration %s", name)
1581+
}
1582+
require.NoError(t, tx.Commit())
1583+
}
1584+
15051585
func TestMigration000498SoftDeleteStaleWorkspaceAgents(t *testing.T) {
15061586
t.Parallel()
15071587

0 commit comments

Comments
 (0)