diff --git a/coderd/x/chatd/chatd.go b/coderd/x/chatd/chatd.go index 0e18f600b60cf..524cd95d04222 100644 --- a/coderd/x/chatd/chatd.go +++ b/coderd/x/chatd/chatd.go @@ -206,7 +206,11 @@ type Server struct { chatWorker *chatWorker messagePartBuffer *messagepartbuffer.Buffer streamSyncPoller *streamSyncPoller - recordingSem chan struct{} + // templateRecommendations tracks list_templates recommendations to classify + // the template a later create_workspace builds. In-memory and best-effort; + // see chattool.RecommendationTracker. + templateRecommendations *chattool.RecommendationTracker + recordingSem chan struct{} aibridgeTransportFactory *atomic.Pointer[aibridge.TransportFactory] aiGatewayRoutingEnabled bool @@ -3447,6 +3451,7 @@ func New(ps pubsub.Pubsub, cfg Config) *Server { panic("chatd: create chat worker: " + err.Error()) } p.chatWorker = chatWorker + p.templateRecommendations = chattool.NewRecommendationTracker(clk, 0, 0) //nolint:gocritic // The chat processor uses a scoped chatd context. ctx = dbauthz.AsChatd(ctx) @@ -4202,6 +4207,9 @@ func (p *Server) appendRootChatTools( Logger: p.logger, Clock: p.clock, AllowedTemplateIDs: p.chatTemplateAllowlist, + ChatID: opts.chat.ID, + Metrics: p.metrics, + Recommendations: p.templateRecommendations, }), chattool.ReadTemplate(p.db, opts.chat.OrganizationID, chattool.ReadTemplateOptions{ OwnerID: opts.chat.OwnerID, @@ -4216,6 +4224,8 @@ func (p *Server) appendRootChatTools( OnChatUpdated: onChatUpdated, Logger: p.logger, AllowedTemplateIDs: p.chatTemplateAllowlist, + Metrics: p.metrics, + Recommendations: p.templateRecommendations, }), chattool.StartWorkspace(p.db, opts.chat.ID, chattool.StartWorkspaceOptions{ OwnerID: opts.chat.OwnerID, diff --git a/coderd/x/chatd/chatloop/metrics.go b/coderd/x/chatd/chatloop/metrics.go index 6f13663017b97..94b7fb42dbd02 100644 --- a/coderd/x/chatd/chatloop/metrics.go +++ b/coderd/x/chatd/chatloop/metrics.go @@ -38,6 +38,11 @@ type Metrics struct { StepsTotal *prometheus.CounterVec StreamRetriesTotal *prometheus.CounterVec StreamBufferDroppedTotal prometheus.Counter + + ListTemplatesOutcomeTotal *prometheus.CounterVec + ListTemplatesSignalsFailuresTotal prometheus.Counter + ListTemplatesAffinityGap *prometheus.HistogramVec + TemplateRecommendationFollowupTotal *prometheus.CounterVec } // NewMetrics creates a new Metrics instance registered with the @@ -109,6 +114,31 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { Name: "stream_buffer_dropped_total", Help: "Number of chat stream buffer events dropped due to the per-chat buffer cap.", }), + ListTemplatesOutcomeTotal: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "list_templates_outcome_total", + Help: "Total list_templates calls by recommendation outcome (recommended, ask_user, no_matches, no_templates).", + }, []string{"outcome"}), + ListTemplatesSignalsFailuresTotal: factory.NewCounter(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "list_templates_signals_failures_total", + Help: "Total list_templates calls where ranking signals failed to load, degrading the result toward asking the user.", + }), + ListTemplatesAffinityGap: factory.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "list_templates_affinity_gap", + Help: "Affinity score gap between the top two candidates when affinity is the deciding signal, labeled by whether a recommendation was made.", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 9), // 0.1 .. 25.6 + }, []string{"recommended"}), + TemplateRecommendationFollowupTotal: factory.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsSubsystem, + Name: "template_recommendation_followup_total", + Help: "Total create_workspace calls by how the chosen template related to the prior list_templates recommendation (accepted_recommendation, overrode_with_listed_template, created_listed_without_recommendation, created_unlisted_template, no_recent_list_templates).", + }, []string{"outcome"}), } } @@ -165,6 +195,45 @@ func (m *Metrics) RecordToolError(provider, model, toolLabel string) { m.ToolErrorsTotal.WithLabelValues(provider, model, toolLabel).Inc() } +// RecordListTemplatesOutcome increments list_templates_outcome_total for the +// given recommendation outcome. No-op when m is nil. +func (m *Metrics) RecordListTemplatesOutcome(outcome string) { + if m == nil { + return + } + m.ListTemplatesOutcomeTotal.WithLabelValues(outcome).Inc() +} + +// RecordListTemplatesSignalsFailure increments +// list_templates_signals_failures_total. No-op when m is nil. +func (m *Metrics) RecordListTemplatesSignalsFailure() { + if m == nil { + return + } + m.ListTemplatesSignalsFailuresTotal.Inc() +} + +// RecordListTemplatesAffinityGap observes the affinity gap between the top two +// list_templates candidates, labeled by whether a recommendation was made. +// Callers must only record when affinity is the deciding signal so the gap is +// non-negative and meaningful. No-op when m is nil. +func (m *Metrics) RecordListTemplatesAffinityGap(recommended bool, gap float64) { + if m == nil { + return + } + m.ListTemplatesAffinityGap.WithLabelValues(strconv.FormatBool(recommended)).Observe(gap) +} + +// RecordTemplateRecommendationFollowup increments +// template_recommendation_followup_total for how a create_workspace call +// related to the prior list_templates recommendation. No-op when m is nil. +func (m *Metrics) RecordTemplateRecommendationFollowup(outcome string) { + if m == nil { + return + } + m.TemplateRecommendationFollowupTotal.WithLabelValues(outcome).Inc() +} + // RecordStreamBufferDropped increments stream_buffer_dropped_total // once per dropped event. No-op when m is nil. func (m *Metrics) RecordStreamBufferDropped() { diff --git a/coderd/x/chatd/chattool/createworkspace.go b/coderd/x/chatd/chattool/createworkspace.go index a20db20ac5d39..0ea3b2faa2d65 100644 --- a/coderd/x/chatd/chattool/createworkspace.go +++ b/coderd/x/chatd/chattool/createworkspace.go @@ -62,7 +62,16 @@ type AgentConnFunc func( agentID uuid.UUID, ) (workspacesdk.AgentConn, func(), error) -// CreateWorkspaceOptions configures the create_workspace tool. +// CreateWorkspaceMetrics records create_workspace telemetry. It is implemented +// by *chatloop.Metrics and declared here (rather than imported) because +// chatloop imports chattool, so chattool must not import chatloop. +type CreateWorkspaceMetrics interface { + RecordTemplateRecommendationFollowup(outcome string) +} + +// CreateWorkspaceOptions configures the create_workspace tool. Metrics and +// Recommendations are optional telemetry hooks that classify how the chosen +// template related to the prior list_templates recommendation for this chat. type CreateWorkspaceOptions struct { OwnerID uuid.UUID CreateFn CreateWorkspaceFn @@ -72,6 +81,8 @@ type CreateWorkspaceOptions struct { OnChatUpdated func(database.Chat) Logger slog.Logger AllowedTemplateIDs func() map[uuid.UUID]bool + Metrics CreateWorkspaceMetrics + Recommendations *RecommendationTracker } type createWorkspaceArgs struct { @@ -277,6 +288,10 @@ func CreateWorkspace(db database.Store, organizationID, chatID uuid.UUID, option options.OnChatUpdated(updatedChat) } + // Only genuine creations reach here; the idempotent + // existing-workspace path returns earlier. + recordRecommendationFollowup(ctx, options, chatID, workspace.ID, templateID) + // Wait for the build to complete and the agent to // come online so subsequent tools can use the // workspace immediately. @@ -362,6 +377,29 @@ type existingWorkspaceResult struct { Err error } +// recordRecommendationFollowup classifies how a freshly created workspace's +// template related to the prior list_templates recommendation for the chat and +// records it as a metric plus a structured log. Classification is best-effort: +// it degrades to "no_recent_list_templates" when no in-memory record exists +// (restart, replica handoff, expiry, or list_templates was never called). +func recordRecommendationFollowup( + ctx context.Context, + options CreateWorkspaceOptions, + chatID, workspaceID, templateID uuid.UUID, +) { + followup := options.Recommendations.Classify(chatID, templateID) + if options.Metrics != nil { + options.Metrics.RecordTemplateRecommendationFollowup(followup) + } + options.Logger.Info(ctx, "create_workspace recommendation follow-up", + slog.F("chat_id", chatID), + slog.F("owner_id", options.OwnerID), + slog.F("workspace_id", workspaceID), + slog.F("template_id", templateID), + slog.F("recommendation_followup", followup), + ) +} + // checkExistingWorkspace checks whether the given chat // already has a usable workspace. Returns an // existingWorkspaceResult with Done set when the caller should diff --git a/coderd/x/chatd/chattool/createworkspace_internal_test.go b/coderd/x/chatd/chattool/createworkspace_internal_test.go index 13f009d6686d8..10230b4e521db 100644 --- a/coderd/x/chatd/chattool/createworkspace_internal_test.go +++ b/coderd/x/chatd/chattool/createworkspace_internal_test.go @@ -2100,3 +2100,68 @@ func TestCreateWorkspace_WithPresetAndParams(t *testing.T) { require.Equal(t, "region", capturedReq.RichParameterValues[0].Name) require.Equal(t, "us-east", capturedReq.RichParameterValues[0].Value) } + +type fakeCreateWorkspaceMetrics struct { + followups []string +} + +func (m *fakeCreateWorkspaceMetrics) RecordTemplateRecommendationFollowup(outcome string) { + m.followups = append(m.followups, outcome) +} + +// TestCreateWorkspace_RecordsRecommendationFollowup verifies the create path +// classifies the chosen template against a prior list_templates recommendation +// shared through the tracker, and records the follow-up metric. +func TestCreateWorkspace_RecordsRecommendationFollowup(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + db := newCreateWorkspaceMockStore(ctrl) + + ownerID := uuid.New() + orgID := uuid.New() + chatID := uuid.New() + templateID := uuid.New() + workspaceID := uuid.New() + + db.EXPECT().GetChatByID(gomock.Any(), chatID).Return(database.Chat{ID: chatID}, nil) + db.EXPECT().UpdateChatWorkspaceBinding(gomock.Any(), gomock.Any()).Return(database.Chat{ID: chatID}, nil) + db.EXPECT().GetAuthorizationUserRoles(gomock.Any(), ownerID).Return(database.GetAuthorizationUserRolesRow{ + ID: ownerID, Roles: []string{}, Groups: []string{}, Status: database.UserStatusActive, + }, nil) + db.EXPECT().GetTemplateByID(gomock.Any(), templateID).Return(database.Template{ + ID: templateID, OrganizationID: orgID, + }, nil) + db.EXPECT().GetChatWorkspaceTTL(gomock.Any()).Return("0s", nil) + // Empty agent list short-circuits the agent-ready wait. + db.EXPECT().GetWorkspaceAgentsInLatestBuildByWorkspaceID(gomock.Any(), workspaceID). + Return([]database.WorkspaceAgent{}, nil) + + // A nil-build workspace skips the build-completion wait. + createFn := func(_ context.Context, _ uuid.UUID, req codersdk.CreateWorkspaceRequest) (codersdk.Workspace, error) { + return codersdk.Workspace{ID: workspaceID, Name: req.Name, OwnerName: "testuser"}, nil + } + + // Seed the tracker so the chat already has a recommendation for templateID. + tracker := NewRecommendationTracker(nil, 0, 0) + tracker.Record(chatID, templateID, []uuid.UUID{templateID}, 1) + metrics := &fakeCreateWorkspaceMetrics{} + + tool := CreateWorkspace(db, orgID, chatID, CreateWorkspaceOptions{ + OwnerID: ownerID, + CreateFn: createFn, + WorkspaceMu: &sync.Mutex{}, + Logger: slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), + Metrics: metrics, + Recommendations: tracker, + }) + + input := fmt.Sprintf(`{"template_id":%q}`, templateID.String()) + resp, err := tool.Run(context.Background(), fantasy.ToolCall{ID: "call-1", Name: "create_workspace", Input: input}) + require.NoError(t, err) + require.False(t, resp.IsError) + + require.Equal(t, []string{recommendationFollowupAccepted}, metrics.followups) + // The recommendation was consumed: a repeat classification finds nothing. + require.Equal(t, recommendationFollowupNoRecord, tracker.Classify(chatID, templateID)) +} diff --git a/coderd/x/chatd/chattool/listtemplates.go b/coderd/x/chatd/chattool/listtemplates.go index 09828b0b6b1b3..69e96f1a11e7d 100644 --- a/coderd/x/chatd/chattool/listtemplates.go +++ b/coderd/x/chatd/chattool/listtemplates.go @@ -76,14 +76,65 @@ const ( queryScoreDescriptionMatch = 1 ) +// listTemplatesRankingVersion identifies the ranking policy (scoring formula +// and confidence thresholds) for telemetry. Bump it whenever the policy +// changes so recorded decisions can be segmented by version. +const listTemplatesRankingVersion = 1 + +// list_templates outcomes are the label values for +// list_templates_outcome_total and the "outcome" field of the decision log. +const ( + listTemplatesOutcomeRecommended = "recommended" + listTemplatesOutcomeAskUser = "ask_user" + listTemplatesOutcomeNoMatches = "no_matches" + listTemplatesOutcomeNoTemplates = "no_templates" + // listTemplatesOutcomeUnknown is a defensive bucket for a next-step value + // that listTemplatesOutcome does not map. It should never appear in + // practice; an increment signals a new NextStep* constant that was not + // wired into the outcome mapping. + listTemplatesOutcomeUnknown = "unknown" +) + +// Recommendation reasons explain which branch produced the outcome. They are +// recorded in the decision log (not as a metric label) so the "why" survives +// without reconstructing it from the raw scores. +const ( + recommendationReasonNoTemplates = "no_templates" + recommendationReasonNoMatches = "no_matches" + recommendationReasonOnlyAvailable = "only_available_template" + recommendationReasonDecisiveQuery = "decisive_query_match" + recommendationReasonSignalsUnavailable = "signals_unavailable" + recommendationReasonQueryTieConfident = "query_tie_broken_by_affinity" + recommendationReasonQueryTieAmbiguous = "ambiguous_query_tie" + recommendationReasonAffinityConfident = "affinity_confident" + recommendationReasonAffinityLow = "affinity_below_floor" + recommendationReasonAffinityAmbiguous = "ambiguous_affinity" +) + +// ListTemplatesMetrics records list_templates ranking telemetry. It is +// implemented by *chatloop.Metrics and declared here (rather than imported) +// because chatloop imports chattool, so chattool must not import chatloop. +type ListTemplatesMetrics interface { + RecordListTemplatesOutcome(outcome string) + RecordListTemplatesSignalsFailure() + RecordListTemplatesAffinityGap(recommended bool, gap float64) +} + // ListTemplatesOptions configures the list_templates tool. OwnerID is // required; Clock defaults to a real clock when nil. AllowedTemplateIDs -// optionally restricts which templates can be returned. +// optionally restricts which templates can be returned. ChatID, Metrics, and +// Recommendations are optional telemetry hooks: ChatID correlates a result +// with a later create_workspace call, Metrics records aggregate ranking +// outcomes, and Recommendations remembers the result for follow-up +// classification. type ListTemplatesOptions struct { OwnerID uuid.UUID Logger slog.Logger Clock quartz.Clock AllowedTemplateIDs func() map[uuid.UUID]bool + ChatID uuid.UUID + Metrics ListTemplatesMetrics + Recommendations *RecommendationTracker } type listTemplatesArgs struct { @@ -122,7 +173,7 @@ func ListTemplates(db database.Store, organizationID uuid.UUID, options ListTemp "by a query matching template name, display name, or description. "+ "Follow the "+NextStepField+" field in the result. Returns 10 per "+ "page; fetch next_page only when no listed template fits the request.", - func(ctx context.Context, args listTemplatesArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) { + func(ctx context.Context, args listTemplatesArgs, toolCall fantasy.ToolCall) (fantasy.ToolResponse, error) { ctx, err := asOwner(ctx, db, options.OwnerID) if err != nil { return fantasy.NewTextErrorResponse(xerrors.Errorf("authorize list_templates owner: %w", err).Error()), nil @@ -176,7 +227,7 @@ func ListTemplates(db database.Store, organizationID uuid.UUID, options ListTemp } rankTemplates(ranked, query) - recommendedID, nextStep := selectTemplateRecommendation( + recommendedID, nextStep, reason := selectTemplateRecommendation( ranked, visibleTemplateCount, signalsErr, @@ -191,10 +242,26 @@ func ListTemplates(db database.Store, organizationID uuid.UUID, options ListTemp end := min(start+listTemplatesPageSize, totalCount) items := make([]map[string]any, 0, end-start) + pageTemplateIDs := make([]uuid.UUID, 0, end-start) for _, t := range ranked[start:end] { items = append(items, templateItem(t)) + pageTemplateIDs = append(pageTemplateIDs, t.Template.ID) } + recordListTemplatesTelemetry(ctx, options, toolCall.ID, organizationID, listTemplatesTelemetry{ + query: query, + page: page, + visibleTemplateCount: visibleTemplateCount, + candidateCount: totalCount, + returnedCount: len(items), + ranked: ranked, + recommendedID: recommendedID, + nextStep: nextStep, + reason: reason, + signalsErr: signalsErr, + }) + options.Recommendations.Record(options.ChatID, recommendedID, pageTemplateIDs, page) + result := map[string]any{ "templates": items, "page": page, @@ -302,55 +369,165 @@ func rankTemplates(ranked []rankedTemplate, query string) { } // selectTemplateRecommendation returns the recommended template (uuid.Nil for -// none) and the next-step instruction. A decisive query match recommends on +// none), the next-step instruction, and a reason identifying which branch +// decided the outcome (for telemetry). A decisive query match recommends on // its own; otherwise the affinity score must clear a floor and lead the // runner-up by a margin. func selectTemplateRecommendation( ranked []rankedTemplate, visibleTemplateCount int, rankingSignalsErr error, -) (uuid.UUID, string) { +) (recommendedID uuid.UUID, nextStep string, reason string) { if len(ranked) == 0 { if visibleTemplateCount == 0 { - return uuid.Nil, NextStepNoTemplates + return uuid.Nil, NextStepNoTemplates, recommendationReasonNoTemplates } - return uuid.Nil, NextStepNoMatches + return uuid.Nil, NextStepNoMatches, recommendationReasonNoMatches } top := ranked[0] if visibleTemplateCount == 1 && len(ranked) == 1 { - return top.Template.ID, NextStepUseRecommended + return top.Template.ID, NextStepUseRecommended, recommendationReasonOnlyAvailable } // A decisive query match recommends even when signals failed to load. if top.QueryScore > 0 && (len(ranked) == 1 || top.QueryScore > ranked[1].QueryScore) { - return top.Template.ID, NextStepUseRecommended + return top.Template.ID, NextStepUseRecommended, recommendationReasonDecisiveQuery } // Beyond a decisive query match, confidence comes from the affinity // score, so a failed signal load means asking the user. if rankingSignalsErr != nil { - return uuid.Nil, NextStepAskUser + return uuid.Nil, NextStepAskUser, recommendationReasonSignalsUnavailable } - // Query tie: break it with a clear affinity gap. + // Query tie: both candidates matched the query at the same relevance tier, + // so the query itself is the baseline confidence signal and affinity only + // breaks the tie. A clear affinity gap is enough here; unlike the no-query + // branch below, the top score need not clear minConfidentAffinityScore on + // its own. if top.QueryScore > 0 { if len(ranked) > 1 && affinityScoreAtLeast(top.AffinityScore-ranked[1].AffinityScore, minConfidentGap) { - return top.Template.ID, NextStepUseRecommended + return top.Template.ID, NextStepUseRecommended, recommendationReasonQueryTieConfident } - return uuid.Nil, NextStepAskUser + return uuid.Nil, NextStepAskUser, recommendationReasonQueryTieAmbiguous } // No query: the affinity score alone decides. if !affinityScoreAtLeast(top.AffinityScore, minConfidentAffinityScore) { - return uuid.Nil, NextStepAskUser + return uuid.Nil, NextStepAskUser, recommendationReasonAffinityLow } if len(ranked) > 1 && affinityScoreAtLeast(ranked[1].AffinityScore, minConfidentAffinityScore) && !affinityScoreAtLeast(top.AffinityScore-ranked[1].AffinityScore, minConfidentGap) { - return uuid.Nil, NextStepAskUser + return uuid.Nil, NextStepAskUser, recommendationReasonAffinityAmbiguous + } + return top.Template.ID, NextStepUseRecommended, recommendationReasonAffinityConfident +} + +// listTemplatesOutcome maps a next-step instruction to its telemetry outcome. +func listTemplatesOutcome(nextStep string) string { + switch nextStep { + case NextStepNoTemplates: + return listTemplatesOutcomeNoTemplates + case NextStepNoMatches: + return listTemplatesOutcomeNoMatches + case NextStepUseRecommended: + return listTemplatesOutcomeRecommended + case NextStepAskUser: + return listTemplatesOutcomeAskUser + default: + return listTemplatesOutcomeUnknown + } +} + +// listTemplatesTelemetry carries the data recorded for one list_templates call: +// aggregate ranking metrics plus the inputs for a structured decision log. +type listTemplatesTelemetry struct { + query string + page int + visibleTemplateCount int + candidateCount int + returnedCount int + ranked []rankedTemplate + recommendedID uuid.UUID + nextStep string + reason string + signalsErr error +} + +// recordListTemplatesTelemetry records the aggregate ranking metrics and emits +// the structured decision log. The raw user query text is never logged: only +// its presence and length are, to avoid leaking task content. The affinity +// score is not included in the tool result shown to the model; the log records +// the inputs (scores, gap, thresholds) so a decision is reconstructable. +func recordListTemplatesTelemetry( + ctx context.Context, + options ListTemplatesOptions, + toolCallID string, + organizationID uuid.UUID, + t listTemplatesTelemetry, +) { + outcome := listTemplatesOutcome(t.nextStep) + recommended := t.recommendedID != uuid.Nil + + if options.Metrics != nil { + options.Metrics.RecordListTemplatesOutcome(outcome) + if t.signalsErr != nil { + options.Metrics.RecordListTemplatesSignalsFailure() + } + // The affinity gap is only meaningful when affinity is the deciding + // signal: no query, or the top two share the same query tier. In those + // cases the sort guarantees a non-negative gap. A failed signal load + // leaves every affinity score at its zero default and forces an + // ask_user outcome, so recording the gap then would only add + // meaningless zero samples; the signals-failure counter covers it. + if t.signalsErr == nil && len(t.ranked) > 1 { + top, runner := t.ranked[0], t.ranked[1] + if t.query == "" || top.QueryScore == runner.QueryScore { + options.Metrics.RecordListTemplatesAffinityGap(recommended, top.AffinityScore-runner.AffinityScore) + } + } + } + + fields := []slog.Field{ + slog.F("tool_call_id", toolCallID), + slog.F("chat_id", options.ChatID), + slog.F("owner_id", options.OwnerID), + slog.F("organization_id", organizationID), + slog.F("query_present", t.query != ""), + slog.F("query_len", len(t.query)), + slog.F("page", t.page), + slog.F("visible_template_count", t.visibleTemplateCount), + slog.F("candidate_count", t.candidateCount), + slog.F("returned_count", t.returnedCount), + slog.F("outcome", outcome), + slog.F("recommendation_reason", t.reason), + slog.F("signals_load_failed", t.signalsErr != nil), + slog.F("ranking_version", listTemplatesRankingVersion), + slog.F("min_confident_affinity_score", minConfidentAffinityScore), + slog.F("min_confident_gap", minConfidentGap), + } + if recommended { + fields = append(fields, slog.F("recommended_template_id", t.recommendedID)) + } + if len(t.ranked) > 0 { + top := t.ranked[0] + fields = append(fields, + slog.F("top_template_id", top.Template.ID), + slog.F("top_query_score", top.QueryScore), + slog.F("top_affinity_score", top.AffinityScore), + ) + } + if len(t.ranked) > 1 { + runner := t.ranked[1] + fields = append(fields, + slog.F("runner_up_query_score", runner.QueryScore), + slog.F("runner_up_affinity_score", runner.AffinityScore), + slog.F("affinity_gap", t.ranked[0].AffinityScore-runner.AffinityScore), + ) } - return top.Template.ID, NextStepUseRecommended + options.Logger.Info(ctx, "list_templates decision", fields...) } func templateItem(t rankedTemplate) map[string]any { diff --git a/coderd/x/chatd/chattool/listtemplates_internal_test.go b/coderd/x/chatd/chattool/listtemplates_internal_test.go index af7674e1f9194..53cfae59fc646 100644 --- a/coderd/x/chatd/chattool/listtemplates_internal_test.go +++ b/coderd/x/chatd/chattool/listtemplates_internal_test.go @@ -56,14 +56,14 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoTemplatesAvailable", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation(nil, 0, nil) + id, next, _ := selectTemplateRecommendation(nil, 0, nil) require.Equal(t, uuid.Nil, id) require.Equal(t, NextStepNoTemplates, next) }) t.Run("QueryFiltersEverything", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation(nil, 2, nil) + id, next, _ := selectTemplateRecommendation(nil, 2, nil) require.Equal(t, uuid.Nil, id) require.Equal(t, NextStepNoMatches, next) }) @@ -71,7 +71,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("OnlyAvailable", func(t *testing.T) { t.Parallel() only := uuid.New() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{{Template: database.Template{ID: only}}}, 1, loadErr, ) require.Equal(t, only, id) @@ -82,7 +82,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Parallel() top := uuid.New() for _, err := range []error{nil, loadErr} { - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: top}, QueryScore: queryScoreExactName}, {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreDescriptionMatch}, @@ -96,7 +96,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("QueryTieBrokenByAffinityGap", func(t *testing.T) { t.Parallel() top := uuid.New() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: top}, QueryScore: queryScoreNamePrefix, AffinityScore: 10, Signals: templateRankingSignals{ActiveCount: 1}}, {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0}, @@ -108,7 +108,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("QueryTieWithSmallGapIsAmbiguous", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0.1}, {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0}, @@ -120,7 +120,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("QueryTieWithLoadErrorAsksUser", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix}, {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix}, @@ -132,7 +132,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryNoSignal", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}}, {Template: database.Template{ID: uuid.New()}}, @@ -145,7 +145,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryWeakSignalBelowFloor", func(t *testing.T) { t.Parallel() // One active developer scores ln(2), below the ln(3) floor. - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}, AffinityScore: math.Log1p(1), Signals: templateRankingSignals{OrgDevs: 1}}, {Template: database.Template{ID: uuid.New()}, AffinityScore: 0}, @@ -158,7 +158,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryConfidentWhenLeadsRunnerUp", func(t *testing.T) { t.Parallel() top := uuid.New() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: top}, AffinityScore: math.Log1p(3), Signals: templateRankingSignals{OrgDevs: 3}}, {Template: database.Template{ID: uuid.New()}, AffinityScore: math.Log1p(1), Signals: templateRankingSignals{OrgDevs: 1}}, @@ -170,7 +170,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryAmbiguousWhenBothClearFloorAndClose", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.20, Signals: templateRankingSignals{OrgDevs: 2}}, {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.15, Signals: templateRankingSignals{OrgDevs: 2}}, @@ -183,7 +183,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryConfidentWhenBothClearFloorWithLargeGap", func(t *testing.T) { t.Parallel() top := uuid.New() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: top}, AffinityScore: 2.0, Signals: templateRankingSignals{OrgDevs: 6}}, {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.2, Signals: templateRankingSignals{OrgDevs: 2}}, @@ -195,7 +195,7 @@ func TestSelectTemplateRecommendation(t *testing.T) { t.Run("NoQueryLoadErrorAsksUser", func(t *testing.T) { t.Parallel() - id, next := selectTemplateRecommendation( + id, next, _ := selectTemplateRecommendation( []rankedTemplate{ {Template: database.Template{ID: uuid.New()}, AffinityScore: math.Log1p(3), Signals: templateRankingSignals{OrgDevs: 3}}, {Template: database.Template{ID: uuid.New()}}, diff --git a/coderd/x/chatd/chattool/listtemplates_telemetry_internal_test.go b/coderd/x/chatd/chattool/listtemplates_telemetry_internal_test.go new file mode 100644 index 0000000000000..89297b92b51e6 --- /dev/null +++ b/coderd/x/chatd/chattool/listtemplates_telemetry_internal_test.go @@ -0,0 +1,215 @@ +package chattool + +import ( + "context" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" +) + +type gapObservation struct { + recommended bool + gap float64 +} + +type fakeListTemplatesMetrics struct { + outcomes []string + signalsFailures int + gaps []gapObservation +} + +func (m *fakeListTemplatesMetrics) RecordListTemplatesOutcome(outcome string) { + m.outcomes = append(m.outcomes, outcome) +} + +func (m *fakeListTemplatesMetrics) RecordListTemplatesSignalsFailure() { + m.signalsFailures++ +} + +func (m *fakeListTemplatesMetrics) RecordListTemplatesAffinityGap(recommended bool, gap float64) { + m.gaps = append(m.gaps, gapObservation{recommended: recommended, gap: gap}) +} + +func rankedWith(queryScore int, affinity float64) rankedTemplate { + return rankedTemplate{ + Template: database.Template{ID: uuid.New()}, + QueryScore: queryScore, + AffinityScore: affinity, + } +} + +func TestRecordListTemplatesTelemetry_Metrics(t *testing.T) { + t.Parallel() + + t.Run("OutcomeAndSignalsFailure", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + ranked: []rankedTemplate{rankedWith(0, 0)}, + nextStep: NextStepAskUser, + reason: recommendationReasonAffinityLow, + signalsErr: xerrors.New("boom"), + }) + require.Equal(t, []string{listTemplatesOutcomeAskUser}, m.outcomes) + require.Equal(t, 1, m.signalsFailures) + }) + + t.Run("GapRecordedWhenNoQuery", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + ranked: []rankedTemplate{rankedWith(0, 5), rankedWith(0, 2)}, + recommendedID: uuid.New(), + nextStep: NextStepUseRecommended, + }) + require.Len(t, m.gaps, 1) + require.True(t, m.gaps[0].recommended) + require.InDelta(t, 3.0, m.gaps[0].gap, 1e-9) + require.Zero(t, m.signalsFailures) + }) + + t.Run("GapRecordedWhenEqualQueryTier", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + query: "py", + ranked: []rankedTemplate{rankedWith(queryScoreNamePrefix, 5), rankedWith(queryScoreNamePrefix, 1)}, + nextStep: NextStepAskUser, + }) + require.Len(t, m.gaps, 1) + require.False(t, m.gaps[0].recommended) + require.InDelta(t, 4.0, m.gaps[0].gap, 1e-9) + }) + + t.Run("GapSkippedWhenQueryTierDecides", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + // Different query tiers: the order is decided by relevance, not + // affinity, so the affinity gap would be misleading. + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + query: "py", + ranked: []rankedTemplate{rankedWith(queryScoreExactName, 0), rankedWith(queryScoreDescriptionMatch, 9)}, + recommendedID: uuid.New(), + nextStep: NextStepUseRecommended, + }) + require.Empty(t, m.gaps) + }) + + t.Run("GapSkippedWhenSingleCandidate", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + ranked: []rankedTemplate{rankedWith(0, 5)}, + recommendedID: uuid.New(), + nextStep: NextStepUseRecommended, + }) + require.Empty(t, m.gaps) + }) + + t.Run("GapSkippedWhenSignalsFailed", func(t *testing.T) { + t.Parallel() + m := &fakeListTemplatesMetrics{} + // Signals failed to load, so affinity scores are meaningless zero + // defaults; only the signals-failure counter should move. + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{Metrics: m}, "tc", uuid.New(), listTemplatesTelemetry{ + ranked: []rankedTemplate{rankedWith(0, 0), rankedWith(0, 0)}, + nextStep: NextStepAskUser, + reason: recommendationReasonSignalsUnavailable, + signalsErr: xerrors.New("boom"), + }) + require.Empty(t, m.gaps) + require.Equal(t, 1, m.signalsFailures) + }) + + t.Run("NilMetricsDoesNotPanic", func(t *testing.T) { + t.Parallel() + recordListTemplatesTelemetry(context.Background(), ListTemplatesOptions{}, "tc", uuid.New(), listTemplatesTelemetry{ + ranked: []rankedTemplate{rankedWith(0, 0)}, + nextStep: NextStepAskUser, + }) + }) +} + +func TestListTemplatesOutcome(t *testing.T) { + t.Parallel() + require.Equal(t, listTemplatesOutcomeNoTemplates, listTemplatesOutcome(NextStepNoTemplates)) + require.Equal(t, listTemplatesOutcomeNoMatches, listTemplatesOutcome(NextStepNoMatches)) + require.Equal(t, listTemplatesOutcomeRecommended, listTemplatesOutcome(NextStepUseRecommended)) + require.Equal(t, listTemplatesOutcomeAskUser, listTemplatesOutcome(NextStepAskUser)) + // An unmapped next-step value falls into the defensive unknown bucket + // instead of silently reporting ask_user. + require.Equal(t, listTemplatesOutcomeUnknown, listTemplatesOutcome("some-future-next-step")) +} + +func TestSelectTemplateRecommendation_Reasons(t *testing.T) { + t.Parallel() + + loadErr := xerrors.New("signals failed to load") + + t.Run("Outcomes", func(t *testing.T) { + t.Parallel() + _, _, reason := selectTemplateRecommendation(nil, 0, nil) + require.Equal(t, recommendationReasonNoTemplates, reason) + + _, _, reason = selectTemplateRecommendation(nil, 2, nil) + require.Equal(t, recommendationReasonNoMatches, reason) + + _, _, reason = selectTemplateRecommendation( + []rankedTemplate{{Template: database.Template{ID: uuid.New()}}}, 1, loadErr, + ) + require.Equal(t, recommendationReasonOnlyAvailable, reason) + }) + + t.Run("Query", func(t *testing.T) { + t.Parallel() + _, _, reason := selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreExactName}, + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreDescriptionMatch}, + }, 2, nil) + require.Equal(t, recommendationReasonDecisiveQuery, reason) + + // A failed signal load past a decisive query falls back to asking. + _, _, reason = selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix}, + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix}, + }, 2, loadErr) + require.Equal(t, recommendationReasonSignalsUnavailable, reason) + + _, _, reason = selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 10, Signals: templateRankingSignals{ActiveCount: 1}}, + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0}, + }, 2, nil) + require.Equal(t, recommendationReasonQueryTieConfident, reason) + + _, _, reason = selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0.1}, + {Template: database.Template{ID: uuid.New()}, QueryScore: queryScoreNamePrefix, AffinityScore: 0}, + }, 2, nil) + require.Equal(t, recommendationReasonQueryTieAmbiguous, reason) + }) + + t.Run("Affinity", func(t *testing.T) { + t.Parallel() + _, _, reason := selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, AffinityScore: 0.1}, + {Template: database.Template{ID: uuid.New()}, AffinityScore: 0}, + }, 2, nil) + require.Equal(t, recommendationReasonAffinityLow, reason) + + _, _, reason = selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.20, Signals: templateRankingSignals{OrgDevs: 2}}, + {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.15, Signals: templateRankingSignals{OrgDevs: 2}}, + }, 2, nil) + require.Equal(t, recommendationReasonAffinityAmbiguous, reason) + + _, _, reason = selectTemplateRecommendation([]rankedTemplate{ + {Template: database.Template{ID: uuid.New()}, AffinityScore: 2.0, Signals: templateRankingSignals{OrgDevs: 6}}, + {Template: database.Template{ID: uuid.New()}, AffinityScore: 1.2, Signals: templateRankingSignals{OrgDevs: 2}}, + }, 2, nil) + require.Equal(t, recommendationReasonAffinityConfident, reason) + }) +} diff --git a/coderd/x/chatd/chattool/listtemplates_test.go b/coderd/x/chatd/chattool/listtemplates_test.go index 060589ba90363..d7f4bcc53e028 100644 --- a/coderd/x/chatd/chattool/listtemplates_test.go +++ b/coderd/x/chatd/chattool/listtemplates_test.go @@ -1006,3 +1006,58 @@ func listTemplateItems(t *testing.T, result map[string]any) []map[string]any { } return templates } + +type fakeListTemplatesMetrics struct { + outcomes []string + signalsFailures int + gaps int +} + +func (m *fakeListTemplatesMetrics) RecordListTemplatesOutcome(outcome string) { + m.outcomes = append(m.outcomes, outcome) +} +func (m *fakeListTemplatesMetrics) RecordListTemplatesSignalsFailure() { m.signalsFailures++ } +func (m *fakeListTemplatesMetrics) RecordListTemplatesAffinityGap(bool, float64) { + m.gaps++ +} + +// TestListTemplates_RecordsTelemetry exercises the handler's telemetry wiring +// end to end: the outcome metric fires and the per-chat recommendation is +// recorded so a later create_workspace can classify it. +func TestListTemplates_RecordsTelemetry(t *testing.T) { + t.Parallel() + + ctx := testutil.Context(t, testutil.WaitShort) + db, _ := dbtestutil.NewDB(t) + user := dbgen.User(t, db, database.User{}) + org := dbgen.Organization(t, db, database.Organization{}) + _ = dbgen.OrganizationMember(t, db, database.OrganizationMember{UserID: user.ID, OrganizationID: org.ID}) + tmpl := dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + Name: "only-template", + }) + + chatID := uuid.New() + metrics := &fakeListTemplatesMetrics{} + tracker := chattool.NewRecommendationTracker(nil, 0, 0) + + tool := chattool.ListTemplates(db, org.ID, chattool.ListTemplatesOptions{ + OwnerID: user.ID, + ChatID: chatID, + Metrics: metrics, + Recommendations: tracker, + }) + + resp, err := tool.Run(ctx, fantasy.ToolCall{ID: "lt-1", Name: "list_templates", Input: "{}"}) + require.NoError(t, err) + require.False(t, resp.IsError) + + // The lone template is recommended, so the outcome metric records it. + require.Equal(t, []string{"recommended"}, metrics.outcomes) + require.Zero(t, metrics.signalsFailures) + + // The recommendation was recorded for the chat: classifying the chosen + // template as the recommended one yields an acceptance. + require.Equal(t, "accepted_recommendation", tracker.Classify(chatID, tmpl.ID)) +} diff --git a/coderd/x/chatd/chattool/recommendationtracker.go b/coderd/x/chatd/chattool/recommendationtracker.go new file mode 100644 index 0000000000000..244fa3d2912ca --- /dev/null +++ b/coderd/x/chatd/chattool/recommendationtracker.go @@ -0,0 +1,190 @@ +package chattool + +import ( + "sync" + "time" + + "github.com/google/uuid" + + "github.com/coder/quartz" +) + +// Recommendation follow-up outcomes describe how a create_workspace call +// related to the most recent list_templates recommendation for the same chat. +// They are the label values for template_recommendation_followup_total. +const ( + // recommendationFollowupAccepted: the chosen template is the one + // list_templates recommended. + recommendationFollowupAccepted = "accepted_recommendation" + // recommendationFollowupOverrodeListed: a recommendation existed but the + // agent built a different template that was still on the shown page. + recommendationFollowupOverrodeListed = "overrode_with_listed_template" + // recommendationFollowupListedNoRecommendation: no recommendation was made, + // and the agent built a template from the shown page. + recommendationFollowupListedNoRecommendation = "created_listed_without_recommendation" + // recommendationFollowupUnlisted: the agent built a template that was not + // on the shown page (e.g. user named it, or an older list result). + recommendationFollowupUnlisted = "created_unlisted_template" + // recommendationFollowupNoRecord: no fresh list_templates result is known + // for the chat (restart, replica handoff, expiry, or none was called). + recommendationFollowupNoRecord = "no_recent_list_templates" +) + +const ( + // defaultRecommendationTTL bounds how long a recorded recommendation stays + // eligible for follow-up classification. + defaultRecommendationTTL = 30 * time.Minute + // defaultRecommendationMaxEntries bounds the tracker's memory footprint. + defaultRecommendationMaxEntries = 4096 +) + +// RecommendationTracker correlates the most recent list_templates result per +// chat with the template that create_workspace later builds, so we can measure +// whether the agent followed the recommendation. +// +// State is in-memory and best-effort: it is lost on restart and is not shared +// across replicas, so a follow-up handled elsewhere classifies as +// "no_recent_list_templates". The durable source of truth for offline analysis +// is the persisted chat transcript (the list_templates result and the +// create_workspace call) plus the chats.workspace_id binding; this tracker +// exists only to surface a live, aggregate acceptance signal. +type RecommendationTracker struct { + clock quartz.Clock + ttl time.Duration + maxEntries int + + mu sync.Mutex + entries map[uuid.UUID]recommendationEntry +} + +type recommendationEntry struct { + recommendedID uuid.UUID + listed map[uuid.UUID]struct{} + recordedAt time.Time +} + +// NewRecommendationTracker constructs a tracker. A nil clock defaults to a real +// clock; non-positive ttl or maxEntries fall back to defaults. +func NewRecommendationTracker(clock quartz.Clock, ttl time.Duration, maxEntries int) *RecommendationTracker { + if clock == nil { + clock = quartz.NewReal() + } + if ttl <= 0 { + ttl = defaultRecommendationTTL + } + if maxEntries <= 0 { + maxEntries = defaultRecommendationMaxEntries + } + return &RecommendationTracker{ + clock: clock, + ttl: ttl, + maxEntries: maxEntries, + entries: make(map[uuid.UUID]recommendationEntry), + } +} + +// Record stores the latest list_templates outcome for a chat. recommendedID may +// be uuid.Nil when no template was recommended. listedIDs are the template IDs +// shown on the returned page (what the agent actually saw). page is the 1-based +// page number: page 1 starts a fresh record, while later pages of the same +// result accumulate their listed IDs so a follow-up build from any shown page +// still classifies as "listed" rather than "unlisted". No-op when t is nil or +// chatID is uuid.Nil. +func (t *RecommendationTracker) Record(chatID, recommendedID uuid.UUID, listedIDs []uuid.UUID, page int) { + if t == nil || chatID == uuid.Nil { + return + } + now := t.clock.Now() + + t.mu.Lock() + defer t.mu.Unlock() + + // Later pages of the same result continue an existing record, so union + // their listed IDs instead of overwriting. Page 1, a missing entry, or a + // changed recommendation starts fresh. + if page > 1 { + if entry, ok := t.entries[chatID]; ok && entry.recommendedID == recommendedID { + for _, id := range listedIDs { + if id != uuid.Nil { + entry.listed[id] = struct{}{} + } + } + entry.recordedAt = now + t.entries[chatID] = entry + return + } + } + + listed := make(map[uuid.UUID]struct{}, len(listedIDs)) + for _, id := range listedIDs { + if id != uuid.Nil { + listed[id] = struct{}{} + } + } + t.evictLocked(now) + t.entries[chatID] = recommendationEntry{ + recommendedID: recommendedID, + listed: listed, + recordedAt: now, + } +} + +// Classify consumes the recorded recommendation for a chat and reports how the +// chosen template relates to it. The entry is removed so a single creation is +// counted once. Returns recommendationFollowupNoRecord when t is nil, chatID is +// uuid.Nil, or no fresh record exists. +func (t *RecommendationTracker) Classify(chatID, chosenID uuid.UUID) string { + if t == nil || chatID == uuid.Nil { + return recommendationFollowupNoRecord + } + now := t.clock.Now() + + t.mu.Lock() + defer t.mu.Unlock() + entry, ok := t.entries[chatID] + if !ok { + return recommendationFollowupNoRecord + } + delete(t.entries, chatID) + if now.Sub(entry.recordedAt) > t.ttl { + return recommendationFollowupNoRecord + } + + _, listed := entry.listed[chosenID] + switch { + case entry.recommendedID != uuid.Nil && chosenID == entry.recommendedID: + return recommendationFollowupAccepted + case listed && entry.recommendedID != uuid.Nil: + return recommendationFollowupOverrodeListed + case listed: + return recommendationFollowupListedNoRecommendation + default: + return recommendationFollowupUnlisted + } +} + +// evictLocked drops expired entries and, if still at capacity, the oldest +// remaining entry to make room for one more. Callers must hold t.mu. +func (t *RecommendationTracker) evictLocked(now time.Time) { + for id, e := range t.entries { + if now.Sub(e.recordedAt) > t.ttl { + delete(t.entries, id) + } + } + if len(t.entries) < t.maxEntries { + return + } + var ( + oldestID uuid.UUID + oldestAt time.Time + found bool + ) + for id, e := range t.entries { + if !found || e.recordedAt.Before(oldestAt) { + oldestID, oldestAt, found = id, e.recordedAt, true + } + } + if found { + delete(t.entries, oldestID) + } +} diff --git a/coderd/x/chatd/chattool/recommendationtracker_internal_test.go b/coderd/x/chatd/chattool/recommendationtracker_internal_test.go new file mode 100644 index 0000000000000..353a0b9c06dde --- /dev/null +++ b/coderd/x/chatd/chattool/recommendationtracker_internal_test.go @@ -0,0 +1,126 @@ +package chattool + +import ( + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + + "github.com/coder/quartz" +) + +func TestRecommendationTracker_Classify(t *testing.T) { + t.Parallel() + + t.Run("AcceptedRecommendation", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, rec, other := uuid.New(), uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec, other}, 1) + require.Equal(t, recommendationFollowupAccepted, tr.Classify(chat, rec)) + }) + + t.Run("OverrodeWithListedTemplate", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, rec, other := uuid.New(), uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec, other}, 1) + require.Equal(t, recommendationFollowupOverrodeListed, tr.Classify(chat, other)) + }) + + t.Run("AccumulatesListedAcrossPages", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, rec := uuid.New(), uuid.New() + page1, page2 := uuid.New(), uuid.New() + // Page 1 seeds the record; a later page of the same result accumulates + // its listed IDs instead of replacing page 1's, so a build of a + // page-1 template is still "listed" rather than "unlisted". + tr.Record(chat, rec, []uuid.UUID{rec, page1}, 1) + tr.Record(chat, rec, []uuid.UUID{page2}, 2) + require.Equal(t, recommendationFollowupOverrodeListed, tr.Classify(chat, page1)) + }) + + t.Run("ListedWithoutRecommendation", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, listed := uuid.New(), uuid.New() + // uuid.Nil recommendation: list_templates returned templates but + // recommended none. + tr.Record(chat, uuid.Nil, []uuid.UUID{listed}, 1) + require.Equal(t, recommendationFollowupListedNoRecommendation, tr.Classify(chat, listed)) + }) + + t.Run("UnlistedTemplate", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, rec, unlisted := uuid.New(), uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec}, 1) + require.Equal(t, recommendationFollowupUnlisted, tr.Classify(chat, unlisted)) + }) + + t.Run("NoRecord", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + require.Equal(t, recommendationFollowupNoRecord, tr.Classify(uuid.New(), uuid.New())) + }) + + t.Run("ConsumedOnce", func(t *testing.T) { + t.Parallel() + tr := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + chat, rec := uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec}, 1) + require.Equal(t, recommendationFollowupAccepted, tr.Classify(chat, rec)) + // A second classification finds nothing: the entry was consumed. + require.Equal(t, recommendationFollowupNoRecord, tr.Classify(chat, rec)) + }) + + t.Run("ExpiredByTTL", func(t *testing.T) { + t.Parallel() + clock := quartz.NewMock(t) + tr := NewRecommendationTracker(clock, time.Minute, 0) + chat, rec := uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec}, 1) + clock.Advance(time.Minute + time.Second) + require.Equal(t, recommendationFollowupNoRecord, tr.Classify(chat, rec)) + }) + + t.Run("NilTrackerAndNilChat", func(t *testing.T) { + t.Parallel() + var tr *RecommendationTracker + require.Equal(t, recommendationFollowupNoRecord, tr.Classify(uuid.New(), uuid.New())) + // Record on a nil tracker or with a nil chat ID must not panic. + tr.Record(uuid.New(), uuid.New(), nil, 1) + live := NewRecommendationTracker(quartz.NewMock(t), 0, 0) + live.Record(uuid.Nil, uuid.New(), nil, 1) + require.Equal(t, recommendationFollowupNoRecord, live.Classify(uuid.Nil, uuid.New())) + }) +} + +func TestRecommendationTracker_EvictsOldestAtCapacity(t *testing.T) { + t.Parallel() + + clock := quartz.NewMock(t) + const maxEntries = 3 + tr := NewRecommendationTracker(clock, time.Hour, maxEntries) + + // Record the oldest entry, then advance so later entries are strictly + // newer, filling capacity beyond maxEntries. + oldest := uuid.New() + oldestRec := uuid.New() + tr.Record(oldest, oldestRec, []uuid.UUID{oldestRec}, 1) + + var survivor, survivorRec uuid.UUID + for range maxEntries { + clock.Advance(time.Second) + chat, rec := uuid.New(), uuid.New() + tr.Record(chat, rec, []uuid.UUID{rec}, 1) + survivor, survivorRec = chat, rec + } + + // The oldest entry was evicted to keep the map bounded, while a newer + // entry recorded within TTL remains classifiable. + require.Equal(t, recommendationFollowupNoRecord, tr.Classify(oldest, oldestRec)) + require.Equal(t, recommendationFollowupAccepted, tr.Classify(survivor, survivorRec)) +} diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index 8df2126633d1f..84efe8508b63d 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -212,11 +212,15 @@ deployment. They will always be available from the agent. | `coderd_chat_auto_archive_records_archived_total` | counter | Total number of chats archived by the auto-archive job (counting both roots and cascaded children). | | | `coderd_chatd_chats` | gauge | Number of chats being processed, by state. | `state` | | `coderd_chatd_compaction_total` | counter | Total compaction outcomes (only recorded when compaction was triggered or failed). | `model` `provider` `result` | +| `coderd_chatd_list_templates_affinity_gap` | histogram | Affinity score gap between the top two candidates when affinity is the deciding signal, labeled by whether a recommendation was made. | `recommended` | +| `coderd_chatd_list_templates_outcome_total` | counter | Total list_templates calls by recommendation outcome (recommended, ask_user, no_matches, no_templates). | `outcome` | +| `coderd_chatd_list_templates_signals_failures_total` | counter | Total list_templates calls where ranking signals failed to load, degrading the result toward asking the user. | | | `coderd_chatd_message_count` | histogram | Number of messages in the prompt per LLM request. | `model` `provider` | | `coderd_chatd_prompt_size_bytes` | histogram | Estimated byte size of the prompt per LLM request. | `model` `provider` | | `coderd_chatd_steps_total` | counter | Total agentic loop steps across all chats. | `model` `provider` | | `coderd_chatd_stream_buffer_dropped_total` | counter | Number of chat stream buffer events dropped due to the per-chat buffer cap. | | | `coderd_chatd_stream_retries_total` | counter | Total LLM stream retries. | `chain_broken` `kind` `model` `provider` | +| `coderd_chatd_template_recommendation_followup_total` | counter | Total create_workspace calls by how the chosen template related to the prior list_templates recommendation (accepted_recommendation, overrode_with_listed_template, created_listed_without_recommendation, created_unlisted_template, no_recent_list_templates). | `outcome` | | `coderd_chatd_tool_errors_total` | counter | Total tool calls that returned an error result. | `model` `provider` `tool_name` | | `coderd_chatd_tool_result_size_bytes` | histogram | Size in bytes of each tool execution result. | `model` `provider` `tool_name` | | `coderd_chatd_ttft_seconds` | histogram | Time-to-first-token: wall time from LLM request to first streamed chunk. | `model` `provider` | diff --git a/scripts/metricsdocgen/generated_metrics b/scripts/metricsdocgen/generated_metrics index 76d25ef341ade..9029d13096303 100644 --- a/scripts/metricsdocgen/generated_metrics +++ b/scripts/metricsdocgen/generated_metrics @@ -238,6 +238,15 @@ coderd_chatd_chats{state=""} 0 # HELP coderd_chatd_compaction_total Total compaction outcomes (only recorded when compaction was triggered or failed). # TYPE coderd_chatd_compaction_total counter coderd_chatd_compaction_total{provider="",model="",result=""} 0 +# HELP coderd_chatd_list_templates_affinity_gap Affinity score gap between the top two candidates when affinity is the deciding signal, labeled by whether a recommendation was made. +# TYPE coderd_chatd_list_templates_affinity_gap histogram +coderd_chatd_list_templates_affinity_gap{recommended=""} 0 +# HELP coderd_chatd_list_templates_outcome_total Total list_templates calls by recommendation outcome (recommended, ask_user, no_matches, no_templates). +# TYPE coderd_chatd_list_templates_outcome_total counter +coderd_chatd_list_templates_outcome_total{outcome=""} 0 +# HELP coderd_chatd_list_templates_signals_failures_total Total list_templates calls where ranking signals failed to load, degrading the result toward asking the user. +# TYPE coderd_chatd_list_templates_signals_failures_total counter +coderd_chatd_list_templates_signals_failures_total 0 # HELP coderd_chatd_message_count Number of messages in the prompt per LLM request. # TYPE coderd_chatd_message_count histogram coderd_chatd_message_count{provider="",model=""} 0 @@ -253,6 +262,9 @@ coderd_chatd_stream_buffer_dropped_total 0 # HELP coderd_chatd_stream_retries_total Total LLM stream retries. # TYPE coderd_chatd_stream_retries_total counter coderd_chatd_stream_retries_total{provider="",model="",kind="",chain_broken=""} 0 +# HELP coderd_chatd_template_recommendation_followup_total Total create_workspace calls by how the chosen template related to the prior list_templates recommendation (accepted_recommendation, overrode_with_listed_template, created_listed_without_recommendation, created_unlisted_template, no_recent_list_templates). +# TYPE coderd_chatd_template_recommendation_followup_total counter +coderd_chatd_template_recommendation_followup_total{outcome=""} 0 # HELP coderd_chatd_tool_errors_total Total tool calls that returned an error result. # TYPE coderd_chatd_tool_errors_total counter coderd_chatd_tool_errors_total{provider="",model="",tool_name=""} 0