-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Expand file tree
/
Copy pathapphealth.go
More file actions
192 lines (171 loc) · 6.02 KB
/
apphealth.go
File metadata and controls
192 lines (171 loc) · 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
package agent
import (
"context"
"net/http"
"sync"
"time"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog/v3"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/quartz"
)
// PostWorkspaceAgentAppHealth updates the workspace app health.
type PostWorkspaceAgentAppHealth func(context.Context, agentsdk.PostAppHealthsRequest) error
// WorkspaceAppHealthReporter is a function that checks and reports the health of the workspace apps until the passed context is canceled.
type WorkspaceAppHealthReporter func(ctx context.Context)
// NewWorkspaceAppHealthReporter creates a WorkspaceAppHealthReporter that reports app health to coderd.
func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.WorkspaceApp, postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth) WorkspaceAppHealthReporter {
return NewAppHealthReporterWithClock(logger, apps, postWorkspaceAgentAppHealth, quartz.NewReal())
}
// NewAppHealthReporterWithClock is only called directly by test code. Product code should call
// NewAppHealthReporter.
func NewAppHealthReporterWithClock(
logger slog.Logger,
apps []codersdk.WorkspaceApp,
postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth,
clk quartz.Clock,
) WorkspaceAppHealthReporter {
logger = logger.Named("apphealth")
return func(ctx context.Context) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
// no need to run this loop if no apps for this workspace.
if len(apps) == 0 {
return
}
hasHealthchecksEnabled := false
health := make(map[uuid.UUID]codersdk.WorkspaceAppHealth, 0)
for _, app := range apps {
if app.Health == codersdk.WorkspaceAppHealthDisabled {
continue
}
health[app.ID] = app.Health
hasHealthchecksEnabled = true
}
// no need to run this loop if no health checks are configured.
if !hasHealthchecksEnabled {
return
}
// run a ticker for each app health check.
var mu sync.RWMutex
failures := make(map[uuid.UUID]int, 0)
client := &http.Client{}
for _, nextApp := range apps {
if !shouldStartTicker(nextApp) {
continue
}
app := nextApp
go func() {
_ = clk.TickerFunc(ctx, time.Duration(app.Healthcheck.Interval)*time.Second, func() error {
// We time out at the healthcheck interval to prevent getting too backed up, but
// set it 1ms early so that it's not simultaneous with the next tick in testing,
// which makes the test easier to understand.
//
// It would be idiomatic to use the http.Client.Timeout or a context.WithTimeout,
// but we are passing this off to the native http library, which is not aware
// of the clock library we are using. That means in testing, with a mock clock
// it will compare mocked times with real times, and we will get strange results.
// So, we just implement the timeout as a context we cancel with an AfterFunc
reqCtx, reqCancel := context.WithCancel(ctx)
timeout := clk.AfterFunc(
time.Duration(app.Healthcheck.Interval)*time.Second-time.Millisecond,
reqCancel,
"timeout", app.Slug)
defer timeout.Stop()
err := func() error {
req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, app.Healthcheck.URL, nil)
if err != nil {
return err
}
res, err := client.Do(req)
if err != nil {
return err
}
// successful healthcheck is a non-5XX status code
_ = res.Body.Close()
if res.StatusCode >= http.StatusInternalServerError {
return xerrors.Errorf("error status code: %d", res.StatusCode)
}
return nil
}()
if err != nil {
nowUnhealthy := false
mu.Lock()
if failures[app.ID] < int(app.Healthcheck.Threshold) {
// increment the failure count and keep status the same.
// we will change it when we hit the threshold.
failures[app.ID]++
} else {
// set to unhealthy if we hit the failure threshold.
// we stop incrementing at the threshold to prevent the failure value from increasing forever.
health[app.ID] = codersdk.WorkspaceAppHealthUnhealthy
nowUnhealthy = true
}
mu.Unlock()
logger.Debug(ctx, "error checking app health",
slog.F("id", app.ID.String()),
slog.F("slug", app.Slug),
slog.F("now_unhealthy", nowUnhealthy), slog.Error(err),
)
} else {
mu.Lock()
// we only need one successful health check to be considered healthy.
health[app.ID] = codersdk.WorkspaceAppHealthHealthy
failures[app.ID] = 0
mu.Unlock()
logger.Debug(ctx, "workspace app healthy", slog.F("id", app.ID.String()), slog.F("slug", app.Slug))
}
return nil
}, "healthcheck", app.Slug)
}()
}
mu.Lock()
lastHealth := copyHealth(health)
mu.Unlock()
reportTicker := clk.TickerFunc(ctx, time.Second, func() error {
mu.RLock()
changed := healthChanged(lastHealth, health)
mu.RUnlock()
if !changed {
return nil
}
mu.Lock()
lastHealth = copyHealth(health)
mu.Unlock()
err := postWorkspaceAgentAppHealth(ctx, agentsdk.PostAppHealthsRequest{
Healths: lastHealth,
})
if err != nil {
logger.Error(ctx, "failed to report workspace app health", slog.Error(err))
} else {
logger.Debug(ctx, "sent workspace app health", slog.F("health", lastHealth))
}
return nil
}, "report")
_ = reportTicker.Wait() // only possible error is context done
}
}
func shouldStartTicker(app codersdk.WorkspaceApp) bool {
return app.Healthcheck.URL != "" && app.Healthcheck.Interval > 0 && app.Healthcheck.Threshold > 0
}
func healthChanged(old map[uuid.UUID]codersdk.WorkspaceAppHealth, updated map[uuid.UUID]codersdk.WorkspaceAppHealth) bool {
for name, newValue := range updated {
oldValue, found := old[name]
if !found {
return true
}
if newValue != oldValue {
return true
}
}
return false
}
func copyHealth(h1 map[uuid.UUID]codersdk.WorkspaceAppHealth) map[uuid.UUID]codersdk.WorkspaceAppHealth {
h2 := make(map[uuid.UUID]codersdk.WorkspaceAppHealth, 0)
for k, v := range h1 {
h2[k] = v
}
return h2
}