@@ -71,6 +71,7 @@ type Client interface {
7171 WorkspaceAgentMetadata (ctx context.Context ) (codersdk.WorkspaceAgentMetadata , error )
7272 ListenWorkspaceAgent (ctx context.Context ) (net.Conn , error )
7373 AgentReportStats (ctx context.Context , log slog.Logger , stats func () * codersdk.AgentStats ) (io.Closer , error )
74+ PostWorkspaceAgentLifecycle (ctx context.Context , state codersdk.PostWorkspaceAgentLifecycleRequest ) error
7475 PostWorkspaceAgentAppHealth (ctx context.Context , req codersdk.PostWorkspaceAppHealthsRequest ) error
7576 PostWorkspaceAgentVersion (ctx context.Context , version string ) error
7677}
@@ -101,6 +102,7 @@ func New(options Options) io.Closer {
101102 exchangeToken : options .ExchangeToken ,
102103 filesystem : options .Filesystem ,
103104 tempDir : options .TempDir ,
105+ lifecycleUpdate : make (chan struct {}, 1 ),
104106 }
105107 a .init (ctx )
106108 return a
@@ -127,6 +129,10 @@ type agent struct {
127129 sessionToken atomic.Pointer [string ]
128130 sshServer * ssh.Server
129131
132+ lifecycleUpdate chan struct {}
133+ lifecycleMu sync.Mutex // Protects following.
134+ lifecycleState codersdk.WorkspaceAgentLifecycle
135+
130136 network * tailnet.Conn
131137}
132138
@@ -135,6 +141,8 @@ type agent struct {
135141// may be happening, but regardless after the intermittent
136142// failure, you'll want the agent to reconnect.
137143func (a * agent ) runLoop (ctx context.Context ) {
144+ go a .reportLifecycleLoop (ctx )
145+
138146 for retrier := retry .New (100 * time .Millisecond , 10 * time .Second ); retrier .Wait (ctx ); {
139147 a .logger .Info (ctx , "running loop" )
140148 err := a .run (ctx )
@@ -156,6 +164,58 @@ func (a *agent) runLoop(ctx context.Context) {
156164 }
157165}
158166
167+ // reportLifecycleLoop reports the current lifecycle state once.
168+ // Only the latest state is reported, intermediate states may be
169+ // lost if the agent can't communicate with the API.
170+ func (a * agent ) reportLifecycleLoop (ctx context.Context ) {
171+ var lastReported codersdk.WorkspaceAgentLifecycle
172+ for {
173+ select {
174+ case <- a .lifecycleUpdate :
175+ case <- ctx .Done ():
176+ return
177+ }
178+
179+ for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
180+ a .lifecycleMu .Lock ()
181+ state := a .lifecycleState
182+ a .lifecycleMu .Unlock ()
183+
184+ if state == lastReported {
185+ break
186+ }
187+
188+ a .logger .Debug (ctx , "post lifecycle state" , slog .F ("state" , state ))
189+
190+ err := a .client .PostWorkspaceAgentLifecycle (ctx , codersdk.PostWorkspaceAgentLifecycleRequest {
191+ State : state ,
192+ })
193+ if err == nil {
194+ lastReported = state
195+ break
196+ }
197+ if xerrors .Is (err , context .Canceled ) || xerrors .Is (err , context .DeadlineExceeded ) {
198+ return
199+ }
200+ // If we fail to report the state we probably shouldn't exit, log only.
201+ a .logger .Error (ctx , "post state" , slog .Error (err ))
202+ }
203+ }
204+ }
205+
206+ func (a * agent ) setLifecycle (ctx context.Context , state codersdk.WorkspaceAgentLifecycle ) {
207+ a .lifecycleMu .Lock ()
208+ defer a .lifecycleMu .Unlock ()
209+
210+ a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("previous" , a .lifecycleState ))
211+
212+ a .lifecycleState = state
213+ select {
214+ case a .lifecycleUpdate <- struct {}{}:
215+ default :
216+ }
217+ }
218+
159219func (a * agent ) run (ctx context.Context ) error {
160220 // This allows the agent to refresh it's token if necessary.
161221 // For instance identity this is required, since the instance
@@ -180,22 +240,60 @@ func (a *agent) run(ctx context.Context) error {
180240
181241 // The startup script should only execute on the first run!
182242 if oldMetadata == nil {
243+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleStarting )
244+
245+ // Perform overrides early so that Git auth can work even if users
246+ // connect to a workspace that is not yet ready. We don't run this
247+ // concurrently with the startup script to avoid conflicts between
248+ // them.
249+ if metadata .GitAuthConfigs > 0 {
250+ // If this fails, we should consider surfacing the error in the
251+ // startup log and setting the lifecycle state to be "start_error"
252+ // (after startup script completion), but for now we'll just log it.
253+ err := gitauth .OverrideVSCodeConfigs (a .filesystem )
254+ if err != nil {
255+ a .logger .Warn (ctx , "failed to override vscode git auth configs" , slog .Error (err ))
256+ }
257+ }
258+
259+ scriptDone := make (chan error , 1 )
260+ scriptStart := time .Now ()
261+ go func () {
262+ defer close (scriptDone )
263+ scriptDone <- a .runStartupScript (ctx , metadata .StartupScript )
264+ }()
183265 go func () {
184- err := a .runStartupScript (ctx , metadata .StartupScript )
266+ var timeout <- chan time.Time
267+ // If timeout is zero, an older version of the coder
268+ // provider was used. Otherwise a timeout is always > 0.
269+ if metadata .StartupScriptTimeout > 0 {
270+ t := time .NewTimer (metadata .StartupScriptTimeout )
271+ defer t .Stop ()
272+ timeout = t .C
273+ }
274+
275+ var err error
276+ select {
277+ case err = <- scriptDone :
278+ case <- timeout :
279+ a .logger .Warn (ctx , "startup script timed out" )
280+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleStartTimeout )
281+ err = <- scriptDone // The script can still complete after a timeout.
282+ }
185283 if errors .Is (err , context .Canceled ) {
186284 return
187285 }
286+ execTime := time .Since (scriptStart )
287+ lifecycleStatus := codersdk .WorkspaceAgentLifecycleReady
188288 if err != nil {
189- a .logger .Warn (ctx , "agent script failed" , slog .Error (err ))
289+ a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
290+ lifecycleStatus = codersdk .WorkspaceAgentLifecycleStartError
291+ } else {
292+ a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
190293 }
191- }()
192- }
193294
194- if metadata .GitAuthConfigs > 0 {
195- err = gitauth .OverrideVSCodeConfigs (a .filesystem )
196- if err != nil {
197- return xerrors .Errorf ("override vscode configuration for git auth: %w" , err )
198- }
295+ a .setLifecycle (ctx , lifecycleStatus )
296+ }()
199297 }
200298
201299 // This automatically closes when the context ends!
0 commit comments