coder/agent/agent.go at main · MatrixShoo/coder

History

2007 lines (1818 loc) · 61.5 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

package agent

import (

"bytes"

"context"

"encoding/binary"

"encoding/json"

"errors"

"fmt"

"io"

"net"

"net/http"

"net/netip"

"os"

"os/user"

"path/filepath"

"runtime"

"runtime/debug"

"sort"

"strconv"

"strings"

"sync"

"time"

"github.com/go-chi/chi/v5"

"github.com/google/uuid"

"github.com/prometheus/client_golang/prometheus"

"github.com/prometheus/common/expfmt"

"github.com/spf13/afero"

"go.uber.org/atomic"

"golang.org/x/exp/slices"

"golang.org/x/sync/errgroup"

"golang.org/x/xerrors"

"storj.io/drpc"

"tailscale.com/net/speedtest"

"tailscale.com/tailcfg"

"tailscale.com/types/netlogtype"

"tailscale.com/util/clientmetric"

"cdr.dev/slog"

"github.com/coder/retry"

"github.com/coder/coder/v2/agent/agentproc"

"github.com/coder/coder/v2/agent/agentscripts"

"github.com/coder/coder/v2/agent/agentssh"

"github.com/coder/coder/v2/agent/proto"

"github.com/coder/coder/v2/agent/reconnectingpty"

"github.com/coder/coder/v2/buildinfo"

"github.com/coder/coder/v2/cli/gitauth"

"github.com/coder/coder/v2/coderd/database/dbtime"

"github.com/coder/coder/v2/codersdk"

"github.com/coder/coder/v2/codersdk/agentsdk"

"github.com/coder/coder/v2/tailnet"

tailnetproto "github.com/coder/coder/v2/tailnet/proto"

)

const (

ProtocolReconnectingPTY = "reconnecting-pty"

ProtocolSSH = "ssh"

ProtocolDial = "dial"

)

// EnvProcPrioMgmt determines whether we attempt to manage

// process CPU and OOM Killer priority.

const EnvProcPrioMgmt = "CODER_PROC_PRIO_MGMT"

type Options struct {

Filesystem afero.Fs

LogDir string

TempDir string

ScriptDataDir string

ExchangeToken func(ctx context.Context) (string, error)

Client Client

ReconnectingPTYTimeout time.Duration

EnvironmentVariables map[string]string

Logger slog.Logger

IgnorePorts map[int]string

PortCacheDuration time.Duration

SSHMaxTimeout time.Duration

TailnetListenPort uint16

Subsystems []codersdk.AgentSubsystem

Addresses []netip.Prefix

PrometheusRegistry *prometheus.Registry

ReportMetadataInterval time.Duration

ServiceBannerRefreshInterval time.Duration

Syscaller agentproc.Syscaller

// ModifiedProcesses is used for testing process priority management.

ModifiedProcesses chan []*agentproc.Process

// ProcessManagementTick is used for testing process priority management.

ProcessManagementTick <-chan time.Time

}

type Client interface {

ConnectRPC(ctx context.Context) (drpc.Conn, error)

RewriteDERPMap(derpMap *tailcfg.DERPMap)

}

type Agent interface {

HTTPDebug() http.Handler

// TailnetConn may be nil.

TailnetConn() *tailnet.Conn

io.Closer

}

func New(options Options) Agent {

if options.Filesystem == nil {

options.Filesystem = afero.NewOsFs()

}

if options.TempDir == "" {

options.TempDir = os.TempDir()

}

if options.LogDir == "" {

if options.TempDir != os.TempDir() {

options.Logger.Debug(context.Background(), "log dir not set, using temp dir", slog.F("temp_dir", options.TempDir))

} else {

options.Logger.Debug(context.Background(), "using log dir", slog.F("log_dir", options.LogDir))

}

options.LogDir = options.TempDir

}

if options.ScriptDataDir == "" {

if options.TempDir != os.TempDir() {

options.Logger.Debug(context.Background(), "script data dir not set, using temp dir", slog.F("temp_dir", options.TempDir))

} else {

options.Logger.Debug(context.Background(), "using script data dir", slog.F("script_data_dir", options.ScriptDataDir))

}

options.ScriptDataDir = options.TempDir

}

if options.ExchangeToken == nil {

options.ExchangeToken = func(ctx context.Context) (string, error) {

return "", nil

}

if options.ReportMetadataInterval == 0 {

options.ReportMetadataInterval = time.Second

}

if options.ServiceBannerRefreshInterval == 0 {

options.ServiceBannerRefreshInterval = 2 * time.Minute

}

if options.PortCacheDuration == 0 {

options.PortCacheDuration = 1 * time.Second

}

prometheusRegistry := options.PrometheusRegistry

if prometheusRegistry == nil {

prometheusRegistry = prometheus.NewRegistry()

}

if options.Syscaller == nil {

options.Syscaller = agentproc.NewSyscaller()

}

hardCtx, hardCancel := context.WithCancel(context.Background())

gracefulCtx, gracefulCancel := context.WithCancel(hardCtx)

a := &agent{

tailnetListenPort: options.TailnetListenPort,

reconnectingPTYTimeout: options.ReconnectingPTYTimeout,

logger: options.Logger,

gracefulCtx: gracefulCtx,

gracefulCancel: gracefulCancel,

hardCtx: hardCtx,

hardCancel: hardCancel,

coordDisconnected: make(chan struct{}),

environmentVariables: options.EnvironmentVariables,

client: options.Client,

exchangeToken: options.ExchangeToken,

filesystem: options.Filesystem,

logDir: options.LogDir,

tempDir: options.TempDir,

scriptDataDir: options.ScriptDataDir,

lifecycleUpdate: make(chan struct{}, 1),

lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),

lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},

ignorePorts: options.IgnorePorts,

portCacheDuration: options.PortCacheDuration,

reportMetadataInterval: options.ReportMetadataInterval,

serviceBannerRefreshInterval: options.ServiceBannerRefreshInterval,

sshMaxTimeout: options.SSHMaxTimeout,

subsystems: options.Subsystems,

addresses: options.Addresses,

syscaller: options.Syscaller,

modifiedProcs: options.ModifiedProcesses,

processManagementTick: options.ProcessManagementTick,

logSender: agentsdk.NewLogSender(options.Logger),

prometheusRegistry: prometheusRegistry,

metrics: newAgentMetrics(prometheusRegistry),

}

// Initially, we have a closed channel, reflecting the fact that we are not initially connected.

// Each time we connect we replace the channel (while holding the closeMutex) with a new one

// that gets closed on disconnection. This is used to wait for graceful disconnection from the

// coordinator during shut down.

close(a.coordDisconnected)

a.serviceBanner.Store(new(codersdk.ServiceBannerConfig))

a.sessionToken.Store(new(string))

a.init()

return a

}

type agent struct {

logger slog.Logger

client Client

exchangeToken func(ctx context.Context) (string, error)

tailnetListenPort uint16

filesystem afero.Fs

logDir string

tempDir string

scriptDataDir string

// ignorePorts tells the api handler which ports to ignore when

// listing all listening ports. This is helpful to hide ports that

// are used by the agent, that the user does not care about.

ignorePorts map[int]string

portCacheDuration time.Duration

subsystems []codersdk.AgentSubsystem

reconnectingPTYs sync.Map

reconnectingPTYTimeout time.Duration

// we track 2 contexts and associated cancel functions: "graceful" which is Done when it is time

// to start gracefully shutting down and "hard" which is Done when it is time to close

// everything down (regardless of whether graceful shutdown completed).

gracefulCtx context.Context

gracefulCancel context.CancelFunc

hardCtx context.Context

hardCancel context.CancelFunc

closeWaitGroup sync.WaitGroup

closeMutex sync.Mutex

coordDisconnected chan struct{}

environmentVariables map[string]string

manifest atomic.Pointer[agentsdk.Manifest] // manifest is atomic because values can change after reconnection.

reportMetadataInterval time.Duration

scriptRunner *agentscripts.Runner

serviceBanner atomic.Pointer[codersdk.ServiceBannerConfig] // serviceBanner is atomic because it is periodically updated.

serviceBannerRefreshInterval time.Duration

sessionToken atomic.Pointer[string]

sshServer *agentssh.Server

sshMaxTimeout time.Duration

lifecycleUpdate chan struct{}

lifecycleReported chan codersdk.WorkspaceAgentLifecycle

lifecycleMu sync.RWMutex // Protects following.

lifecycleStates []agentsdk.PostLifecycleRequest

network *tailnet.Conn

addresses []netip.Prefix

statsReporter *statsReporter

logSender *agentsdk.LogSender

connCountReconnectingPTY atomic.Int64

prometheusRegistry *prometheus.Registry

// metrics are prometheus registered metrics that will be collected and

// labeled in Coder with the agent + workspace.

metrics *agentMetrics

syscaller agentproc.Syscaller

// modifiedProcs is used for testing process priority management.

modifiedProcs chan []*agentproc.Process

// processManagementTick is used for testing process priority management.

processManagementTick <-chan time.Time

}

func (a *agent) TailnetConn() *tailnet.Conn {

return a.network

}

func (a *agent) init() {

// pass the "hard" context because we explicitly close the SSH server as part of graceful shutdown.

sshSrv, err := agentssh.NewServer(a.hardCtx, a.logger.Named("ssh-server"), a.prometheusRegistry, a.filesystem, &agentssh.Config{

MaxTimeout: a.sshMaxTimeout,

MOTDFile: func() string { return a.manifest.Load().MOTDFile },

ServiceBanner: func() *codersdk.ServiceBannerConfig { return a.serviceBanner.Load() },

UpdateEnv: a.updateCommandEnv,

WorkingDirectory: func() string { return a.manifest.Load().Directory },

})

if err != nil {

panic(err)

}

a.sshServer = sshSrv

a.scriptRunner = agentscripts.New(agentscripts.Options{

LogDir: a.logDir,

DataDirBase: a.scriptDataDir,

Logger: a.logger,

SSHServer: sshSrv,

Filesystem: a.filesystem,

GetScriptLogger: func(logSourceID uuid.UUID) agentscripts.ScriptLogger {

return a.logSender.GetScriptLogger(logSourceID)

})

// Register runner metrics. If the prom registry is nil, the metrics

// will not report anywhere.

a.scriptRunner.RegisterMetrics(a.prometheusRegistry)

go a.runLoop()

}

// runLoop attempts to start the agent in a retry loop.

// Coder may be offline temporarily, a connection issue

// may be happening, but regardless after the intermittent

// failure, you'll want the agent to reconnect.

func (a *agent) runLoop() {

go a.manageProcessPriorityUntilGracefulShutdown()

// need to keep retrying up to the hardCtx so that we can send graceful shutdown-related

// messages.

ctx := a.hardCtx

for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {

a.logger.Info(ctx, "connecting to coderd")

err := a.run()

if err == nil {

continue

}

if ctx.Err() != nil {

// Context canceled errors may come from websocket pings, so we

// don't want to use `errors.Is(err, context.Canceled)` here.

return

}

if a.isClosed() {

return

}

if errors.Is(err, io.EOF) {

a.logger.Info(ctx, "disconnected from coderd")

continue

}

a.logger.Warn(ctx, "run exited with error", slog.Error(err))

}

func (a *agent) collectMetadata(ctx context.Context, md codersdk.WorkspaceAgentMetadataDescription, now time.Time) *codersdk.WorkspaceAgentMetadataResult {

var out bytes.Buffer

result := &codersdk.WorkspaceAgentMetadataResult{

// CollectedAt is set here for testing purposes and overrode by

// coderd to the time of server receipt to solve clock skew.

// In the future, the server may accept the timestamp from the agent

// if it can guarantee the clocks are synchronized.

CollectedAt: now,

}

cmdPty, err := a.sshServer.CreateCommand(ctx, md.Script, nil)

if err != nil {

result.Error = fmt.Sprintf("create cmd: %+v", err)

return result

}

cmd := cmdPty.AsExec()

cmd.Stdout = &out

cmd.Stderr = &out

cmd.Stdin = io.LimitReader(nil, 0)

// We split up Start and Wait instead of calling Run so that we can return a more precise error.

err = cmd.Start()

if err != nil {

result.Error = fmt.Sprintf("start cmd: %+v", err)

return result

}

// This error isn't mutually exclusive with useful output.

err = cmd.Wait()

const bufLimit = 10 << 10

if out.Len() > bufLimit {

err = errors.Join(

err,

xerrors.Errorf("output truncated from %v to %v bytes", out.Len(), bufLimit),

)

out.Truncate(bufLimit)

}

// Important: if the command times out, we may see a misleading error like

// "exit status 1", so it's important to include the context error.

err = errors.Join(err, ctx.Err())

if err != nil {

result.Error = fmt.Sprintf("run cmd: %+v", err)

}

result.Value = out.String()

return result

}

type metadataResultAndKey struct {

result *codersdk.WorkspaceAgentMetadataResult

key string

}

type trySingleflight struct {

mu sync.Mutex

m map[string]struct{}

}

func (t *trySingleflight) Do(key string, fn func()) {

t.mu.Lock()

_, ok := t.m[key]

if ok {

t.mu.Unlock()

return

}

t.m[key] = struct{}{}

t.mu.Unlock()

defer func() {

t.mu.Lock()

delete(t.m, key)

t.mu.Unlock()

}()

fn()

}

func (a *agent) reportMetadata(ctx context.Context, conn drpc.Conn) error {

tickerDone := make(chan struct{})

collectDone := make(chan struct{})

ctx, cancel := context.WithCancel(ctx)

defer func() {

cancel()

<-collectDone

<-tickerDone

}()

var (

logger = a.logger.Named("metadata")

report = make(chan struct{}, 1)

collect = make(chan struct{}, 1)

metadataResults = make(chan metadataResultAndKey, 1)

)

// Set up collect and report as a single ticker with two channels,

// this is to allow collection and reporting to be triggered

// independently of each other.

go func() {

t := time.NewTicker(a.reportMetadataInterval)

defer func() {

t.Stop()

close(report)

close(collect)

close(tickerDone)

}()

wake := func(c chan<- struct{}) {

select {

case c <- struct{}{}:

default:

}

wake(collect) // Start immediately.

for {

select {

case <-ctx.Done():

return

case <-t.C:

wake(report)

wake(collect)

}

}()

go func() {

defer close(collectDone)

var (

// We use a custom singleflight that immediately returns if there is already

// a goroutine running for a given key. This is to prevent a build-up of

// goroutines waiting on Do when the script takes many multiples of

// baseInterval to run.

flight = trySingleflight{m: map[string]struct{}{}}

lastCollectedAtMu sync.RWMutex

lastCollectedAts = make(map[string]time.Time)

)

for {

select {

case <-ctx.Done():

return

case <-collect:

}

manifest := a.manifest.Load()

if manifest == nil {

continue

}

// If the manifest changes (e.g. on agent reconnect) we need to

// purge old cache values to prevent lastCollectedAt from growing

// boundlessly.

lastCollectedAtMu.Lock()

for key := range lastCollectedAts {

if slices.IndexFunc(manifest.Metadata, func(md codersdk.WorkspaceAgentMetadataDescription) bool {

return md.Key == key

}) < 0 {

logger.Debug(ctx, "deleting lastCollected key, missing from manifest",

slog.F("key", key),

)

delete(lastCollectedAts, key)

}

lastCollectedAtMu.Unlock()

// Spawn a goroutine for each metadata collection, and use a

// channel to synchronize the results and avoid both messy

// mutex logic and overloading the API.

for _, md := range manifest.Metadata {

md := md

// We send the result to the channel in the goroutine to avoid

// sending the same result multiple times. So, we don't care about

// the return values.

go flight.Do(md.Key, func() {

ctx := slog.With(ctx, slog.F("key", md.Key))

lastCollectedAtMu.RLock()

collectedAt, ok := lastCollectedAts[md.Key]

lastCollectedAtMu.RUnlock()

if ok {

// If the interval is zero, we assume the user just wants

// a single collection at startup, not a spinning loop.

if md.Interval == 0 {

return

}

intervalUnit := time.Second

// reportMetadataInterval is only less than a second in tests,

// so adjust the interval unit for them.

if a.reportMetadataInterval < time.Second {

intervalUnit = 100 * time.Millisecond

}

// The last collected value isn't quite stale yet, so we skip it.

if collectedAt.Add(time.Duration(md.Interval) * intervalUnit).After(time.Now()) {

return

}

timeout := md.Timeout

if timeout == 0 {

if md.Interval != 0 {

timeout = md.Interval

} else if interval := int64(a.reportMetadataInterval.Seconds()); interval != 0 {

// Fallback to the report interval

timeout = interval * 3

} else {

// If the interval is still 0 (possible if the interval

// is less than a second), default to 5. This was

// randomly picked.

timeout = 5

}

ctxTimeout := time.Duration(timeout) * time.Second

ctx, cancel := context.WithTimeout(ctx, ctxTimeout)

defer cancel()

now := time.Now()

select {

case <-ctx.Done():

logger.Warn(ctx, "metadata collection timed out", slog.F("timeout", ctxTimeout))

case metadataResults <- metadataResultAndKey{

key: md.Key,

result: a.collectMetadata(ctx, md, now),

lastCollectedAtMu.Lock()

lastCollectedAts[md.Key] = now

lastCollectedAtMu.Unlock()

}

})

}

}()

// Gather metadata updates and report them once every interval. If a

// previous report is in flight, wait for it to complete before

// sending a new one. If the network conditions are bad, we won't

// benefit from canceling the previous send and starting a new one.

var (

updatedMetadata = make(map[string]*codersdk.WorkspaceAgentMetadataResult)

reportTimeout = 30 * time.Second

reportError = make(chan error, 1)

reportInFlight = false

aAPI = proto.NewDRPCAgentClient(conn)

)

for {

select {

case <-ctx.Done():

return ctx.Err()

case mr := <-metadataResults:

// This can overwrite unsent values, but that's fine because

// we're only interested about up-to-date values.

updatedMetadata[mr.key] = mr.result

continue

case err := <-reportError:

a.logger.Debug(ctx, "batch update metadata complete", slog.Error(err))

if err != nil {

return xerrors.Errorf("failed to report metadata: %w", err)

}

reportInFlight = false

case <-report:

if len(updatedMetadata) == 0 {

continue

}

if reportInFlight {

// If there's already a report in flight, don't send

// another one, wait for next tick instead.

a.logger.Debug(ctx, "skipped metadata report tick because report is in flight")

continue

}

metadata := make([]*proto.Metadata, 0, len(updatedMetadata))

for key, result := range updatedMetadata {

pr := agentsdk.ProtoFromMetadataResult(*result)

metadata = append(metadata, &proto.Metadata{

Key: key,

Result: pr,

})

delete(updatedMetadata, key)

}

reportInFlight = true

go func() {

a.logger.Debug(ctx, "batch updating metadata")

ctx, cancel := context.WithTimeout(ctx, reportTimeout)

defer cancel()

_, err := aAPI.BatchUpdateMetadata(ctx, &proto.BatchUpdateMetadataRequest{Metadata: metadata})

reportError <- err

}()

}

// reportLifecycle reports the current lifecycle state once. All state

// changes are reported in order.

func (a *agent) reportLifecycle(ctx context.Context, conn drpc.Conn) error {

aAPI := proto.NewDRPCAgentClient(conn)

lastReportedIndex := 0 // Start off with the created state without reporting it.

for {

select {

case <-a.lifecycleUpdate:

case <-ctx.Done():

return ctx.Err()

}

for {

a.lifecycleMu.RLock()

lastIndex := len(a.lifecycleStates) - 1

report := a.lifecycleStates[lastReportedIndex]

if len(a.lifecycleStates) > lastReportedIndex+1 {

report = a.lifecycleStates[lastReportedIndex+1]

}

a.lifecycleMu.RUnlock()

if lastIndex == lastReportedIndex {

break

}

l, err := agentsdk.ProtoFromLifecycle(report)

if err != nil {

a.logger.Critical(ctx, "failed to convert lifecycle state", slog.F("report", report))

// Skip this report; there is no point retrying. Maybe we can successfully convert the next one?

lastReportedIndex++

continue

}

payload := &proto.UpdateLifecycleRequest{Lifecycle: l}

logger := a.logger.With(slog.F("payload", payload))

logger.Debug(ctx, "reporting lifecycle state")

_, err = aAPI.UpdateLifecycle(ctx, payload)

if err != nil {

return xerrors.Errorf("failed to update lifecycle: %w", err)

}

logger.Debug(ctx, "successfully reported lifecycle state")

lastReportedIndex++

select {

case a.lifecycleReported <- report.State:

case <-a.lifecycleReported:

a.lifecycleReported <- report.State

}

if lastReportedIndex < lastIndex {

// Keep reporting until we've sent all messages, we can't

// rely on the channel triggering us before the backlog is

// consumed.

continue

}

break

}

// setLifecycle sets the lifecycle state and notifies the lifecycle loop.

// The state is only updated if it's a valid state transition.

func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {

report := agentsdk.PostLifecycleRequest{

State: state,

ChangedAt: dbtime.Now(),

}

a.lifecycleMu.Lock()

lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]

if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {

a.logger.Warn(context.Background(), "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))

a.lifecycleMu.Unlock()

return

}

a.lifecycleStates = append(a.lifecycleStates, report)

a.logger.Debug(context.Background(), "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))

a.lifecycleMu.Unlock()

select {

case a.lifecycleUpdate <- struct{}{}:

default:

}

// fetchServiceBannerLoop fetches the service banner on an interval. It will

// not be fetched immediately; the expectation is that it is primed elsewhere

// (and must be done before the session actually starts).

func (a *agent) fetchServiceBannerLoop(ctx context.Context, conn drpc.Conn) error {

aAPI := proto.NewDRPCAgentClient(conn)

ticker := time.NewTicker(a.serviceBannerRefreshInterval)

defer ticker.Stop()

for {

select {

case <-ctx.Done():

return ctx.Err()

case <-ticker.C:

sbp, err := aAPI.GetServiceBanner(ctx, &proto.GetServiceBannerRequest{})

if err != nil {

if ctx.Err() != nil {

return ctx.Err()

}

a.logger.Error(ctx, "failed to update service banner", slog.Error(err))

return err

}

serviceBanner := agentsdk.ServiceBannerFromProto(sbp)

a.serviceBanner.Store(&serviceBanner)

}

func (a *agent) run() (retErr error) {

// This allows the agent to refresh it's token if necessary.

// For instance identity this is required, since the instance

// may not have re-provisioned, but a new agent ID was created.

sessionToken, err := a.exchangeToken(a.hardCtx)

if err != nil {

return xerrors.Errorf("exchange token: %w", err)

}

a.sessionToken.Store(&sessionToken)

// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs

conn, err := a.client.ConnectRPC(a.hardCtx)

if err != nil {

return err

}

defer func() {

cErr := conn.Close()

if cErr != nil {

a.logger.Debug(a.hardCtx, "error closing drpc connection", slog.Error(err))

}

}()

// A lot of routines need the agent API / tailnet API connection. We run them in their own

// goroutines in parallel, but errors in any routine will cause them all to exit so we can

// redial the coder server and retry.

connMan := newAPIConnRoutineManager(a.gracefulCtx, a.hardCtx, a.logger, conn)

connMan.start("init service banner", gracefulShutdownBehaviorStop,

func(ctx context.Context, conn drpc.Conn) error {

aAPI := proto.NewDRPCAgentClient(conn)

sbp, err := aAPI.GetServiceBanner(ctx, &proto.GetServiceBannerRequest{})

if err != nil {

return xerrors.Errorf("fetch service banner: %w", err)

}

serviceBanner := agentsdk.ServiceBannerFromProto(sbp)

a.serviceBanner.Store(&serviceBanner)

return nil

)

// sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by

// shutdown scripts.

connMan.start("send logs", gracefulShutdownBehaviorRemain,

func(ctx context.Context, conn drpc.Conn) error {

err := a.logSender.SendLoop(ctx, proto.NewDRPCAgentClient(conn))

if xerrors.Is(err, agentsdk.LogLimitExceededError) {

// we don't want this error to tear down the API connection and propagate to the

// other routines that use the API. The LogSender has already dropped a warning

// log, so just return nil here.

return nil

}

return err

})

// part of graceful shut down is reporting the final lifecycle states, e.g "ShuttingDown" so the

// lifecycle reporting has to be via gracefulShutdownBehaviorRemain

connMan.start("report lifecycle", gracefulShutdownBehaviorRemain, a.reportLifecycle)

// metadata reporting can cease as soon as we start gracefully shutting down

connMan.start("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata)

// channels to sync goroutines below

// handle manifest

// |

// manifestOK

// | |

// | +----------------------+

// V |

// app health reporter |

// V

// create or update network

// |

// networkOK

// |

// coordination <--------------------------+

// derp map subscriber <----------------+

// stats report loop <---------------+

networkOK := make(chan struct{})

manifestOK := make(chan struct{})

connMan.start("handle manifest", gracefulShutdownBehaviorStop, a.handleManifest(manifestOK))

connMan.start("app health reporter", gracefulShutdownBehaviorStop,

func(ctx context.Context, conn drpc.Conn) error {

select {

case <-ctx.Done():

return nil

case <-manifestOK:

manifest := a.manifest.Load()

NewWorkspaceAppHealthReporter(

a.logger, manifest.Apps, agentsdk.AppHealthPoster(proto.NewDRPCAgentClient(conn)),

)(ctx)

return nil

}

})

connMan.start("create or update network", gracefulShutdownBehaviorStop,

a.createOrUpdateNetwork(manifestOK, networkOK))

connMan.start("coordination", gracefulShutdownBehaviorStop,

func(ctx context.Context, conn drpc.Conn) error {

select {

case <-ctx.Done():

return nil

case <-networkOK:

}

return a.runCoordinator(ctx, conn, a.network)

)

connMan.start("derp map subscriber", gracefulShutdownBehaviorStop,

func(ctx context.Context, conn drpc.Conn) error {

select {

case <-ctx.Done():

return nil

case <-networkOK:

}

return a.runDERPMapSubscriber(ctx, conn, a.network)

})

connMan.start("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop)

connMan.start("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, conn drpc.Conn) error {

select {

case <-ctx.Done():

return nil

case <-networkOK:

}

return a.statsReporter.reportLoop(ctx, proto.NewDRPCAgentClient(conn))

})

return connMan.wait()

}

// handleManifest returns a function that fetches and processes the manifest

func (a *agent) handleManifest(manifestOK chan<- struct{}) func(ctx context.Context, conn drpc.Conn) error {

return func(ctx context.Context, conn drpc.Conn) error {

aAPI := proto.NewDRPCAgentClient(conn)

mp, err := aAPI.GetManifest(ctx, &proto.GetManifestRequest{})

if err != nil {

return xerrors.Errorf("fetch metadata: %w", err)

}

a.logger.Info(ctx, "fetched manifest", slog.F("manifest", mp))

manifest, err := agentsdk.ManifestFromProto(mp)

if err != nil {

a.logger.Critical(ctx, "failed to convert manifest", slog.F("manifest", mp), slog.Error(err))

return xerrors.Errorf("convert manifest: %w", err)

}

if manifest.AgentID == uuid.Nil {

return xerrors.New("nil agentID returned by manifest")

}

a.client.RewriteDERPMap(manifest.DERPMap)

// Expand the directory and send it back to coderd so external

// applications that rely on the directory can use it.

// An example is VS Code Remote, which must know the directory

// before initializing a connection.

manifest.Directory, err = expandDirectory(manifest.Directory)

if err != nil {

return xerrors.Errorf("expand directory: %w", err)

}

subsys, err := agentsdk.ProtoFromSubsystems(a.subsystems)

if err != nil {

a.logger.Critical(ctx, "failed to convert subsystems", slog.Error(err))

return xerrors.Errorf("failed to convert subsystems: %w", err)

}

_, err = aAPI.UpdateStartup(ctx, &proto.UpdateStartupRequest{Startup: &proto.Startup{

Version: buildinfo.Version(),

ExpandedDirectory: manifest.Directory,

Subsystems: subsys,

}})

if err != nil {

if xerrors.Is(err, context.Canceled) {

return nil

}

return xerrors.Errorf("update workspace agent startup: %w", err)

}

oldManifest := a.manifest.Swap(&manifest)

close(manifestOK)

// The startup script should only execute on the first run!

if oldManifest == nil {

a.setLifecycle(codersdk.WorkspaceAgentLifecycleStarting)

// Perform overrides early so that Git auth can work even if users

// connect to a workspace that is not yet ready. We don't run this

// concurrently with the startup script to avoid conflicts between

// them.

if manifest.GitAuthConfigs > 0 {

// If this fails, we should consider surfacing the error in the

// startup log and setting the lifecycle state to be "start_error"

// (after startup script completion), but for now we'll just log it.

err := gitauth.OverrideVSCodeConfigs(a.filesystem)

if err != nil {

a.logger.Warn(ctx, "failed to override vscode git auth configs", slog.Error(err))

}

err = a.scriptRunner.Init(manifest.Scripts)

if err != nil {

return xerrors.Errorf("init script runner: %w", err)

}

err = a.trackGoroutine(func() {

start := time.Now()

// here we use the graceful context because the script runner is not directly tied

// to the agent API.

err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {

return script.RunOnStart

})

// Measure the time immediately after the script has finished

dur := time.Since(start).Seconds()

if err != nil {

a.logger.Warn(ctx, "startup script(s) failed", slog.Error(err))

if errors.Is(err, agentscripts.ErrTimeout) {

a.setLifecycle(codersdk.WorkspaceAgentLifecycleStartTimeout)

} else {

a.setLifecycle(codersdk.WorkspaceAgentLifecycleStartError)

}

} else {

a.setLifecycle(codersdk.WorkspaceAgentLifecycleReady)

}

label := "false"

if err == nil {

label = "true"

}

a.metrics.startupScriptSeconds.WithLabelValues(label).Set(dur)

a.scriptRunner.StartCron()

})

if err != nil {

return xerrors.Errorf("track conn goroutine: %w", err)

}

return nil

}

// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates

// the tailnet using the information in the manifest

func (a *agent) createOrUpdateNetwork(manifestOK <-chan struct{}, networkOK chan<- struct{}) func(context.Context, drpc.Conn) error {

return func(ctx context.Context, _ drpc.Conn) error {

select {

case <-ctx.Done():

return nil

case <-manifestOK:

}

var err error

manifest := a.manifest.Load()

a.closeMutex.Lock()

network := a.network

a.closeMutex.Unlock()

if network == nil {

// use the graceful context here, because creating the tailnet is not itself tied to the

// agent API.

network, err = a.createTailnet(a.gracefulCtx, manifest.AgentID, manifest.DERPMap, manifest.DERPForceWebSockets, manifest.DisableDirectConnections)

if err != nil {

return xerrors.Errorf("create tailnet: %w", err)

}

a.closeMutex.Lock()

// Re-check if agent was closed while initializing the network.

closed := a.isClosed()

if !closed {

a.network = network

a.statsReporter = newStatsReporter(a.logger, network, a)

}

a.closeMutex.Unlock()

if closed {

_ = network.Close()

return xerrors.New("agent is closed")

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

agent.go

Latest commit

History

agent.go

File metadata and controls