Skip to content

Commit 6972d07

Browse files
authored
fix: improve background process handling for agent tools (#23132)
## Problem Models frequently use shell `&` instead of `run_in_background=true` when starting long-running processes through `/agents`, causing them to die shortly after starting. This happens because: 1. **No guidance in tool schema** — The `ExecuteArgs` struct had zero `description` tags. The model saw `run_in_background: boolean (optional)` with no explanation of when/why to use it. 2. **Shell `&` is silently broken** — `sh -c "command &"` forks the process, the shell exits immediately, and the forked child becomes an orphan not tracked by the process manager. 3. **No process group isolation** — The SSH subsystem sets `Setsid: true` on spawned processes, but the agent process manager set no `SysProcAttr` at all. Signals only hit the top-level `sh`, not child processes. ## Investigation Compared our implementation against **openai/codex** and **coder/mux**: | Aspect | codex | mux | coder/coder (before) | |--------|-------|-----|---------------------| | Background flag | Yield/resume with `session_id` | `run_in_background` with rich description | `run_in_background` with **no description** | | `&` handling | `setsid()` + `killpg()` | `detached: true` + `killProcessTree()` | **Nothing** — orphaned children escape | | Process isolation | `setsid()` on every spawn | `set -m; nohup ... setsid` for background | **No `SysProcAttr` at all** | | Signal delivery | `killpg(pgid, sig)` — entire group | `kill -15 -\$pid` — negative PID | `proc.cmd.Process.Signal()` — **PID only** | ## Changes ### Fix 1: Add descriptions to `ExecuteArgs` (highest impact) The model now sees explicit guidance: *"Use for long-running processes like dev servers, file watchers, or builds. Do NOT use shell & — it will not work correctly."* ### Fix 2: Update tool description The top-level execute tool description now reinforces: *"Use run_in_background=true for long-running processes. Never use shell '&' for backgrounding."* ### Fix 3: Detect trailing `&` and auto-promote to background Defense-in-depth: if the model still uses `command &`, we strip the `&` and promote to `run_in_background=true` automatically. Correctly distinguishes `&` from `&&`. ### Fix 4: Process group isolation (`Setpgid`) New platform-specific files (`proc_other.go` / `proc_windows.go`) following the same pattern as `agentssh/exec_other.go`. Every spawned process gets its own process group. ### Fix 5: Process group signaling `signal()` now uses `syscall.Kill(-pid, sig)` on Unix to signal the entire process group, ensuring child processes from shell pipelines are also cleaned up. ## Testing All existing `agent/agentproc` tests pass. Both packages compile cleanly.
1 parent 89bb5bb commit 6972d07

4 files changed

Lines changed: 68 additions & 9 deletions

File tree

agent/agentproc/proc_other.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//go:build !windows
2+
3+
package agentproc
4+
5+
import (
6+
"os"
7+
"syscall"
8+
)
9+
10+
// procSysProcAttr returns the SysProcAttr to use when spawning
11+
// processes. On Unix, Setpgid creates a new process group so
12+
// that signals can be delivered to the entire group (the shell
13+
// and all its children).
14+
func procSysProcAttr() *syscall.SysProcAttr {
15+
return &syscall.SysProcAttr{
16+
Setpgid: true,
17+
}
18+
}
19+
20+
// signalProcess sends a signal to the process group rooted at p.
21+
// Using the negative PID sends the signal to every process in the
22+
// group, ensuring child processes (e.g. from shell pipelines) are
23+
// also signaled.
24+
func signalProcess(p *os.Process, sig syscall.Signal) error {
25+
return syscall.Kill(-p.Pid, sig)
26+
}

agent/agentproc/proc_windows.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package agentproc
2+
3+
import (
4+
"os"
5+
"syscall"
6+
)
7+
8+
// procSysProcAttr returns the SysProcAttr to use when spawning
9+
// processes. On Windows, process groups are not supported in the
10+
// same way as Unix, so this returns an empty struct.
11+
func procSysProcAttr() *syscall.SysProcAttr {
12+
return &syscall.SysProcAttr{}
13+
}
14+
15+
// signalProcess sends a signal directly to the process. Windows
16+
// does not support process group signaling, so we fall back to
17+
// sending the signal to the process itself.
18+
func signalProcess(p *os.Process, _ syscall.Signal) error {
19+
return p.Kill()
20+
}

agent/agentproc/process.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ func (m *manager) start(req workspacesdk.StartProcessRequest, chatID string) (*p
113113
cmd.Dir = req.WorkDir
114114
}
115115
cmd.Stdin = nil
116+
cmd.SysProcAttr = procSysProcAttr()
116117

117118
// WaitDelay ensures cmd.Wait returns promptly after
118119
// the process is killed, even if child processes are
@@ -272,13 +273,15 @@ func (m *manager) signal(id string, sig string) error {
272273

273274
switch sig {
274275
case "kill":
275-
if err := proc.cmd.Process.Kill(); err != nil {
276+
// Use process group kill to ensure child processes
277+
// (e.g. from shell pipelines) are also killed.
278+
if err := signalProcess(proc.cmd.Process, syscall.SIGKILL); err != nil {
276279
return xerrors.Errorf("kill process: %w", err)
277280
}
278281
case "terminate":
279-
//nolint:revive // syscall.SIGTERM is portable enough
280-
// for our supported platforms.
281-
if err := proc.cmd.Process.Signal(syscall.SIGTERM); err != nil {
282+
// Use process group signal to ensure child processes
283+
// are also terminated.
284+
if err := signalProcess(proc.cmd.Process, syscall.SIGTERM); err != nil {
282285
return xerrors.Errorf("terminate process: %w", err)
283286
}
284287
default:

coderd/chatd/chattool/execute.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,18 +78,18 @@ type ProcessToolOptions struct {
7878

7979
// ExecuteArgs are the parameters accepted by the execute tool.
8080
type ExecuteArgs struct {
81-
Command string `json:"command"`
82-
Timeout *string `json:"timeout,omitempty"`
83-
WorkDir *string `json:"workdir,omitempty"`
84-
RunInBackground *bool `json:"run_in_background,omitempty"`
81+
Command string `json:"command" description:"The shell command to execute."`
82+
Timeout *string `json:"timeout,omitempty" description:"Timeout duration (e.g. '30s', '5m'). Default is 10s. Only applies to foreground commands."`
83+
WorkDir *string `json:"workdir,omitempty" description:"Working directory for the command."`
84+
RunInBackground *bool `json:"run_in_background,omitempty" description:"Run this command in the background without blocking. Use for long-running processes like dev servers, file watchers, or builds that run longer than 5 seconds. Do NOT use shell & to background processes — it will not work correctly. Always use this parameter instead."`
8585
}
8686

8787
// Execute returns an AgentTool that runs a shell command in the
8888
// workspace via the agent HTTP API.
8989
func Execute(options ExecuteOptions) fantasy.AgentTool {
9090
return fantasy.NewAgentTool(
9191
"execute",
92-
"Execute a shell command in the workspace.",
92+
"Execute a shell command in the workspace. Use run_in_background=true for long-running processes (dev servers, file watchers, builds). Never use shell '&' for backgrounding.",
9393
func(ctx context.Context, args ExecuteArgs, _ fantasy.ToolCall) (fantasy.ToolResponse, error) {
9494
if options.GetWorkspaceConn == nil {
9595
return fantasy.NewTextErrorResponse("workspace connection resolver is not configured"), nil
@@ -122,6 +122,16 @@ func executeTool(
122122

123123
background := args.RunInBackground != nil && *args.RunInBackground
124124

125+
// Detect shell-style backgrounding (trailing &) and promote to
126+
// background mode. Models sometimes use "cmd &" instead of the
127+
// run_in_background parameter, which causes the shell to fork
128+
// and exit immediately, leaving an untracked orphan process.
129+
trimmed := strings.TrimSpace(args.Command)
130+
if !background && strings.HasSuffix(trimmed, "&") && !strings.HasSuffix(trimmed, "&&") {
131+
background = true
132+
args.Command = strings.TrimSpace(strings.TrimSuffix(trimmed, "&"))
133+
}
134+
125135
var workDir string
126136
if args.WorkDir != nil {
127137
workDir = *args.WorkDir

0 commit comments

Comments
 (0)