Skip to content

Commit 91fb38e

Browse files
committed
remote_worker: Serialize fork() calls. Fixes bazelbuild#3356
Spawning processes concurrently from multiple threads doesn't work reliably on Linux (see bug for details). This change introduces simply puts a synchronized block around fork(). I have tested this change by repeatedly running a build of bazel for 2 hours on a 64 core machine with up to 200 concurrent actions. There wasn't a single failure. PiperOrigin-RevId: 164450559
1 parent 904563c commit 91fb38e

File tree

1 file changed

+34
-30
lines changed

1 file changed

+34
-30
lines changed

src/tools/remote_worker/src/main/java/com/google/devtools/build/remote/ExecutionServer.java

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import com.google.devtools.build.lib.shell.Command;
3333
import com.google.devtools.build.lib.shell.CommandException;
3434
import com.google.devtools.build.lib.shell.CommandResult;
35+
import com.google.devtools.build.lib.shell.FutureCommandResult;
3536
import com.google.devtools.build.lib.shell.TimeoutKillableObserver;
3637
import com.google.devtools.build.lib.vfs.FileSystemUtils;
3738
import com.google.devtools.build.lib.vfs.Path;
@@ -49,6 +50,7 @@
4950
import io.grpc.StatusException;
5051
import io.grpc.protobuf.StatusProto;
5152
import io.grpc.stub.StreamObserver;
53+
import java.io.ByteArrayInputStream;
5254
import java.io.ByteArrayOutputStream;
5355
import java.io.File;
5456
import java.io.IOException;
@@ -71,6 +73,8 @@
7173
final class ExecutionServer extends ExecutionImplBase {
7274
private static final Logger logger = Logger.getLogger(ExecutionServer.class.getName());
7375

76+
private final Object lock = new Object();
77+
7478
// The name of the container image entry in the Platform proto
7579
// (see third_party/googleapis/devtools/remoteexecution/*/remote_execution.proto and
7680
// experimental_remote_platform_override in
@@ -200,41 +204,41 @@ private ActionResult execute(Action action, Path execRoot)
200204
execRoot.getPathString());
201205
long startTime = System.currentTimeMillis();
202206
CommandResult cmdResult = null;
203-
// Linux does not provide a safe API for a multi-threaded program to fork a subprocess. Consider
204-
// the case where two threads both write an executable file and then try to execute it. It can
205-
// happen that the first thread writes its executable file, with the file descriptor still
206-
// being open when the second thread forks, with the fork inheriting a copy of the file
207-
// descriptor. Then the first thread closes the original file descriptor, and proceeds to
208-
// execute the file. At that point Linux sees an open file descriptor to the file and returns
209-
// ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec duality, with
210-
// fork always inheriting a copy of the file descriptor table; if there was a way to fork
211-
// without copying the entire file descriptor table (e.g., only copy specific entries), we could
212-
// avoid this race.
213-
//
214-
// I was able to reproduce this problem reliably by running significantly more threads than
215-
// there are CPU cores on my workstation - the more threads the more likely it happens.
216-
//
217-
// As a workaround, we retry up to two times before we let the exception propagate.
218-
int attempt = 0;
219-
while (true) {
207+
208+
FutureCommandResult futureCmdResult = null;
209+
synchronized (lock) {
210+
// Linux does not provide a safe API for a multi-threaded program to fork a subprocess.
211+
// Consider the case where two threads both write an executable file and then try to execute
212+
// it. It can happen that the first thread writes its executable file, with the file
213+
// descriptor still being open when the second thread forks, with the fork inheriting a copy
214+
// of the file descriptor. Then the first thread closes the original file descriptor, and
215+
// proceeds to execute the file. At that point Linux sees an open file descriptor to the file
216+
// and returns ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec
217+
// duality, with fork always inheriting a copy of the file descriptor table; if there was a
218+
// way to fork without copying the entire file descriptor table (e.g., only copy specific
219+
// entries), we could avoid this race.
220+
//
221+
// I was able to reproduce this problem reliably by running significantly more threads than
222+
// there are CPU cores on my workstation - the more threads the more likely it happens.
223+
//
224+
// As a workaround, we put a synchronized block around the fork.
225+
try {
226+
futureCmdResult = cmd
227+
.executeAsynchronously(new ByteArrayInputStream(new byte[0]), Command.NO_OBSERVER,
228+
stdoutBuffer, stderrBuffer, true, false);
229+
} catch (CommandException e) {
230+
Throwables.throwIfInstanceOf(e.getCause(), IOException.class);
231+
}
232+
}
233+
234+
if (futureCmdResult != null) {
220235
try {
221-
cmdResult =
222-
cmd.execute(Command.NO_INPUT, Command.NO_OBSERVER, stdoutBuffer, stderrBuffer, true);
236+
cmdResult = futureCmdResult.get();
223237
} catch (AbnormalTerminationException e) {
224238
cmdResult = e.getResult();
225-
} catch (CommandException e) {
226-
// As of this writing, the cause can only be an IOException from the underlying library.
227-
IOException cause = (IOException) e.getCause();
228-
if ((attempt++ < 3) && cause.getMessage().endsWith("Text file busy")) {
229-
// We wait a bit to give the other forks some time to close their open file descriptors.
230-
Thread.sleep(10);
231-
continue;
232-
} else {
233-
throw cause;
234-
}
235239
}
236-
break;
237240
}
241+
238242
long timeoutMillis =
239243
action.hasTimeout()
240244
? Durations.toMillis(action.getTimeout())

0 commit comments

Comments
 (0)