@@ -12,7 +12,7 @@ import {
1212} from "@trigger.dev/core/v3" ;
1313import { ZodNamespace } from "@trigger.dev/core/v3/zodNamespace" ;
1414import { ZodSocketConnection } from "@trigger.dev/core/v3/zodSocket" ;
15- import { HttpReply , getTextBody , SimpleLogger } from "@trigger.dev/core-apps" ;
15+ import { HttpReply , getTextBody , SimpleLogger , testDockerCheckpoint } from "@trigger.dev/core-apps" ;
1616import { ExponentialBackoff } from "./backoff" ;
1717
1818import { collectDefaultMetrics , register , Gauge } from "prom-client" ;
@@ -72,7 +72,10 @@ type CheckpointAndPushOptions = {
7272
7373type CheckpointAndPushResult =
7474 | { success : true ; checkpoint : CheckpointData }
75- | { success : false ; reason ?: "CANCELED" | "DISABLED" | "ERROR" | "IN_PROGRESS" | "NO_SUPPORT" } ;
75+ | {
76+ success : false ;
77+ reason ?: "CANCELED" | "DISABLED" | "ERROR" | "IN_PROGRESS" | "NO_SUPPORT" | "SKIP_RETRYING" ;
78+ } ;
7679
7780type CheckpointData = {
7881 location : string ;
@@ -125,65 +128,53 @@ class Checkpointer {
125128
126129 constructor ( private opts = { forceSimulate : false } ) { }
127130
128- async initialize ( ) : Promise < CheckpointerInitializeReturn > {
131+ async init ( ) : Promise < CheckpointerInitializeReturn > {
129132 if ( this . #initialized) {
130- return this . #getInitializeReturn ( ) ;
133+ return this . #getInitReturn ( this . #canCheckpoint ) ;
131134 }
132135
133136 this . #logger. log ( `${ this . #dockerMode ? "Docker" : "Kubernetes" } mode` ) ;
134137
135138 if ( this . #dockerMode) {
136- try {
137- await $ `criu --version` ;
138- } catch ( error ) {
139- this . #logger. error ( "No checkpoint support: Missing CRIU binary" ) ;
140- this . #logger. error ( "Will simulate instead" ) ;
141- this . #canCheckpoint = false ;
142- this . #initialized = true ;
139+ const testCheckpoint = await testDockerCheckpoint ( ) ;
143140
144- return this . #getInitializeReturn( ) ;
141+ if ( testCheckpoint . ok ) {
142+ return this . #getInitReturn( true ) ;
145143 }
146144
147- try {
148- await $ `docker checkpoint` ;
149- } catch ( error ) {
150- this . #logger. error (
151- "No checkpoint support: Docker needs to have experimental features enabled"
152- ) ;
153- this . #logger. error ( "Will simulate instead" ) ;
154- this . #canCheckpoint = false ;
155- this . #initialized = true ;
156-
157- return this . #getInitializeReturn( ) ;
158- }
145+ this . #logger. error ( testCheckpoint . message , testCheckpoint . error ?? "" ) ;
146+ return this . #getInitReturn( false ) ;
159147 } else {
160148 try {
161149 await $ `buildah login --get-login ${ REGISTRY_HOST } ` ;
162150 } catch ( error ) {
163151 this . #logger. error ( `No checkpoint support: Not logged in to registry ${ REGISTRY_HOST } ` ) ;
164- this . #canCheckpoint = false ;
165- this . #initialized = true ;
166-
167- return this . #getInitializeReturn( ) ;
152+ return this . #getInitReturn( false ) ;
168153 }
169154 }
170155
171- this . #logger. log (
172- `Full checkpoint support${
173- this . #dockerMode && this . opts . forceSimulate ? " with forced simulation enabled." : "!"
174- } `
175- ) ;
156+ return this . #getInitReturn( true ) ;
157+ }
176158
159+ #getInitReturn( canCheckpoint : boolean ) : CheckpointerInitializeReturn {
177160 this . #initialized = true ;
178- this . #canCheckpoint = true ;
161+ this . #canCheckpoint = canCheckpoint ;
179162
180- return this . #getInitializeReturn( ) ;
181- }
163+ if ( canCheckpoint ) {
164+ this . #logger. log ( "Full checkpoint support!" ) ;
165+ }
166+
167+ const willSimulate = this . #dockerMode && ( ! this . #canCheckpoint || this . opts . forceSimulate ) ;
168+
169+ if ( willSimulate ) {
170+ this . #logger. log ( "Simulation mode enabled. Containers will be paused, not checkpointed." , {
171+ forceSimulate : this . opts . forceSimulate ,
172+ } ) ;
173+ }
182174
183- #getInitializeReturn( ) : CheckpointerInitializeReturn {
184175 return {
185- canCheckpoint : this . #canCheckpoint ,
186- willSimulate : this . #dockerMode && ( ! this . #canCheckpoint || this . opts . forceSimulate ) ,
176+ canCheckpoint,
177+ willSimulate,
187178 } ;
188179 }
189180
@@ -327,6 +318,11 @@ class Checkpointer {
327318 return result ;
328319 }
329320
321+ if ( result . reason === "SKIP_RETRYING" ) {
322+ this . #logger. log ( "Skipping retrying" , { runId } ) ;
323+ return result ;
324+ }
325+
330326 continue ;
331327 } catch ( error ) {
332328 this . #logger. error ( "Checkpoint error" , {
@@ -355,7 +351,7 @@ class Checkpointer {
355351 projectRef,
356352 deploymentVersion,
357353 } : CheckpointAndPushOptions ) : Promise < CheckpointAndPushResult > {
358- await this . initialize ( ) ;
354+ await this . init ( ) ;
359355
360356 const options = {
361357 runId,
@@ -473,7 +469,8 @@ class Checkpointer {
473469
474470 // Create checkpoint (CRI)
475471 if ( ! this . #canCheckpoint) {
476- throw new Error ( "No checkpoint support in kubernetes mode." ) ;
472+ this . #logger. error ( "No checkpoint support in kubernetes mode." ) ;
473+ return { success : false , reason : "SKIP_RETRYING" } ;
477474 }
478475
479476 const containerId = this . #logger. debug (
@@ -484,7 +481,8 @@ class Checkpointer {
484481 ) ;
485482
486483 if ( ! containerId . stdout ) {
487- throw new Error ( "could not find container id" ) ;
484+ this . #logger. error ( "could not find container id" , { options, containterName } ) ;
485+ return { success : false , reason : "SKIP_RETRYING" } ;
488486 }
489487
490488 const start = performance . now ( ) ;
@@ -617,7 +615,7 @@ class TaskCoordinator {
617615 private host = "0.0.0.0"
618616 ) {
619617 this . #httpServer = this . #createHttpServer( ) ;
620- this . #checkpointer. initialize ( ) ;
618+ this . #checkpointer. init ( ) ;
621619 this . #delayThresholdInMs = this . #getDelayThreshold( ) ;
622620
623621 if ( process . env . DELAY_THRESHOLD_IN_MS ) {
@@ -1034,7 +1032,7 @@ class TaskCoordinator {
10341032 return ;
10351033 }
10361034
1037- const { canCheckpoint, willSimulate } = await this . #checkpointer. initialize ( ) ;
1035+ const { canCheckpoint, willSimulate } = await this . #checkpointer. init ( ) ;
10381036
10391037 const willCheckpointAndRestore = canCheckpoint || willSimulate ;
10401038
@@ -1131,7 +1129,7 @@ class TaskCoordinator {
11311129 return ;
11321130 }
11331131
1134- const { canCheckpoint, willSimulate } = await this . #checkpointer. initialize ( ) ;
1132+ const { canCheckpoint, willSimulate } = await this . #checkpointer. init ( ) ;
11351133
11361134 const willCheckpointAndRestore = canCheckpoint || willSimulate ;
11371135
@@ -1185,7 +1183,7 @@ class TaskCoordinator {
11851183 socket . on ( "WAIT_FOR_TASK" , async ( message , callback ) => {
11861184 logger . log ( "[WAIT_FOR_TASK]" , message ) ;
11871185
1188- const { canCheckpoint, willSimulate } = await this . #checkpointer. initialize ( ) ;
1186+ const { canCheckpoint, willSimulate } = await this . #checkpointer. init ( ) ;
11891187
11901188 const willCheckpointAndRestore = canCheckpoint || willSimulate ;
11911189
@@ -1227,7 +1225,7 @@ class TaskCoordinator {
12271225 socket . on ( "WAIT_FOR_BATCH" , async ( message , callback ) => {
12281226 logger . log ( "[WAIT_FOR_BATCH]" , message ) ;
12291227
1230- const { canCheckpoint, willSimulate } = await this . #checkpointer. initialize ( ) ;
1228+ const { canCheckpoint, willSimulate } = await this . #checkpointer. init ( ) ;
12311229
12321230 const willCheckpointAndRestore = canCheckpoint || willSimulate ;
12331231
0 commit comments