@@ -49,7 +49,11 @@ const PLATFORM_SECRET = process.env.PLATFORM_SECRET || "coordinator-secret";
4949const SECURE_CONNECTION = [ "1" , "true" ] . includes ( process . env . SECURE_CONNECTION ?? "false" ) ;
5050
5151const logger = new SimpleLogger ( `[${ NODE_NAME } ]` ) ;
52- const chaosMonkey = new ChaosMonkey ( ! ! process . env . CHAOS_MONKEY_ENABLED ) ;
52+ const chaosMonkey = new ChaosMonkey (
53+ ! ! process . env . CHAOS_MONKEY_ENABLED ,
54+ ! ! process . env . CHAOS_MONKEY_DISABLE_ERRORS ,
55+ ! ! process . env . CHAOS_MONKEY_DISABLE_DELAYS
56+ ) ;
5357
5458class TaskCoordinator {
5559 #httpServer: ReturnType < typeof createServer > ;
@@ -290,6 +294,7 @@ class TaskCoordinator {
290294 setSocketDataFromHeader ( "projectRef" , "x-trigger-project-ref" ) ;
291295 setSocketDataFromHeader ( "runId" , "x-trigger-run-id" ) ;
292296 setSocketDataFromHeader ( "attemptFriendlyId" , "x-trigger-attempt-friendly-id" , false ) ;
297+ setSocketDataFromHeader ( "attemptNumber" , "x-trigger-attempt-number" , false ) ;
293298 setSocketDataFromHeader ( "envId" , "x-trigger-env-id" ) ;
294299 setSocketDataFromHeader ( "deploymentId" , "x-trigger-deployment-id" ) ;
295300 setSocketDataFromHeader ( "deploymentVersion" , "x-trigger-deployment-version" ) ;
@@ -306,6 +311,10 @@ class TaskCoordinator {
306311 onConnection : async ( socket , handler , sender ) => {
307312 const logger = new SimpleLogger ( `[prod-worker][${ socket . id } ]` ) ;
308313
314+ const getAttemptNumber = ( ) => {
315+ return socket . data . attemptNumber ? parseInt ( socket . data . attemptNumber ) : undefined ;
316+ } ;
317+
309318 const crashRun = async ( error : { name : string ; message : string ; stack ?: string } ) => {
310319 try {
311320 this . #platformSocket?. send ( "RUN_CRASHED" , {
@@ -381,6 +390,10 @@ class TaskCoordinator {
381390 socket . data . attemptFriendlyId = attemptFriendlyId ;
382391 } ;
383392
393+ const updateAttemptNumber = ( attemptNumber : string | number ) => {
394+ socket . data . attemptNumber = String ( attemptNumber ) ;
395+ } ;
396+
384397 this . #platformSocket?. send ( "LOG" , {
385398 metadata : socket . data ,
386399 text : "connected" ,
@@ -430,6 +443,7 @@ class TaskCoordinator {
430443 } ) ;
431444
432445 updateAttemptFriendlyId ( executionAck . payload . execution . attempt . id ) ;
446+ updateAttemptNumber ( executionAck . payload . execution . attempt . number ) ;
433447 } catch ( error ) {
434448 logger . error ( "Error" , { error } ) ;
435449
@@ -505,11 +519,17 @@ class TaskCoordinator {
505519
506520 updateAttemptFriendlyId ( message . attemptFriendlyId ) ;
507521
508- this . #platformSocket?. send ( "READY_FOR_RESUME" , message ) ;
522+ if ( message . version === "v2" ) {
523+ updateAttemptNumber ( message . attemptNumber ) ;
524+ }
525+
526+ this . #platformSocket?. send ( "READY_FOR_RESUME" , { ...message , version : "v1" } ) ;
509527 } ) ;
510528
511529 // MARK: RUN COMPLETED
512- socket . on ( "TASK_RUN_COMPLETED" , async ( { completion, execution } , callback ) => {
530+ socket . on ( "TASK_RUN_COMPLETED" , async ( message , callback ) => {
531+ const { completion, execution } = message ;
532+
513533 logger . log ( "completed task" , { completionId : completion . id } ) ;
514534
515535 // Cancel all in-progress checkpoints (if any)
@@ -518,8 +538,10 @@ class TaskCoordinator {
518538 await chaosMonkey . call ( { throwErrors : false } ) ;
519539
520540 const completeWithoutCheckpoint = ( shouldExit : boolean ) => {
541+ const supportsRetryCheckpoints = message . version === "v1" ;
542+
521543 this . #platformSocket?. send ( "TASK_RUN_COMPLETED" , {
522- version : "v1" ,
544+ version : supportsRetryCheckpoints ? "v1" : "v2 ",
523545 execution,
524546 completion,
525547 } ) ;
@@ -549,6 +571,11 @@ class TaskCoordinator {
549571 return ;
550572 }
551573
574+ if ( message . version === "v2" ) {
575+ completeWithoutCheckpoint ( true ) ;
576+ return ;
577+ }
578+
552579 const { canCheckpoint, willSimulate } = await this . #checkpointer. init ( ) ;
553580
554581 const willCheckpointAndRestore = canCheckpoint || willSimulate ;
@@ -681,6 +708,7 @@ class TaskCoordinator {
681708 runId : socket . data . runId ,
682709 projectRef : socket . data . projectRef ,
683710 deploymentVersion : socket . data . deploymentVersion ,
711+ attemptNumber : getAttemptNumber ( ) ,
684712 } ) ;
685713
686714 if ( ! checkpoint ) {
@@ -752,6 +780,7 @@ class TaskCoordinator {
752780 runId : socket . data . runId ,
753781 projectRef : socket . data . projectRef ,
754782 deploymentVersion : socket . data . deploymentVersion ,
783+ attemptNumber : getAttemptNumber ( ) ,
755784 } ) ;
756785
757786 if ( ! checkpoint ) {
@@ -821,6 +850,7 @@ class TaskCoordinator {
821850 runId : socket . data . runId ,
822851 projectRef : socket . data . projectRef ,
823852 deploymentVersion : socket . data . deploymentVersion ,
853+ attemptNumber : getAttemptNumber ( ) ,
824854 } ) ;
825855
826856 if ( ! checkpoint ) {
@@ -905,6 +935,7 @@ class TaskCoordinator {
905935 }
906936
907937 updateAttemptFriendlyId ( createAttempt . executionPayload . execution . attempt . id ) ;
938+ updateAttemptNumber ( createAttempt . executionPayload . execution . attempt . number ) ;
908939
909940 callback ( {
910941 success : true ,
@@ -924,6 +955,10 @@ class TaskCoordinator {
924955 if ( message . attemptFriendlyId ) {
925956 updateAttemptFriendlyId ( message . attemptFriendlyId ) ;
926957 }
958+
959+ if ( message . attemptNumber ) {
960+ updateAttemptNumber ( message . attemptNumber ) ;
961+ }
927962 } ) ;
928963 } ,
929964 onDisconnect : async ( socket , handler , sender , logger ) => {
0 commit comments