Skip to content

Commit 3b6a2dc

Browse files
authored
Add sampled logging when workflow query task times out (temporalio#5521)
## What changed? Adding a sampled logging when a workflow query task times out. ## Why? Need this to be able to debug issues in the future when workflow queries timeout. ## How did you test it? <!-- How have you verified this change? Tested locally? Added a unit test? Checked in staging env? --> ## Potential risks <!-- Assuming the worst case, what can be broken when deploying this change to production? --> ## Documentation <!-- Have you made sure this change doesn't falsify anything currently stated in `docs/`? If significant new behavior is added, have you described that in `docs/`? --> ## Is hotfix candidate? <!-- Is this PR a hotfix candidate or does it require a notification to be sent to the broader community? (Yes/No) -->
1 parent 243ae88 commit 3b6a2dc

5 files changed

Lines changed: 28 additions & 0 deletions

File tree

common/dynamicconfig/constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,9 @@ const (
480480
// MatchingMembershipUnloadDelay is how long to wait to re-confirm loss of ownership before unloading a task queue.
481481
// Set to zero to disable proactive unload.
482482
MatchingMembershipUnloadDelay = "matching.membershipUnloadDelay"
483+
// MatchingQueryWorkflowTaskTimeoutLogRate defines the sampling rate for logs when a query workflow task times out. Since
484+
// these log lines can be noisy, we want to be able to turn on and sample selectively for each affected namespace.
485+
MatchingQueryWorkflowTaskTimeoutLogRate = "matching.queryWorkflowTaskTimeoutLogRate"
483486

484487
// for matching testing only:
485488

config/dynamicconfig/development-cass.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,5 @@ frontend.adminDeleteAccessHistoryFraction:
4646
- value: 1.0
4747
frontend.enableNexusHTTPHandler:
4848
- value: true
49+
matching.queryWorkflowTaskTimeoutLogRate:
50+
- value: 1.0

config/dynamicconfig/development-sql.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,5 @@ frontend.adminDeleteAccessHistoryFraction:
4949
- value: 1.0
5050
frontend.enableNexusHTTPHandler:
5151
- value: true
52+
matching.queryWorkflowTaskTimeoutLogRate:
53+
- value: 1.0

service/matching/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ type (
7272
BacklogNegligibleAge dynamicconfig.DurationPropertyFnWithTaskQueueInfoFilters
7373
MaxWaitForPollerBeforeFwd dynamicconfig.DurationPropertyFnWithTaskQueueInfoFilters
7474
QueryPollerUnavailableWindow dynamicconfig.DurationPropertyFn
75+
QueryWorkflowTaskTimeoutLogRate dynamicconfig.FloatPropertyFnWithTaskQueueInfoFilters
7576
MembershipUnloadDelay dynamicconfig.DurationPropertyFn
7677

7778
// Time to hold a poll request before returning an empty response if there are no tasks
@@ -205,6 +206,7 @@ func NewConfig(
205206
BacklogNegligibleAge: dc.GetDurationPropertyFilteredByTaskQueueInfo(dynamicconfig.MatchingBacklogNegligibleAge, 24*365*10*time.Hour),
206207
MaxWaitForPollerBeforeFwd: dc.GetDurationPropertyFilteredByTaskQueueInfo(dynamicconfig.MatchingMaxWaitForPollerBeforeFwd, 200*time.Millisecond),
207208
QueryPollerUnavailableWindow: dc.GetDurationProperty(dynamicconfig.QueryPollerUnavailableWindow, 20*time.Second),
209+
QueryWorkflowTaskTimeoutLogRate: dc.GetFloatPropertyFilteredByTaskQueueInfo(dynamicconfig.MatchingQueryWorkflowTaskTimeoutLogRate, 0.0),
208210
MembershipUnloadDelay: dc.GetDurationProperty(dynamicconfig.MatchingMembershipUnloadDelay, 500*time.Millisecond),
209211

210212
AdminNamespaceToPartitionDispatchRate: dc.GetFloatPropertyFilteredByNamespace(dynamicconfig.AdminMatchingNamespaceToPartitionDispatchRate, 10000),

service/matching/matching_engine.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"errors"
3131
"fmt"
3232
"math"
33+
"math/rand"
3334
"sync"
3435
"sync/atomic"
3536
"time"
@@ -864,6 +865,24 @@ func (e *matchingEngineImpl) QueryWorkflow(
864865
return nil, serviceerror.NewInternal("unknown query completed type")
865866
}
866867
case <-ctx.Done():
868+
// task timed out. log (optionally) and return the timeout error
869+
ns, err := e.namespaceRegistry.GetNamespaceByID(namespaceID)
870+
if err != nil {
871+
e.logger.Error("Failed to get the namespace by ID",
872+
tag.WorkflowNamespaceID(string(namespaceID)),
873+
tag.Error(err))
874+
} else {
875+
sampleRate := e.config.QueryWorkflowTaskTimeoutLogRate(ns.Name().String(), taskQueueName, enumspb.TASK_QUEUE_TYPE_WORKFLOW)
876+
if rand.Float64() < sampleRate {
877+
e.logger.Info("Workflow Query Task timed out",
878+
tag.WorkflowNamespaceID(ns.ID().String()),
879+
tag.WorkflowNamespace(ns.Name().String()),
880+
tag.WorkflowID(queryRequest.GetQueryRequest().GetExecution().GetWorkflowId()),
881+
tag.WorkflowRunID(queryRequest.GetQueryRequest().GetExecution().GetRunId()),
882+
tag.WorkflowTaskRequestId(taskID),
883+
tag.WorkflowTaskQueueName(taskQueueName))
884+
}
885+
}
867886
return nil, ctx.Err()
868887
}
869888
}

0 commit comments

Comments
 (0)