From 938f2a8aa8f634dfedd7ab4c57a4fb3b681c8664 Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Mon, 8 Jul 2019 14:50:49 -0700 Subject: [PATCH 1/3] Support waiting for kube-apiserver to be ready with timout during NPD startup --- cmd/node_problem_detector.go | 19 +++++++++++++++++++ cmd/options/options.go | 9 +++++++++ pkg/problemclient/fake_problem_client.go | 6 +++++- pkg/problemclient/problem_client.go | 11 +++++++++-- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/cmd/node_problem_detector.go b/cmd/node_problem_detector.go index c64b1113a..e62108f97 100644 --- a/cmd/node_problem_detector.go +++ b/cmd/node_problem_detector.go @@ -26,6 +26,7 @@ import ( "github.com/golang/glog" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/node-problem-detector/cmd/options" "k8s.io/node-problem-detector/pkg/custompluginmonitor" "k8s.io/node-problem-detector/pkg/problemclient" @@ -95,7 +96,25 @@ func main() { startHTTPServer(p, npdo) } + // This function may be blocked (until a timeout occurs) before + // kube-apiserver becomes ready. + glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout) + if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil { + glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err) + } + if err := p.Run(); err != nil { glog.Fatalf("Problem detector failed with error: %v", err) } } + +func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error { + return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) { + // If NPD can get the node object from kube-apiserver, the server is + // ready and the RBAC permission is set correctly. + if _, err := c.GetNode(); err == nil { + return true, nil + } + return false, nil + }) +} diff --git a/cmd/options/options.go b/cmd/options/options.go index 4c10ec8cd..afd49bc7d 100644 --- a/cmd/options/options.go +++ b/cmd/options/options.go @@ -20,6 +20,7 @@ import ( "flag" "fmt" "os" + "time" "net/url" @@ -46,6 +47,12 @@ type NodeProblemDetectorOptions struct { ServerPort int // ServerAddress is the address to bind the node problem detector server. ServerAddress string + // APIServerWaitTimeout is the timeout on waiting for kube-apiserver to be + // ready. + APIServerWaitTimeout time.Duration + // APIServerWaitInterval is the interval between the checks on the + // readiness of kube-apiserver. + APIServerWaitInterval time.Duration // application options @@ -65,6 +72,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) { []string{}, "List of paths to custom plugin monitor config files, comma separated.") fs.StringVar(&npdo.ApiServerOverride, "apiserver-override", "", "Custom URI used to connect to Kubernetes ApiServer") + fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.") + fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.") fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit") fs.StringVar(&npdo.HostnameOverride, "hostname-override", "", "Custom node name used to override hostname") diff --git a/pkg/problemclient/fake_problem_client.go b/pkg/problemclient/fake_problem_client.go index 6f878493b..50e47350e 100644 --- a/pkg/problemclient/fake_problem_client.go +++ b/pkg/problemclient/fake_problem_client.go @@ -21,7 +21,7 @@ import ( "reflect" "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" ) // FakeProblemClient is a fake problem client for debug. @@ -92,3 +92,7 @@ func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.N // Eventf does nothing now. func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) { } + +func (f *FakeProblemClient) GetNode() (*v1.Node, error) { + return nil, fmt.Errorf("GetNode() not implemented") +} diff --git a/pkg/problemclient/problem_client.go b/pkg/problemclient/problem_client.go index c2f841416..70a022ad6 100644 --- a/pkg/problemclient/problem_client.go +++ b/pkg/problemclient/problem_client.go @@ -26,7 +26,7 @@ import ( typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/kubernetes/pkg/api/legacyscheme" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/clock" @@ -47,6 +47,9 @@ type Client interface { SetConditions(conditions []v1.NodeCondition) error // Eventf reports the event. Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) + // GetNode returns the Node object of the node on which the + // node-problem-detector runs. + GetNode() (*v1.Node, error) } type nodeProblemClient struct { @@ -79,7 +82,7 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client { } func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) { - node, err := c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) + node, err := c.GetNode() if err != nil { return nil, err } @@ -116,6 +119,10 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...) } +func (c *nodeProblemClient) GetNode() (*v1.Node, error) { + return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) +} + // generatePatch generates condition patch func generatePatch(conditions []v1.NodeCondition) ([]byte, error) { raw, err := json.Marshal(&conditions) From e9ed821d480392ac646f1fe035ccab68b1d51d9f Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Thu, 1 Aug 2019 17:10:53 -0700 Subject: [PATCH 2/3] Update the detection method for docker overlay2 issue --- config/docker-monitor.json | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/config/docker-monitor.json b/config/docker-monitor.json index d6294d4c4..6b6f3794e 100644 --- a/config/docker-monitor.json +++ b/config/docker-monitor.json @@ -7,12 +7,24 @@ "lookback": "5m", "bufferSize": 10, "source": "docker-monitor", - "conditions": [], + "conditions": [ + { + "type": "CorruptDockerOverlay2", + "reason": "NoCorruptDockerOverlay2", + "message": "docker overlay2 is functioning properly" + } + ], "rules": [ { "type": "temporary", "reason": "CorruptDockerImage", "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*" + }, + { + "type": "permanent", + "condition": "CorruptDockerOverlay2", + "reason": "CorruptDockerOverlay2", + "pattern": "returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*" } ] } From 318dcfef3599bea42981afa48ae7fefa8252c94e Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 13 Aug 2019 11:02:51 -0700 Subject: [PATCH 3/3] Don't update condition if status stays False/Unknown for custom plugin --- .../custom_plugin_monitor.go | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/pkg/custompluginmonitor/custom_plugin_monitor.go b/pkg/custompluginmonitor/custom_plugin_monitor.go index 9edd787c1..629b2f94e 100644 --- a/pkg/custompluginmonitor/custom_plugin_monitor.go +++ b/pkg/custompluginmonitor/custom_plugin_monitor.go @@ -121,74 +121,74 @@ func (c *customPluginMonitor) generateStatus(result cpmtypes.Result) *types.Stat }) } } else { - // For permanent error changes the condition + // For permanent error that changes the condition for i := range c.conditions { condition := &c.conditions[i] if condition.Type == result.Rule.Condition { + // The condition reason specified in the rule and the result message + // represent the problem happened. We need to know the default condition + // from the config, so that we can set the new condition reason/message + // back when such problem goes away. + var defaultConditionReason string + var defaultConditionMessage string + for j := range c.config.DefaultConditions { + defaultCondition := &c.config.DefaultConditions[j] + if defaultCondition.Type == result.Rule.Condition { + defaultConditionReason = defaultCondition.Reason + defaultConditionMessage = defaultCondition.Message + break + } + } + + needToUpdateCondition := true + var newReason string + var newMessage string status := toConditionStatus(result.ExitStatus) - // change 1: Condition status change from True to False/Unknown if condition.Status == types.True && status != types.True { - condition.Transition = timestamp - var defaultConditionReason string - var defaultConditionMessage string - for j := range c.config.DefaultConditions { - defaultCondition := &c.config.DefaultConditions[j] - if defaultCondition.Type == result.Rule.Condition { - defaultConditionReason = defaultCondition.Reason - defaultConditionMessage = defaultCondition.Message - break - } + // Scenario 1: Condition status changes from True to False/Unknown + newReason = defaultConditionReason + if newMessage == "" { + newMessage = defaultConditionMessage + } else { + newMessage = result.Message } - - events = append(events, util.GenerateConditionChangeEvent( - condition.Type, - status, - defaultConditionReason, - timestamp, - )) - - condition.Status = status - condition.Message = defaultConditionMessage - condition.Reason = defaultConditionReason } else if condition.Status != types.True && status == types.True { - // change 2: Condition status change from False/Unknown to True - condition.Transition = timestamp - condition.Message = result.Message - events = append(events, util.GenerateConditionChangeEvent( - condition.Type, - status, - result.Rule.Reason, - timestamp, - )) - - condition.Status = status - condition.Reason = result.Rule.Reason + // Scenario 2: Condition status changes from False/Unknown to True + newReason = result.Rule.Reason + newMessage = result.Message } else if condition.Status != status { - // change 3: Condition status change from False to Unknown or vice versa - condition.Transition = timestamp - condition.Message = result.Message - events = append(events, util.GenerateConditionChangeEvent( - condition.Type, - status, - result.Rule.Reason, - timestamp, - )) - - condition.Status = status - condition.Reason = result.Rule.Reason - } else if condition.Status == status && + // Scenario 3: Condition status changes from False to Unknown or vice versa + newReason = defaultConditionReason + if newMessage == "" { + newMessage = defaultConditionMessage + } else { + newMessage = result.Message + } + } else if condition.Status == types.True && status == types.True && (condition.Reason != result.Rule.Reason || (*c.config.PluginGlobalConfig.EnableMessageChangeBasedConditionUpdate && condition.Message != result.Message)) { - // change 4: Condition status do not change. + // Scenario 4: Condition status does not change and it stays true. // condition reason changes or // condition message changes when message based condition update is enabled. + newReason = result.Rule.Reason + newMessage = result.Message + } else { + // Scenario 5: Condition status does not change and it stays False/Unknown. + // This should just be the default reason or message (as a consequence + // of scenario 1 and scenario 3 above). + needToUpdateCondition = false + } + + if needToUpdateCondition { condition.Transition = timestamp - condition.Reason = result.Rule.Reason - condition.Message = result.Message + condition.Status = status + condition.Reason = newReason + condition.Message = newMessage + events = append(events, util.GenerateConditionChangeEvent( condition.Type, status, - condition.Reason, + newReason, timestamp, )) }