Skip to content

Commit 875b1b7

Browse files
aiwantaoziAlena Prokharchyk
authored andcommitted
fix alert notify template lack info
Problem: 1. workload replicate persent can't show in alert info 2. cluster/project id not easy to identify in alert info Solution: 1. update alert notify template 2. use cluster/project display name instead of id Issue: rancher#18156 rancher#18160
1 parent f434208 commit 875b1b7

8 files changed

Lines changed: 80 additions & 89 deletions

File tree

pkg/controllers/user/alert/common/common.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ package common
22

33
import (
44
"fmt"
5+
6+
"github.com/rancher/rancher/pkg/ref"
7+
"github.com/rancher/types/apis/management.cattle.io/v3"
8+
"github.com/sirupsen/logrus"
59
)
610

711
func GetRuleID(groupID string, ruleName string) string {
@@ -19,3 +23,32 @@ func GetAlertManagerSecretName(appName string) string {
1923
func GetAlertManagerDaemonsetName(appName string) string {
2024
return fmt.Sprintf("alertmanager-%s", appName)
2125
}
26+
27+
func formatProjectDisplayName(projectDisplayName, projectID string) string {
28+
return fmt.Sprintf("%s (ID: %s)", projectDisplayName, projectID)
29+
}
30+
31+
func formatClusterDisplayName(clusterDisplayName, clusterID string) string {
32+
return fmt.Sprintf("%s (ID: %s)", clusterDisplayName, clusterID)
33+
}
34+
35+
func GetClusterDisplayName(clusterName string, clusterLister v3.ClusterLister) string {
36+
cluster, err := clusterLister.Get("", clusterName)
37+
if err != nil {
38+
logrus.Warnf("Failed to get cluster for %s: %v", clusterName, err)
39+
return clusterName
40+
}
41+
42+
return formatClusterDisplayName(cluster.Spec.DisplayName, clusterName)
43+
}
44+
45+
func GetProjectDisplayName(projectID string, projectLister v3.ProjectLister) string {
46+
clusterName, projectName := ref.Parse(projectID)
47+
project, err := projectLister.Get(clusterName, projectName)
48+
if err != nil {
49+
logrus.Warnf("Failed to get project %s: %v", projectID, err)
50+
return projectID
51+
}
52+
53+
return formatProjectDisplayName(project.Spec.DisplayName, projectID)
54+
}

pkg/controllers/user/alert/configsyncer/configsyncer.go

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ func NewConfigSyncer(ctx context.Context, cluster *config.UserContext, alertMana
4141
clusterAlertRuleLister: cluster.Management.Management.ClusterAlertRules(cluster.ClusterName).Controller().Lister(),
4242
projectAlertRuleLister: cluster.Management.Management.ProjectAlertRules("").Controller().Lister(),
4343
notifierLister: cluster.Management.Management.Notifiers(cluster.ClusterName).Controller().Lister(),
44+
clusterLister: cluster.Management.Management.Clusters(metav1.NamespaceAll).Controller().Lister(),
45+
projectLister: cluster.Management.Management.Projects(cluster.ClusterName).Controller().Lister(),
4446
clusterName: cluster.ClusterName,
4547
alertManager: alertManager,
4648
operatorCRDManager: operatorCRDManager,
@@ -54,6 +56,8 @@ type ConfigSyncer struct {
5456
projectAlertRuleLister v3.ProjectAlertRuleLister
5557
clusterAlertRuleLister v3.ClusterAlertRuleLister
5658
notifierLister v3.NotifierLister
59+
clusterLister v3.ClusterLister
60+
projectLister v3.ProjectLister
5761
clusterName string
5862
alertManager *manager.AlertManager
5963
operatorCRDManager *manager.PromOperatorCRDManager
@@ -86,6 +90,8 @@ func (d *ConfigSyncer) sync() error {
8690
return nil
8791
}
8892

93+
clusterDisplayName := common.GetClusterDisplayName(d.clusterName, d.clusterLister)
94+
8995
if _, err := d.alertManager.GetAlertManagerEndpoint(); err != nil {
9096
return err
9197
}
@@ -161,11 +167,11 @@ func (d *ConfigSyncer) sync() error {
161167
}
162168
sort.Strings(pAlertsKey)
163169

164-
if err := d.addClusterAlert2Operator(cAlertsMap, cAlertsKey); err != nil {
170+
if err := d.addClusterAlert2Operator(clusterDisplayName, cAlertsMap, cAlertsKey); err != nil {
165171
return err
166172
}
167173

168-
if err := d.addProjectAlert2Operator(pAlertsMap, pAlertsKey); err != nil {
174+
if err := d.addProjectAlert2Operator(clusterDisplayName, pAlertsMap, pAlertsKey); err != nil {
169175
return err
170176
}
171177

@@ -221,13 +227,15 @@ func (d *ConfigSyncer) getNotifier(id string, notifiers []*v3.Notifier) *v3.Noti
221227
return nil
222228
}
223229

224-
func (d *ConfigSyncer) addProjectAlert2Operator(projectGroups map[string]map[string][]*v3.ProjectAlertRule, keys []string) error {
225-
for _, projectID := range keys {
226-
groupRules := projectGroups[projectID]
227-
_, projectName := ref.Parse(projectID)
230+
func (d *ConfigSyncer) addProjectAlert2Operator(clusterDisplayName string, projectGroups map[string]map[string][]*v3.ProjectAlertRule, keys []string) error {
231+
for _, projectName := range keys {
232+
groupRules := projectGroups[projectName]
228233
_, namespace := monitorutil.ProjectMonitoringInfo(projectName)
229234
promRule := d.operatorCRDManager.GetDefaultPrometheusRule(namespace, projectName)
230235

236+
projectID := fmt.Sprintf("%s:%s", d.clusterName, projectName)
237+
projectDisplayName := common.GetProjectDisplayName(projectID, d.projectLister)
238+
231239
var groupIDs []string
232240
for k := range groupRules {
233241
groupIDs = append(groupIDs, k)
@@ -242,7 +250,7 @@ func (d *ConfigSyncer) addProjectAlert2Operator(projectGroups map[string]map[str
242250
for _, alertRule := range alertRules {
243251
if alertRule.Spec.MetricRule != nil {
244252
ruleID := common.GetRuleID(alertRule.Spec.GroupName, alertRule.Name)
245-
promRule := manager.Metric2Rule(groupID, ruleID, alertRule.Spec.Severity, alertRule.Spec.DisplayName, d.clusterName, projectName, alertRule.Spec.MetricRule)
253+
promRule := manager.Metric2Rule(groupID, ruleID, alertRule.Spec.Severity, alertRule.Spec.DisplayName, clusterDisplayName, projectDisplayName, alertRule.Spec.MetricRule)
246254
d.operatorCRDManager.AddRule(ruleGroup, promRule)
247255
}
248256
}
@@ -261,7 +269,7 @@ func (d *ConfigSyncer) addProjectAlert2Operator(projectGroups map[string]map[str
261269
return nil
262270
}
263271

264-
func (d *ConfigSyncer) addClusterAlert2Operator(groupRules map[string][]*v3.ClusterAlertRule, keys []string) error {
272+
func (d *ConfigSyncer) addClusterAlert2Operator(clusterDisplayName string, groupRules map[string][]*v3.ClusterAlertRule, keys []string) error {
265273
var enabled bool
266274
_, namespace := monitorutil.ClusterMonitoringInfo()
267275
promRule := d.operatorCRDManager.GetDefaultPrometheusRule(namespace, d.clusterName)
@@ -273,7 +281,7 @@ func (d *ConfigSyncer) addClusterAlert2Operator(groupRules map[string][]*v3.Clus
273281
for _, alertRule := range alertRules {
274282
if alertRule.Spec.MetricRule != nil {
275283
ruleID := common.GetRuleID(alertRule.Spec.GroupName, alertRule.Name)
276-
promRule := manager.Metric2Rule(groupID, ruleID, alertRule.Spec.Severity, alertRule.Spec.DisplayName, d.clusterName, "", alertRule.Spec.MetricRule)
284+
promRule := manager.Metric2Rule(groupID, ruleID, alertRule.Spec.Severity, alertRule.Spec.DisplayName, clusterDisplayName, "", alertRule.Spec.MetricRule)
277285
d.operatorCRDManager.AddRule(ruleGroup, promRule)
278286
}
279287
}
@@ -535,7 +543,7 @@ func getClusterAlertGroupBy(spec v3.ClusterAlertRuleSpec) []model.LabelName {
535543

536544
func getProjectAlertGroupBy(spec v3.ProjectAlertRuleSpec) []model.LabelName {
537545
if spec.PodRule != nil {
538-
return []model.LabelName{"rule_id", "namespace", "pod_name"}
546+
return []model.LabelName{"rule_id", "namespace", "pod_name", "alert_type"}
539547
} else if spec.WorkloadRule != nil {
540548
return []model.LabelName{"rule_id", "workload_namespace", "workload_name", "workload_kind"}
541549
} else if spec.MetricRule != nil {

pkg/controllers/user/alert/deployer/notification_template.go

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ The Pod {{ if .GroupLabels.namespace}}{{.GroupLabels.namespace}}:{{end}}{{.Group
2525
The Pod {{ if .GroupLabels.namespace}}{{.GroupLabels.namespace}}:{{end}}{{.GroupLabels.pod_name}} is not running
2626
2727
{{- else if eq .CommonLabels.alert_type "podRestarts" }}
28-
The Pod {{ if .GroupLabels.namespace}}{{.GroupLabels.namespace}}:{{end}}{{.GroupLabels.pod_name}} restarts {{ .GroupLabels.restart_times}} times in {{ .GroupLabels.restart_interval}} sec
28+
The Pod {{ if .GroupLabels.namespace}}{{.GroupLabels.namespace}}:{{end}}{{.GroupLabels.pod_name}} restarts {{ .CommonLabels.restart_times}} times in {{ .CommonLabels.restart_interval}} sec
2929
3030
{{- else if eq .CommonLabels.alert_type "workload" }}
31-
The workload {{ if .GroupLabels.workload_namespace}}{{.GroupLabels.workload_namespace}}:{{end}}{{.GroupLabels.workload_name}} has available replicas less than {{ .GroupLabels.available_percentage}}%
31+
The workload {{ if .GroupLabels.workload_namespace}}{{.GroupLabels.workload_namespace}}:{{end}}{{.GroupLabels.workload_name}} has available replicas less than {{ .CommonLabels.available_percentage}}%
3232
3333
{{- else if eq .CommonLabels.alert_type "metric" }}
3434
The metric {{ .CommonLabels.alert_name}} crossed the threshold
@@ -99,15 +99,12 @@ Project Name: {{ .Labels.project_name}}
9999
Available Replicas: {{ .Labels.available_replicas}}
100100
Desired Replicas: {{ .Labels.desired_replicas}}
101101
{{- else if eq .Labels.alert_type "metric" }}
102-
{{ if .Labels.project_name }}
103-
Project Name: {{ .Labels.project_name}}
104-
{{ end -}}
105-
{{ if .Labels.pod_name }}
106-
Pod Name: {{ .Labels.pod_name}}{{ else if .Labels.pod -}}Pod Name: {{ .Labels.pod}}
107-
{{ end -}}
108-
{{ if .Labels.namespace }}
109-
Namespace: {{ .Labels.namespace}}
110-
{{ end -}}
102+
{{- if .Labels.namespace }}
103+
Namespace: {{ .Labels.namespace}}{{ end }}
104+
{{- if .Labels.project_name }}
105+
Project Name: {{ .Labels.project_name}}{{ end }}
106+
{{- if .Labels.pod_name }}
107+
Pod Name: {{ .Labels.pod_name}}{{ else if .Labels.pod -}}Pod Name: {{ .Labels.pod}}{{ end }}
111108
Expression: {{ .Labels.expression}}
112109
Description: Threshold Crossed: datapoint value {{ .Annotations.current_value}} was {{ .Labels.comparison}} to the threshold ({{ .Labels.threshold_value}}) for ({{ .Labels.duration}})
113110
{{ end -}}

pkg/controllers/user/alert/watcher/event.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,7 @@ func (l *EventWatcher) Sync(key string, obj *corev1.Event) (runtime.Object, erro
7070
if alert.Spec.EventRule.EventType == obj.Type && alert.Spec.EventRule.ResourceKind == obj.InvolvedObject.Kind {
7171
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
7272

73-
clusterDisplayName := l.clusterName
74-
cluster, err := l.clusterLister.Get("", l.clusterName)
75-
if err != nil {
76-
logrus.Warnf("Failed to get cluster for %s: %v", l.clusterName, err)
77-
} else {
78-
clusterDisplayName = cluster.Spec.DisplayName
79-
}
73+
clusterDisplayName := common.GetClusterDisplayName(l.clusterName, l.clusterLister)
8074

8175
data := map[string]string{}
8276
data["rule_id"] = ruleID

pkg/controllers/user/alert/watcher/node.go

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,7 @@ func (w *NodeWatcher) checkNodeMemUsage(alert *v3.ClusterAlertRule, machine *v3.
143143
if used.Value()*100.0/total.Value() > int64(alert.Spec.NodeRule.MemThreshold) {
144144
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
145145

146-
clusterDisplayName := w.clusterName
147-
cluster, err := w.clusterLister.Get("", w.clusterName)
148-
if err != nil {
149-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
150-
} else {
151-
clusterDisplayName = cluster.Spec.DisplayName
152-
}
146+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
153147

154148
data := map[string]string{}
155149
data["rule_id"] = ruleID
@@ -177,13 +171,7 @@ func (w *NodeWatcher) checkNodeCPUUsage(alert *v3.ClusterAlertRule, machine *v3.
177171
if used.MilliValue()*100.0/total.MilliValue() > int64(alert.Spec.NodeRule.CPUThreshold) {
178172
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
179173

180-
clusterDisplayName := w.clusterName
181-
cluster, err := w.clusterLister.Get("", w.clusterName)
182-
if err != nil {
183-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
184-
} else {
185-
clusterDisplayName = cluster.Spec.DisplayName
186-
}
174+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
187175

188176
data := map[string]string{}
189177
data["rule_id"] = ruleID
@@ -210,13 +198,7 @@ func (w *NodeWatcher) checkNodeReady(alert *v3.ClusterAlertRule, machine *v3.Nod
210198
if cond.Status != corev1.ConditionTrue {
211199
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
212200

213-
clusterDisplayName := w.clusterName
214-
cluster, err := w.clusterLister.Get("", w.clusterName)
215-
if err != nil {
216-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
217-
} else {
218-
clusterDisplayName = cluster.Spec.DisplayName
219-
}
201+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
220202

221203
data := map[string]string{}
222204
data["rule_id"] = ruleID

pkg/controllers/user/alert/watcher/pod.go

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type PodWatcher struct {
3232
clusterName string
3333
podRestartTrack sync.Map
3434
clusterLister v3.ClusterLister
35+
projectLister v3.ProjectLister
3536
workloadFetcher workloadFetcher
3637
}
3738

@@ -54,6 +55,7 @@ func StartPodWatcher(ctx context.Context, cluster *config.UserContext, manager *
5455
clusterName: cluster.ClusterName,
5556
podRestartTrack: sync.Map{},
5657
clusterLister: cluster.Management.Management.Clusters("").Controller().Lister(),
58+
projectLister: cluster.Management.Management.Projects(cluster.ClusterName).Controller().Lister(),
5759
workloadFetcher: workloadFetcher,
5860
}
5961

@@ -160,20 +162,14 @@ func (w *PodWatcher) checkPodRestarts(pod *corev1.Pod, alert *v3.ProjectAlertRul
160162

161163
if curCount-preCount >= int32(alert.Spec.PodRule.RestartTimes) {
162164
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
163-
projectName := alert.Namespace
164165

165166
details := ""
166167
if containerStatus.State.Waiting != nil {
167168
details = containerStatus.State.Waiting.Message
168169
}
169170

170-
clusterDisplayName := w.clusterName
171-
cluster, err := w.clusterLister.Get("", w.clusterName)
172-
if err != nil {
173-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
174-
} else {
175-
clusterDisplayName = cluster.Spec.DisplayName
176-
}
171+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
172+
projectDisplayName := common.GetProjectDisplayName(alert.Spec.ProjectName, w.projectLister)
177173

178174
data := map[string]string{}
179175
data["rule_id"] = ruleID
@@ -182,7 +178,7 @@ func (w *PodWatcher) checkPodRestarts(pod *corev1.Pod, alert *v3.ProjectAlertRul
182178
data["alert_type"] = "podRestarts"
183179
data["severity"] = alert.Spec.Severity
184180
data["cluster_name"] = clusterDisplayName
185-
data["project_name"] = projectName
181+
data["project_name"] = projectDisplayName
186182
data["namespace"] = pod.Namespace
187183
data["pod_name"] = pod.Name
188184
data["container_name"] = containerStatus.Name
@@ -262,13 +258,8 @@ func (w *PodWatcher) checkPodRunning(pod *corev1.Pod, alert *v3.ProjectAlertRule
262258
details = containerStatus.State.Terminated.Message
263259
}
264260

265-
clusterDisplayName := w.clusterName
266-
cluster, err := w.clusterLister.Get("", w.clusterName)
267-
if err != nil {
268-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
269-
} else {
270-
clusterDisplayName = cluster.Spec.DisplayName
271-
}
261+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
262+
projectDisplayName := common.GetProjectDisplayName(alert.Spec.ProjectName, w.projectLister)
272263

273264
data := map[string]string{}
274265
data["rule_id"] = ruleID
@@ -278,6 +269,7 @@ func (w *PodWatcher) checkPodRunning(pod *corev1.Pod, alert *v3.ProjectAlertRule
278269
data["severity"] = alert.Spec.Severity
279270
data["cluster_name"] = clusterDisplayName
280271
data["namespace"] = pod.Namespace
272+
data["project_name"] = projectDisplayName
281273
data["pod_name"] = pod.Name
282274
data["container_name"] = containerStatus.Name
283275

@@ -306,16 +298,10 @@ func (w *PodWatcher) checkPodScheduled(pod *corev1.Pod, alert *v3.ProjectAlertRu
306298
for _, condition := range pod.Status.Conditions {
307299
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
308300
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
309-
310301
details := condition.Message
311302

312-
clusterDisplayName := w.clusterName
313-
cluster, err := w.clusterLister.Get("", w.clusterName)
314-
if err != nil {
315-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
316-
} else {
317-
clusterDisplayName = cluster.Spec.DisplayName
318-
}
303+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
304+
projectDisplayName := common.GetProjectDisplayName(alert.Spec.ProjectName, w.projectLister)
319305

320306
data := map[string]string{}
321307
data["rule_id"] = ruleID
@@ -325,6 +311,7 @@ func (w *PodWatcher) checkPodScheduled(pod *corev1.Pod, alert *v3.ProjectAlertRu
325311
data["severity"] = alert.Spec.Severity
326312
data["cluster_name"] = clusterDisplayName
327313
data["namespace"] = pod.Namespace
314+
data["project_name"] = projectDisplayName
328315
data["pod_name"] = pod.Name
329316

330317
if details != "" {

pkg/controllers/user/alert/watcher/syscomponent.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,7 @@ func (w *SysComponentWatcher) checkComponentHealthy(statuses *v1.ComponentStatus
8080
if cond.Status == corev1.ConditionFalse {
8181
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
8282

83-
clusterDisplayName := w.clusterName
84-
cluster, err := w.clusterLister.Get("", w.clusterName)
85-
if err != nil {
86-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
87-
} else {
88-
clusterDisplayName = cluster.Spec.DisplayName
89-
}
83+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
9084

9185
data := map[string]string{}
9286
data["rule_id"] = ruleID

pkg/controllers/user/alert/watcher/workload.go

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type WorkloadWatcher struct {
3434
projectAlertRuleLister v3.ProjectAlertRuleLister
3535
clusterName string
3636
clusterLister v3.ClusterLister
37+
projectLister v3.ProjectLister
3738
namespaceIndexer cache.Indexer
3839
}
3940

@@ -51,6 +52,7 @@ func StartWorkloadWatcher(ctx context.Context, cluster *config.UserContext, mana
5152
alertManager: manager,
5253
clusterName: cluster.ClusterName,
5354
clusterLister: cluster.Management.Management.Clusters("").Controller().Lister(),
55+
projectLister: cluster.Management.Management.Projects(cluster.ClusterName).Controller().Lister(),
5456
namespaceIndexer: nsInformer.GetIndexer(),
5557
}
5658

@@ -143,15 +145,9 @@ func (w *WorkloadWatcher) checkWorkloadCondition(wl *workload.Workload, alert *v
143145

144146
if wl.Status.AvailableReplicas < availableThreshold {
145147
ruleID := common.GetRuleID(alert.Spec.GroupName, alert.Name)
146-
projectName := alert.Namespace
147148

148-
clusterDisplayName := w.clusterName
149-
cluster, err := w.clusterLister.Get("", w.clusterName)
150-
if err != nil {
151-
logrus.Warnf("Failed to get cluster for %s: %v", w.clusterName, err)
152-
} else {
153-
clusterDisplayName = cluster.Spec.DisplayName
154-
}
149+
clusterDisplayName := common.GetClusterDisplayName(w.clusterName, w.clusterLister)
150+
projectDisplayName := common.GetProjectDisplayName(alert.Spec.ProjectName, w.projectLister)
155151

156152
data := map[string]string{}
157153
data["rule_id"] = ruleID
@@ -160,7 +156,7 @@ func (w *WorkloadWatcher) checkWorkloadCondition(wl *workload.Workload, alert *v
160156
data["alert_name"] = alert.Spec.DisplayName
161157
data["severity"] = alert.Spec.Severity
162158
data["cluster_name"] = clusterDisplayName
163-
data["project_name"] = projectName
159+
data["project_name"] = projectDisplayName
164160
data["workload_name"] = wl.Name
165161
data["workload_namespace"] = wl.Namespace
166162
data["workload_kind"] = wl.Kind

0 commit comments

Comments
 (0)