Skip to content

Commit 7c15260

Browse files
feat: update health probe to make use of healthChecker
healthChecker was introduced in open-cluster-management-io/addon-framework#289 and it allow us to define a list of ProbeFields as before but it also allows us to define HealthChecker. HealthChecker has the ability to go through all the resultFields and decide which resources are mandatory and should trigger unheathly probes and which resources are optional
1 parent 53e1907 commit 7c15260

File tree

4 files changed

+85
-60
lines changed

4 files changed

+85
-60
lines changed

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ require (
1717
k8s.io/client-go v0.30.3
1818
k8s.io/component-base v0.30.3
1919
k8s.io/klog/v2 v2.130.1
20-
open-cluster-management.io/addon-framework v0.11.0
21-
open-cluster-management.io/api v0.15.0
20+
open-cluster-management.io/addon-framework v0.0.0-00010101000000-000000000000
21+
open-cluster-management.io/api v0.15.1-0.20241120090202-cb7ce98ab874
2222
sigs.k8s.io/controller-runtime v0.18.5
2323
)
2424

25+
replace open-cluster-management.io/addon-framework => github.com/zhiweiyin318/addon-framework v0.0.0-20241125074036-754aecccaf2b
26+
2527
replace github.com/prometheus/common => github.com/prometheus/common v0.60.1
2628

2729
replace github.com/prometheus/client_golang => github.com/prometheus/client_golang v1.20.5

go.sum

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q
219219
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
220220
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
221221
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
222+
github.com/zhiweiyin318/addon-framework v0.0.0-20241125074036-754aecccaf2b h1:xBnQWUqASEhZfyUZ8Bsl7fBe78ZF+Y2od524n6dZfCA=
223+
github.com/zhiweiyin318/addon-framework v0.0.0-20241125074036-754aecccaf2b/go.mod h1:tsBSNs9mGfVQQjXBnjgpiX6r0UM+G3iNfmzQgKhEfw4=
222224
go.etcd.io/bbolt v1.3.8 h1:xs88BrvEv273UsB79e0hcVrlUWmS0a8upikMFhSyAtA=
223225
go.etcd.io/bbolt v1.3.8/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=
224226
go.etcd.io/etcd/api/v3 v3.5.12 h1:W4sw5ZoU2Juc9gBWuLk5U6fHfNVyY1WC5g9uiXZio/c=
@@ -378,10 +380,8 @@ k8s.io/kube-openapi v0.0.0-20240620174524-b456828f718b h1:Q9xmGWBvOGd8UJyccgpYlL
378380
k8s.io/kube-openapi v0.0.0-20240620174524-b456828f718b/go.mod h1:UxDHUPsUwTOOxSU+oXURfFBcAS6JwiRXTYqYwfuGowc=
379381
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak=
380382
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
381-
open-cluster-management.io/addon-framework v0.11.0 h1:ZJxphgHQ36VUJF0RIag+nzcEn5PNyep2rsEPdz6wT7o=
382-
open-cluster-management.io/addon-framework v0.11.0/go.mod h1:ruMU8i/dciz3qCv2CQ46Cu1b7rkK7TpvB+W4bRwHf+I=
383-
open-cluster-management.io/api v0.15.0 h1:lRee1KOlGHZb2scTA7ff9E9Fxt2hJc7jpkHnaCbvkOU=
384-
open-cluster-management.io/api v0.15.0/go.mod h1:9erZEWEn4bEqh0nIX2wA7f/s3KCuFycQdBrPrRzi0QM=
383+
open-cluster-management.io/api v0.15.1-0.20241120090202-cb7ce98ab874 h1:WgkuYXTbJV7EK+qtiMq3soa21faGUKeTG5w0C8Mn1Ok=
384+
open-cluster-management.io/api v0.15.1-0.20241120090202-cb7ce98ab874/go.mod h1:9erZEWEn4bEqh0nIX2wA7f/s3KCuFycQdBrPrRzi0QM=
385385
open-cluster-management.io/sdk-go v0.15.0 h1:2IAJnPfUoY6rPC5w7LhqAnvIlgekPoVW03LdZO1unIM=
386386
open-cluster-management.io/sdk-go v0.15.0/go.mod h1:fi5WBsbC5K3txKb8eRLuP0Sim/Oqz/PHX18skAEyjiA=
387387
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.29.0 h1:/U5vjBbQn3RChhv7P11uhYvCSm5G2GaIi5AIGBS6r4c=

internal/addon/addon.go

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ import (
1414
)
1515

1616
var (
17+
errMissingFields = errors.New("no fields found in health checker")
1718
errProbeConditionNotSatisfied = errors.New("probe condition is not satisfied")
1819
errProbeValueIsNil = errors.New("probe value is nil")
19-
errValueNotProbed = errors.New("value not probed")
20+
errUnknownProbeKey = errors.New("probe has key that doesn't match the key defined")
21+
errUnknownResource = errors.New("undefined health check for resource")
2022
)
2123

2224
func NewRegistrationOption(agentName string) *agent.RegistrationOption {
@@ -90,42 +92,53 @@ func AgentHealthProber() *agent.HealthProber {
9092
},
9193
},
9294
},
93-
HealthCheck: func(identifier workv1.ResourceIdentifier, result workv1.StatusFeedbackResult) error {
94-
for _, value := range result.Values {
95+
HealthChecker: func(fields []agent.ResultField) error {
96+
if len(fields) == 0 {
97+
return errMissingFields
98+
}
99+
for _, field := range fields {
100+
if len(field.FeedbackResult.Values) == 0 {
101+
// If a probe didn't get any values maybe the resources was not deployed
102+
continue
103+
}
104+
identifier := field.ResourceIdentifier
95105
switch {
96106
case identifier.Resource == ClusterLogForwardersResource:
97-
if value.Name != clfProbeKey {
98-
continue
99-
}
107+
for _, value := range field.FeedbackResult.Values {
108+
if value.Name != clfProbeKey {
109+
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
110+
}
100111

101-
if value.Value.String == nil {
102-
return fmt.Errorf("%w: clusterlogforwarder with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
103-
}
112+
if value.Value.String == nil {
113+
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
114+
}
104115

105-
if *value.Value.String != "True" {
106-
return fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, *value.Value.String, identifier.Namespace, identifier.Name)
116+
if *value.Value.String != "True" {
117+
return fmt.Errorf("%w: %s status condition type is %s for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.String, identifier.Namespace, identifier.Name)
118+
}
119+
// everything checks we should skip to the next field
107120
}
108-
109-
return nil
110121
case identifier.Resource == OpenTelemetryCollectorsResource:
111-
if value.Name != otelColProbeKey {
112-
continue
113-
}
122+
for _, value := range field.FeedbackResult.Values {
123+
if value.Name != otelColProbeKey {
124+
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
125+
}
114126

115-
if value.Value.Integer == nil {
116-
return fmt.Errorf("%w: opentelemetrycollector with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
117-
}
127+
if value.Value.Integer == nil {
128+
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
129+
}
118130

119-
if *value.Value.Integer < 1 {
120-
return fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, *value.Value.Integer, identifier.Namespace, identifier.Name)
131+
if *value.Value.Integer < 1 {
132+
return fmt.Errorf("%w: %s replicas is %d for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.Integer, identifier.Namespace, identifier.Name)
133+
}
134+
// everything checks we should skip to the next field
121135
}
122-
123-
return nil
124136
default:
125-
continue
137+
// If a resource is being probed it should have a health check defined
138+
return fmt.Errorf("%w: %s with key %s/%s", errUnknownResource, identifier.Resource, identifier.Namespace, identifier.Name)
126139
}
127140
}
128-
return fmt.Errorf("%w: for resource %s with key %s/%s", errValueNotProbed, identifier.Resource, identifier.Namespace, identifier.Name)
141+
return nil
129142
},
130143
},
131144
}

internal/addon/addon_test.go

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@ import (
44
"fmt"
55
"testing"
66

7-
"github.com/stretchr/testify/require"
8-
workv1 "open-cluster-management.io/api/work/v1"
9-
107
otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
118
loggingv1 "github.com/openshift/cluster-logging-operator/api/observability/v1"
9+
"github.com/stretchr/testify/require"
10+
"open-cluster-management.io/addon-framework/pkg/agent"
11+
workv1 "open-cluster-management.io/api/work/v1"
1212
)
1313

1414
func Test_AgentHealthProber_CLF(t *testing.T) {
15-
unhealthyError := fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
15+
unhealthyError := fmt.Errorf("%w: clusterlogforwarders status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
1616
for _, tc := range []struct {
1717
name string
1818
status string
@@ -30,18 +30,23 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
3030
} {
3131
t.Run(tc.name, func(t *testing.T) {
3232
healthProber := AgentHealthProber()
33-
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
34-
Group: loggingv1.GroupVersion.Group,
35-
Resource: ClusterLogForwardersResource,
36-
Name: SpokeCLFName,
37-
Namespace: SpokeCLFNamespace,
38-
}, workv1.StatusFeedbackResult{
39-
Values: []workv1.FeedbackValue{
40-
{
41-
Name: "isReady",
42-
Value: workv1.FieldValue{
43-
Type: workv1.String,
44-
String: &tc.status,
33+
err := healthProber.WorkProber.HealthChecker([]agent.ResultField{
34+
{
35+
ResourceIdentifier: workv1.ResourceIdentifier{
36+
Group: loggingv1.GroupVersion.Group,
37+
Resource: ClusterLogForwardersResource,
38+
Name: SpokeCLFName,
39+
Namespace: SpokeCLFNamespace,
40+
},
41+
FeedbackResult: workv1.StatusFeedbackResult{
42+
Values: []workv1.FeedbackValue{
43+
{
44+
Name: "isReady",
45+
Value: workv1.FieldValue{
46+
Type: workv1.String,
47+
String: &tc.status,
48+
},
49+
},
4550
},
4651
},
4752
},
@@ -56,7 +61,7 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
5661
}
5762

5863
func Test_AgentHealthProber_OTELCol(t *testing.T) {
59-
unhealthyError := fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
64+
unhealthyError := fmt.Errorf("%w: opentelemetrycollectors replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
6065
for _, tc := range []struct {
6166
name string
6267
replicas int64
@@ -74,18 +79,23 @@ func Test_AgentHealthProber_OTELCol(t *testing.T) {
7479
} {
7580
t.Run(tc.name, func(t *testing.T) {
7681
healthProber := AgentHealthProber()
77-
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
78-
Group: otelv1alpha1.GroupVersion.Group,
79-
Resource: OpenTelemetryCollectorsResource,
80-
Name: SpokeOTELColName,
81-
Namespace: SpokeOTELColNamespace,
82-
}, workv1.StatusFeedbackResult{
83-
Values: []workv1.FeedbackValue{
84-
{
85-
Name: "replicas",
86-
Value: workv1.FieldValue{
87-
Type: workv1.Integer,
88-
Integer: &tc.replicas,
82+
err := healthProber.WorkProber.HealthChecker([]agent.ResultField{
83+
{
84+
ResourceIdentifier: workv1.ResourceIdentifier{
85+
Group: otelv1alpha1.GroupVersion.Group,
86+
Resource: OpenTelemetryCollectorsResource,
87+
Name: SpokeOTELColName,
88+
Namespace: SpokeOTELColNamespace,
89+
},
90+
FeedbackResult: workv1.StatusFeedbackResult{
91+
Values: []workv1.FeedbackValue{
92+
{
93+
Name: "replicas",
94+
Value: workv1.FieldValue{
95+
Type: workv1.Integer,
96+
Integer: &tc.replicas,
97+
},
98+
},
8999
},
90100
},
91101
},

0 commit comments

Comments
 (0)