Skip to content

Commit 7b4b9b9

Browse files
feat: update health probe to make use of healthChecker (#94)
1 parent 914f908 commit 7b4b9b9

File tree

3 files changed

+183
-59
lines changed

3 files changed

+183
-59
lines changed

internal/addon/addon.go

Lines changed: 75 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,20 @@ import (
66

77
otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
88
loggingv1 "github.com/openshift/cluster-logging-operator/api/observability/v1"
9+
prometheusalpha1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1"
910
"open-cluster-management.io/addon-framework/pkg/agent"
1011
"open-cluster-management.io/addon-framework/pkg/utils"
12+
"open-cluster-management.io/api/addon/v1alpha1"
13+
v1 "open-cluster-management.io/api/cluster/v1"
1114
workv1 "open-cluster-management.io/api/work/v1"
1215
)
1316

1417
var (
18+
errMissingFields = errors.New("no fields found in health checker")
1519
errProbeConditionNotSatisfied = errors.New("probe condition is not satisfied")
1620
errProbeValueIsNil = errors.New("probe value is nil")
17-
errValueNotProbed = errors.New("value not probed")
21+
errUnknownProbeKey = errors.New("probe has key that doesn't match the key defined")
22+
errUnknownResource = errors.New("undefined health check for resource")
1823
)
1924

2025
func NewRegistrationOption(agentName string) *agent.RegistrationOption {
@@ -31,6 +36,25 @@ func AgentHealthProber() *agent.HealthProber {
3136
Type: agent.HealthProberTypeWork,
3237
WorkProber: &agent.WorkHealthProber{
3338
ProbeFields: []agent.ProbeField{
39+
{
40+
ResourceIdentifier: workv1.ResourceIdentifier{
41+
Group: prometheusalpha1.SchemeGroupVersion.Group,
42+
Resource: PrometheusAgentResource,
43+
Name: PPAName,
44+
Namespace: InstallNamespace,
45+
},
46+
ProbeRules: []workv1.FeedbackRule{
47+
{
48+
Type: workv1.JSONPathsType,
49+
JsonPaths: []workv1.JsonPath{
50+
{
51+
Name: paProbeKey,
52+
Path: paProbePath,
53+
},
54+
},
55+
},
56+
},
57+
},
3458
{
3559
ResourceIdentifier: workv1.ResourceIdentifier{
3660
Group: loggingv1.GroupVersion.Group,
@@ -70,42 +94,68 @@ func AgentHealthProber() *agent.HealthProber {
7094
},
7195
},
7296
},
73-
HealthCheck: func(identifier workv1.ResourceIdentifier, result workv1.StatusFeedbackResult) error {
74-
for _, value := range result.Values {
97+
HealthChecker: func(fields []agent.FieldResult, mc *v1.ManagedCluster, mcao *v1alpha1.ManagedClusterAddOn) error {
98+
if len(fields) == 0 {
99+
return errMissingFields
100+
}
101+
for _, field := range fields {
102+
if len(field.FeedbackResult.Values) == 0 {
103+
// If a probe didn't get any values maybe the resources were not deployed
104+
continue
105+
}
106+
identifier := field.ResourceIdentifier
75107
switch identifier.Resource {
76-
case ClusterLogForwardersResource:
77-
if value.Name != clfProbeKey {
78-
continue
79-
}
108+
case PrometheusAgentResource:
109+
for _, value := range field.FeedbackResult.Values {
110+
if value.Name != paProbeKey {
111+
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
112+
}
80113

81-
if value.Value.String == nil {
82-
return fmt.Errorf("%w: clusterlogforwarder with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
83-
}
114+
if value.Value.String == nil {
115+
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
116+
}
84117

85-
if *value.Value.String != "True" {
86-
return fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, *value.Value.String, identifier.Namespace, identifier.Name)
118+
if *value.Value.String != "True" {
119+
return fmt.Errorf("%w: %s status condition type is %s for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.String, identifier.Namespace, identifier.Name)
120+
}
121+
// pa passes the health check
87122
}
123+
case ClusterLogForwardersResource:
124+
for _, value := range field.FeedbackResult.Values {
125+
if value.Name != clfProbeKey {
126+
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
127+
}
88128

89-
return nil
90-
case OpenTelemetryCollectorsResource:
91-
if value.Name != otelColProbeKey {
92-
continue
93-
}
129+
if value.Value.String == nil {
130+
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
131+
}
94132

95-
if value.Value.Integer == nil {
96-
return fmt.Errorf("%w: opentelemetrycollector with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
133+
if *value.Value.String != "True" {
134+
return fmt.Errorf("%w: %s status condition type is %s for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.String, identifier.Namespace, identifier.Name)
135+
}
136+
// clf passes the health check
97137
}
138+
case OpenTelemetryCollectorsResource:
139+
for _, value := range field.FeedbackResult.Values {
140+
if value.Name != otelColProbeKey {
141+
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
142+
}
98143

99-
if *value.Value.Integer < 1 {
100-
return fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, *value.Value.Integer, identifier.Namespace, identifier.Name)
101-
}
144+
if value.Value.Integer == nil {
145+
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
146+
}
102147

103-
return nil
148+
if *value.Value.Integer < 1 {
149+
return fmt.Errorf("%w: %s replicas is %d for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.Integer, identifier.Namespace, identifier.Name)
150+
}
151+
// otel collector passes the health check
152+
}
104153
default:
105-
continue
154+
// If a resource is being probed it should have a health check defined
155+
return fmt.Errorf("%w: %s with key %s/%s", errUnknownResource, identifier.Resource, identifier.Namespace, identifier.Name)
106156
}
107157
}
108-
return fmt.Errorf("%w: for resource %s with key %s/%s", errValueNotProbed, identifier.Resource, identifier.Namespace, identifier.Name)
158+
return nil
109159
},
110160
},
111161
}

internal/addon/addon_test.go

Lines changed: 97 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,67 @@ import (
77
otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
88
loggingv1 "github.com/openshift/cluster-logging-operator/api/observability/v1"
99
"github.com/stretchr/testify/require"
10+
"open-cluster-management.io/addon-framework/pkg/addonmanager/addontesting"
11+
"open-cluster-management.io/addon-framework/pkg/agent"
1012
workv1 "open-cluster-management.io/api/work/v1"
1113
)
1214

15+
func Test_AgentHealthProber_PPA(t *testing.T) {
16+
unhealthyError := fmt.Errorf("%w: prometheusagents status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", InstallNamespace, PPAName)
17+
managedCluster := addontesting.NewManagedCluster("cluster-1")
18+
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
19+
for _, tc := range []struct {
20+
name string
21+
status string
22+
expectedErr string
23+
}{
24+
{
25+
name: "healthy",
26+
status: "True",
27+
},
28+
{
29+
name: "unhealthy",
30+
status: "False",
31+
expectedErr: unhealthyError.Error(),
32+
},
33+
} {
34+
t.Run(tc.name, func(t *testing.T) {
35+
healthProber := AgentHealthProber()
36+
err := healthProber.WorkProber.HealthChecker(
37+
[]agent.FieldResult{
38+
{
39+
ResourceIdentifier: workv1.ResourceIdentifier{
40+
Group: loggingv1.GroupVersion.Group,
41+
Resource: PrometheusAgentResource,
42+
Name: PPAName,
43+
Namespace: InstallNamespace,
44+
},
45+
FeedbackResult: workv1.StatusFeedbackResult{
46+
Values: []workv1.FeedbackValue{
47+
{
48+
Name: "isAvailable",
49+
Value: workv1.FieldValue{
50+
Type: workv1.String,
51+
String: &tc.status,
52+
},
53+
},
54+
},
55+
},
56+
},
57+
}, managedCluster, managedClusterAddOn)
58+
if tc.expectedErr != "" {
59+
require.EqualError(t, err, tc.expectedErr)
60+
return
61+
}
62+
require.NoError(t, err)
63+
})
64+
}
65+
}
66+
1367
func Test_AgentHealthProber_CLF(t *testing.T) {
14-
unhealthyError := fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
68+
unhealthyError := fmt.Errorf("%w: clusterlogforwarders status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
69+
managedCluster := addontesting.NewManagedCluster("cluster-1")
70+
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
1571
for _, tc := range []struct {
1672
name string
1773
status string
@@ -29,22 +85,28 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
2985
} {
3086
t.Run(tc.name, func(t *testing.T) {
3187
healthProber := AgentHealthProber()
32-
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
33-
Group: loggingv1.GroupVersion.Group,
34-
Resource: ClusterLogForwardersResource,
35-
Name: SpokeCLFName,
36-
Namespace: SpokeCLFNamespace,
37-
}, workv1.StatusFeedbackResult{
38-
Values: []workv1.FeedbackValue{
88+
err := healthProber.WorkProber.HealthChecker(
89+
[]agent.FieldResult{
3990
{
40-
Name: "isReady",
41-
Value: workv1.FieldValue{
42-
Type: workv1.String,
43-
String: &tc.status,
91+
ResourceIdentifier: workv1.ResourceIdentifier{
92+
Group: loggingv1.GroupVersion.Group,
93+
Resource: ClusterLogForwardersResource,
94+
Name: SpokeCLFName,
95+
Namespace: SpokeCLFNamespace,
96+
},
97+
FeedbackResult: workv1.StatusFeedbackResult{
98+
Values: []workv1.FeedbackValue{
99+
{
100+
Name: "isReady",
101+
Value: workv1.FieldValue{
102+
Type: workv1.String,
103+
String: &tc.status,
104+
},
105+
},
106+
},
44107
},
45108
},
46-
},
47-
})
109+
}, managedCluster, managedClusterAddOn)
48110
if tc.expectedErr != "" {
49111
require.EqualError(t, err, tc.expectedErr)
50112
return
@@ -55,7 +117,9 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
55117
}
56118

57119
func Test_AgentHealthProber_OTELCol(t *testing.T) {
58-
unhealthyError := fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
120+
unhealthyError := fmt.Errorf("%w: opentelemetrycollectors replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
121+
managedCluster := addontesting.NewManagedCluster("cluster-1")
122+
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
59123
for _, tc := range []struct {
60124
name string
61125
replicas int64
@@ -73,22 +137,27 @@ func Test_AgentHealthProber_OTELCol(t *testing.T) {
73137
} {
74138
t.Run(tc.name, func(t *testing.T) {
75139
healthProber := AgentHealthProber()
76-
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
77-
Group: otelv1alpha1.GroupVersion.Group,
78-
Resource: OpenTelemetryCollectorsResource,
79-
Name: SpokeOTELColName,
80-
Namespace: SpokeOTELColNamespace,
81-
}, workv1.StatusFeedbackResult{
82-
Values: []workv1.FeedbackValue{
83-
{
84-
Name: "replicas",
85-
Value: workv1.FieldValue{
86-
Type: workv1.Integer,
87-
Integer: &tc.replicas,
140+
err := healthProber.WorkProber.HealthChecker([]agent.FieldResult{
141+
{
142+
ResourceIdentifier: workv1.ResourceIdentifier{
143+
Group: otelv1alpha1.GroupVersion.Group,
144+
Resource: OpenTelemetryCollectorsResource,
145+
Name: SpokeOTELColName,
146+
Namespace: SpokeOTELColNamespace,
147+
},
148+
FeedbackResult: workv1.StatusFeedbackResult{
149+
Values: []workv1.FeedbackValue{
150+
{
151+
Name: "replicas",
152+
Value: workv1.FieldValue{
153+
Type: workv1.Integer,
154+
Integer: &tc.replicas,
155+
},
156+
},
88157
},
89158
},
90159
},
91-
})
160+
}, managedCluster, managedClusterAddOn)
92161
if tc.expectedErr != "" {
93162
require.EqualError(t, err, tc.expectedErr)
94163
return

internal/addon/var.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,17 @@ const (
1616
TracingChartDir = "manifests/charts/mcoa/charts/tracing"
1717

1818
AddonDeploymentConfigResource = "addondeploymentconfigs"
19-
ClusterLogForwardersResource = "clusterlogforwarders"
20-
SpokeCLFName = "mcoa-instance"
21-
SpokeCLFNamespace = "openshift-logging"
22-
clfProbeKey = "isReady"
23-
// TODO @JoaoBraveCoding this most likely needs to be updated to reflect the new path
24-
clfProbePath = ".status.conditions[?(@.type==\"Ready\")].status"
19+
20+
PrometheusAgentResource = "prometheusagents"
21+
PPAName = "acm-platform-metrics-collector-config"
22+
paProbeKey = "isAvailable"
23+
paProbePath = ".status.conditions[?(@.type==\"Available\")].status"
24+
25+
ClusterLogForwardersResource = "clusterlogforwarders"
26+
SpokeCLFName = "mcoa-instance"
27+
SpokeCLFNamespace = "openshift-logging"
28+
clfProbeKey = "isReady"
29+
clfProbePath = ".status.conditions[?(@.type==\"Ready\")].status"
2530

2631
OpenTelemetryCollectorsResource = "opentelemetrycollectors"
2732
InstrumentationResource = "instrumentations"

0 commit comments

Comments
 (0)