Skip to content

Commit 5ff7dcd

Browse files
openshift-cherrypick-robotzhujian7claude
authored
[release-1.1] 🐛 Fix health check to support partial probe results with multi-probe configuration (#343)
* 🐛 Fix health check to support partial probe results with multi-probe configuration When multiple health check probes are configured, if the first probe returns empty feedback results, the health check was incorrectly marking the addon as Unknown and returning early, preventing other probes from being evaluated. This fix changes the logic to: - Collect all empty probe fields instead of returning immediately - Only mark the addon as Unknown if ALL probes have no results - Allow partial probe results to be considered valid - Remove redundant check for non-empty results after healthChecker Fixes #341 Signed-off-by: Claude <[email protected]> Signed-off-by: zhujian <[email protected]> * Refactor health check probe result handling Simplify the probe result collection logic by: - Collecting all probe results upfront, regardless of healthChecker - Checking for empty results based on FieldResults length - Removing unnecessary tracking of empty probe fields - Updating test expectations for multi-probe scenarios This makes the code cleaner and maintains the fix for supporting partial probe results with multi-probe configurations. Signed-off-by: Jia Zhu <[email protected]> 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]> Signed-off-by: zhujian <[email protected]> --------- Signed-off-by: Claude <[email protected]> Signed-off-by: zhujian <[email protected]> Co-authored-by: zhujian <[email protected]> Co-authored-by: Claude <[email protected]>
1 parent c2f5c52 commit 5ff7dcd

File tree

3 files changed

+89
-17
lines changed

3 files changed

+89
-17
lines changed

pkg/addonmanager/controllers/agentdeploy/healthcheck_sync.go

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -187,29 +187,19 @@ func (s *healthCheckSyncer) probeAddonStatusByWorks(
187187
return err
188188
}
189189

190-
var FieldResults []agent.FieldResult
190+
var fieldResults []agent.FieldResult
191191

192192
for _, field := range probeFields {
193193
results := findResultsByIdentifier(field.ResourceIdentifier, manifestConditions)
194194
// if no results are returned. it is possible that work agent has not returned the feedback value.
195-
// mark condition to unknown
195+
// collect these fields and check if all probes are empty later
196196
if len(results) == 0 {
197-
meta.SetStatusCondition(&addon.Status.Conditions, metav1.Condition{
198-
Type: addonapiv1alpha1.ManagedClusterAddOnConditionAvailable,
199-
Status: metav1.ConditionUnknown,
200-
Reason: addonapiv1alpha1.AddonAvailableReasonNoProbeResult,
201-
Message: fmt.Sprintf("Probe results are not returned for %s/%s: %s/%s",
202-
field.ResourceIdentifier.Group, field.ResourceIdentifier.Resource,
203-
field.ResourceIdentifier.Namespace, field.ResourceIdentifier.Name),
204-
})
205-
return nil
197+
continue
206198
}
207199

200+
fieldResults = append(fieldResults, results...)
208201
// healthCheck will be ignored if healthChecker is set
209202
if healthChecker != nil {
210-
if len(results) != 0 {
211-
FieldResults = append(FieldResults, results...)
212-
}
213203
continue
214204
}
215205

@@ -238,8 +228,22 @@ func (s *healthCheckSyncer) probeAddonStatusByWorks(
238228

239229
}
240230

231+
// If all probe fields have no results, mark condition to unknown
232+
if len(probeFields) > 0 && len(fieldResults) == 0 {
233+
meta.SetStatusCondition(&addon.Status.Conditions, metav1.Condition{
234+
Type: addonapiv1alpha1.ManagedClusterAddOnConditionAvailable,
235+
Status: metav1.ConditionUnknown,
236+
Reason: addonapiv1alpha1.AddonAvailableReasonNoProbeResult,
237+
Message: "Probe results are not returned",
238+
})
239+
return nil
240+
}
241+
242+
// If we have fieldResults but some probes are empty, still proceed with healthChecker
243+
// This allows partial probe results to be considered valid
244+
241245
if healthChecker != nil {
242-
if err := healthChecker(FieldResults, cluster, addon); err != nil {
246+
if err := healthChecker(fieldResults, cluster, addon); err != nil {
243247
meta.SetStatusCondition(&addon.Status.Conditions, metav1.Condition{
244248
Type: addonapiv1alpha1.ManagedClusterAddOnConditionAvailable,
245249
Status: metav1.ConditionFalse,

pkg/addonmanager/controllers/agentdeploy/healthcheck_sync_test.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1259,6 +1259,74 @@ func TestHealthCheckReconcile(t *testing.T) {
12591259
Message: "test add-on is available.",
12601260
},
12611261
},
1262+
{
1263+
name: "Health check mode is work with multi probes where first probe has empty results but second has valid results",
1264+
testAddon: &healthCheckTestAgent{name: "test",
1265+
health: newDeploymentsCheckAllProber(
1266+
types.NamespacedName{Name: "test-deployment-nonexistent", Namespace: "default"},
1267+
types.NamespacedName{Name: "test-deployment1", Namespace: "default"}),
1268+
},
1269+
addon: addontesting.NewAddonWithConditions("test", "cluster1", manifestAppliedCondition),
1270+
existingWork: []runtime.Object{
1271+
&v1.ManifestWork{
1272+
ObjectMeta: metav1.ObjectMeta{
1273+
Name: "addon-test-deploy-01",
1274+
Namespace: "cluster1",
1275+
Labels: map[string]string{
1276+
"open-cluster-management.io/addon-name": "test",
1277+
},
1278+
},
1279+
Spec: v1.ManifestWorkSpec{},
1280+
Status: v1.ManifestWorkStatus{
1281+
ResourceStatus: v1.ManifestResourceStatus{
1282+
Manifests: []v1.ManifestCondition{
1283+
{
1284+
ResourceMeta: v1.ManifestResourceMeta{
1285+
Ordinal: 0,
1286+
Group: "apps",
1287+
Version: "",
1288+
Kind: "",
1289+
Resource: "deployments",
1290+
Name: "test-deployment1",
1291+
Namespace: "default",
1292+
},
1293+
StatusFeedbacks: v1.StatusFeedbackResult{
1294+
Values: []v1.FeedbackValue{
1295+
{
1296+
Name: "Replicas",
1297+
Value: v1.FieldValue{
1298+
Integer: boolPtr(1),
1299+
},
1300+
},
1301+
{
1302+
Name: "ReadyReplicas",
1303+
Value: v1.FieldValue{
1304+
Integer: boolPtr(1),
1305+
},
1306+
},
1307+
},
1308+
},
1309+
},
1310+
},
1311+
},
1312+
Conditions: []metav1.Condition{
1313+
{
1314+
Type: v1.WorkAvailable,
1315+
Status: metav1.ConditionTrue,
1316+
},
1317+
},
1318+
},
1319+
},
1320+
},
1321+
expectedErr: nil,
1322+
expectedHealthCheckMode: addonapiv1alpha1.HealthCheckModeCustomized,
1323+
expectAvailableCondition: metav1.Condition{
1324+
Type: addonapiv1alpha1.ManagedClusterAddOnConditionAvailable,
1325+
Status: metav1.ConditionTrue,
1326+
Reason: addonapiv1alpha1.AddonAvailableReasonProbeAvailable,
1327+
Message: "test add-on is available.",
1328+
},
1329+
},
12621330
}
12631331

12641332
for _, c := range cases {

test/integration/kube/agent_deploy_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -922,8 +922,8 @@ var _ = ginkgo.Describe("Agent deploy", func() {
922922
return err
923923
}
924924

925-
// the daemonset is not probed, so the addon available condition is unknown
926-
if !meta.IsStatusConditionPresentAndEqual(addon.Status.Conditions, "Available", metav1.ConditionUnknown) {
925+
// the daemonset is not probed, so the addon available condition is false
926+
if !meta.IsStatusConditionPresentAndEqual(addon.Status.Conditions, "Available", metav1.ConditionFalse) {
927927
return fmt.Errorf("Unexpected addon available condition, %v", addon.Status.Conditions)
928928
}
929929
return nil

0 commit comments

Comments
 (0)