Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 75 additions & 25 deletions internal/addon/addon.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,20 @@ import (

otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
loggingv1 "github.com/openshift/cluster-logging-operator/api/observability/v1"
prometheusalpha1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1"
"open-cluster-management.io/addon-framework/pkg/agent"
"open-cluster-management.io/addon-framework/pkg/utils"
"open-cluster-management.io/api/addon/v1alpha1"
v1 "open-cluster-management.io/api/cluster/v1"
workv1 "open-cluster-management.io/api/work/v1"
)

var (
errMissingFields = errors.New("no fields found in health checker")
errProbeConditionNotSatisfied = errors.New("probe condition is not satisfied")
errProbeValueIsNil = errors.New("probe value is nil")
errValueNotProbed = errors.New("value not probed")
errUnknownProbeKey = errors.New("probe has key that doesn't match the key defined")
errUnknownResource = errors.New("undefined health check for resource")
)

func NewRegistrationOption(agentName string) *agent.RegistrationOption {
Expand All @@ -31,6 +36,25 @@ func AgentHealthProber() *agent.HealthProber {
Type: agent.HealthProberTypeWork,
WorkProber: &agent.WorkHealthProber{
ProbeFields: []agent.ProbeField{
{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: prometheusalpha1.SchemeGroupVersion.Group,
Resource: PrometheusAgentResource,
Name: PPAName,
Namespace: InstallNamespace,
},
ProbeRules: []workv1.FeedbackRule{
{
Type: workv1.JSONPathsType,
JsonPaths: []workv1.JsonPath{
{
Name: paProbeKey,
Path: paProbePath,
},
},
},
},
},
Comment on lines +39 to +57
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thibaultmg I've added a probe to the Platform prometheus, feel free to request any changes in this. It I can also add for UWM if you want but I would also have to update the HealthChecker

{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Expand Down Expand Up @@ -70,42 +94,68 @@ func AgentHealthProber() *agent.HealthProber {
},
},
},
HealthCheck: func(identifier workv1.ResourceIdentifier, result workv1.StatusFeedbackResult) error {
for _, value := range result.Values {
HealthChecker: func(fields []agent.FieldResult, mc *v1.ManagedCluster, mcao *v1alpha1.ManagedClusterAddOn) error {
if len(fields) == 0 {
return errMissingFields
}
for _, field := range fields {
if len(field.FeedbackResult.Values) == 0 {
// If a probe didn't get any values maybe the resources were not deployed
continue
}
identifier := field.ResourceIdentifier
switch identifier.Resource {
case ClusterLogForwardersResource:
if value.Name != clfProbeKey {
continue
}
case PrometheusAgentResource:
for _, value := range field.FeedbackResult.Values {
if value.Name != paProbeKey {
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
}

if value.Value.String == nil {
return fmt.Errorf("%w: clusterlogforwarder with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
}
if value.Value.String == nil {
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
}

if *value.Value.String != "True" {
return fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, *value.Value.String, identifier.Namespace, identifier.Name)
if *value.Value.String != "True" {
return fmt.Errorf("%w: %s status condition type is %s for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.String, identifier.Namespace, identifier.Name)
}
// pa passes the health check
}
case ClusterLogForwardersResource:
for _, value := range field.FeedbackResult.Values {
if value.Name != clfProbeKey {
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
}

return nil
case OpenTelemetryCollectorsResource:
if value.Name != otelColProbeKey {
continue
}
if value.Value.String == nil {
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
}

if value.Value.Integer == nil {
return fmt.Errorf("%w: opentelemetrycollector with key %s/%s", errProbeValueIsNil, identifier.Namespace, identifier.Name)
if *value.Value.String != "True" {
return fmt.Errorf("%w: %s status condition type is %s for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.String, identifier.Namespace, identifier.Name)
}
// clf passes the health check
}
case OpenTelemetryCollectorsResource:
for _, value := range field.FeedbackResult.Values {
if value.Name != otelColProbeKey {
return fmt.Errorf("%w: %s with key %s/%s unknown probe keys %s", errUnknownProbeKey, identifier.Resource, identifier.Namespace, identifier.Name, value.Name)
}

if *value.Value.Integer < 1 {
return fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, *value.Value.Integer, identifier.Namespace, identifier.Name)
}
if value.Value.Integer == nil {
return fmt.Errorf("%w: %s with key %s/%s", errProbeValueIsNil, identifier.Resource, identifier.Namespace, identifier.Name)
}

return nil
if *value.Value.Integer < 1 {
return fmt.Errorf("%w: %s replicas is %d for %s/%s", errProbeConditionNotSatisfied, identifier.Resource, *value.Value.Integer, identifier.Namespace, identifier.Name)
}
// otel collector passes the health check
}
default:
continue
// If a resource is being probed it should have a health check defined
return fmt.Errorf("%w: %s with key %s/%s", errUnknownResource, identifier.Resource, identifier.Namespace, identifier.Name)
}
}
return fmt.Errorf("%w: for resource %s with key %s/%s", errValueNotProbed, identifier.Resource, identifier.Namespace, identifier.Name)
return nil
},
},
}
Expand Down
125 changes: 97 additions & 28 deletions internal/addon/addon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,67 @@ import (
otelv1alpha1 "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1"
loggingv1 "github.com/openshift/cluster-logging-operator/api/observability/v1"
"github.com/stretchr/testify/require"
"open-cluster-management.io/addon-framework/pkg/addonmanager/addontesting"
"open-cluster-management.io/addon-framework/pkg/agent"
workv1 "open-cluster-management.io/api/work/v1"
)

func Test_AgentHealthProber_PPA(t *testing.T) {
unhealthyError := fmt.Errorf("%w: prometheusagents status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", InstallNamespace, PPAName)
managedCluster := addontesting.NewManagedCluster("cluster-1")
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
for _, tc := range []struct {
name string
status string
expectedErr string
}{
{
name: "healthy",
status: "True",
},
{
name: "unhealthy",
status: "False",
expectedErr: unhealthyError.Error(),
},
} {
t.Run(tc.name, func(t *testing.T) {
healthProber := AgentHealthProber()
err := healthProber.WorkProber.HealthChecker(
[]agent.FieldResult{
{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Resource: PrometheusAgentResource,
Name: PPAName,
Namespace: InstallNamespace,
},
FeedbackResult: workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "isAvailable",
Value: workv1.FieldValue{
Type: workv1.String,
String: &tc.status,
},
},
},
},
},
}, managedCluster, managedClusterAddOn)
if tc.expectedErr != "" {
require.EqualError(t, err, tc.expectedErr)
return
}
require.NoError(t, err)
})
}
}

func Test_AgentHealthProber_CLF(t *testing.T) {
unhealthyError := fmt.Errorf("%w: clusterlogforwarder status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
unhealthyError := fmt.Errorf("%w: clusterlogforwarders status condition type is %s for %s/%s", errProbeConditionNotSatisfied, "False", SpokeCLFNamespace, SpokeCLFName)
managedCluster := addontesting.NewManagedCluster("cluster-1")
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
for _, tc := range []struct {
name string
status string
Expand All @@ -29,22 +85,28 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
healthProber := AgentHealthProber()
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Resource: ClusterLogForwardersResource,
Name: SpokeCLFName,
Namespace: SpokeCLFNamespace,
}, workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
err := healthProber.WorkProber.HealthChecker(
[]agent.FieldResult{
{
Name: "isReady",
Value: workv1.FieldValue{
Type: workv1.String,
String: &tc.status,
ResourceIdentifier: workv1.ResourceIdentifier{
Group: loggingv1.GroupVersion.Group,
Resource: ClusterLogForwardersResource,
Name: SpokeCLFName,
Namespace: SpokeCLFNamespace,
},
FeedbackResult: workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "isReady",
Value: workv1.FieldValue{
Type: workv1.String,
String: &tc.status,
},
},
},
},
},
},
})
}, managedCluster, managedClusterAddOn)
if tc.expectedErr != "" {
require.EqualError(t, err, tc.expectedErr)
return
Expand All @@ -55,7 +117,9 @@ func Test_AgentHealthProber_CLF(t *testing.T) {
}

func Test_AgentHealthProber_OTELCol(t *testing.T) {
unhealthyError := fmt.Errorf("%w: opentelemetrycollector replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
unhealthyError := fmt.Errorf("%w: opentelemetrycollectors replicas is %d for %s/%s", errProbeConditionNotSatisfied, 0, SpokeOTELColNamespace, SpokeOTELColName)
managedCluster := addontesting.NewManagedCluster("cluster-1")
managedClusterAddOn := addontesting.NewAddon("test", "cluster-1")
for _, tc := range []struct {
name string
replicas int64
Expand All @@ -73,22 +137,27 @@ func Test_AgentHealthProber_OTELCol(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
healthProber := AgentHealthProber()
err := healthProber.WorkProber.HealthCheck(workv1.ResourceIdentifier{
Group: otelv1alpha1.GroupVersion.Group,
Resource: OpenTelemetryCollectorsResource,
Name: SpokeOTELColName,
Namespace: SpokeOTELColNamespace,
}, workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "replicas",
Value: workv1.FieldValue{
Type: workv1.Integer,
Integer: &tc.replicas,
err := healthProber.WorkProber.HealthChecker([]agent.FieldResult{
{
ResourceIdentifier: workv1.ResourceIdentifier{
Group: otelv1alpha1.GroupVersion.Group,
Resource: OpenTelemetryCollectorsResource,
Name: SpokeOTELColName,
Namespace: SpokeOTELColNamespace,
},
FeedbackResult: workv1.StatusFeedbackResult{
Values: []workv1.FeedbackValue{
{
Name: "replicas",
Value: workv1.FieldValue{
Type: workv1.Integer,
Integer: &tc.replicas,
},
},
},
},
},
})
}, managedCluster, managedClusterAddOn)
if tc.expectedErr != "" {
require.EqualError(t, err, tc.expectedErr)
return
Expand Down
17 changes: 11 additions & 6 deletions internal/addon/var.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,17 @@ const (
TracingChartDir = "manifests/charts/mcoa/charts/tracing"

AddonDeploymentConfigResource = "addondeploymentconfigs"
ClusterLogForwardersResource = "clusterlogforwarders"
SpokeCLFName = "mcoa-instance"
SpokeCLFNamespace = "openshift-logging"
clfProbeKey = "isReady"
// TODO @JoaoBraveCoding this most likely needs to be updated to reflect the new path
clfProbePath = ".status.conditions[?(@.type==\"Ready\")].status"

PrometheusAgentResource = "prometheusagents"
PPAName = "acm-platform-metrics-collector-config"
paProbeKey = "isAvailable"
paProbePath = ".status.conditions[?(@.type==\"Available\")].status"

ClusterLogForwardersResource = "clusterlogforwarders"
SpokeCLFName = "mcoa-instance"
SpokeCLFNamespace = "openshift-logging"
clfProbeKey = "isReady"
clfProbePath = ".status.conditions[?(@.type==\"Ready\")].status"

OpenTelemetryCollectorsResource = "opentelemetrycollectors"
InstrumentationResource = "instrumentations"
Expand Down