Skip to content

Commit 5f5dc9e

Browse files
authored
Merge pull request #57 from BaizeAI/add-metrics
add metrics
2 parents 637aca0 + 5846f82 commit 5f5dc9e

File tree

11 files changed

+595
-6
lines changed

11 files changed

+595
-6
lines changed

README.md

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# snapshot-pod
2-
// TODO(user): Add simple overview of use/purpose
2+
Snapshot and export running Pod container images on Kubernetes nodes.
33

44
## Description
5-
// TODO(user): An in-depth paragraph about your project and overview of use
5+
This controller creates point-in-time snapshots of running Pods by committing container filesystem layers and optionally pushing the resulting image to a registry. It offers Prometheus metrics for observability of task states, failures, and controller health.
66

77
## Getting Started
88

@@ -66,6 +66,48 @@ make uninstall
6666
make undeploy
6767
```
6868

69+
## Metrics
70+
71+
All metrics are exposed on the controller's Prometheus endpoint.
72+
73+
- Snapshot tasks
74+
- `snapshot_tasks_total{status, namespace, snapshot_name}`: Counter. Number of tasks by final/observed status. Use with rate/increase if needed.
75+
- `snapshot_task_duration_seconds{status, namespace, snapshot_name}`: Histogram. Task duration seconds by status.
76+
- `snapshot_task_retries_total{namespace, snapshot_name, task_name}`: Counter. Retry attempts count.
77+
- `snapshot_active_tasks{namespace, snapshot_name}`: Gauge. Current active tasks per SnapshotPod.
78+
- `snapshot_tasks_by_node{node_name, status}`: Gauge. Task counts per node and status.
79+
- `snapshot_task_failed_total{namespace, snapshot_name, task_name}`: Counter. Increments when a task step errors or task exhausts max retries.
80+
81+
- Controller
82+
- `snapshot_controller_reconcile_total{controller, result}`: Counter. Reconcile outcomes (success/error/requeue).
83+
- `snapshot_controller_reconcile_duration_seconds{controller}`: Histogram. Reconcile latency.
84+
- `snapshot_controller_errors_total{controller, error_type}`: Counter. Controller error events.
85+
86+
- Image push
87+
- `snapshot_image_push_total{result, registry, namespace, snapshot_name}`: Counter. Push attempts.
88+
- `snapshot_image_push_duration_seconds{registry, namespace, snapshot_name}`: Histogram. Push latency.
89+
90+
### Alerting examples (PromQL)
91+
92+
- New failures in last 5 minutes (per task):
93+
```promql
94+
increase(snapshot_task_failed_total[5m]) by (namespace, snapshot_name, task_name) > 0
95+
```
96+
97+
- Any new failure in a namespace:
98+
```promql
99+
sum by (namespace) (increase(snapshot_task_failed_total[5m])) > 0
100+
```
101+
102+
- Optional stability:
103+
```yaml
104+
for: 1m
105+
```
106+
107+
Notes:
108+
- Always alert on a rate/increase over a window for counters; do not compare raw counters to 0.
109+
- Tune the lookback window (e.g., 5m/10m) to balance sensitivity vs stability.
110+
69111
## Project Distribution
70112
71113
Following are the steps to build the installer and distribute this project to users.

cmd/main.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343

4444
snapshotpodv1alpha1 "github.com/baizeai/kube-snapshot/api/v1alpha1"
4545
"github.com/baizeai/kube-snapshot/internal/controller"
46+
"github.com/baizeai/kube-snapshot/internal/metrics"
4647
// +kubebuilder:scaffold:imports
4748
)
4849

@@ -66,8 +67,8 @@ func main() {
6667
var enableHTTP2 bool
6768
var systemWideDockerConfigPath string
6869
var certDir string
69-
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metric endpoint binds to. "+
70-
"Use the port :8080. If not set, it will be 0 in order to disable the metrics server")
70+
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to. "+
71+
"Set to '0' to disable the metrics server.")
7172
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
7273
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
7374
"Enable leader election for controller manager. "+
@@ -212,6 +213,11 @@ func main() {
212213
}
213214
// +kubebuilder:scaffold:builder
214215

216+
// Start metrics collector with shared signal context
217+
ctx := ctrl.SetupSignalHandler()
218+
metricsCollector := metrics.NewMetricsCollector(mgr.GetClient())
219+
go metricsCollector.Start(ctx)
220+
215221
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
216222
setupLog.Error(err, "unable to set up health check")
217223
os.Exit(1)
@@ -222,7 +228,7 @@ func main() {
222228
}
223229

224230
setupLog.Info("starting manager")
225-
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
231+
if err := mgr.Start(ctx); err != nil {
226232
setupLog.Error(err, "problem running manager")
227233
os.Exit(1)
228234
}

internal/controller/snapshotpod_controller.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"sigs.k8s.io/controller-runtime/pkg/log"
3838

3939
snapshotpodv1alpha1 "github.com/baizeai/kube-snapshot/api/v1alpha1"
40+
"github.com/baizeai/kube-snapshot/internal/metrics"
4041
"github.com/baizeai/kube-snapshot/pkg/apis/snapshotpod/v1alpha1"
4142
)
4243

@@ -322,11 +323,15 @@ func renderNewImageName(originImage, format string, appendSnappedSuffix bool) (s
322323
// For more details, check Reconcile and its Result here:
323324
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
324325
func (r *SnapshotPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
326+
startTime := time.Now()
325327
logger := log.FromContext(ctx)
328+
326329
sp := snapshotpodv1alpha1.SnapshotPod{}
327330
err := r.Client.Get(ctx, req.NamespacedName, &sp)
328331
if err != nil {
329332
logger.Error(err, "get instance error")
333+
metrics.RecordControllerError("snapshotpod", "get_instance_error")
334+
metrics.RecordControllerReconcile("snapshotpod", metrics.ControllerResultError, time.Since(startTime))
330335
return ctrl.Result{}, err
331336
}
332337
if sp.Spec.TriggerRound <= 0 {
@@ -392,8 +397,11 @@ func (r *SnapshotPodReconciler) Reconcile(ctx context.Context, req ctrl.Request)
392397
if find || lo.ContainsBy(sp.Status.Conditions, func(item metav1.Condition) bool {
393398
return item.Status != metav1.ConditionTrue
394399
}) {
400+
metrics.RecordControllerReconcile("snapshotpod", metrics.ControllerResultRequeue, time.Since(startTime))
395401
return ctrl.Result{RequeueAfter: time.Second * 15}, nil
396402
}
403+
404+
metrics.RecordControllerReconcile("snapshotpod", metrics.ControllerResultSuccess, time.Since(startTime))
397405
return ctrl.Result{}, nil
398406
}
399407

internal/controller/snapshotpodtask_controller.go

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535

3636
snapshotpodv1alpha1 "github.com/baizeai/kube-snapshot/api/v1alpha1"
3737
criruntime "github.com/baizeai/kube-snapshot/internal/controller/runtime"
38+
"github.com/baizeai/kube-snapshot/internal/metrics"
39+
"github.com/baizeai/kube-snapshot/pkg/apis/snapshotpod/v1alpha1"
3840
)
3941

4042
const (
@@ -85,16 +87,33 @@ func (r *SnapshotPodTaskReconciler) getImageAuthWithSecret(ctx context.Context,
8587
return &a, nil
8688
}
8789

90+
// getSnapshotName extracts the snapshot name from the task's labels, falling back to the task name if not found
91+
func (r *SnapshotPodTaskReconciler) getSnapshotName(spt snapshotpodv1alpha1.SnapshotPodTask) string {
92+
snapshotName := spt.Labels[v1alpha1.SnapshotNameLabel]
93+
if snapshotName == "" {
94+
return "unknown"
95+
}
96+
return snapshotName
97+
}
98+
8899
func (r *SnapshotPodTaskReconciler) reconcilePushImage(ctx context.Context, spt *snapshotpodv1alpha1.SnapshotPodTask) error {
100+
startTime := time.Now()
101+
102+
// Get snapshot name from label
103+
snapshotName := r.getSnapshotName(*spt)
104+
89105
rtName, rt, _, err := r.getRuntimeAndContainerID(spt.Spec.ContainerID)
90106
if err != nil {
107+
metrics.RecordImagePush(metrics.ImagePushResultFailed, "unknown", spt.Namespace, snapshotName, time.Since(startTime))
91108
return err
92109
}
93110
if !rt.ImageExists(ctx, spt.Spec.CommitImage) {
111+
metrics.RecordImagePush(metrics.ImagePushResultFailed, "unknown", spt.Namespace, snapshotName, time.Since(startTime))
94112
return fmt.Errorf("image %s not exists", spt.Spec.CommitImage)
95113
}
96114
auth, err := r.getImageAuthWithSecret(ctx, spt.Namespace, spt.Spec.RegistrySecretRef, spt.Spec.CommitImage)
97115
if err != nil {
116+
metrics.RecordImagePush(metrics.ImagePushResultFailed, "unknown", spt.Namespace, snapshotName, time.Since(startTime))
98117
return err
99118
}
100119
if rtName == containerdRuntime {
@@ -103,7 +122,21 @@ func (r *SnapshotPodTaskReconciler) reconcilePushImage(ctx context.Context, spt
103122
a, _ := r.getImageAuthWithSecret(ctx, spt.Namespace, spt.Spec.OriginRegistrySecretRef, spt.Spec.OriginImage)
104123
_ = rt.Pull(ctx, spt.Spec.OriginImage, a, "--unpack=false")
105124
}
106-
return rt.Push(ctx, spt.Spec.CommitImage, auth)
125+
126+
// Extract image registry for metrics label
127+
registry := "unknown"
128+
if ref, err := docker.ParseReference("//" + spt.Spec.CommitImage); err == nil {
129+
registry = reference.Domain(ref.DockerReference())
130+
}
131+
132+
err = rt.Push(ctx, spt.Spec.CommitImage, auth)
133+
if err != nil {
134+
metrics.RecordImagePush(metrics.ImagePushResultFailed, registry, spt.Namespace, snapshotName, time.Since(startTime))
135+
return err
136+
}
137+
138+
metrics.RecordImagePush(metrics.ImagePushResultSuccess, registry, spt.Namespace, snapshotName, time.Since(startTime))
139+
return nil
107140
}
108141

109142
func (r *SnapshotPodTaskReconciler) reconcileCommit(ctx context.Context, spt *snapshotpodv1alpha1.SnapshotPodTask) error {
@@ -142,6 +175,7 @@ func (r *SnapshotPodTaskReconciler) reconcileAccept(ctx context.Context, spt *sn
142175
// For more details, check Reconcile and its Result here:
143176
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
144177
func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
178+
startTime := time.Now()
145179
logger := log.FromContext(ctx)
146180

147181
spt := snapshotpodv1alpha1.SnapshotPodTask{}
@@ -150,6 +184,8 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
150184
if errors.IsNotFound(err) {
151185
return ctrl.Result{}, nil
152186
}
187+
metrics.RecordControllerError("snapshotpodtask", "get_instance_error")
188+
metrics.RecordControllerReconcile("snapshotpodtask", metrics.ControllerResultError, time.Since(startTime))
153189
return ctrl.Result{}, err
154190
}
155191
if spt.Spec.NodeName != r.NodeName {
@@ -208,6 +244,7 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
208244
spt.Status.Conditions[i].Status = metav1.ConditionFalse
209245
spt.Status.Conditions[i].Message = err.Error()
210246
lastError = err
247+
211248
// Update status immediately after error
212249
if updateErr := r.Status().Update(ctx, &spt); updateErr != nil {
213250
return ctrl.Result{}, updateErr
@@ -227,6 +264,9 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
227264
}
228265
}
229266

267+
// Get snapshot name from label for metrics
268+
snapshotName := r.getSnapshotName(spt)
269+
230270
// Update phase based on conditions
231271
switch {
232272
case lo.EveryBy(spt.Status.Conditions, func(item metav1.Condition) bool {
@@ -243,6 +283,9 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
243283
}):
244284
spt.Status.RetryCount++
245285
spt.Status.LastRetryTime = lo.ToPtr(metav1.Now())
286+
287+
// Record retry count when it's actually incremented
288+
metrics.RecordSnapshotTaskRetry(spt.Namespace, snapshotName, spt.Name)
246289
maxRetries := spt.Spec.MaxRetries
247290
if maxRetries == 0 {
248291
maxRetries = 3
@@ -252,6 +295,8 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
252295
logger.Info("task failed after max retries",
253296
"retryCount", spt.Status.RetryCount,
254297
"maxRetries", maxRetries)
298+
299+
metrics.RecordTaskFailed(spt.Namespace, snapshotName, spt.Name)
255300
}
256301
default:
257302
spt.Status.Phase = snapshotpodv1alpha1.SnapshotPodTaskPhaseCreated
@@ -262,10 +307,34 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
262307
return ctrl.Result{}, err
263308
}
264309

310+
// Record metrics only when task reaches terminal state
311+
// For running tasks, we should use gauge metrics instead of counter/histogram
312+
if spt.Status.Phase == snapshotpodv1alpha1.SnapshotPodTaskPhaseCompleted ||
313+
spt.Status.Phase == snapshotpodv1alpha1.SnapshotPodTaskPhaseFailed {
314+
315+
var status string
316+
switch spt.Status.Phase {
317+
case snapshotpodv1alpha1.SnapshotPodTaskPhaseCompleted:
318+
status = metrics.SnapshotTaskStatusSuccess
319+
case snapshotpodv1alpha1.SnapshotPodTaskPhaseFailed:
320+
status = metrics.SnapshotTaskStatusFailed
321+
}
322+
323+
// Calculate task duration
324+
var duration time.Duration
325+
if !spt.CreationTimestamp.Time.IsZero() {
326+
duration = time.Since(spt.CreationTimestamp.Time)
327+
}
328+
metrics.RecordSnapshotTask(status, spt.Namespace, snapshotName, duration)
329+
}
330+
265331
switch spt.Status.Phase {
266332
case snapshotpodv1alpha1.SnapshotPodTaskPhaseFailed, snapshotpodv1alpha1.SnapshotPodTaskPhaseCompleted:
333+
metrics.RecordControllerReconcile("snapshotpodtask", metrics.ControllerResultSuccess, time.Since(startTime))
267334
return ctrl.Result{}, nil
268335
}
336+
337+
metrics.RecordControllerReconcile("snapshotpodtask", metrics.ControllerResultRequeue, time.Since(startTime))
269338
retryDelay := time.Second * 30
270339
if spt.Spec.RetryDelaySeconds > 0 {
271340
retryDelay = time.Second * time.Duration(spt.Spec.RetryDelaySeconds)

0 commit comments

Comments
 (0)