@@ -35,6 +35,8 @@ import (
3535
3636 snapshotpodv1alpha1 "github.com/baizeai/kube-snapshot/api/v1alpha1"
3737 criruntime "github.com/baizeai/kube-snapshot/internal/controller/runtime"
38+ "github.com/baizeai/kube-snapshot/internal/metrics"
39+ "github.com/baizeai/kube-snapshot/pkg/apis/snapshotpod/v1alpha1"
3840)
3941
4042const (
@@ -85,16 +87,33 @@ func (r *SnapshotPodTaskReconciler) getImageAuthWithSecret(ctx context.Context,
8587 return & a , nil
8688}
8789
90+ // getSnapshotName extracts the snapshot name from the task's labels, falling back to the task name if not found
91+ func (r * SnapshotPodTaskReconciler ) getSnapshotName (spt snapshotpodv1alpha1.SnapshotPodTask ) string {
92+ snapshotName := spt .Labels [v1alpha1 .SnapshotNameLabel ]
93+ if snapshotName == "" {
94+ return "unknown"
95+ }
96+ return snapshotName
97+ }
98+
8899func (r * SnapshotPodTaskReconciler ) reconcilePushImage (ctx context.Context , spt * snapshotpodv1alpha1.SnapshotPodTask ) error {
100+ startTime := time .Now ()
101+
102+ // Get snapshot name from label
103+ snapshotName := r .getSnapshotName (* spt )
104+
89105 rtName , rt , _ , err := r .getRuntimeAndContainerID (spt .Spec .ContainerID )
90106 if err != nil {
107+ metrics .RecordImagePush (metrics .ImagePushResultFailed , "unknown" , spt .Namespace , snapshotName , time .Since (startTime ))
91108 return err
92109 }
93110 if ! rt .ImageExists (ctx , spt .Spec .CommitImage ) {
111+ metrics .RecordImagePush (metrics .ImagePushResultFailed , "unknown" , spt .Namespace , snapshotName , time .Since (startTime ))
94112 return fmt .Errorf ("image %s not exists" , spt .Spec .CommitImage )
95113 }
96114 auth , err := r .getImageAuthWithSecret (ctx , spt .Namespace , spt .Spec .RegistrySecretRef , spt .Spec .CommitImage )
97115 if err != nil {
116+ metrics .RecordImagePush (metrics .ImagePushResultFailed , "unknown" , spt .Namespace , snapshotName , time .Since (startTime ))
98117 return err
99118 }
100119 if rtName == containerdRuntime {
@@ -103,7 +122,21 @@ func (r *SnapshotPodTaskReconciler) reconcilePushImage(ctx context.Context, spt
103122 a , _ := r .getImageAuthWithSecret (ctx , spt .Namespace , spt .Spec .OriginRegistrySecretRef , spt .Spec .OriginImage )
104123 _ = rt .Pull (ctx , spt .Spec .OriginImage , a , "--unpack=false" )
105124 }
106- return rt .Push (ctx , spt .Spec .CommitImage , auth )
125+
126+ // Extract image registry for metrics label
127+ registry := "unknown"
128+ if ref , err := docker .ParseReference ("//" + spt .Spec .CommitImage ); err == nil {
129+ registry = reference .Domain (ref .DockerReference ())
130+ }
131+
132+ err = rt .Push (ctx , spt .Spec .CommitImage , auth )
133+ if err != nil {
134+ metrics .RecordImagePush (metrics .ImagePushResultFailed , registry , spt .Namespace , snapshotName , time .Since (startTime ))
135+ return err
136+ }
137+
138+ metrics .RecordImagePush (metrics .ImagePushResultSuccess , registry , spt .Namespace , snapshotName , time .Since (startTime ))
139+ return nil
107140}
108141
109142func (r * SnapshotPodTaskReconciler ) reconcileCommit (ctx context.Context , spt * snapshotpodv1alpha1.SnapshotPodTask ) error {
@@ -142,6 +175,7 @@ func (r *SnapshotPodTaskReconciler) reconcileAccept(ctx context.Context, spt *sn
142175// For more details, check Reconcile and its Result here:
143176// - https://pkg.go.dev/sigs.k8s.io/[email protected] /pkg/reconcile 144177func (r * SnapshotPodTaskReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
178+ startTime := time .Now ()
145179 logger := log .FromContext (ctx )
146180
147181 spt := snapshotpodv1alpha1.SnapshotPodTask {}
@@ -150,6 +184,8 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
150184 if errors .IsNotFound (err ) {
151185 return ctrl.Result {}, nil
152186 }
187+ metrics .RecordControllerError ("snapshotpodtask" , "get_instance_error" )
188+ metrics .RecordControllerReconcile ("snapshotpodtask" , metrics .ControllerResultError , time .Since (startTime ))
153189 return ctrl.Result {}, err
154190 }
155191 if spt .Spec .NodeName != r .NodeName {
@@ -208,6 +244,7 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
208244 spt .Status .Conditions [i ].Status = metav1 .ConditionFalse
209245 spt .Status .Conditions [i ].Message = err .Error ()
210246 lastError = err
247+
211248 // Update status immediately after error
212249 if updateErr := r .Status ().Update (ctx , & spt ); updateErr != nil {
213250 return ctrl.Result {}, updateErr
@@ -227,6 +264,9 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
227264 }
228265 }
229266
267+ // Get snapshot name from label for metrics
268+ snapshotName := r .getSnapshotName (spt )
269+
230270 // Update phase based on conditions
231271 switch {
232272 case lo .EveryBy (spt .Status .Conditions , func (item metav1.Condition ) bool {
@@ -243,6 +283,9 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
243283 }):
244284 spt .Status .RetryCount ++
245285 spt .Status .LastRetryTime = lo .ToPtr (metav1 .Now ())
286+
287+ // Record retry count when it's actually incremented
288+ metrics .RecordSnapshotTaskRetry (spt .Namespace , snapshotName , spt .Name )
246289 maxRetries := spt .Spec .MaxRetries
247290 if maxRetries == 0 {
248291 maxRetries = 3
@@ -252,6 +295,8 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
252295 logger .Info ("task failed after max retries" ,
253296 "retryCount" , spt .Status .RetryCount ,
254297 "maxRetries" , maxRetries )
298+
299+ metrics .RecordTaskFailed (spt .Namespace , snapshotName , spt .Name )
255300 }
256301 default :
257302 spt .Status .Phase = snapshotpodv1alpha1 .SnapshotPodTaskPhaseCreated
@@ -262,10 +307,34 @@ func (r *SnapshotPodTaskReconciler) Reconcile(ctx context.Context, req ctrl.Requ
262307 return ctrl.Result {}, err
263308 }
264309
310+ // Record metrics only when task reaches terminal state
311+ // For running tasks, we should use gauge metrics instead of counter/histogram
312+ if spt .Status .Phase == snapshotpodv1alpha1 .SnapshotPodTaskPhaseCompleted ||
313+ spt .Status .Phase == snapshotpodv1alpha1 .SnapshotPodTaskPhaseFailed {
314+
315+ var status string
316+ switch spt .Status .Phase {
317+ case snapshotpodv1alpha1 .SnapshotPodTaskPhaseCompleted :
318+ status = metrics .SnapshotTaskStatusSuccess
319+ case snapshotpodv1alpha1 .SnapshotPodTaskPhaseFailed :
320+ status = metrics .SnapshotTaskStatusFailed
321+ }
322+
323+ // Calculate task duration
324+ var duration time.Duration
325+ if ! spt .CreationTimestamp .Time .IsZero () {
326+ duration = time .Since (spt .CreationTimestamp .Time )
327+ }
328+ metrics .RecordSnapshotTask (status , spt .Namespace , snapshotName , duration )
329+ }
330+
265331 switch spt .Status .Phase {
266332 case snapshotpodv1alpha1 .SnapshotPodTaskPhaseFailed , snapshotpodv1alpha1 .SnapshotPodTaskPhaseCompleted :
333+ metrics .RecordControllerReconcile ("snapshotpodtask" , metrics .ControllerResultSuccess , time .Since (startTime ))
267334 return ctrl.Result {}, nil
268335 }
336+
337+ metrics .RecordControllerReconcile ("snapshotpodtask" , metrics .ControllerResultRequeue , time .Since (startTime ))
269338 retryDelay := time .Second * 30
270339 if spt .Spec .RetryDelaySeconds > 0 {
271340 retryDelay = time .Second * time .Duration (spt .Spec .RetryDelaySeconds )
0 commit comments