Skip to content

Commit 1384645

Browse files
authored
🌱 optimize the requeue timing for lease controller (#1236)
Signed-off-by: Yang Le <[email protected]>
1 parent 3722a00 commit 1384645

File tree

2 files changed

+137
-3
lines changed

2 files changed

+137
-3
lines changed

pkg/registration/hub/lease/controller.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,21 @@ func (c *leaseController) sync(ctx context.Context, syncCtx factory.SyncContext)
124124
}
125125

126126
now := time.Now()
127-
if !now.Before(observedLease.Spec.RenewTime.Add(gracePeriod)) {
127+
leaseExpired := !now.Before(observedLease.Spec.RenewTime.Add(gracePeriod))
128+
129+
if leaseExpired {
128130
// the lease is not updated constantly, change the cluster available condition to unknown
129131
if err := c.updateClusterStatus(ctx, cluster); err != nil {
130132
return err
131133
}
134+
// Requeue after grace period. Recovery will be detected immediately via lease watch.
135+
syncCtx.Queue().AddAfter(clusterName, gracePeriod)
136+
} else {
137+
// Lease is fresh, requeue exactly when it will expire to detect expiration immediately
138+
timeUntilExpiry := observedLease.Spec.RenewTime.Add(gracePeriod).Sub(now)
139+
syncCtx.Queue().AddAfter(clusterName, timeUntilExpiry)
132140
}
133141

134-
// always requeue this cluster to check its lease constantly
135-
syncCtx.Queue().AddAfter(clusterName, gracePeriod)
136142
return nil
137143
}
138144

pkg/registration/hub/lease/controller_test.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ import (
77
"testing"
88
"time"
99

10+
"github.com/openshift/library-go/pkg/operator/events"
11+
"github.com/openshift/library-go/pkg/operator/events/eventstesting"
1012
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1113
"k8s.io/apimachinery/pkg/runtime"
1214
kubeinformers "k8s.io/client-go/informers"
1315
kubefake "k8s.io/client-go/kubernetes/fake"
1416
clienttesting "k8s.io/client-go/testing"
17+
"k8s.io/client-go/util/workqueue"
1518

1619
clusterfake "open-cluster-management.io/api/client/cluster/clientset/versioned/fake"
1720
clusterscheme "open-cluster-management.io/api/client/cluster/clientset/versioned/scheme"
@@ -202,3 +205,128 @@ func newDeletingManagedCluster() *clusterv1.ManagedCluster {
202205
cluster.DeletionTimestamp = &now
203206
return cluster
204207
}
208+
209+
// spyQueue wraps a real queue and captures AddAfter calls
210+
type spyQueue struct {
211+
workqueue.RateLimitingInterface
212+
addAfterDelay time.Duration
213+
addAfterKey interface{}
214+
}
215+
216+
func (s *spyQueue) AddAfter(item interface{}, duration time.Duration) {
217+
s.addAfterDelay = duration
218+
s.addAfterKey = item
219+
s.RateLimitingInterface.AddAfter(item, duration)
220+
}
221+
222+
// testSyncContext is a custom sync context for testing requeue timing
223+
type testSyncContext struct {
224+
queueKey string
225+
recorder events.Recorder
226+
queue *spyQueue
227+
}
228+
229+
func (t *testSyncContext) Queue() workqueue.RateLimitingInterface { return t.queue }
230+
func (t *testSyncContext) QueueKey() string { return t.queueKey }
231+
func (t *testSyncContext) Recorder() events.Recorder { return t.recorder }
232+
233+
func newManagedClusterWithLeaseDuration(seconds int32) *clusterv1.ManagedCluster {
234+
cluster := testinghelpers.NewAvailableManagedCluster()
235+
cluster.Spec.LeaseDurationSeconds = seconds
236+
return cluster
237+
}
238+
239+
func TestRequeueTime(t *testing.T) {
240+
// Using 60 second lease duration (grace period = 5 * 60 = 300 seconds)
241+
cases := []struct {
242+
name string
243+
cluster runtime.Object
244+
clusterLease runtime.Object
245+
expectedRequeueMin time.Duration
246+
expectedRequeueMax time.Duration
247+
}{
248+
{
249+
name: "requeue for expired lease - recovery detected via watch, use grace period",
250+
cluster: newManagedClusterWithLeaseDuration(60),
251+
// Lease expired 5 minutes ago (grace period is 5 * 60 = 300 seconds)
252+
clusterLease: testinghelpers.NewManagedClusterLease("managed-cluster-lease", now.Add(-5*time.Minute)),
253+
expectedRequeueMin: 295 * time.Second, // Should be ~300s (grace period)
254+
expectedRequeueMax: 305 * time.Second, // Allow some tolerance
255+
},
256+
{
257+
name: "requeue for fresh lease - should use time until expiry for immediate detection",
258+
cluster: newManagedClusterWithLeaseDuration(60),
259+
// Lease renewed 4 minutes ago, will expire in 1 minute (grace period is 5 minutes)
260+
clusterLease: testinghelpers.NewManagedClusterLease("managed-cluster-lease", now.Add(-4*time.Minute)),
261+
expectedRequeueMin: 55 * time.Second, // Should be ~60s (1 minute)
262+
expectedRequeueMax: 65 * time.Second, // Allow some tolerance
263+
},
264+
{
265+
name: "requeue for recently renewed lease - should check just before expiry",
266+
cluster: newManagedClusterWithLeaseDuration(60),
267+
// Lease renewed 30 seconds ago, will expire in 4.5 minutes
268+
clusterLease: testinghelpers.NewManagedClusterLease("managed-cluster-lease", now.Add(-30*time.Second)),
269+
expectedRequeueMin: 265 * time.Second, // Should be ~270s (4.5 minutes)
270+
expectedRequeueMax: 275 * time.Second, // Allow some tolerance
271+
},
272+
}
273+
274+
for _, c := range cases {
275+
t.Run(c.name, func(t *testing.T) {
276+
clusterClient := clusterfake.NewSimpleClientset(c.cluster)
277+
clusterInformerFactory := clusterinformers.NewSharedInformerFactory(clusterClient, time.Minute*10)
278+
clusterStore := clusterInformerFactory.Cluster().V1().ManagedClusters().Informer().GetStore()
279+
if err := clusterStore.Add(c.cluster); err != nil {
280+
t.Fatal(err)
281+
}
282+
283+
hubClient := kubefake.NewSimpleClientset(c.clusterLease)
284+
leaseInformerFactory := kubeinformers.NewSharedInformerFactory(hubClient, time.Minute*10)
285+
leaseStore := leaseInformerFactory.Coordination().V1().Leases().Informer().GetStore()
286+
if err := leaseStore.Add(c.clusterLease); err != nil {
287+
t.Fatal(err)
288+
}
289+
290+
ctx := context.TODO()
291+
mcEventRecorder, err := helpers.NewEventRecorder(ctx, clusterscheme.Scheme, hubClient.EventsV1(), "test")
292+
if err != nil {
293+
t.Fatal(err)
294+
}
295+
296+
// Create a custom sync context with spy queue to capture AddAfter calls
297+
spyQ := &spyQueue{
298+
RateLimitingInterface: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
299+
}
300+
syncCtx := &testSyncContext{
301+
queueKey: testinghelpers.TestManagedClusterName,
302+
recorder: eventstesting.NewTestingEventRecorder(t),
303+
queue: spyQ,
304+
}
305+
306+
ctrl := &leaseController{
307+
kubeClient: hubClient,
308+
patcher: patcher.NewPatcher[
309+
*clusterv1.ManagedCluster, clusterv1.ManagedClusterSpec, clusterv1.ManagedClusterStatus](
310+
clusterClient.ClusterV1().ManagedClusters()),
311+
clusterLister: clusterInformerFactory.Cluster().V1().ManagedClusters().Lister(),
312+
leaseLister: leaseInformerFactory.Coordination().V1().Leases().Lister(),
313+
mcEventRecorder: mcEventRecorder,
314+
}
315+
316+
syncErr := ctrl.sync(context.TODO(), syncCtx)
317+
if syncErr != nil {
318+
t.Errorf("unexpected err: %v", syncErr)
319+
}
320+
321+
// Verify the requeue delay captured by the spy
322+
actualDelay := spyQ.addAfterDelay
323+
if actualDelay < c.expectedRequeueMin || actualDelay > c.expectedRequeueMax {
324+
t.Errorf("Unexpected requeue delay: got %v, expected between %v and %v",
325+
actualDelay, c.expectedRequeueMin, c.expectedRequeueMax)
326+
} else {
327+
t.Logf("✓ Requeue delay %v is within expected range [%v, %v]",
328+
actualDelay, c.expectedRequeueMin, c.expectedRequeueMax)
329+
}
330+
})
331+
}
332+
}

0 commit comments

Comments
 (0)