Skip to content

Commit 91efaf0

Browse files
committed
fix: Runtime Operator Resiliency Pass
Signed-off-by: Lucas Fontes <[email protected]>
1 parent 9d380b4 commit 91efaf0

File tree

6 files changed

+49
-8
lines changed

6 files changed

+49
-8
lines changed

runtime-operator/api/condition/conditioned_reconciler.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ type ReconcilerContext struct {
166166

167167
const ctxKeyReconcilerContext = ctxKey("reconciler-context")
168168

169+
// GetReconcilerContext retrieves the ReconcilerContext from the context.
169170
func GetReconcilerContext(ctx context.Context) *ReconcilerContext {
170171
if v, ok := ctx.Value(ctxKeyReconcilerContext).(*ReconcilerContext); ok {
171172
return v
@@ -179,6 +180,11 @@ func ForceStatusUpdate(ctx context.Context) {
179180
GetReconcilerContext(ctx).ForceUpdate = true
180181
}
181182

183+
// ForceRequeue forces an immediate requeue, regardless if status has changed.
184+
func ForceRequeue(ctx context.Context) {
185+
GetReconcilerContext(ctx).ForceRequeue = true
186+
}
187+
182188
// Reconcile reconciles the object with the given request.
183189
// implements the `AnyConditionedReconciler` and `reconcile.Reconciler` interfaces.
184190
func (r *ConditionedReconciler[T]) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
@@ -203,6 +209,8 @@ func (r *ConditionedReconciler[T]) Reconcile(ctx context.Context, req reconcile.
203209
}
204210
}
205211

212+
originalObject := obj.DeepCopyObject().(T)
213+
206214
reconcilerCtx := &ReconcilerContext{
207215
ReconcileInterval: r.interval,
208216
}
@@ -226,7 +234,6 @@ func (r *ConditionedReconciler[T]) Reconcile(ctx context.Context, req reconcile.
226234

227235
condErr := condEntry.fn(condCtx, obj)
228236
if condErr != nil {
229-
reconcilerCtx.ForceRequeue = true
230237
if errors.Is(condErr, errStatusUnknown) {
231238
conditions.SetConditions(UnknownCondition(condEntry.condition, "Reconcile", condErr.Error()))
232239
} else if errors.Is(condErr, errNoop) {
@@ -255,7 +262,7 @@ func (r *ConditionedReconciler[T]) Reconcile(ctx context.Context, req reconcile.
255262
}
256263

257264
if reconcilerCtx.ForceUpdate {
258-
if err := r.client.Status().Update(ctx, obj); err != nil {
265+
if err := r.client.Status().Patch(ctx, obj, client.MergeFrom(originalObject)); err != nil {
259266
return reconcile.Result{}, err
260267
}
261268
}
@@ -272,7 +279,7 @@ func (r *ConditionedReconciler[T]) Reconcile(ctx context.Context, req reconcile.
272279
}
273280

274281
if reconcilerCtx.ForceRequeue {
275-
return reconcile.Result{Requeue: true}, nil
282+
return reconcile.Result{RequeueAfter: time.Second}, nil
276283
}
277284

278285
return reconcile.Result{RequeueAfter: reconcilerCtx.ReconcileInterval}, nil

runtime-operator/internal/controller/runtime/host_client.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import (
1212
"google.golang.org/protobuf/types/known/emptypb"
1313
)
1414

15+
// HostRoundtripTimeout is the max timeout for host RPC calls.
16+
// Callers can set lower context timeouts as needed.
1517
const HostRoundtripTimeout = 1 * time.Minute
1618

1719
func NewWashHostClient(bus wasmbus.Bus, hostID string) *WashHostClient {

runtime-operator/internal/controller/runtime/host_controller.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
)
2222

2323
const (
24+
hostHeartbeatTimeout = 5 * time.Second
2425
hostReconcileInterval = 1 * time.Minute
2526
)
2627

@@ -43,7 +44,7 @@ func (r *HostReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
4344
func (r *HostReconciler) reconcileReporting(ctx context.Context, host *runtimev1alpha1.Host) error {
4445
client := NewWashHostClient(r.Bus, host.HostID)
4546

46-
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
47+
ctx, cancel := context.WithTimeout(ctx, hostHeartbeatTimeout)
4748
defer cancel()
4849

4950
heartbeat, err := client.Heartbeat(ctx)

runtime-operator/internal/controller/runtime/workload_controller.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ import (
1818
)
1919

2020
const (
21-
workloadReconcileInterval = 1 * time.Minute
21+
workloadReconcileInterval = 30 * time.Second
22+
workloadStartTimeout = 60 * time.Second
23+
workloadStatusTimeout = 5 * time.Second
24+
workloadStopTimeout = 30 * time.Second
2225
workloadFinalizerName = "runtime.wasmcloud.dev/workload-finalizer"
2326
workloadSchedulableHostsIndex = "spec.isSchedulable"
2427
)
@@ -241,6 +244,9 @@ func (r *WorkloadReconciler) reconcilePlacement(ctx context.Context, workload *r
241244
}
242245

243246
client := NewWashHostClient(r.Bus, workload.Status.HostID)
247+
ctx, cancel := context.WithTimeout(ctx, workloadStartTimeout)
248+
defer cancel()
249+
244250
resp, err := client.Start(ctx, req)
245251
if err != nil {
246252
return err
@@ -262,6 +268,9 @@ func (r *WorkloadReconciler) reconcileSync(ctx context.Context, workload *runtim
262268
WorkloadId: workload.Status.WorkloadID,
263269
}
264270

271+
ctx, cancel := context.WithTimeout(ctx, workloadStatusTimeout)
272+
defer cancel()
273+
265274
resp, err := client.Status(ctx, req)
266275
if err != nil {
267276
return err
@@ -293,6 +302,9 @@ func (r *WorkloadReconciler) finalize(ctx context.Context, workload *runtimev1al
293302
WorkloadId: workload.Status.WorkloadID,
294303
}
295304

305+
ctx, cancel := context.WithTimeout(ctx, workloadStopTimeout)
306+
defer cancel()
307+
296308
_, err := client.Stop(ctx, req)
297309
if err != nil {
298310
logger := ctrl.LoggerFrom(ctx)

runtime-operator/internal/controller/runtime/workload_deployment_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,5 +305,16 @@ func resolveArtifacts(ctx context.Context, kubeClient client.Client, namespace s
305305
tpl.Spec.Components[i] = comp
306306
}
307307

308+
if tpl.Spec.Service != nil {
309+
if strings.HasPrefix(tpl.Spec.Service.Image, "artifact://") {
310+
artifactName := strings.TrimPrefix(tpl.Spec.Service.Image, "artifact://")
311+
artifact, ok := artifactMap[artifactName]
312+
if !ok {
313+
return fmt.Errorf("artifact %s not found in deployment spec", artifactName)
314+
}
315+
tpl.Spec.Service.Image = artifact.Status.ArtifactURL
316+
}
317+
}
318+
308319
return nil
309320
}

runtime-operator/internal/controller/runtime/workload_replicaset_controller.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,13 +237,21 @@ func (r *WorkloadReplicaSetReconciler) cleanupUnhealthyWorkloads(ctx context.Con
237237
continue
238238
}
239239

240+
// catches workloads that are still syncing
241+
// they might have not been placed or sync failing. in most cases host is down/gone.
242+
240243
syncStatus := workload.Status.GetCondition(runtimev1alpha1.WorkloadConditionSync)
241-
// catches unknown & true sync status
242-
if syncStatus.Status != condition.ConditionFalse {
244+
if syncStatus.Status == condition.ConditionTrue {
243245
continue
244246
}
247+
248+
// this was the first failure, give it some time to recover
249+
if syncStatus.LastTransitionTime.IsZero() {
250+
continue
251+
}
252+
245253
// If the workload has failed recently, give it some time to recover
246-
if syncStatus.LastProbeTime.Time.Add(workloadReplicaSetReplicaGracePeriod).After(time.Now()) {
254+
if syncStatus.LastTransitionTime.Time.Add(workloadReplicaSetReplicaGracePeriod).After(time.Now()) {
247255
continue
248256
}
249257
// At this point, the workload is not ready, has failed, and has not recovered within the grace period

0 commit comments

Comments
 (0)