Skip to content

Commit 188cb38

Browse files
committed
fix: add scaling adapter
Signed-off-by: Julien Mancuso <[email protected]>
1 parent 7ec54a6 commit 188cb38

14 files changed

+299
-49
lines changed

deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10189,8 +10189,12 @@ spec:
1018910189
type: integer
1019010190
type: object
1019110191
replicas:
10192-
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
10192+
description: |-
10193+
Replicas is the desired number of Pods for this component.
10194+
When scalingAdapter is enabled (default), this field is managed by the
10195+
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
1019310196
format: int32
10197+
minimum: 0
1019410198
type: integer
1019510199
resources:
1019610200
description: |-
@@ -10269,6 +10273,20 @@ spec:
1026910273
type: string
1027010274
type: object
1027110275
type: object
10276+
scalingAdapter:
10277+
description: |-
10278+
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
10279+
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
10280+
the service using the Scale subresource. When disabled, replicas can be modified directly.
10281+
properties:
10282+
disable:
10283+
default: false
10284+
description: |-
10285+
Disable indicates whether the ScalingAdapter should be disabled for this service.
10286+
When false (default), a DGDSA is created and owns the replicas field.
10287+
When true, no DGDSA is created and replicas can be modified directly in the DGD.
10288+
type: boolean
10289+
type: object
1027210290
serviceName:
1027310291
description: The name of the component
1027410292
type: string

deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10324,8 +10324,12 @@ spec:
1032410324
type: integer
1032510325
type: object
1032610326
replicas:
10327-
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
10327+
description: |-
10328+
Replicas is the desired number of Pods for this component.
10329+
When scalingAdapter is enabled (default), this field is managed by the
10330+
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
1032810331
format: int32
10332+
minimum: 0
1032910333
type: integer
1033010334
resources:
1033110335
description: |-
@@ -10404,6 +10408,20 @@ spec:
1040410408
type: string
1040510409
type: object
1040610410
type: object
10411+
scalingAdapter:
10412+
description: |-
10413+
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
10414+
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
10415+
the service using the Scale subresource. When disabled, replicas can be modified directly.
10416+
properties:
10417+
disable:
10418+
default: false
10419+
description: |-
10420+
Disable indicates whether the ScalingAdapter should be disabled for this service.
10421+
When false (default), a DGDSA is created and owns the replicas field.
10422+
When true, no DGDSA is created and replicas can be modified directly in the DGD.
10423+
type: boolean
10424+
type: object
1040710425
serviceName:
1040810426
description: The name of the component
1040910427
type: string

deploy/cloud/operator/api/v1alpha1/common.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,15 @@ type ExtraPodSpec struct {
123123
*corev1.PodSpec `json:",inline"`
124124
MainContainer *corev1.Container `json:"mainContainer,omitempty"`
125125
}
126+
127+
// ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
128+
// for replica management. When enabled (default), the DGDSA owns the replicas field and
129+
// external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
130+
type ScalingAdapter struct {
131+
// Disable indicates whether the ScalingAdapter should be disabled for this service.
132+
// When false (default), a DGDSA is created and owns the replicas field.
133+
// When true, no DGDSA is created and replicas can be modified directly in the DGD.
134+
// +optional
135+
// +kubebuilder:default=false
136+
Disable bool `json:"disable,omitempty"`
137+
}

deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,18 @@ type DynamoComponentDeploymentSharedSpec struct {
110110
LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"`
111111
// ReadinessProbe to signal when the container is ready to receive traffic.
112112
ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"`
113-
// Replicas is the desired number of Pods for this component when autoscaling is not used.
113+
// Replicas is the desired number of Pods for this component.
114+
// When scalingAdapter is enabled (default), this field is managed by the
115+
// DynamoGraphDeploymentScalingAdapter and should not be modified directly.
116+
// +kubebuilder:validation:Minimum=0
114117
Replicas *int32 `json:"replicas,omitempty"`
115118
// Multinode is the configuration for multinode components.
116119
Multinode *MultinodeSpec `json:"multinode,omitempty"`
120+
// ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
121+
// When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
122+
// the service using the Scale subresource. When disabled, replicas can be modified directly.
123+
// +optional
124+
ScalingAdapter *ScalingAdapter `json:"scalingAdapter,omitempty"`
117125
}
118126

119127
type MultinodeSpec struct {

deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10189,8 +10189,12 @@ spec:
1018910189
type: integer
1019010190
type: object
1019110191
replicas:
10192-
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
10192+
description: |-
10193+
Replicas is the desired number of Pods for this component.
10194+
When scalingAdapter is enabled (default), this field is managed by the
10195+
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
1019310196
format: int32
10197+
minimum: 0
1019410198
type: integer
1019510199
resources:
1019610200
description: |-
@@ -10269,6 +10273,20 @@ spec:
1026910273
type: string
1027010274
type: object
1027110275
type: object
10276+
scalingAdapter:
10277+
description: |-
10278+
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
10279+
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
10280+
the service using the Scale subresource. When disabled, replicas can be modified directly.
10281+
properties:
10282+
disable:
10283+
default: false
10284+
description: |-
10285+
Disable indicates whether the ScalingAdapter should be disabled for this service.
10286+
When false (default), a DGDSA is created and owns the replicas field.
10287+
When true, no DGDSA is created and replicas can be modified directly in the DGD.
10288+
type: boolean
10289+
type: object
1027210290
serviceName:
1027310291
description: The name of the component
1027410292
type: string

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10324,8 +10324,12 @@ spec:
1032410324
type: integer
1032510325
type: object
1032610326
replicas:
10327-
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
10327+
description: |-
10328+
Replicas is the desired number of Pods for this component.
10329+
When scalingAdapter is enabled (default), this field is managed by the
10330+
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
1032810331
format: int32
10332+
minimum: 0
1032910333
type: integer
1033010334
resources:
1033110335
description: |-
@@ -10404,6 +10408,20 @@ spec:
1040410408
type: string
1040510409
type: object
1040610410
type: object
10411+
scalingAdapter:
10412+
description: |-
10413+
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
10414+
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
10415+
the service using the Scale subresource. When disabled, replicas can be modified directly.
10416+
properties:
10417+
disable:
10418+
default: false
10419+
description: |-
10420+
Disable indicates whether the ScalingAdapter should be disabled for this service.
10421+
When false (default), a DGDSA is created and owns the replicas field.
10422+
When true, no DGDSA is created and replicas can be modified directly in the DGD.
10423+
type: boolean
10424+
type: object
1040710425
serviceName:
1040810426
description: The name of the component
1040910427
type: string

deploy/cloud/operator/internal/webhook/common.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package webhook
1919

2020
import (
2121
"context"
22+
"strings"
2223

24+
authenticationv1 "k8s.io/api/authentication/v1"
2325
"k8s.io/apimachinery/pkg/runtime"
2426
"sigs.k8s.io/controller-runtime/pkg/client"
2527
logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {
118120

119121
return false
120122
}
123+
124+
// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized
125+
// to modify DGD replicas when scaling adapter is enabled.
126+
// Service accounts matching any of these suffixes are allowed regardless of namespace.
127+
var DGDReplicasModifierSuffixes = []string{
128+
// Dynamo operator controller manager (handles DGDSA reconciliation)
129+
// Example: "dynamo-platform-dynamo-operator-controller-manager"
130+
"-dynamo-operator-controller-manager",
131+
132+
// Planner service account (manages DGD replicas for autoscaling)
133+
// Example: "planner-serviceaccount"
134+
"planner-serviceaccount",
135+
}
136+
137+
// CanModifyDGDReplicas checks if the request comes from a service account authorized
138+
// to modify DGD replicas when scaling adapter is enabled.
139+
// Service accounts are identified by username format: system:serviceaccount:<namespace>:<name>
140+
//
141+
// Authorized service accounts (by suffix):
142+
// - *-dynamo-operator-controller-manager (for DGDSA reconciliation)
143+
// - *planner-serviceaccount (for Planner autoscaling)
144+
func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool {
145+
username := userInfo.Username
146+
147+
// Service accounts have username format: system:serviceaccount:<namespace>:<name>
148+
if !strings.HasPrefix(username, "system:serviceaccount:") {
149+
return false
150+
}
151+
152+
// Parse: system:serviceaccount:<namespace>:<name>
153+
parts := strings.Split(username, ":")
154+
if len(parts) != 4 {
155+
return false
156+
}
157+
158+
namespace := parts[2]
159+
saName := parts[3]
160+
161+
// Check against authorized suffixes
162+
for _, suffix := range DGDReplicasModifierSuffixes {
163+
if strings.HasSuffix(saName, suffix) {
164+
webhookCommonLog.V(1).Info("allowing DGD replicas modification",
165+
"serviceAccount", saName,
166+
"namespace", namespace,
167+
"matchedSuffix", suffix)
168+
return true
169+
}
170+
}
171+
172+
return false
173+
}

deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC
4242
func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) {
4343
// Validate shared spec fields using SharedSpecValidator
4444
sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec")
45-
if err := sharedValidator.Validate(); err != nil {
46-
return nil, err
47-
}
48-
49-
// Collect deprecation warnings
50-
warnings := sharedValidator.GetWarnings()
5145

5246
// DCD-specific validation would go here (currently none)
5347

54-
return warnings, nil
48+
return sharedValidator.Validate()
5549
}
5650

5751
// ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment.

deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"fmt"
2323

2424
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
25+
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
26+
authenticationv1 "k8s.io/api/authentication/v1"
2527
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
2628
)
2729

@@ -68,29 +70,88 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error)
6870
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
6971
// Returns warnings and error.
7072
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) {
73+
return v.ValidateUpdateWithUserInfo(old, nil)
74+
}
75+
76+
// ValidateUpdateWithUserInfo performs stateful validation with user identity checking.
77+
// When userInfo is provided, it validates that only allowed controllers can modify
78+
// replicas for services with scaling adapter enabled.
79+
// Returns warnings and error.
80+
func (v *DynamoGraphDeploymentValidator) ValidateUpdateWithUserInfo(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) {
7181
// Validate that BackendFramework is not changed (immutable)
7282
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
7383
warning := "Changing spec.backendFramework may cause unexpected behavior"
7484
return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
7585
}
7686

87+
// Validate replicas changes for services with scaling adapter enabled
88+
if userInfo != nil {
89+
if err := v.validateReplicasChanges(old, *userInfo); err != nil {
90+
return nil, err
91+
}
92+
}
93+
7794
return nil, nil
7895
}
7996

97+
// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled.
98+
// Only authorized service accounts (operator controller, planner) can modify these fields.
99+
func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo authenticationv1.UserInfo) error {
100+
// If the request comes from an authorized service account, allow the change
101+
if internalwebhook.CanModifyDGDReplicas(userInfo) {
102+
return nil
103+
}
104+
105+
var errs []error
106+
107+
for serviceName, newService := range v.deployment.Spec.Services {
108+
// Check if scaling adapter is enabled for this service (enabled by default)
109+
scalingAdapterEnabled := true
110+
if newService.ScalingAdapter != nil && newService.ScalingAdapter.Disable {
111+
scalingAdapterEnabled = false
112+
}
113+
114+
if !scalingAdapterEnabled {
115+
// Scaling adapter is disabled, users can modify replicas directly
116+
continue
117+
}
118+
119+
// Get old service (if exists)
120+
oldService, exists := old.Spec.Services[serviceName]
121+
if !exists {
122+
// New service, no comparison needed
123+
continue
124+
}
125+
126+
// Check if replicas changed
127+
oldReplicas := int32(1) // default
128+
if oldService.Replicas != nil {
129+
oldReplicas = *oldService.Replicas
130+
}
131+
132+
newReplicas := int32(1) // default
133+
if newService.Replicas != nil {
134+
newReplicas = *newService.Replicas
135+
}
136+
137+
if oldReplicas != newReplicas {
138+
errs = append(errs, fmt.Errorf(
139+
"spec.services[%s].replicas cannot be modified directly when scaling adapter is enabled; "+
140+
"use 'kubectl scale dgdsa/%s-%s --replicas=%d' or update the DynamoGraphDeploymentScalingAdapter instead",
141+
serviceName, v.deployment.Name, serviceName, newReplicas))
142+
}
143+
}
144+
145+
return errors.Join(errs...)
146+
}
147+
80148
// validateService validates a single service configuration using SharedSpecValidator.
81149
// Returns warnings and error.
82150
func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
83151
// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
84152
fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
85153
sharedValidator := NewSharedSpecValidator(service, fieldPath)
86-
87-
if err := sharedValidator.Validate(); err != nil {
88-
return nil, err
89-
}
90-
91-
// Collect deprecation warnings
92-
warnings := sharedValidator.GetWarnings()
93-
return warnings, nil
154+
return sharedValidator.Validate()
94155
}
95156

96157
// validatePVCs validates the PVC configurations.

0 commit comments

Comments
 (0)