diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml index 558a5b973d..c90e3bdfe7 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml @@ -77,12 +77,13 @@ spec: (such as Pod, Service, and Ingress when applicable). type: object autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. properties: behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). + description: 'Deprecated: This field is ignored.' properties: scaleDown: description: |- @@ -231,10 +232,13 @@ spec: type: object type: object enabled: + description: 'Deprecated: This field is ignored.' type: boolean maxReplicas: + description: 'Deprecated: This field is ignored.' type: integer metrics: + description: 'Deprecated: This field is ignored.' items: description: |- MetricSpec specifies how to scale based on a single metric @@ -665,6 +669,7 @@ spec: type: object type: array minReplicas: + description: 'Deprecated: This field is ignored.' type: integer type: object backendFramework: @@ -10184,8 +10189,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10264,6 +10273,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml index ba2b19fef9..4db1e902b8 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml @@ -219,12 +219,13 @@ spec: (such as Pod, Service, and Ingress when applicable). type: object autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. properties: behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). + description: 'Deprecated: This field is ignored.' properties: scaleDown: description: |- @@ -373,10 +374,13 @@ spec: type: object type: object enabled: + description: 'Deprecated: This field is ignored.' type: boolean maxReplicas: + description: 'Deprecated: This field is ignored.' type: integer metrics: + description: 'Deprecated: This field is ignored.' items: description: |- MetricSpec specifies how to scale based on a single metric @@ -807,6 +811,7 @@ spec: type: object type: array minReplicas: + description: 'Deprecated: This field is ignored.' type: integer type: object componentType: @@ -10319,8 +10324,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10399,6 +10408,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml new file mode 100644 index 0000000000..f822bb91db --- /dev/null +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dynamographdeploymentscalingadapters.nvidia.com +spec: + group: nvidia.com + names: + kind: DynamoGraphDeploymentScalingAdapter + listKind: DynamoGraphDeploymentScalingAdapterList + plural: dynamographdeploymentscalingadapters + shortNames: + - dgdsa + singular: dynamographdeploymentscalingadapter + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: DynamoGraphDeployment name + jsonPath: .spec.dgdRef.name + name: DGD + type: string + - description: Service name + jsonPath: .spec.dgdRef.serviceName + name: SERVICE + type: string + - description: Current replicas + jsonPath: .status.replicas + name: REPLICAS + type: integer + - jsonPath: .metadata.creationTimestamp + name: AGE + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services + within a DynamoGraphDeployment. It implements the Kubernetes scale + subresource, enabling integration with HPA, KEDA, and custom autoscalers. + + The adapter acts as an intermediary between autoscalers and the DGD, + ensuring that only the adapter controller modifies the DGD's service replicas. + This prevents conflicts when multiple autoscaling mechanisms are in play. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter + properties: + dgdRef: + description: DGDRef references the DynamoGraphDeployment and the specific service to scale. + properties: + name: + description: Name of the DynamoGraphDeployment + minLength: 1 + type: string + serviceName: + description: ServiceName is the key name of the service within the DGD's spec.services map to scale + minLength: 1 + type: string + required: + - name + - serviceName + type: object + replicas: + description: |- + Replicas is the desired number of replicas for the target service. + This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + format: int32 + minimum: 0 + type: integer + required: + - dgdRef + - replicas + type: object + status: + description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter + properties: + lastScaleTime: + description: LastScaleTime is the last time the adapter scaled the target service. + format: date-time + type: string + replicas: + description: |- + Replicas is the current number of replicas for the target service. + This is synced from the DGD's service replicas and is required for the scale subresource. + format: int32 + type: integer + selector: + description: |- + Selector is a label selector string for the pods managed by this adapter. + Required for HPA compatibility via the scale subresource. + type: string + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index 8ab42c0988..7ae1eb6c5d 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -369,6 +369,7 @@ rules: - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments + - dynamographdeploymentscalingadapters - dynamomodels verbs: - create @@ -393,6 +394,7 @@ rules: - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status + - dynamographdeploymentscalingadapters/status - dynamomodels/status verbs: - get diff --git a/deploy/cloud/operator/api/v1alpha1/common.go b/deploy/cloud/operator/api/v1alpha1/common.go index 5673fd5cfd..b68dd818c0 100644 --- a/deploy/cloud/operator/api/v1alpha1/common.go +++ b/deploy/cloud/operator/api/v1alpha1/common.go @@ -53,12 +53,20 @@ type VolumeMount struct { UseAsCompilationCache bool `json:"useAsCompilationCache,omitempty"` } +// Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter +// with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md +// for migration guidance. This field will be removed in a future API version. type Autoscaling struct { - Enabled bool `json:"enabled,omitempty"` - MinReplicas int `json:"minReplicas,omitempty"` - MaxReplicas int `json:"maxReplicas,omitempty"` - Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` - Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` + // Deprecated: This field is ignored. + Enabled bool `json:"enabled,omitempty"` + // Deprecated: This field is ignored. + MinReplicas int `json:"minReplicas,omitempty"` + // Deprecated: This field is ignored. + MaxReplicas int `json:"maxReplicas,omitempty"` + // Deprecated: This field is ignored. + Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` + // Deprecated: This field is ignored. + Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` } type SharedMemorySpec struct { @@ -115,3 +123,15 @@ type ExtraPodSpec struct { *corev1.PodSpec `json:",inline"` MainContainer *corev1.Container `json:"mainContainer,omitempty"` } + +// ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter +// for replica management. When enabled (default), the DGDSA owns the replicas field and +// external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. +type ScalingAdapter struct { + // Disable indicates whether the ScalingAdapter should be disabled for this service. + // When false (default), a DGDSA is created and owns the replicas field. + // When true, no DGDSA is created and replicas can be modified directly in the DGD. + // +optional + // +kubebuilder:default=false + Disable bool `json:"disable,omitempty"` +} diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index 8f484057ab..8a2abb78f2 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -74,7 +74,9 @@ type DynamoComponentDeploymentSharedSpec struct { // Resources requested and limits for this component, including CPU, memory, // GPUs/devices, and any runtime-specific resources. Resources *Resources `json:"resources,omitempty"` - // Autoscaling config for this component (replica range, target utilization, etc.). + // Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + // with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + // for migration guidance. This field will be removed in a future API version. Autoscaling *Autoscaling `json:"autoscaling,omitempty"` // Envs defines additional environment variables to inject into the component containers. Envs []corev1.EnvVar `json:"envs,omitempty"` @@ -108,10 +110,18 @@ type DynamoComponentDeploymentSharedSpec struct { LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"` // ReadinessProbe to signal when the container is ready to receive traffic. ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"` - // Replicas is the desired number of Pods for this component when autoscaling is not used. + // Replicas is the desired number of Pods for this component. + // When scalingAdapter is enabled (default), this field is managed by the + // DynamoGraphDeploymentScalingAdapter and should not be modified directly. + // +kubebuilder:validation:Minimum=0 Replicas *int32 `json:"replicas,omitempty"` // Multinode is the configuration for multinode components. Multinode *MultinodeSpec `json:"multinode,omitempty"` + // ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + // When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + // the service using the Scale subresource. When disabled, replicas can be modified directly. + // +optional + ScalingAdapter *ScalingAdapter `json:"scalingAdapter,omitempty"` } type MultinodeSpec struct { diff --git a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go new file mode 100644 index 0000000000..d4da1a0ccf --- /dev/null +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterSpec struct { + // Replicas is the desired number of replicas for the target service. + // This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas"` + + // DGDRef references the DynamoGraphDeployment and the specific service to scale. + // +kubebuilder:validation:Required + DGDRef DynamoGraphDeploymentServiceRef `json:"dgdRef"` +} + +// DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment +type DynamoGraphDeploymentServiceRef struct { + // Name of the DynamoGraphDeployment + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` + + // ServiceName is the key name of the service within the DGD's spec.services map to scale + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + ServiceName string `json:"serviceName"` +} + +// DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterStatus struct { + // Replicas is the current number of replicas for the target service. + // This is synced from the DGD's service replicas and is required for the scale subresource. + // +optional + Replicas int32 `json:"replicas,omitempty"` + + // Selector is a label selector string for the pods managed by this adapter. + // Required for HPA compatibility via the scale subresource. + // +optional + Selector string `json:"selector,omitempty"` + + // LastScaleTime is the last time the adapter scaled the target service. + // +optional + LastScaleTime *metav1.Time `json:"lastScaleTime,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector +// +kubebuilder:printcolumn:name="DGD",type="string",JSONPath=".spec.dgdRef.name",description="DynamoGraphDeployment name" +// +kubebuilder:printcolumn:name="SERVICE",type="string",JSONPath=".spec.dgdRef.serviceName",description="Service name" +// +kubebuilder:printcolumn:name="REPLICAS",type="integer",JSONPath=".status.replicas",description="Current replicas" +// +kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:resource:shortName={dgdsa} + +// DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services +// within a DynamoGraphDeployment. It implements the Kubernetes scale +// subresource, enabling integration with HPA, KEDA, and custom autoscalers. +// +// The adapter acts as an intermediary between autoscalers and the DGD, +// ensuring that only the adapter controller modifies the DGD's service replicas. +// This prevents conflicts when multiple autoscaling mechanisms are in play. +type DynamoGraphDeploymentScalingAdapter struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec DynamoGraphDeploymentScalingAdapterSpec `json:"spec,omitempty"` + Status DynamoGraphDeploymentScalingAdapterStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// DynamoGraphDeploymentScalingAdapterList contains a list of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []DynamoGraphDeploymentScalingAdapter `json:"items"` +} + +func init() { + SchemeBuilder.Register(&DynamoGraphDeploymentScalingAdapter{}, &DynamoGraphDeploymentScalingAdapterList{}) +} diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index 56d33cd498..d3ecbb44ec 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -371,6 +371,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent *out = new(MultinodeSpec) **out = **in } + if in.ScalingAdapter != nil { + in, out := &in.ScalingAdapter, &out.ScalingAdapter + *out = new(ScalingAdapter) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec. @@ -599,6 +604,115 @@ func (in *DynamoGraphDeploymentRequestStatus) DeepCopy() *DynamoGraphDeploymentR return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapter) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapter. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopy() *DynamoGraphDeploymentScalingAdapter { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapter) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]DynamoGraphDeploymentScalingAdapter, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterList. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopy() *DynamoGraphDeploymentScalingAdapterList { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterSpec) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterSpec) { + *out = *in + out.DGDRef = in.DGDRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterSpec. +func (in *DynamoGraphDeploymentScalingAdapterSpec) DeepCopy() *DynamoGraphDeploymentScalingAdapterSpec { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterStatus) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterStatus) { + *out = *in + if in.LastScaleTime != nil { + in, out := &in.LastScaleTime, &out.LastScaleTime + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterStatus. +func (in *DynamoGraphDeploymentScalingAdapterStatus) DeepCopy() *DynamoGraphDeploymentScalingAdapterStatus { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentServiceRef) DeepCopyInto(out *DynamoGraphDeploymentServiceRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentServiceRef. +func (in *DynamoGraphDeploymentServiceRef) DeepCopy() *DynamoGraphDeploymentServiceRef { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentServiceRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DynamoGraphDeploymentSpec) DeepCopyInto(out *DynamoGraphDeploymentSpec) { *out = *in @@ -1085,6 +1199,21 @@ func (in *Resources) DeepCopy() *Resources { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScalingAdapter) DeepCopyInto(out *ScalingAdapter) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalingAdapter. +func (in *ScalingAdapter) DeepCopy() *ScalingAdapter { + if in == nil { + return nil + } + out := new(ScalingAdapter) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) { *out = *in diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index 4d79cfe3f0..dc1a33b262 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -578,6 +578,16 @@ func main() { os.Exit(1) } + if err = (&controller.DynamoGraphDeploymentScalingAdapterReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("dgdscalingadapter"), + Config: ctrlConfig, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "DGDScalingAdapter") + os.Exit(1) + } + if err = (&controller.DynamoGraphDeploymentRequestReconciler{ Client: mgr.GetClient(), Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"), diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml index 558a5b973d..c90e3bdfe7 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml @@ -77,12 +77,13 @@ spec: (such as Pod, Service, and Ingress when applicable). type: object autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. properties: behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). + description: 'Deprecated: This field is ignored.' properties: scaleDown: description: |- @@ -231,10 +232,13 @@ spec: type: object type: object enabled: + description: 'Deprecated: This field is ignored.' type: boolean maxReplicas: + description: 'Deprecated: This field is ignored.' type: integer metrics: + description: 'Deprecated: This field is ignored.' items: description: |- MetricSpec specifies how to scale based on a single metric @@ -665,6 +669,7 @@ spec: type: object type: array minReplicas: + description: 'Deprecated: This field is ignored.' type: integer type: object backendFramework: @@ -10184,8 +10189,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10264,6 +10273,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml index ba2b19fef9..4db1e902b8 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml @@ -219,12 +219,13 @@ spec: (such as Pod, Service, and Ingress when applicable). type: object autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. properties: behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). + description: 'Deprecated: This field is ignored.' properties: scaleDown: description: |- @@ -373,10 +374,13 @@ spec: type: object type: object enabled: + description: 'Deprecated: This field is ignored.' type: boolean maxReplicas: + description: 'Deprecated: This field is ignored.' type: integer metrics: + description: 'Deprecated: This field is ignored.' items: description: |- MetricSpec specifies how to scale based on a single metric @@ -807,6 +811,7 @@ spec: type: object type: array minReplicas: + description: 'Deprecated: This field is ignored.' type: integer type: object componentType: @@ -10319,8 +10324,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10399,6 +10408,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml new file mode 100644 index 0000000000..f822bb91db --- /dev/null +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dynamographdeploymentscalingadapters.nvidia.com +spec: + group: nvidia.com + names: + kind: DynamoGraphDeploymentScalingAdapter + listKind: DynamoGraphDeploymentScalingAdapterList + plural: dynamographdeploymentscalingadapters + shortNames: + - dgdsa + singular: dynamographdeploymentscalingadapter + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: DynamoGraphDeployment name + jsonPath: .spec.dgdRef.name + name: DGD + type: string + - description: Service name + jsonPath: .spec.dgdRef.serviceName + name: SERVICE + type: string + - description: Current replicas + jsonPath: .status.replicas + name: REPLICAS + type: integer + - jsonPath: .metadata.creationTimestamp + name: AGE + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services + within a DynamoGraphDeployment. It implements the Kubernetes scale + subresource, enabling integration with HPA, KEDA, and custom autoscalers. + + The adapter acts as an intermediary between autoscalers and the DGD, + ensuring that only the adapter controller modifies the DGD's service replicas. + This prevents conflicts when multiple autoscaling mechanisms are in play. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter + properties: + dgdRef: + description: DGDRef references the DynamoGraphDeployment and the specific service to scale. + properties: + name: + description: Name of the DynamoGraphDeployment + minLength: 1 + type: string + serviceName: + description: ServiceName is the key name of the service within the DGD's spec.services map to scale + minLength: 1 + type: string + required: + - name + - serviceName + type: object + replicas: + description: |- + Replicas is the desired number of replicas for the target service. + This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + format: int32 + minimum: 0 + type: integer + required: + - dgdRef + - replicas + type: object + status: + description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter + properties: + lastScaleTime: + description: LastScaleTime is the last time the adapter scaled the target service. + format: date-time + type: string + replicas: + description: |- + Replicas is the current number of replicas for the target service. + This is synced from the DGD's service replicas and is required for the scale subresource. + format: int32 + type: integer + selector: + description: |- + Selector is a label selector string for the pods managed by this adapter. + Required for HPA compatibility via the scale subresource. + type: string + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} diff --git a/deploy/cloud/operator/config/rbac/role.yaml b/deploy/cloud/operator/config/rbac/role.yaml index b473aa1ad7..2a3a00c6f8 100644 --- a/deploy/cloud/operator/config/rbac/role.yaml +++ b/deploy/cloud/operator/config/rbac/role.yaml @@ -182,6 +182,7 @@ rules: - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments + - dynamographdeploymentscalingadapters - dynamomodels verbs: - create @@ -206,6 +207,7 @@ rules: - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status + - dynamographdeploymentscalingadapters/status - dynamomodels/status verbs: - get diff --git a/deploy/cloud/operator/internal/consts/consts.go b/deploy/cloud/operator/internal/consts/consts.go index 882f9f18d9..6dd3bc0712 100644 --- a/deploy/cloud/operator/internal/consts/consts.go +++ b/deploy/cloud/operator/internal/consts/consts.go @@ -7,8 +7,6 @@ import ( ) const ( - HPACPUDefaultAverageUtilization = 80 - DefaultUserId = "default" DefaultOrgId = "default" diff --git a/deploy/cloud/operator/internal/controller/common.go b/deploy/cloud/operator/internal/controller/common.go index 70a70fdead..e41cbe1deb 100644 --- a/deploy/cloud/operator/internal/controller/common.go +++ b/deploy/cloud/operator/internal/controller/common.go @@ -53,3 +53,43 @@ type dockerSecretRetriever interface { // returns a list of secret names associated with the docker registry GetSecrets(namespace, registry string) ([]string, error) } + +// getServiceKeys returns the keys of the services map for logging purposes +func getServiceKeys(services map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec) []string { + keys := make([]string, 0, len(services)) + for k := range services { + keys = append(keys, k) + } + return keys +} + +// servicesEqual compares two services maps to detect changes in replica counts +func servicesEqual(old, new map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec) bool { + if len(old) != len(new) { + return false + } + + for key, oldSvc := range old { + newSvc, exists := new[key] + if !exists { + return false + } + + // Compare replicas + oldReplicas := int32(1) + if oldSvc.Replicas != nil { + oldReplicas = *oldSvc.Replicas + } + + newReplicas := int32(1) + if newSvc.Replicas != nil { + newReplicas = *newSvc.Replicas + } + + if oldReplicas != newReplicas { + return false + } + } + + return true +} diff --git a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go index 307bf7ac05..88d92e2f42 100644 --- a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go @@ -338,21 +338,6 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req } deployment = obj - - // create or update api-server hpa - modified_, _, err = commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) { - return r.generateHPA(generateResourceOption{ - dynamoComponentDeployment: dynamoComponentDeployment, - }) - }) - if err != nil { - return ctrl.Result{}, err - } - - if modified_ { - modified = true - } - } // create or update api-server service @@ -1114,63 +1099,6 @@ type generateResourceOption struct { instanceID *int } -func (r *DynamoComponentDeploymentReconciler) generateHPA(opt generateResourceOption) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) { - labels := r.getKubeLabels(opt.dynamoComponentDeployment) - - annotations := r.getKubeAnnotations(opt.dynamoComponentDeployment) - - kubeName := r.getKubeName(opt.dynamoComponentDeployment, false) - - kubeNs := opt.dynamoComponentDeployment.Namespace - - hpaConf := opt.dynamoComponentDeployment.Spec.Autoscaling - - kubeHpa := &autoscalingv2.HorizontalPodAutoscaler{ - ObjectMeta: metav1.ObjectMeta{ - Name: kubeName, - Namespace: kubeNs, - Labels: labels, - Annotations: annotations, - }, - } - - if hpaConf == nil || !hpaConf.Enabled { - // if hpa is not enabled, we need to delete the hpa - return kubeHpa, true, nil - } - - minReplica := int32(hpaConf.MinReplicas) - - kubeHpa.Spec = autoscalingv2.HorizontalPodAutoscalerSpec{ - MinReplicas: &minReplica, - MaxReplicas: int32(hpaConf.MaxReplicas), - ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ - APIVersion: "apps/v1", - Kind: "Deployment", - Name: kubeName, - }, - Metrics: hpaConf.Metrics, - } - - if len(kubeHpa.Spec.Metrics) == 0 { - averageUtilization := int32(commonconsts.HPACPUDefaultAverageUtilization) - kubeHpa.Spec.Metrics = []autoscalingv2.MetricSpec{ - { - Type: autoscalingv2.ResourceMetricSourceType, - Resource: &autoscalingv2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2.MetricTarget{ - Type: autoscalingv2.UtilizationMetricType, - AverageUtilization: &averageUtilization, - }, - }, - }, - } - } - - return kubeHpa, false, nil -} - //nolint:gocyclo,nakedret func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx context.Context, opt generateResourceOption, role dynamo.Role) (podTemplateSpec *corev1.PodTemplateSpec, err error) { podLabels := r.getKubeLabels(opt.dynamoComponentDeployment) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 22dcdb5490..823818ac1e 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -86,6 +86,7 @@ type DynamoGraphDeploymentReconciler struct { // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch @@ -225,6 +226,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context return "", "", "", fmt.Errorf("failed to reconcile top-level PVCs: %w", err) } + // Reconcile DynamoGraphDeploymentScalingAdapters for each service + err = r.reconcileScalingAdapters(ctx, dynamoDeployment) + if err != nil { + logger.Error(err, "Failed to reconcile scaling adapters") + return "", "", "", fmt.Errorf("failed to reconcile scaling adapters: %w", err) + } + // Reconcile the SA, Role and RoleBinding if k8s discovery is enabled err = r.reconcileK8sDiscoveryResources(ctx, dynamoDeployment) if err != nil { @@ -607,6 +615,89 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn return nil } +// reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD +// that has scaling adapter enabled (default). Services with scalingAdapter.disable=true will not have a DGDSA. +// This enables pluggable autoscaling via HPA, KEDA, or Planner. +func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { + logger := log.FromContext(ctx) + + // Process each service - SyncResource handles create, update, and delete via toDelete flag + for serviceName, component := range dynamoDeployment.Spec.Services { + // Check if scaling adapter is disabled for this service + scalingAdapterDisabled := component.ScalingAdapter != nil && component.ScalingAdapter.Disable + + // Get current replicas (default to 1 if not set) + currentReplicas := int32(1) + if component.Replicas != nil { + currentReplicas = *component.Replicas + } + + // Use SyncResource to handle creation/updates/deletion + // When toDelete=true, SyncResource will delete the existing resource if it exists + _, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, bool, error) { + adapterName := generateAdapterName(dynamoDeployment.Name, serviceName) + adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: adapterName, + Namespace: dynamoDeployment.Namespace, + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name, + consts.KubeLabelDynamoComponent: serviceName, + }, + }, + Spec: nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: currentReplicas, + DGDRef: nvidiacomv1alpha1.DynamoGraphDeploymentServiceRef{ + Name: dynamoDeployment.Name, + ServiceName: serviceName, + }, + }, + } + // Return toDelete=true if scaling adapter is disabled + return adapter, scalingAdapterDisabled, nil + }) + + if err != nil { + logger.Error(err, "Failed to sync DynamoGraphDeploymentScalingAdapter", "service", serviceName) + return err + } + } + + // Clean up adapters for services that were removed from DGD entirely + adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{} + if err := r.List(ctx, adapterList, + client.InNamespace(dynamoDeployment.Namespace), + client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name}, + ); err != nil { + logger.Error(err, "Failed to list DynamoGraphDeploymentScalingAdapters") + return err + } + + for i := range adapterList.Items { + adapter := &adapterList.Items[i] + serviceName := adapter.Spec.DGDRef.ServiceName + + // Delete adapter if service no longer exists in DGD + if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists { + logger.Info("Deleting orphaned DynamoGraphDeploymentScalingAdapter", "adapter", adapter.Name, "service", serviceName) + if err := r.Delete(ctx, adapter); err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Failed to delete orphaned adapter", "adapter", adapter.Name) + return err + } + r.Recorder.Eventf(dynamoDeployment, corev1.EventTypeNormal, "AdapterDeleted", + "Deleted orphaned scaling adapter %s for removed service %s", adapter.Name, serviceName) + } + } + + return nil +} + +// generateAdapterName creates a consistent name for a DynamoGraphDeploymentScalingAdapter +// Service names are lowercased to comply with Kubernetes DNS subdomain naming requirements +func generateAdapterName(dgdName, serviceName string) string { + return fmt.Sprintf("%s-%s", dgdName, strings.ToLower(serviceName)) +} + func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { // for now doing nothing return nil @@ -626,6 +717,13 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err UpdateFunc: func(de event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true }, })). + Owns(&nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{}, builder.WithPredicates(predicate.Funcs{ + // ignore creation cause we don't want to be called again after we create the adapter + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, + UpdateFunc: func(de event.UpdateEvent) bool { return false }, // Adapter updates are handled by adapter controller + GenericFunc: func(ge event.GenericEvent) bool { return false }, + })). Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.Funcs{ // ignore creation cause we don't want to be called again after we create the PVC CreateFunc: func(ce event.CreateEvent) bool { return false }, diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go new file mode 100644 index 0000000000..a217fd403c --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go @@ -0,0 +1,321 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "testing" + + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + tests := []struct { + name string + dgd *v1alpha1.DynamoGraphDeployment + existingAdapters []v1alpha1.DynamoGraphDeploymentScalingAdapter + expectedAdapterCount int + expectedAdapters map[string]int32 // map of adapter name to expected replicas + expectDeleted []string // adapter names that should be deleted + }{ + { + name: "creates adapters for all services", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + "decode": { + Replicas: ptr.To(int32(3)), + }, + }, + }, + }, + expectedAdapterCount: 2, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + "test-dgd-decode": 3, + }, + }, + { + name: "uses default replicas when not specified", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "worker": {}, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-worker": 1, // default replicas + }, + }, + { + name: "skips adapter creation when disabled", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + "decode": { + Replicas: ptr.To(int32(3)), + ScalingAdapter: &v1alpha1.ScalingAdapter{ + Disable: true, + }, + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + }, + }, + { + name: "deletes adapter when service is removed", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + UID: "test-uid", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + }, + existingAdapters: []v1alpha1.DynamoGraphDeploymentScalingAdapter{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-removed", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 1, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "removed", + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + }, + expectDeleted: []string{"test-dgd-removed"}, + }, + { + name: "deletes adapter when scalingAdapter.disable is set to true", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + UID: "test-uid", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + ScalingAdapter: &v1alpha1.ScalingAdapter{ + Disable: true, + }, + }, + }, + }, + }, + existingAdapters: []v1alpha1.DynamoGraphDeploymentScalingAdapter{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + }, + expectedAdapterCount: 0, + expectedAdapters: map[string]int32{}, + expectDeleted: []string{"test-dgd-frontend"}, + }, + { + name: "adapter name uses lowercase service name", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "MyService": { + Replicas: ptr.To(int32(1)), + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "my-dgd-myservice": 1, // lowercase + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Build initial objects + var initObjs []client.Object + initObjs = append(initObjs, tt.dgd) + for i := range tt.existingAdapters { + initObjs = append(initObjs, &tt.existingAdapters[i]) + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(initObjs...). + Build() + + // Create reconciler + r := &DynamoGraphDeploymentReconciler{ + Client: fakeClient, + Recorder: record.NewFakeRecorder(10), + } + + // Run reconcileScalingAdapters + ctx := context.Background() + err := r.reconcileScalingAdapters(ctx, tt.dgd) + if err != nil { + t.Fatalf("reconcileScalingAdapters() error = %v", err) + } + + // Verify adapters + adapterList := &v1alpha1.DynamoGraphDeploymentScalingAdapterList{} + if err := fakeClient.List(ctx, adapterList, client.InNamespace("default")); err != nil { + t.Fatalf("Failed to list adapters: %v", err) + } + + if len(adapterList.Items) != tt.expectedAdapterCount { + t.Errorf("Expected %d adapters, got %d", tt.expectedAdapterCount, len(adapterList.Items)) + } + + // Check expected adapters exist with correct replicas + for name, expectedReplicas := range tt.expectedAdapters { + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + err := fakeClient.Get(ctx, types.NamespacedName{Name: name, Namespace: "default"}, adapter) + if err != nil { + t.Errorf("Expected adapter %s to exist, but got error: %v", name, err) + continue + } + if adapter.Spec.Replicas != expectedReplicas { + t.Errorf("Adapter %s has replicas=%d, expected %d", name, adapter.Spec.Replicas, expectedReplicas) + } + } + + // Check that deleted adapters don't exist + for _, name := range tt.expectDeleted { + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + err := fakeClient.Get(ctx, types.NamespacedName{Name: name, Namespace: "default"}, adapter) + if err == nil { + t.Errorf("Expected adapter %s to be deleted, but it still exists", name) + } + } + }) + } +} diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go new file mode 100644 index 0000000000..edaa4323ae --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -0,0 +1,213 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" +) + +// DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object +type DynamoGraphDeploymentScalingAdapterReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder + Config commonController.Config +} + +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;update;patch + +// Reconcile implements the reconciliation loop for DynamoGraphDeploymentScalingAdapter +func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // 1. Fetch the DynamoGraphDeploymentScalingAdapter + adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{} + if err := r.Get(ctx, req.NamespacedName, adapter); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Skip reconciliation if being deleted + if !adapter.GetDeletionTimestamp().IsZero() { + logger.V(1).Info("Adapter is being deleted, skipping reconciliation") + return ctrl.Result{}, nil + } + + // 2. Fetch the referenced DGD + dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{} + dgdKey := types.NamespacedName{ + Name: adapter.Spec.DGDRef.Name, + Namespace: adapter.Namespace, + } + if err := r.Get(ctx, dgdKey, dgd); err != nil { + if errors.IsNotFound(err) { + logger.Error(err, "Referenced DGD not found", "dgd", dgdKey) + // DGD doesn't exist, can't proceed + return ctrl.Result{}, err + } + return ctrl.Result{}, err + } + + // 3. Find the target service in DGD's spec.services map + component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName] + if !exists || component == nil { + logger.Error(nil, "Service not found in DGD", + "service", adapter.Spec.DGDRef.ServiceName, + "dgd", dgd.Name, + "availableServices", getServiceKeys(dgd.Spec.Services)) + return ctrl.Result{}, fmt.Errorf("service %s not found in DGD", adapter.Spec.DGDRef.ServiceName) + } + + // Get current replicas from DGD (default to 1 if not set) + currentReplicas := int32(1) + if component.Replicas != nil { + currentReplicas = *component.Replicas + } + + // 4. Update DGD if replicas changed (DGDSA is the source of truth) + if currentReplicas != adapter.Spec.Replicas { + // Update the service's replicas in DGD + component.Replicas = &adapter.Spec.Replicas + dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName] = component + + if err := r.Update(ctx, dgd); err != nil { + logger.Error(err, "Failed to update DGD") + r.Recorder.Eventf(adapter, corev1.EventTypeWarning, "UpdateFailed", + "Failed to update DGD %s: %v", dgd.Name, err) + return ctrl.Result{}, err + } + + logger.Info("Scaled service", + "dgd", dgd.Name, + "service", adapter.Spec.DGDRef.ServiceName, + "from", currentReplicas, + "to", adapter.Spec.Replicas) + + r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Scaled", + "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.ServiceName, currentReplicas, adapter.Spec.Replicas) + + // Record scaling event + now := metav1.Now() + adapter.Status.LastScaleTime = &now + } + + // 5. Update adapter status + adapter.Status.Replicas = adapter.Spec.Replicas + adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.ServiceName) + + if err := r.Status().Update(ctx, adapter); err != nil { + logger.Error(err, "Failed to update adapter status") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// buildPodSelector constructs a label selector for the pods managed by this service +func (r *DynamoGraphDeploymentScalingAdapterReconciler) buildPodSelector(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) string { + // Pods are labeled with: + // - nvidia.com/dynamo-graph-deployment-name = dgd.Name + // - nvidia.com/dynamo-component = serviceName (the key from spec.services map) + return fmt.Sprintf("%s=%s,%s=%s", + consts.KubeLabelDynamoGraphDeploymentName, dgd.Name, + consts.KubeLabelDynamoComponent, serviceName) +} + +// SetupWithManager sets up the controller with the Manager +func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{}, builder.WithPredicates( + predicate.GenerationChangedPredicate{}, + )). + Named("dgdscalingadapter"). + // Watch DGDs to sync status when DGD service replicas change + Watches( + &nvidiacomv1alpha1.DynamoGraphDeployment{}, + handler.EnqueueRequestsFromMapFunc(r.findAdaptersForDGD), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, + UpdateFunc: func(ue event.UpdateEvent) bool { + // Only trigger on spec changes (not status) + oldDGD, okOld := ue.ObjectOld.(*nvidiacomv1alpha1.DynamoGraphDeployment) + newDGD, okNew := ue.ObjectNew.(*nvidiacomv1alpha1.DynamoGraphDeployment) + if !okOld || !okNew { + return false + } + // Trigger if services map changed + return !servicesEqual(oldDGD.Spec.Services, newDGD.Spec.Services) + }, + GenericFunc: func(ge event.GenericEvent) bool { return false }, + }), + ). + WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). + Complete(r) +} + +// findAdaptersForDGD maps DGD changes to adapter reconcile requests +// Uses label selector to efficiently query only adapters for this specific DGD +func (r *DynamoGraphDeploymentScalingAdapterReconciler) findAdaptersForDGD(ctx context.Context, obj client.Object) []reconcile.Request { + dgd, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment) + if !ok { + return nil + } + + // Use label selector to filter at API level (more efficient than in-memory filtering) + adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{} + if err := r.List(ctx, adapterList, + client.InNamespace(dgd.Namespace), + client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dgd.Name}, + ); err != nil { + log.FromContext(ctx).Error(err, "Failed to list adapters for DGD", "dgd", dgd.Name) + return nil + } + + // All returned adapters are guaranteed to belong to this DGD + requests := make([]reconcile.Request, 0, len(adapterList.Items)) + for i := range adapterList.Items { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: adapterList.Items[i].Name, + Namespace: adapterList.Items[i].Namespace, + }, + }) + } + + return requests +} diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go new file mode 100644 index 0000000000..33c6b9f5e8 --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go @@ -0,0 +1,512 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "testing" + + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + tests := []struct { + name string + adapter *v1alpha1.DynamoGraphDeploymentScalingAdapter + dgd *v1alpha1.DynamoGraphDeployment + expectedDGDReplicas int32 + expectedStatusReplicas int32 + expectError bool + expectRequeue bool + }{ + { + name: "updates DGD replicas when DGDSA spec differs", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + }, + expectedDGDReplicas: 5, + expectedStatusReplicas: 5, + expectError: false, + }, + { + name: "no update when replicas already match", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 3, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(3)), + }, + }, + }, + }, + expectedDGDReplicas: 3, + expectedStatusReplicas: 3, + expectError: false, + }, + { + name: "uses default replicas (1) when DGD service has no replicas set", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-worker", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 4, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "worker", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "worker": {}, // no replicas set + }, + }, + }, + expectedDGDReplicas: 4, + expectedStatusReplicas: 4, + expectError: false, + }, + { + name: "error when service not found in DGD", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-missing", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "nonexistent", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(1)), + }, + }, + }, + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Build initial objects + var initObjs []client.Object + initObjs = append(initObjs, tt.adapter, tt.dgd) + + // Create fake client with status subresource support + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(initObjs...). + WithStatusSubresource(&v1alpha1.DynamoGraphDeploymentScalingAdapter{}). + Build() + + // Create reconciler + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + // Run Reconcile + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.adapter.Name, + Namespace: tt.adapter.Namespace, + }, + } + + result, err := r.Reconcile(ctx, req) + + // Check error expectation + if tt.expectError && err == nil { + t.Errorf("Expected error, but got none") + } + if !tt.expectError && err != nil { + t.Errorf("Unexpected error: %v", err) + } + + // Skip further checks if error was expected + if tt.expectError { + return + } + + // Check requeue + if tt.expectRequeue && result.RequeueAfter == 0 { + t.Errorf("Expected requeue, but got none") + } + + // Verify DGD replicas were updated + updatedDGD := &v1alpha1.DynamoGraphDeployment{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: tt.dgd.Name, Namespace: tt.dgd.Namespace}, updatedDGD); err != nil { + t.Fatalf("Failed to get updated DGD: %v", err) + } + + service, exists := updatedDGD.Spec.Services[tt.adapter.Spec.DGDRef.ServiceName] + if !exists { + t.Fatalf("Service %s not found in updated DGD", tt.adapter.Spec.DGDRef.ServiceName) + } + + actualReplicas := int32(1) + if service.Replicas != nil { + actualReplicas = *service.Replicas + } + + if actualReplicas != tt.expectedDGDReplicas { + t.Errorf("DGD service replicas = %d, expected %d", actualReplicas, tt.expectedDGDReplicas) + } + + // Verify adapter status was updated + updatedAdapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: tt.adapter.Name, Namespace: tt.adapter.Namespace}, updatedAdapter); err != nil { + t.Fatalf("Failed to get updated adapter: %v", err) + } + + if updatedAdapter.Status.Replicas != tt.expectedStatusReplicas { + t.Errorf("Adapter status.replicas = %d, expected %d", updatedAdapter.Status.Replicas, tt.expectedStatusReplicas) + } + + // Verify selector is set + if updatedAdapter.Status.Selector == "" { + t.Errorf("Adapter status.selector is empty, expected non-empty") + } + }) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_NotFound(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + // Create fake client with no objects + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "nonexistent", + Namespace: "default", + }, + } + + // Should return no error when adapter not found (client.IgnoreNotFound) + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Errorf("Expected no error for not found adapter, got: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("Expected no requeueAfter for not found adapter, got: %v", result.RequeueAfter) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_DGDNotFound(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "nonexistent-dgd", + ServiceName: "Frontend", + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: adapter.Name, + Namespace: adapter.Namespace, + }, + } + + // Should return error when DGD not found + _, err := r.Reconcile(ctx, req) + if err == nil { + t.Errorf("Expected error when DGD not found, got none") + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_BeingDeleted(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + now := metav1.Now() + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + DeletionTimestamp: &now, + Finalizers: []string{"test-finalizer"}, // Required for deletion timestamp to be set + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + } + + dgd := &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter, dgd). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: adapter.Name, + Namespace: adapter.Namespace, + }, + } + + // Should return no error and skip reconciliation + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Errorf("Expected no error for deleting adapter, got: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("Expected no requeueAfter for deleting adapter, got: %v", result.RequeueAfter) + } + + // DGD replicas should NOT be updated (still 2) + updatedDGD := &v1alpha1.DynamoGraphDeployment{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: dgd.Name, Namespace: dgd.Namespace}, updatedDGD); err != nil { + t.Fatalf("Failed to get DGD: %v", err) + } + + if *updatedDGD.Spec.Services["Frontend"].Replicas != 2 { + t.Errorf("DGD replicas should remain unchanged, got %d", *updatedDGD.Spec.Services["Frontend"].Replicas) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_findAdaptersForDGD(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + dgd := &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + } + + // Adapters belonging to test-dgd + adapter1 := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + } + + adapter2 := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-decode", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "decode", + }, + }, + } + + // Adapter belonging to different DGD + adapterOther := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "other-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "other-dgd", + ServiceName: "Frontend", + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter1, adapter2, adapterOther). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + } + + ctx := context.Background() + requests := r.findAdaptersForDGD(ctx, dgd) + + // Should return 2 requests (for test-dgd adapters only) + if len(requests) != 2 { + t.Errorf("findAdaptersForDGD() returned %d requests, expected 2", len(requests)) + } + + // Verify correct adapters are returned + expectedNames := map[string]bool{ + "test-dgd-frontend": true, + "test-dgd-decode": true, + } + + for _, req := range requests { + if !expectedNames[req.Name] { + t.Errorf("Unexpected adapter in results: %s", req.Name) + } + } +} diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index 706dcec234..e644e5e881 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -1034,7 +1034,7 @@ func GenerateGrovePodCliqueSet( PodSpec: *podSpec, }, } - labels, err := generateLabels(component, dynamoDeployment, r.Name) + labels, err := generateLabels(component, dynamoDeployment, serviceName) if err != nil { return nil, fmt.Errorf("failed to generate labels: %w", err) } @@ -1075,6 +1075,7 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn labels := make(map[string]string) labels[commonconsts.KubeLabelDynamoSelector] = GetDynamoComponentName(dynamoDeployment, componentName) labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name + labels[commonconsts.KubeLabelDynamoComponent] = componentName if component.DynamoNamespace != nil { labels[commonconsts.KubeLabelDynamoNamespace] = *component.DynamoNamespace } diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index d93a60459b..6a126cf445 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -121,7 +121,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, }, }, }, @@ -153,7 +152,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -229,7 +227,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, }, }, }, @@ -261,7 +258,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -341,7 +337,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, Ingress: &v1alpha1.IngressSpec{ Enabled: true, Host: "test-dynamographdeployment", @@ -377,7 +372,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -465,7 +459,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "DYN_DEPLOYMENT_CONFIG", @@ -503,7 +496,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "DYN_DEPLOYMENT_CONFIG", @@ -599,7 +591,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, ExtraPodSpec: &v1alpha1.ExtraPodSpec{ MainContainer: &corev1.Container{ Command: []string{"sh", "-c"}, @@ -644,7 +635,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "TEST_ENV", @@ -1307,6 +1297,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "frontend", Labels: map[string]string{ commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", @@ -1483,6 +1474,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", @@ -1884,8 +1876,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", "nvidia.com/label2": "label2", @@ -2059,8 +2052,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", "nvidia.com/label2": "label2", @@ -2200,6 +2194,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", }, Annotations: map[string]string{}, @@ -2358,6 +2353,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "planner", Labels: map[string]string{ commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, @@ -2779,7 +2775,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { { Name: "worker-ldr", Labels: map[string]string{ - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", @@ -2943,7 +2940,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", @@ -3084,6 +3082,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", }, Annotations: map[string]string{}, @@ -3243,6 +3242,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", diff --git a/deploy/cloud/operator/internal/webhook/common.go b/deploy/cloud/operator/internal/webhook/common.go index 6333738739..c18edd98f4 100644 --- a/deploy/cloud/operator/internal/webhook/common.go +++ b/deploy/cloud/operator/internal/webhook/common.go @@ -19,7 +19,9 @@ package webhook import ( "context" + "strings" + authenticationv1 "k8s.io/api/authentication/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool { return false } + +// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized +// to modify DGD replicas when scaling adapter is enabled. +// Service accounts matching any of these suffixes are allowed regardless of namespace. +var DGDReplicasModifierSuffixes = []string{ + // Dynamo operator controller manager (handles DGDSA reconciliation) + // Example: "dynamo-platform-dynamo-operator-controller-manager" + "-dynamo-operator-controller-manager", + + // Planner service account (manages DGD replicas for autoscaling) + // Example: "planner-serviceaccount" + "planner-serviceaccount", +} + +// CanModifyDGDReplicas checks if the request comes from a service account authorized +// to modify DGD replicas when scaling adapter is enabled. +// Service accounts are identified by username format: system:serviceaccount:: +// +// Authorized service accounts (by suffix): +// - *-dynamo-operator-controller-manager (for DGDSA reconciliation) +// - *planner-serviceaccount (for Planner autoscaling) +func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool { + username := userInfo.Username + + // Service accounts have username format: system:serviceaccount:: + if !strings.HasPrefix(username, "system:serviceaccount:") { + return false + } + + // Parse: system:serviceaccount:: + parts := strings.Split(username, ":") + if len(parts) != 4 { + return false + } + + namespace := parts[2] + saName := parts[3] + + // Check against authorized suffixes + for _, suffix := range DGDReplicasModifierSuffixes { + if strings.HasSuffix(saName, suffix) { + webhookCommonLog.V(1).Info("allowing DGD replicas modification", + "serviceAccount", saName, + "namespace", namespace, + "matchedSuffix", suffix) + return true + } + } + + return false +} diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go index c77303fde2..c0e0628834 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go @@ -42,13 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) { // Validate shared spec fields using SharedSpecValidator sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec") - if err := sharedValidator.Validate(); err != nil { - return nil, err - } // DCD-specific validation would go here (currently none) - return nil, nil + return sharedValidator.Validate() } // ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment. diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go index 0324856dfd..f38240c8ee 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go @@ -47,11 +47,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) { Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{ DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ Replicas: &validReplicas, - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 1, - MaxReplicas: 10, - }, }, BackendFramework: "sglang", }, @@ -74,26 +69,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.replicas must be non-negative", }, - { - name: "invalid autoscaling", - deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-deployment", - Namespace: "default", - }, - Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{ - DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 5, - MaxReplicas: 3, - }, - }, - }, - }, - wantErr: true, - errMsg: "spec.autoscaling.maxReplicas must be > minReplicas", - }, { name: "invalid ingress", deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{ diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go index e6bf9e3893..00a1668806 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go @@ -22,6 +22,8 @@ import ( "fmt" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook" + authenticationv1 "k8s.io/api/authentication/v1" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) @@ -51,30 +53,106 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) return nil, err } + var allWarnings admission.Warnings + // Validate each service for serviceName, service := range v.deployment.Spec.Services { - if err := v.validateService(serviceName, service); err != nil { + warnings, err := v.validateService(serviceName, service) + if err != nil { return nil, err } + allWarnings = append(allWarnings, warnings...) } - return nil, nil + return allWarnings, nil } // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment. +// userInfo is used for identity-based validation (replica protection). +// If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed). // Returns warnings and error. -func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) { - // Validate that BackendFramework is not changed (immutable) +func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) { + var warnings admission.Warnings + + // Validate immutable fields + if err := v.validateImmutableFields(old, &warnings); err != nil { + return warnings, err + } + + // Validate replicas changes for services with scaling adapter enabled + // Pass userInfo (may be nil - will fail closed for DGDSA-enabled services) + if err := v.validateReplicasChanges(old, userInfo); err != nil { + return warnings, err + } + + return warnings, nil +} + +// validateImmutableFields checks that immutable fields have not been changed. +// Appends warnings to the provided slice. +func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error { if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework { - warning := "Changing spec.backendFramework may cause unexpected behavior" - return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") + *warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior") + return fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") } + return nil +} - return nil, nil +// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled. +// Only authorized service accounts (operator controller, planner) can modify these fields. +// If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed). +func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) error { + // If the request comes from an authorized service account, allow the change + if userInfo != nil && internalwebhook.CanModifyDGDReplicas(*userInfo) { + return nil + } + + var errs []error + + for serviceName, newService := range v.deployment.Spec.Services { + // Check if scaling adapter is enabled for this service (enabled by default) + scalingAdapterEnabled := true + if newService.ScalingAdapter != nil && newService.ScalingAdapter.Disable { + scalingAdapterEnabled = false + } + + if !scalingAdapterEnabled { + // Scaling adapter is disabled, users can modify replicas directly + continue + } + + // Get old service (if exists) + oldService, exists := old.Spec.Services[serviceName] + if !exists { + // New service, no comparison needed + continue + } + + // Check if replicas changed + oldReplicas := int32(1) // default + if oldService.Replicas != nil { + oldReplicas = *oldService.Replicas + } + + newReplicas := int32(1) // default + if newService.Replicas != nil { + newReplicas = *newService.Replicas + } + + if oldReplicas != newReplicas { + errs = append(errs, fmt.Errorf( + "spec.services[%s].replicas cannot be modified directly when scaling adapter is enabled; "+ + "scale or update the related DynamoGraphDeploymentScalingAdapter instead", + serviceName)) + } + } + + return errors.Join(errs...) } // validateService validates a single service configuration using SharedSpecValidator. -func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error { +// Returns warnings and error. +func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) { // Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec) fieldPath := fmt.Sprintf("spec.services[%s]", serviceName) sharedValidator := NewSharedSpecValidator(service, fieldPath) diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go index 074a4c5cc2..e98bd03442 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go @@ -23,6 +23,7 @@ import ( nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook" + authenticationv1 "k8s.io/api/authentication/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -91,9 +92,24 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb return warnings, err } - // Validate stateful rules (immutability) - updateWarnings, err := validator.ValidateUpdate(oldDeployment) + // Get user info from admission request context for identity-based validation + var userInfo *authenticationv1.UserInfo + req, err := admission.RequestFromContext(ctx) if err != nil { + logger.Error(err, "failed to get admission request from context, replica changes for DGDSA-enabled services will be rejected") + // userInfo remains nil - validateReplicasChanges will fail closed + } else { + userInfo = &req.UserInfo + } + + // Validate stateful rules (immutability + replicas protection) + updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo) + if err != nil { + username := "" + if userInfo != nil { + username = userInfo.Username + } + logger.Info("validation failed", "error", err.Error(), "user", username) return updateWarnings, err } diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go index 75c18dd33f..71228327b6 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go @@ -93,28 +93,6 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.services[main].replicas must be non-negative", }, - { - name: "service with invalid autoscaling", - deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-graph", - Namespace: "default", - }, - Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{ - Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - "prefill": { - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 10, - MaxReplicas: 5, - }, - }, - }, - }, - }, - wantErr: true, - errMsg: "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas", - }, { name: "service with invalid ingress", deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{ @@ -441,7 +419,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { validator := NewDynamoGraphDeploymentValidator(tt.newDeployment) - warnings, err := validator.ValidateUpdate(tt.oldDeployment) + // Pass nil userInfo - these tests don't modify replicas, so it's safe + warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil) if (err != nil) != tt.wantErr { t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr) diff --git a/deploy/cloud/operator/internal/webhook/validation/shared.go b/deploy/cloud/operator/internal/webhook/validation/shared.go index 5348193f3f..30edb0500d 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared.go @@ -21,6 +21,7 @@ import ( "fmt" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) // SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields. @@ -41,61 +42,45 @@ func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSha } // Validate performs validation on the shared spec fields. -// Returns an error if validation fails. -func (v *SharedSpecValidator) Validate() error { +// Returns warnings (e.g., deprecation notices) and error if validation fails. +func (v *SharedSpecValidator) Validate() (admission.Warnings, error) { // Validate replicas if specified if v.spec.Replicas != nil && *v.spec.Replicas < 0 { - return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) - } - - // Validate autoscaling configuration if specified - if v.spec.Autoscaling != nil { - if err := v.validateAutoscaling(); err != nil { - return err - } + return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) } // Validate ingress configuration if enabled if v.spec.Ingress != nil && v.spec.Ingress.Enabled { if err := v.validateIngress(); err != nil { - return err + return nil, err } } // Validate volume mounts if err := v.validateVolumeMounts(); err != nil { - return err + return nil, err } // Validate shared memory if v.spec.SharedMemory != nil { if err := v.validateSharedMemory(); err != nil { - return err + return nil, err } } - return nil -} - -// validateAutoscaling validates the autoscaling configuration. -func (v *SharedSpecValidator) validateAutoscaling() error { - autoscaling := v.spec.Autoscaling - - if !autoscaling.Enabled { - return nil - } - - // Validate minReplicas - if autoscaling.MinReplicas < 1 { - return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath) - } + // Collect warnings (e.g., deprecation notices) + var warnings admission.Warnings - // Validate maxReplicas - if autoscaling.MaxReplicas <= autoscaling.MinReplicas { - return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath) + // Check for deprecated autoscaling field + //nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users + if v.spec.Autoscaling != nil { + warnings = append(warnings, fmt.Sprintf( + "%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+ + "with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md", + v.fieldPath)) } - return nil + return warnings, nil } // validateIngress validates the ingress configuration. diff --git a/deploy/cloud/operator/internal/webhook/validation/shared_test.go b/deploy/cloud/operator/internal/webhook/validation/shared_test.go index 472bb7d990..b7a2687cbd 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared_test.go @@ -41,11 +41,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) { name: "valid spec with all fields", spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ Replicas: &validReplicas, - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 1, - MaxReplicas: 10, - }, Ingress: &nvidiacomv1alpha1.IngressSpec{ Enabled: true, Host: "example.com", @@ -77,44 +72,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.replicas must be non-negative", }, - { - name: "autoscaling minReplicas too low", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 0, - MaxReplicas: 10, - }, - }, - fieldPath: "spec", - wantErr: true, - errMsg: "spec.autoscaling.minReplicas must be >= 1", - }, - { - name: "autoscaling maxReplicas less than minReplicas", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 5, - MaxReplicas: 3, - }, - }, - fieldPath: "spec", - wantErr: true, - errMsg: "spec.autoscaling.maxReplicas must be > minReplicas", - }, - { - name: "autoscaling disabled - no validation", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: false, - MinReplicas: 0, - MaxReplicas: 0, - }, - }, - fieldPath: "spec", - wantErr: false, - }, { name: "ingress enabled without host", spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ @@ -227,7 +184,7 @@ func TestSharedSpecValidator_Validate(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { validator := NewSharedSpecValidator(tt.spec, tt.fieldPath) - err := validator.Validate() + _, err := validator.Validate() if (err != nil) != tt.wantErr { t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr) @@ -240,3 +197,53 @@ func TestSharedSpecValidator_Validate(t *testing.T) { }) } } + +func TestSharedSpecValidator_Validate_Warnings(t *testing.T) { + validReplicas := int32(3) + + tests := []struct { + name string + spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec + fieldPath string + wantWarnings int + }{ + { + name: "no warnings for spec without autoscaling", + spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ + Replicas: &validReplicas, + }, + fieldPath: "spec", + wantWarnings: 0, + }, + { + name: "warning for deprecated autoscaling field enabled", + spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ + Replicas: &validReplicas, + //nolint:staticcheck // SA1019: Intentionally testing deprecated field + Autoscaling: &nvidiacomv1alpha1.Autoscaling{ + Enabled: true, + MinReplicas: 1, + MaxReplicas: 10, + }, + }, + fieldPath: "spec", + wantWarnings: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + validator := NewSharedSpecValidator(tt.spec, tt.fieldPath) + warnings, err := validator.Validate() + + if err != nil { + t.Errorf("SharedSpecValidator.Validate() unexpected error = %v", err) + return + } + + if len(warnings) != tt.wantWarnings { + t.Errorf("SharedSpecValidator.Validate() warnings count = %d, want %d", len(warnings), tt.wantWarnings) + } + }) + } +} diff --git a/docs/_sections/k8s_deployment.rst b/docs/_sections/k8s_deployment.rst index 81d06513cb..cdd7d2029a 100644 --- a/docs/_sections/k8s_deployment.rst +++ b/docs/_sections/k8s_deployment.rst @@ -10,3 +10,4 @@ Deployment Guide Webhooks <../kubernetes/webhooks> Minikube Setup <../kubernetes/deployment/minikube> Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide> + Autoscaling <../kubernetes/autoscaling> diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index 09e7415769..4ae3246155 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -37,6 +37,7 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API - [DynamoComponentDeployment](#dynamocomponentdeployment) - [DynamoGraphDeployment](#dynamographdeployment) - [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest) +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) - [DynamoModel](#dynamomodel) @@ -45,7 +46,9 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API - +Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter +with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md +for migration guidance. This field will be removed in a future API version. @@ -55,11 +58,11 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `enabled` _boolean_ | | | | -| `minReplicas` _integer_ | | | | -| `maxReplicas` _integer_ | | | | -| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | | | | -| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | | | | +| `enabled` _boolean_ | Deprecated: This field is ignored. | | | +| `minReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `maxReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. | | | +| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. | | | @@ -165,7 +168,7 @@ _Appears in:_ | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | -| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -176,8 +179,9 @@ _Appears in:_ | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | -| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | #### DynamoComponentDeploymentSpec @@ -202,7 +206,7 @@ _Appears in:_ | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | -| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -213,8 +217,9 @@ _Appears in:_ | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | -| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | #### DynamoGraphDeployment @@ -314,6 +319,83 @@ _Appears in:_ | `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\}
| +#### DynamoGraphDeploymentScalingAdapter + + + +DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services +within a DynamoGraphDeployment. It implements the Kubernetes scale +subresource, enabling integration with HPA, KEDA, and custom autoscalers. + +The adapter acts as an intermediary between autoscalers and the DGD, +ensuring that only the adapter controller modifies the DGD's service replicas. +This prevents conflicts when multiple autoscaling mechanisms are in play. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ | | | | +| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ | | | | + + +#### DynamoGraphDeploymentScalingAdapterSpec + + + +DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.
This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0
Required: \{\}
| +| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: \{\}
| + + +#### DynamoGraphDeploymentScalingAdapterStatus + + + +DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the current number of replicas for the target service.
This is synced from the DGD's service replicas and is required for the scale subresource. | | | +| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.
Required for HPA compatibility via the scale subresource. | | | +| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. | | | + + +#### DynamoGraphDeploymentServiceRef + + + +DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: \{\}
| +| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: \{\}
| + + #### DynamoGraphDeploymentSpec @@ -638,6 +720,25 @@ _Appears in:_ | `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation | | | +#### ScalingAdapter + + + +ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter +for replica management. When enabled (default), the DGDSA owns the replicas field and +external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `disable` _boolean_ | Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD. | false | | + + #### SharedMemorySpec diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md new file mode 100644 index 0000000000..8adaf09107 --- /dev/null +++ b/docs/kubernetes/autoscaling.md @@ -0,0 +1,733 @@ +# Autoscaling + +This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`. + +## Example DGD + +All examples in this guide use the following DGD: + +```yaml +# examples/backends/sglang/deploy/agg.yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg + namespace: default +spec: + services: + Frontend: + dynamoNamespace: sglang-agg + componentType: frontend + replicas: 1 + + decode: + dynamoNamespace: sglang-agg + componentType: worker + replicas: 1 + resources: + limits: + gpu: "1" +``` + +**Key identifiers:** +- **DGD name**: `sglang-agg` +- **Namespace**: `default` +- **Services**: `Frontend`, `decode` +- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering) + +## Overview + +Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with: + +| Autoscaler | Description | Best For | +|------------|-------------|----------| +| **KEDA** | Event-driven autoscaling (recommended) | Most use cases | +| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling | +| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | +| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements | + +> **⚠️ Deprecation Notice**: The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning. + +## Architecture + +``` +┌──────────────────────────────────┐ ┌─────────────────────────────────────┐ +│ DynamoGraphDeployment │ │ Scaling Adapters (auto-created) │ +│ "sglang-agg" │ │ (one per service) │ +├──────────────────────────────────┤ ├─────────────────────────────────────┤ +│ │ │ │ +│ spec.services: │ │ ┌─────────────────────────────┐ │ ┌──────────────────┐ +│ │ │ │ sglang-agg-frontend │◄───┼──────│ Autoscalers │ +│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1 │ │ │ │ +│ │ Frontend: 1 replica │ │ │ └─────────────────────────────┘ │ │ • KEDA │ +│ └────────────────────────┘ │ │ │ │ • HPA │ +│ │ │ ┌─────────────────────────────┐ │ │ • Planner │ +│ ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode │◄───┼──────│ • Custom │ +│ │ decode: 1 replica │ │ │ │ spec.replicas: 1 │ │ │ │ +│ └────────────────────────┘ │ │ └─────────────────────────────┘ │ └──────────────────┘ +│ │ │ │ +└──────────────────────────────────┘ └─────────────────────────────────────┘ +``` + +**How it works:** + +1. You deploy a DGD with services (Frontend, decode) +2. The operator auto-creates one DGDSA per service +3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource +4. Adapter controller syncs replica changes to the DGD +5. DGD controller reconciles the underlying pods + +## Viewing Scaling Adapters + +After deploying the `sglang-agg` DGD, verify the auto-created adapters: + +```bash +kubectl get dgdsa -n default + +# Example output: +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-frontend sglang-agg Frontend 1 5m +# sglang-agg-decode sglang-agg decode 1 5m +``` + +## Replica Ownership Model + +When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets. + +### How It Works + +1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas` +2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service +3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD +4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas + +### Manual Scaling with DGDSA Enabled + +When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD): + +```bash +# ✅ Correct - scale via DGDSA +kubectl scale dgdsa sglang-agg-decode --replicas=3 + +# ❌ Blocked - direct DGD edit rejected by webhook +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled; +# use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead +``` + +## Disabling DGDSA for a Service + +If you want to manage replicas directly in the DGD (without autoscaling), you can disable the scaling adapter per service: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg +spec: + services: + Frontend: + replicas: 2 + scalingAdapter: + disable: true # ← No DGDSA created, direct edits allowed + + decode: + replicas: 1 # ← DGDSA created by default, managed via adapter +``` + +**When to disable DGDSA:** +- You want simple, manual replica management +- You don't need autoscaling for that service +- You prefer direct DGD edits over adapter-based scaling + +**When to keep DGDSA enabled (default):** +- You want to use HPA, KEDA, or Planner for autoscaling +- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD) +- You want protection against accidental direct replica edits + +## Autoscaling with Dynamo Planner + +The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization. + +**When to use Planner:** +- You want LLM-optimized autoscaling out of the box +- You need coordinated scaling across prefill/decode services +- You want SLA-driven scaling (e.g., target TTFT < 500ms) + +**How Planner works:** + +Planner is deployed as a service component within your DGD. It: +1. Queries Prometheus for frontend metrics (request rate, latency, etc.) +2. Uses profiling data to predict optimal replica counts +3. Scales prefill/decode workers to meet SLA targets + +**Deployment:** + +The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions. + +Example configurations with Planner: +- `examples/backends/vllm/deploy/disagg_planner.yaml` +- `examples/backends/sglang/deploy/disagg_planner.yaml` +- `examples/backends/trtllm/deploy/disagg_planner.yaml` + +For more details, see the [SLA Planner documentation](../planner/sla_planner.md). + +## Autoscaling with Kubernetes HPA + +The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution. + +**When to use HPA:** +- You have simple, predictable scaling requirements +- You want to use standard Kubernetes tooling +- You need CPU or memory-based scaling + +> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure. + +### Basic HPA (CPU-based) + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-frontend-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-frontend + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 0 +``` + +### HPA with Dynamo Metrics + +Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod. + +> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md). + +#### Available Dynamo Metrics + +| Metric | Type | Description | Good for scaling | +|--------|------|-------------|------------------| +| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers | +| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services | +| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers | +| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode | +| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General | +| `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode | + +#### Metric Labels + +Dynamo metrics include these labels for filtering: + +| Label | Description | Example | +|-------|-------------|---------| +| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` | +| `model` | Model being served | `Qwen/Qwen3-0.6B` | + +> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD. + +#### Example: Scale Decode Service Based on TTFT + +Using HPA with Prometheus Adapter requires configuring external metrics. + +**Step 1: Configure Prometheus Adapter** + +Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`): + +```yaml +# prometheus-adapter-values.yaml +prometheus: + url: http://prometheus-kube-prometheus-prometheus.monitoring.svc + port: 9090 + +rules: + external: + # TTFT p95 from frontend - used to scale decode + - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_ttft_p95_seconds" + metricsQuery: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + by (le, namespace, dynamo_namespace) + ) +``` + +**Step 2: Install Prometheus Adapter** + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \ + -n monitoring --create-namespace \ + -f prometheus-adapter-values.yaml +``` + +**Step 3: Verify the metric is available** + +```bash +kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces//dynamo_ttft_p95_seconds" | jq +``` + +**Step 4: Create the HPA** + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-decode-hpa +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode # ← DGD name + service name (lowercase) + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: External + external: + metric: + name: dynamo_ttft_p95_seconds + selector: + matchLabels: + dynamo_namespace: "default-sglang-agg" # ← {namespace}-{dynamoNamespace} + target: + type: Value + value: "500m" # Scale up when TTFT p95 > 500ms + behavior: + scaleDown: + stabilizationWindowSeconds: 60 # Wait 1 min before scaling down + policies: + - type: Pods + value: 1 + periodSeconds: 30 + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + policies: + - type: Pods + value: 2 + periodSeconds: 30 +``` + +**How it works:** +1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram +2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace` +3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"` +4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter +5. Adapter controller syncs the replica count to the DGD's `decode` service +6. More decode workers are created, reducing TTFT + +#### Example: Scale Based on Queue Depth + +Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule): + +```yaml +# Add to rules.external in prometheus-adapter-values.yaml +- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_queued_requests" + metricsQuery: | + sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace) +``` + +Then create the HPA: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-decode-queue-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: External + external: + metric: + name: dynamo_queued_requests + selector: + matchLabels: + dynamo_namespace: "default-sglang-agg" + target: + type: Value + value: "10" # Scale up when queue > 10 requests +``` + +## Autoscaling with KEDA (Recommended) + +KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus. + +**Advantages over HPA + Prometheus Adapter:** +- No Prometheus Adapter configuration needed +- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment) +- Easy to update - just `kubectl apply` the ScaledObject +- Can scale to zero when idle +- Supports multiple triggers per object + +**When to use KEDA:** +- You want simpler configuration (no Prometheus Adapter to manage) +- You need event-driven scaling (e.g., queue depth, Kafka, etc.) +- You want to scale to zero when idle + +### Installing KEDA + +```bash +# Add KEDA Helm repo +helm repo add kedacore https://kedacore.github.io/charts +helm repo update + +# Install KEDA +helm install keda kedacore/keda \ + --namespace keda \ + --create-namespace + +# Verify installation +kubectl get pods -n keda +``` + +> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts. + +### Example: Scale Decode Based on TTFT + +Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`: + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + pollingInterval: 15 # Check metrics every 15 seconds + cooldownPeriod: 60 # Wait 60s before scaling down + triggers: + - type: prometheus + metadata: + # Update this URL to match your Prometheus service + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_ttft_p95 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" # Scale up when TTFT p95 > 500ms (0.5 seconds) + activationThreshold: "0.1" # Start scaling when TTFT > 100ms +``` + +Apply it: + +```bash +kubectl apply -f sglang-agg-decode-scaler.yaml +``` + +### Verify KEDA Scaling + +```bash +# Check ScaledObject status +kubectl get scaledobject -n default + +# KEDA creates an HPA under the hood - you can see it +kubectl get hpa -n default + +# Example output: +# NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS +# keda-hpa-sglang-agg-decode-scaler DynamoGraphDeploymentScalingAdapter/sglang-agg-decode 45m/500m 1 10 1 + +# Get detailed status +kubectl describe scaledobject sglang-agg-decode-scaler -n default +``` + +### Example: Scale Based on Queue Depth + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-queue-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + pollingInterval: 15 + cooldownPeriod: 60 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_queued_requests + query: | + sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"}) + threshold: "10" # Scale up when queue > 10 requests +``` + +### How KEDA Works + +KEDA creates and manages an HPA under the hood: + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ You create: ScaledObject │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - triggers: prometheus query │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ KEDA Operator automatically creates: HPA │ +│ - name: keda-hpa-sglang-agg-decode-scaler │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - metrics: External (from KEDA metrics server) │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeploymentScalingAdapter: sglang-agg-decode │ +│ - spec.replicas: updated by HPA │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeployment: sglang-agg │ +│ - spec.services.decode.replicas: synced from adapter │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +## Mixed Autoscaling + +For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services: + +```yaml +--- +# HPA for Frontend (CPU-based) +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: sglang-agg-frontend-hpa + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-frontend + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + +--- +# KEDA for Decode (TTFT-based) +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" +``` + +## Manual Scaling + +### With DGDSA Enabled (Default) + +When DGDSA is enabled (the default), scale via the adapter: + +```bash +kubectl scale dgdsa sglang-agg-decode -n default --replicas=3 +``` + +Verify the scaling: + +```bash +kubectl get dgdsa sglang-agg-decode -n default + +# Output: +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-decode sglang-agg decode 3 10m +``` + +> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle. + +### With DGDSA Disabled + +If you've disabled the scaling adapter for a service, edit the DGD directly: + +```bash +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +``` + +Or edit the YAML: + +```yaml +spec: + services: + decode: + replicas: 3 + scalingAdapter: + disable: true +``` + +## Best Practices + +### 1. Choose One Autoscaler Per Service + +Avoid configuring multiple autoscalers for the same service: + +| Configuration | Status | +|---------------|--------| +| HPA for frontend, Planner for prefill/decode | ✅ Good | +| KEDA for all services | ✅ Good | +| Planner only (default) | ✅ Good | +| HPA + Planner both targeting decode | ❌ Bad - they will fight | + +### 2. Use Appropriate Metrics + +| Service Type | Recommended Metrics | Dynamo Metric | +|--------------|---------------------|---------------| +| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` | +| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` | +| Decode | KV cache utilization, ITL | `kvstats_gpu_cache_usage_percent`, `dynamo_frontend_inter_token_latency_seconds` | + +### 3. Configure Stabilization Windows + +Prevent thrashing with appropriate stabilization: + +```yaml +# HPA +behavior: + scaleDown: + stabilizationWindowSeconds: 300 # Wait 5 min before scaling down + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + +# KEDA +spec: + cooldownPeriod: 300 +``` + +### 4. Set Sensible Min/Max Replicas + +Always configure minimum and maximum replicas in your HPA/KEDA to prevent: +- Scaling to zero (unless intentional) +- Unbounded scaling that exhausts cluster resources + +## Troubleshooting + +### Adapters Not Created + +```bash +# Check DGD status +kubectl describe dgd sglang-agg -n default + +# Check operator logs +kubectl logs -n dynamo-system deployment/dynamo-operator +``` + +### Scaling Not Working + +```bash +# Check adapter status +kubectl describe dgdsa sglang-agg-decode -n default + +# Check HPA/KEDA status +kubectl describe hpa sglang-agg-decode-hpa -n default +kubectl describe scaledobject sglang-agg-decode-scaler -n default + +# Verify metrics are available in Kubernetes metrics API +kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 +``` + +### Metrics Not Available + +If HPA/KEDA shows `` for metrics: + +```bash +# Check if Dynamo metrics are being scraped +kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000 +curl http://localhost:8000/metrics | grep dynamo_frontend + +# Example output: +# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2 +# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5 + +# Verify Prometheus is scraping the metrics +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 +# Then query: dynamo_frontend_time_to_first_token_seconds_bucket + +# Check KEDA operator logs +kubectl logs -n keda deployment/keda-operator +``` + +### Rapid Scaling Up and Down + +If you see unstable scaling: + +1. Check if multiple autoscalers are targeting the same adapter +2. Increase `cooldownPeriod` in KEDA ScaledObject +3. Increase `stabilizationWindowSeconds` in HPA behavior + +## References + +- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) +- [KEDA Documentation](https://keda.sh/) +- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) +- [Planner Documentation](../planner/sla_planner.md) +- [Dynamo Metrics Reference](../observability/metrics.md) +- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md) +