Skip to content

Commit 9573d34

Browse files
feat: make planner use DGD Scaling Adapters (#4825)
Signed-off-by: Julien Mancuso <[email protected]>
1 parent 01bfbea commit 9573d34

File tree

3 files changed

+139
-6
lines changed

3 files changed

+139
-6
lines changed

components/src/dynamo/planner/kube.py

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,48 @@ def get_graph_deployment(self, graph_deployment_name: str) -> dict:
7878
)
7979
raise
8080

81-
def update_graph_replicas(
82-
self, graph_deployment_name: str, component_name: str, replicas: int
81+
def update_service_replicas(
82+
self, graph_deployment_name: str, service_name: str, replicas: int
83+
) -> None:
84+
"""
85+
Update replicas for a service using Scale subresource when DGDSA exists.
86+
Falls back to DGD patch for backward compatibility with older operators.
87+
88+
Args:
89+
graph_deployment_name: Name of the DynamoGraphDeployment
90+
service_name: Name of the service in DGD.spec.services
91+
replicas: Desired number of replicas
92+
"""
93+
# DGDSA naming convention: <dgd-name>-<lowercase-service-name>
94+
adapter_name = f"{graph_deployment_name}-{service_name.lower()}"
95+
96+
try:
97+
# Try to scale via DGDSA Scale subresource
98+
self.custom_api.patch_namespaced_custom_object_scale(
99+
group="nvidia.com",
100+
version="v1alpha1",
101+
namespace=self.current_namespace,
102+
plural="dynamographdeploymentscalingadapters",
103+
name=adapter_name,
104+
body={"spec": {"replicas": replicas}},
105+
)
106+
logger.info(f"Scaled DGDSA {adapter_name} to {replicas} replicas")
107+
108+
except client.ApiException as e:
109+
if e.status == 404:
110+
# DGDSA doesn't exist - fall back to DGD patch (old operator)
111+
logger.info(
112+
f"DGDSA {adapter_name} not found, falling back to DGD update"
113+
)
114+
self._update_dgd_replicas(graph_deployment_name, service_name, replicas)
115+
else:
116+
raise
117+
118+
def _update_dgd_replicas(
119+
self, graph_deployment_name: str, service_name: str, replicas: int
83120
) -> None:
84-
"""Update the replicas count for a component in a DynamoGraphDeployment"""
85-
patch = {"spec": {"services": {component_name: {"replicas": replicas}}}}
121+
"""Update replicas directly in DGD (fallback for old operators)"""
122+
patch = {"spec": {"services": {service_name: {"replicas": replicas}}}}
86123
self.custom_api.patch_namespaced_custom_object(
87124
group="nvidia.com",
88125
version="v1alpha1",
@@ -91,6 +128,20 @@ def update_graph_replicas(
91128
name=graph_deployment_name,
92129
body=patch,
93130
)
131+
logger.info(
132+
f"Updated DGD {graph_deployment_name} service {service_name} to {replicas} replicas"
133+
)
134+
135+
def update_graph_replicas(
136+
self, graph_deployment_name: str, component_name: str, replicas: int
137+
) -> None:
138+
"""
139+
Update replicas for a service. Now uses DGDSA when available.
140+
141+
Deprecated: Use update_service_replicas() instead for clarity.
142+
This method is kept for backward compatibility.
143+
"""
144+
self.update_service_replicas(graph_deployment_name, component_name, replicas)
94145

95146
def is_deployment_ready(self, deployment: dict) -> bool:
96147
"""Check if a graph deployment is ready"""

deploy/cloud/helm/platform/components/operator/templates/planner.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ rules:
3939
- apiGroups: ["nvidia.com"]
4040
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
4141
verbs: ["get", "list", "create", "update", "patch"]
42+
- apiGroups: ["nvidia.com"]
43+
resources: ["dynamographdeploymentscalingadapters/scale"]
44+
verbs: ["patch"]
4245
---
4346
apiVersion: rbac.authorization.k8s.io/v1
4447
kind: RoleBinding
@@ -68,4 +71,7 @@ rules:
6871
- apiGroups: ["nvidia.com"]
6972
resources: ["dynamocomponentdeployments", "dynamographdeployments"]
7073
verbs: ["get", "list", "create", "update", "patch"]
71-
{{- end }}
74+
- apiGroups: ["nvidia.com"]
75+
resources: ["dynamographdeploymentscalingadapters/scale"]
76+
verbs: ["patch"]
77+
{{- end }}

tests/planner/unit/kube.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,87 @@ def test_get_graph_deployment_from_name(k8s_api, mock_custom_api):
7676
)
7777

7878

79-
def test_update_graph_replicas(k8s_api, mock_custom_api):
79+
def test_update_service_replicas_uses_dgdsa_scale(k8s_api, mock_custom_api):
80+
"""Test that update_service_replicas uses DGDSA Scale API when available"""
81+
mock_custom_api.patch_namespaced_custom_object_scale.return_value = None
82+
83+
k8s_api.update_service_replicas("test-deployment", "Frontend", 3)
84+
85+
# Should use Scale subresource with lowercase adapter name
86+
mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once_with(
87+
group="nvidia.com",
88+
version="v1alpha1",
89+
namespace=k8s_api.current_namespace,
90+
plural="dynamographdeploymentscalingadapters",
91+
name="test-deployment-frontend", # lowercase service name
92+
body={"spec": {"replicas": 3}},
93+
)
94+
# Should NOT fall back to DGD patch
95+
mock_custom_api.patch_namespaced_custom_object.assert_not_called()
96+
97+
98+
def test_update_service_replicas_fallback_to_dgd(k8s_api, mock_custom_api):
99+
"""Test that update_service_replicas falls back to DGD when DGDSA not found"""
100+
# DGDSA doesn't exist (404)
101+
mock_custom_api.patch_namespaced_custom_object_scale.side_effect = (
102+
client.ApiException(status=404)
103+
)
80104
mock_custom_api.patch_namespaced_custom_object.return_value = None
81105

106+
k8s_api.update_service_replicas("test-deployment", "test-component", 1)
107+
108+
# Should have tried DGDSA first
109+
mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once()
110+
111+
# Should fall back to DGD patch
112+
mock_custom_api.patch_namespaced_custom_object.assert_called_once_with(
113+
group="nvidia.com",
114+
version="v1alpha1",
115+
namespace=k8s_api.current_namespace,
116+
plural="dynamographdeployments",
117+
name="test-deployment",
118+
body={"spec": {"services": {"test-component": {"replicas": 1}}}},
119+
)
120+
121+
122+
def test_update_service_replicas_propagates_other_errors(k8s_api, mock_custom_api):
123+
"""Test that update_service_replicas propagates non-404 errors"""
124+
mock_custom_api.patch_namespaced_custom_object_scale.side_effect = (
125+
client.ApiException(status=500, reason="Internal Server Error")
126+
)
127+
128+
with pytest.raises(client.ApiException) as exc_info:
129+
k8s_api.update_service_replicas("test-deployment", "test-component", 1)
130+
131+
assert exc_info.value.status == 500
132+
# Should NOT fall back to DGD
133+
mock_custom_api.patch_namespaced_custom_object.assert_not_called()
134+
135+
136+
def test_update_graph_replicas_calls_update_service_replicas(k8s_api, mock_custom_api):
137+
"""Test that deprecated update_graph_replicas calls update_service_replicas"""
138+
mock_custom_api.patch_namespaced_custom_object_scale.return_value = None
139+
140+
# Use the deprecated method
82141
k8s_api.update_graph_replicas("test-deployment", "test-component", 1)
83142

143+
# Should delegate to update_service_replicas which uses Scale API
144+
mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once_with(
145+
group="nvidia.com",
146+
version="v1alpha1",
147+
namespace=k8s_api.current_namespace,
148+
plural="dynamographdeploymentscalingadapters",
149+
name="test-deployment-test-component",
150+
body={"spec": {"replicas": 1}},
151+
)
152+
153+
154+
def test_update_dgd_replicas_directly(k8s_api, mock_custom_api):
155+
"""Test the internal _update_dgd_replicas method"""
156+
mock_custom_api.patch_namespaced_custom_object.return_value = None
157+
158+
k8s_api._update_dgd_replicas("test-deployment", "test-component", 1)
159+
84160
mock_custom_api.patch_namespaced_custom_object.assert_called_once_with(
85161
group="nvidia.com",
86162
version="v1alpha1",

0 commit comments

Comments
 (0)