Merge branch 'main' into bis/dep-681-add-agg-lora-tests

biswapanda · web-flow · commit 31163f711638 · 2025-12-09T14:13:18.000-08:00
diff --git a/.github/workflows/templates/akamai-eccu-flush.xslt b/.github/workflows/templates/akamai-eccu-flush.xslt
@@ -1,4 +1,19 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="akamai-eccu-flush.xslt"?>
+<!--
+Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
 <!--
   Akamai ECCU (Edge Content Control Utility) XML Generator
 
diff --git a/components/src/dynamo/planner/kube.py b/components/src/dynamo/planner/kube.py
@@ -78,11 +78,48 @@ def get_graph_deployment(self, graph_deployment_name: str) -> dict:
                 )
             raise
 
-    def update_graph_replicas(
-        self, graph_deployment_name: str, component_name: str, replicas: int
+    def update_service_replicas(
+        self, graph_deployment_name: str, service_name: str, replicas: int
+    ) -> None:
+        """
+        Update replicas for a service using Scale subresource when DGDSA exists.
+        Falls back to DGD patch for backward compatibility with older operators.
+
+        Args:
+            graph_deployment_name: Name of the DynamoGraphDeployment
+            service_name: Name of the service in DGD.spec.services
+            replicas: Desired number of replicas
+        """
+        # DGDSA naming convention: <dgd-name>-<lowercase-service-name>
+        adapter_name = f"{graph_deployment_name}-{service_name.lower()}"
+
+        try:
+            # Try to scale via DGDSA Scale subresource
+            self.custom_api.patch_namespaced_custom_object_scale(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.current_namespace,
+                plural="dynamographdeploymentscalingadapters",
+                name=adapter_name,
+                body={"spec": {"replicas": replicas}},
+            )
+            logger.info(f"Scaled DGDSA {adapter_name} to {replicas} replicas")
+
+        except client.ApiException as e:
+            if e.status == 404:
+                # DGDSA doesn't exist - fall back to DGD patch (old operator)
+                logger.info(
+                    f"DGDSA {adapter_name} not found, falling back to DGD update"
+                )
+                self._update_dgd_replicas(graph_deployment_name, service_name, replicas)
+            else:
+                raise
+
+    def _update_dgd_replicas(
+        self, graph_deployment_name: str, service_name: str, replicas: int
     ) -> None:
-        """Update the replicas count for a component in a DynamoGraphDeployment"""
-        patch = {"spec": {"services": {component_name: {"replicas": replicas}}}}
+        """Update replicas directly in DGD (fallback for old operators)"""
+        patch = {"spec": {"services": {service_name: {"replicas": replicas}}}}
         self.custom_api.patch_namespaced_custom_object(
             group="nvidia.com",
             version="v1alpha1",
@@ -91,6 +128,20 @@ def update_graph_replicas(
             name=graph_deployment_name,
             body=patch,
         )
+        logger.info(
+            f"Updated DGD {graph_deployment_name} service {service_name} to {replicas} replicas"
+        )
+
+    def update_graph_replicas(
+        self, graph_deployment_name: str, component_name: str, replicas: int
+    ) -> None:
+        """
+        Update replicas for a service. Now uses DGDSA when available.
+
+        Deprecated: Use update_service_replicas() instead for clarity.
+        This method is kept for backward compatibility.
+        """
+        self.update_service_replicas(graph_deployment_name, component_name, replicas)
 
     def is_deployment_ready(self, deployment: dict) -> bool:
         """Check if a graph deployment is ready"""
diff --git a/components/src/dynamo/trtllm/request_handlers/handler_base.py b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -106,6 +106,76 @@ def check_error(self, result: dict):
                 result["finish_reason"] == "stop" or result["finish_reason"] == "error"
             )
 
+    @staticmethod
+    def _extract_logprobs(
+        output, num_output_tokens_so_far: int
+    ) -> tuple[list[float] | None, list[list[dict]] | None]:
+        """
+        Extract logprobs from the TRTLLM output for new tokens.
+
+        Args:
+            output: TRTLLM CompletionOutput object
+            num_output_tokens_so_far: Number of tokens already processed
+        Returns:
+            Tuple of (log_probs, top_logprobs) in Dynamo's expected format:
+            - log_probs: List of log probabilities for each new token
+            - top_logprobs: List of top logprobs dicts for each new token
+        """
+        if output.logprobs is None:
+            return None, None
+
+        # Get logprobs for new tokens only
+        new_logprobs = output.logprobs[num_output_tokens_so_far:]
+        if not new_logprobs:
+            return None, None
+
+        # From TRTLLM CompletionOutput API, logprobs: (TokenLogprobs | List[float], optional)
+        # Expect TokenLogprobs output when logprobs is set, check edge case where list[float] is returned instead
+        if isinstance(new_logprobs[0], float):
+            return [float(lp) for lp in new_logprobs], None
+
+        log_probs = []
+        top_logprobs = []
+
+        for token_idx, token_logprobs_dict in enumerate(new_logprobs):
+            if token_logprobs_dict is None:
+                continue
+
+            # Get the actual token_id that was generated at this position
+            actual_token_id = output.token_ids[num_output_tokens_so_far + token_idx]
+
+            # Extract log probability for the selected token
+            if actual_token_id in token_logprobs_dict:
+                selected_logprob = token_logprobs_dict[actual_token_id]
+                log_probs.append(float(selected_logprob.logprob))
+            else:
+                # Fallback: use the first logprob if selected token not found
+                first_logprob = next(iter(token_logprobs_dict.values()), None)
+                if first_logprob:
+                    log_probs.append(float(first_logprob.logprob))
+
+            # Build top_logprobs list for this token position
+            # NOTE: TRTLLM LogProb API doesn't have decoded_token, will default to None
+            token_top_logprobs = []
+            for tok_id, logprob_info in token_logprobs_dict.items():
+                token_top_logprobs.append(
+                    {
+                        "rank": logprob_info.rank
+                        if hasattr(logprob_info, "rank")
+                        else 0,
+                        "token_id": tok_id,
+                        "token": (
+                            logprob_info.decoded_token
+                            if hasattr(logprob_info, "decoded_token")
+                            else None
+                        ),
+                        "logprob": float(logprob_info.logprob),
+                    }
+                )
+            top_logprobs.append(token_top_logprobs)
+
+        return log_probs if log_probs else None, top_logprobs if top_logprobs else None
+
     async def _handle_cancellation(
         self, generation_result: GenerationResult, context: Context
     ):
@@ -236,6 +306,26 @@ async def generate_locally(
             if hasattr(sampling_params, key):
                 setattr(sampling_params, key, value)
 
+        # Additional sampling params in output options
+        output_options = request.get("output_options", {})
+        if output_options:
+            logprobs_value = output_options.get("logprobs")
+
+            # Handle logprobs
+            if logprobs_value is not None:
+                if hasattr(sampling_params, "logprobs"):
+                    setattr(
+                        sampling_params, "logprobs", max(1, int(logprobs_value))
+                    )  # If top_logprobs = 0, still want to see chosen token logprob
+
+            # Handle prompt_logprobs
+            prompt_logprobs_value = output_options.get("prompt_logprobs")
+            if prompt_logprobs_value:
+                if hasattr(sampling_params, "prompt_logprobs"):
+                    setattr(
+                        sampling_params, "prompt_logprobs", int(prompt_logprobs_value)
+                    )
+
         max_tokens = request["stop_conditions"]["max_tokens"]
         if max_tokens:
             sampling_params.max_tokens = max_tokens
@@ -302,6 +392,15 @@ async def generate_locally(
 
                     out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
 
+                    # Extract logprobs from the output
+                    log_probs, top_logprobs = self._extract_logprobs(
+                        output, num_output_tokens_so_far
+                    )
+                    if log_probs:
+                        out["log_probs"] = log_probs
+                    if top_logprobs:
+                        out["top_logprobs"] = top_logprobs
+
                     if output.finish_reason:
                         out["finish_reason"] = output.finish_reason
                     if output.stop_reason:
diff --git a/deploy/cloud/helm/platform/components/operator/templates/planner.yaml b/deploy/cloud/helm/platform/components/operator/templates/planner.yaml
@@ -39,6 +39,9 @@ rules:
 - apiGroups: ["nvidia.com"]
   resources: ["dynamocomponentdeployments", "dynamographdeployments"]
   verbs: ["get", "list", "create", "update", "patch"]
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeploymentscalingadapters/scale"]
+  verbs: ["patch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
@@ -68,4 +71,7 @@ rules:
 - apiGroups: ["nvidia.com"]
   resources: ["dynamocomponentdeployments", "dynamographdeployments"]
   verbs: ["get", "list", "create", "update", "patch"]
-{{- end }}
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeploymentscalingadapters/scale"]
+  verbs: ["patch"]
+{{- end }}
diff --git a/tests/planner/unit/kube.py b/tests/planner/unit/kube.py
@@ -76,11 +76,87 @@ def test_get_graph_deployment_from_name(k8s_api, mock_custom_api):
     )
 
 
-def test_update_graph_replicas(k8s_api, mock_custom_api):
+def test_update_service_replicas_uses_dgdsa_scale(k8s_api, mock_custom_api):
+    """Test that update_service_replicas uses DGDSA Scale API when available"""
+    mock_custom_api.patch_namespaced_custom_object_scale.return_value = None
+
+    k8s_api.update_service_replicas("test-deployment", "Frontend", 3)
+
+    # Should use Scale subresource with lowercase adapter name
+    mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once_with(
+        group="nvidia.com",
+        version="v1alpha1",
+        namespace=k8s_api.current_namespace,
+        plural="dynamographdeploymentscalingadapters",
+        name="test-deployment-frontend",  # lowercase service name
+        body={"spec": {"replicas": 3}},
+    )
+    # Should NOT fall back to DGD patch
+    mock_custom_api.patch_namespaced_custom_object.assert_not_called()
+
+
+def test_update_service_replicas_fallback_to_dgd(k8s_api, mock_custom_api):
+    """Test that update_service_replicas falls back to DGD when DGDSA not found"""
+    # DGDSA doesn't exist (404)
+    mock_custom_api.patch_namespaced_custom_object_scale.side_effect = (
+        client.ApiException(status=404)
+    )
     mock_custom_api.patch_namespaced_custom_object.return_value = None
 
+    k8s_api.update_service_replicas("test-deployment", "test-component", 1)
+
+    # Should have tried DGDSA first
+    mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once()
+
+    # Should fall back to DGD patch
+    mock_custom_api.patch_namespaced_custom_object.assert_called_once_with(
+        group="nvidia.com",
+        version="v1alpha1",
+        namespace=k8s_api.current_namespace,
+        plural="dynamographdeployments",
+        name="test-deployment",
+        body={"spec": {"services": {"test-component": {"replicas": 1}}}},
+    )
+
+
+def test_update_service_replicas_propagates_other_errors(k8s_api, mock_custom_api):
+    """Test that update_service_replicas propagates non-404 errors"""
+    mock_custom_api.patch_namespaced_custom_object_scale.side_effect = (
+        client.ApiException(status=500, reason="Internal Server Error")
+    )
+
+    with pytest.raises(client.ApiException) as exc_info:
+        k8s_api.update_service_replicas("test-deployment", "test-component", 1)
+
+    assert exc_info.value.status == 500
+    # Should NOT fall back to DGD
+    mock_custom_api.patch_namespaced_custom_object.assert_not_called()
+
+
+def test_update_graph_replicas_calls_update_service_replicas(k8s_api, mock_custom_api):
+    """Test that deprecated update_graph_replicas calls update_service_replicas"""
+    mock_custom_api.patch_namespaced_custom_object_scale.return_value = None
+
+    # Use the deprecated method
     k8s_api.update_graph_replicas("test-deployment", "test-component", 1)
 
+    # Should delegate to update_service_replicas which uses Scale API
+    mock_custom_api.patch_namespaced_custom_object_scale.assert_called_once_with(
+        group="nvidia.com",
+        version="v1alpha1",
+        namespace=k8s_api.current_namespace,
+        plural="dynamographdeploymentscalingadapters",
+        name="test-deployment-test-component",
+        body={"spec": {"replicas": 1}},
+    )
+
+
+def test_update_dgd_replicas_directly(k8s_api, mock_custom_api):
+    """Test the internal _update_dgd_replicas method"""
+    mock_custom_api.patch_namespaced_custom_object.return_value = None
+
+    k8s_api._update_dgd_replicas("test-deployment", "test-component", 1)
+
     mock_custom_api.patch_namespaced_custom_object.assert_called_once_with(
         group="nvidia.com",
         version="v1alpha1",
diff --git a/tests/serve/test_trtllm.py b/tests/serve/test_trtllm.py
@@ -14,7 +14,10 @@
 )
 from tests.utils.engine_process import EngineConfig
 from tests.utils.payload_builder import (
+    TEXT_PROMPT,
+    chat_payload,
     chat_payload_default,
+    completion_payload,
     completion_payload_default,
     metric_payload_default,
     multimodal_payload_default,
@@ -91,6 +94,34 @@ class TRTLLMConfig(EngineConfig):
             metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
         ],
     ),
+    "aggregated_logprobs": TRTLLMConfig(
+        name="aggregated_logprobs",
+        directory=trtllm_dir,
+        script_name="agg.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
+        model="Qwen/Qwen3-0.6B",
+        models_port=8000,
+        request_payloads=[
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
+            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
+        ],
+    ),
+    "disaggregated_logprobs": TRTLLMConfig(
+        name="disaggregated_logprobs",
+        directory=trtllm_dir,
+        script_name="disagg.sh",
+        marks=[pytest.mark.gpu_2, pytest.mark.post_merge, pytest.mark.trtllm],
+        model="Qwen/Qwen3-0.6B",
+        models_port=8000,
+        request_payloads=[
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=5),
+            chat_payload(content=TEXT_PROMPT, logprobs=False, top_logprobs=5),
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=None),
+            chat_payload(content=TEXT_PROMPT, logprobs=True, top_logprobs=0),
+        ],
+    ),
     "aggregated_router": TRTLLMConfig(
         name="aggregated_router",
         directory=trtllm_dir,
@@ -159,6 +190,7 @@ class TRTLLMConfig(EngineConfig):
         },
         request_payloads=[
             completion_payload_default(),
+            completion_payload(prompt=TEXT_PROMPT, logprobs=3),
         ],
     ),
 }
diff --git a/tests/utils/payload_builder.py b/tests/utils/payload_builder.py
diff --git a/tests/utils/payloads.py b/tests/utils/payloads.py