@@ -201,6 +201,18 @@ def _patch_service_for_injection(
201201 {"name" : "cuda-fault-lib" , "emptyDir" : {}}
202202 )
203203
204+ # Add hostPath volume for persistent fault marker (survives pod restarts on same node)
205+ # This simulates persistent hardware failure!
206+ service ["extraPodSpec" ]["volumes" ].append (
207+ {
208+ "name" : "node-fault-marker" ,
209+ "hostPath" : {
210+ "path" : "/var/lib/cuda-fault-test" ,
211+ "type" : "DirectoryOrCreate" ,
212+ },
213+ }
214+ )
215+
204216 # Add init container to decode base64
205217 if "initContainers" not in service ["extraPodSpec" ]:
206218 service ["extraPodSpec" ]["initContainers" ] = []
@@ -247,7 +259,7 @@ def _patch_service_for_injection(
247259 if vm .get ("name" ) != "cuda-fault-lib"
248260 ]
249261
250- # Add mount
262+ # Add mount for compiled library
251263 service ["extraPodSpec" ]["mainContainer" ]["volumeMounts" ].append (
252264 {
253265 "name" : "cuda-fault-lib" ,
@@ -256,8 +268,18 @@ def _patch_service_for_injection(
256268 }
257269 )
258270
271+ # Add mount for persistent fault marker (hostPath)
272+ service ["extraPodSpec" ]["mainContainer" ]["volumeMounts" ].append (
273+ {
274+ "name" : "node-fault-marker" ,
275+ "mountPath" : "/host-fault" ,
276+ "readOnly" : False , # Need write access
277+ }
278+ )
279+
259280 print (" ✓ Added init container to compile library" )
260281 print (" ✓ Added ConfigMap volume mount" )
282+ print (" ✓ Added hostPath volume for persistent fault marker" )
261283
262284 # Add node affinity to pin pods to target node (simulates real XID 79 behavior)
263285 if target_node and enable :
@@ -287,14 +309,15 @@ def _patch_service_for_injection(
287309 service ["extraPodSpec" ]["volumes" ] = [
288310 v
289311 for v in service ["extraPodSpec" ]["volumes" ]
290- if v .get ("name" ) not in ["cuda-fault-lib" , "cuda-fault-lib-source" ]
312+ if v .get ("name" )
313+ not in ["cuda-fault-lib" , "cuda-fault-lib-source" , "node-fault-marker" ]
291314 ]
292315
293316 if "volumeMounts" in service ["extraPodSpec" ].get ("mainContainer" , {}):
294317 service ["extraPodSpec" ]["mainContainer" ]["volumeMounts" ] = [
295318 vm
296319 for vm in service ["extraPodSpec" ]["mainContainer" ]["volumeMounts" ]
297- if vm .get ("name" ) != "cuda-fault-lib"
320+ if vm .get ("name" ) not in [ "cuda-fault-lib" , "node-fault-marker" ]
298321 ]
299322
300323 # Remove init container
@@ -323,6 +346,7 @@ def patch_deployment_env(
323346 use_configmap = True ,
324347 target_node = None ,
325348 xid_type = 79 ,
349+ passthrough_mode = False ,
326350):
327351 """Patch deployment to add/remove LD_PRELOAD environment variable.
328352
@@ -334,6 +358,8 @@ def patch_deployment_env(
334358 target_node: If provided, adds node affinity to pin pods to this node
335359 (simulates real XID where pods crash on the faulty node)
336360 xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
361+ passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
362+ Allows baseline testing before enabling faults via toggle
337363 """
338364 custom_api = client .CustomObjectsApi ()
339365 apps_api = client .AppsV1Api ()
@@ -385,9 +411,14 @@ def patch_deployment_env(
385411 # Prepare environment variables
386412 new_envs = []
387413 if enable :
414+ # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
415+ fault_enabled_value = "0" if passthrough_mode else "1"
388416 new_envs = [
389417 {"name" : "LD_PRELOAD" , "value" : lib_path },
390- {"name" : "CUDA_FAULT_INJECTION_ENABLED" , "value" : "1" },
418+ {
419+ "name" : "CUDA_FAULT_INJECTION_ENABLED" ,
420+ "value" : fault_enabled_value ,
421+ },
391422 {"name" : "CUDA_XID_TYPE" , "value" : str (xid_type )},
392423 ]
393424
@@ -400,6 +431,28 @@ def patch_deployment_env(
400431 available_services = list (services .keys ())
401432 print (f" → Available services: { available_services } " )
402433
434+ # Set aggressive update strategy when enabling (allow all pods to update at once)
435+ # This ensures all pods get CUDA faults, not just the first few
436+ if enable :
437+ if "updateStrategy" not in spec :
438+ spec ["updateStrategy" ] = {}
439+ if "rollingUpdate" not in spec ["updateStrategy" ]:
440+ spec ["updateStrategy" ]["rollingUpdate" ] = {}
441+
442+ # Allow all pods to be unavailable during update
443+ spec ["updateStrategy" ]["rollingUpdate" ]["maxUnavailable" ] = "100%"
444+ # Don't create surge pods
445+ spec ["updateStrategy" ]["rollingUpdate" ]["maxSurge" ] = 0
446+ print (" → Set update strategy: maxUnavailable=100%, maxSurge=0" )
447+ print (" (All pods will update simultaneously)" )
448+ else :
449+ # Restore default update strategy when disabling
450+ if "updateStrategy" in spec :
451+ spec ["updateStrategy" ] = {
452+ "rollingUpdate" : {"maxUnavailable" : "25%" , "maxSurge" : "25%" }
453+ }
454+ print (" → Restored default update strategy (maxUnavailable=25%)" )
455+
403456 for service_name in services_to_patch :
404457 if service_name in services :
405458 print (f" → Patching service: { service_name } " )
@@ -465,6 +518,38 @@ def patch_deployment_env(
465518 print (f" Services patched: { ', ' .join (patched_services )} " )
466519 if use_configmap and enable :
467520 print (f" Library mounted at: { lib_path } " )
521+
522+ # Force restart all worker pods when enabling to apply changes immediately
523+ if enable :
524+ print (
525+ " → Force-deleting all worker pods to apply changes immediately..."
526+ )
527+ core_api = client .CoreV1Api ()
528+ try :
529+ worker_pods = core_api .list_namespaced_pod (
530+ namespace = namespace ,
531+ label_selector = f"nvidia.com/dynamo-graph-deployment-name={ deployment_name } ,nvidia.com/dynamo-component-type=worker" ,
532+ )
533+ deleted_count = 0
534+ for pod in worker_pods .items :
535+ try :
536+ core_api .delete_namespaced_pod (
537+ name = pod .metadata .name ,
538+ namespace = namespace ,
539+ grace_period_seconds = 0 ,
540+ )
541+ deleted_count += 1
542+ except Exception as e :
543+ print (
544+ f" ⚠ Could not delete pod { pod .metadata .name } : { e } "
545+ )
546+ print (
547+ f" ✓ Deleted { deleted_count } pod(s) - they will restart with CUDA library"
548+ )
549+ except Exception as e :
550+ print (f" ⚠ Could not list/delete pods: { e } " )
551+ print (" Pods will eventually restart, but may take longer" )
552+
468553 return True
469554
470555 except ApiException as e :
@@ -505,11 +590,15 @@ def patch_deployment_env(
505590
506591 if enable :
507592 # Add new env vars
593+ # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
594+ fault_enabled_value = "0" if passthrough_mode else "1"
508595 container .env .append (
509596 client .V1EnvVar (name = "LD_PRELOAD" , value = "/tmp/cuda_intercept.so" )
510597 )
511598 container .env .append (
512- client .V1EnvVar (name = "CUDA_FAULT_INJECTION_ENABLED" , value = "1" )
599+ client .V1EnvVar (
600+ name = "CUDA_FAULT_INJECTION_ENABLED" , value = fault_enabled_value
601+ )
513602 )
514603 container .env .append (
515604 client .V1EnvVar (name = "CUDA_XID_TYPE" , value = str (xid_type ))
0 commit comments