Skip to content

Commit e10319f

Browse files
feat(fault-injection): Add GPU fault injector agent (#4043)
Signed-off-by: Oviya Seeniraj <[email protected]> Signed-off-by: Harrison Saturley-Hall <[email protected]> Signed-off-by: Harrison King Saturley-Hall <[email protected]> Co-authored-by: Harrison Saturley-Hall <[email protected]>
1 parent 39a9d0b commit e10319f

File tree

4 files changed

+635
-0
lines changed

4 files changed

+635
-0
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg
6+
# Runs as privileged DaemonSet on GPU nodes to inject XID errors
7+
#
8+
# NOTE: GPU nodes are AMD64/x86_64 architecture
9+
# Build with: docker buildx build --platform linux/amd64 --load -t <image> .
10+
11+
FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04
12+
13+
# Install system dependencies (nsenter, nvidia-smi, journalctl)
14+
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
15+
python3 \
16+
python3-pip \
17+
curl \
18+
util-linux \
19+
systemd \
20+
kmod \
21+
pciutils \
22+
&& rm -rf /var/lib/apt/lists/*
23+
24+
# Install Python packages
25+
COPY requirements.txt /tmp/
26+
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
27+
28+
# Create working directory
29+
WORKDIR /app
30+
31+
# Copy agent code
32+
COPY agent.py /app/
33+
COPY gpu_xid_injector.py /app/
34+
35+
# Create log directory
36+
RUN mkdir -p /var/log/gpu-fault-injector
37+
38+
# Set environment
39+
ENV PYTHONUNBUFFERED=1
40+
41+
# Expose port
42+
EXPOSE 8083
43+
44+
# Health check
45+
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
46+
CMD curl -f http://localhost:8083/health || exit 1
47+
48+
# Run agent
49+
CMD ["python3", "agent.py"]
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
"""
6+
GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes.
7+
8+
This agent provides privileged access for XID error injection:
9+
- XID injection via nsenter+kmsg (writes to host's /dev/kmsg)
10+
- Triggers NVSentinel syslog-health-monitor detection
11+
- Initiates complete fault tolerance workflow
12+
13+
Accepts ANY XID error code for testing flexibility.
14+
Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
15+
- Devastating: 79, 74, 48, 94, 95, 119, 120, 140
16+
- Memory: 31, 32, 43, 63, 64
17+
- PCIe: 38, 39, 42
18+
- Thermal: 60, 61, 62
19+
- Power: 54, 56, 57
20+
- Graphics: 13, 45, 69
21+
22+
Unknown XIDs use generic error message format.
23+
NVSentinel detects XIDs and handles actions based on its own rules.
24+
See gpu_xid_injector.py for complete XID descriptions.
25+
"""
26+
27+
import logging
28+
import os
29+
import subprocess
30+
from datetime import datetime, timezone
31+
from typing import Any, Optional, Type
32+
33+
import uvicorn
34+
from fastapi import FastAPI, HTTPException
35+
from pydantic import BaseModel
36+
37+
# Configure logging
38+
logging.basicConfig(
39+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
40+
)
41+
logger = logging.getLogger(__name__)
42+
43+
# Import kernel-level XID injector (for XID 79 via nsenter+kmsg)
44+
GPUXIDInjectorKernel: Optional[Type[Any]] = None
45+
try:
46+
from gpu_xid_injector import GPUXIDInjectorKernel # type: ignore[assignment]
47+
48+
KERNEL_XID_AVAILABLE = True
49+
except ImportError:
50+
logger.warning("Kernel-level XID injector not available")
51+
KERNEL_XID_AVAILABLE = False
52+
53+
54+
# ============================================================================
55+
# Models and Enums
56+
# ============================================================================
57+
58+
59+
class XIDInjectRequest(BaseModel):
60+
"""Request model for XID error injection via nsenter+kmsg"""
61+
62+
fault_id: str
63+
xid_type: int
64+
gpu_id: int = 0
65+
duration: Optional[int] = None
66+
67+
68+
# ============================================================================
69+
# GPU Fault Injector
70+
# ============================================================================
71+
72+
73+
class GPUFaultInjector:
74+
"""
75+
GPU fault injection operations with DCGM integration.
76+
77+
Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages).
78+
Accepts any XID value (1-1000) for comprehensive fault tolerance testing.
79+
"""
80+
81+
def __init__(self):
82+
self.active_faults: dict[str, dict[str, Any]] = {}
83+
self.node_name = os.getenv("NODE_NAME", "unknown")
84+
self.dcgm_available = self._check_dcgm()
85+
self.gpu_count = self._get_gpu_count()
86+
87+
# Initialize kernel-level XID injector (XID 79 via nsenter+kmsg)
88+
self.kernel_xid_injector = None
89+
self.kernel_xid_available = False
90+
if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None:
91+
try:
92+
self.kernel_xid_injector = GPUXIDInjectorKernel()
93+
self.kernel_xid_available = self.kernel_xid_injector.privileged
94+
logger.info(
95+
f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})"
96+
)
97+
except Exception as e:
98+
logger.warning(f"Kernel XID injector not available: {e}")
99+
100+
logger.info(f"GPU Fault Injector initialized on node: {self.node_name}")
101+
logger.info(f"DCGM available: {self.dcgm_available}")
102+
logger.info(f"GPU count: {self.gpu_count}")
103+
logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}")
104+
105+
def _check_dcgm(self) -> bool:
106+
"""Check if DCGM is available"""
107+
try:
108+
result = subprocess.run(
109+
["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5
110+
)
111+
return result.returncode == 0
112+
except Exception as e:
113+
logger.warning(f"DCGM not available: {e}")
114+
return False
115+
116+
def _get_gpu_count(self) -> int:
117+
"""Get number of GPUs on this node"""
118+
try:
119+
result = subprocess.run(
120+
["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
121+
capture_output=True,
122+
text=True,
123+
timeout=5,
124+
)
125+
if result.returncode == 0:
126+
return int(result.stdout.strip().split("\n")[0])
127+
return 0
128+
except Exception as e:
129+
logger.error(f"Failed to get GPU count: {e}")
130+
return 0
131+
132+
def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]:
133+
"""Run shell command with timeout"""
134+
try:
135+
result = subprocess.run(
136+
command, capture_output=True, text=True, timeout=timeout
137+
)
138+
success = result.returncode == 0
139+
output = result.stdout if success else result.stderr
140+
return success, output.strip()
141+
except subprocess.TimeoutExpired:
142+
return False, "Command timed out"
143+
except Exception as e:
144+
return False, str(e)
145+
146+
147+
# ============================================================================
148+
# FastAPI Application
149+
# ============================================================================
150+
151+
app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0")
152+
injector = GPUFaultInjector()
153+
154+
155+
@app.get("/health")
156+
async def health_check():
157+
"""Health check endpoint"""
158+
return {
159+
"status": "healthy",
160+
"node": injector.node_name,
161+
"gpu_count": injector.gpu_count,
162+
"dcgm_available": injector.dcgm_available,
163+
"active_faults": len(injector.active_faults),
164+
}
165+
166+
167+
@app.post("/inject-xid")
168+
async def inject_xid(request: XIDInjectRequest):
169+
"""
170+
Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection).
171+
172+
Accepts any XID error code (1-1000) for maximum testing flexibility.
173+
174+
Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
175+
176+
Devastating (always FAIL):
177+
- 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors
178+
- 119/120: GSP errors | 140: ECC unrecovered
179+
180+
Subsystem (may WARN/escalate):
181+
- Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement)
182+
- PCIe: 38, 39, 42 (bus, fabric, replay rate)
183+
- Thermal: 60, 61, 62 (temperature limits)
184+
- Power: 54, 56, 57 (power/clock state)
185+
- Graphics: 13, 45, 69 (SM exceptions)
186+
187+
Unknown XIDs use generic error message - NVSentinel will parse and handle
188+
based on its own XID database.
189+
"""
190+
logger.info(
191+
f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}"
192+
)
193+
194+
# Validate XID type is a reasonable integer (basic sanity check)
195+
if (
196+
not isinstance(request.xid_type, int)
197+
or request.xid_type < 1
198+
or request.xid_type > 1000
199+
):
200+
raise HTTPException(
201+
status_code=400,
202+
detail=(
203+
f"Invalid XID type: {request.xid_type}. "
204+
f"XID must be an integer between 1-1000. "
205+
f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)."
206+
),
207+
)
208+
209+
if not injector.kernel_xid_available or not injector.kernel_xid_injector:
210+
raise HTTPException(
211+
status_code=503,
212+
detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.",
213+
)
214+
215+
# Use the generic inject_xid method which supports multiple XID types
216+
success, message = injector.kernel_xid_injector.inject_xid(
217+
xid_type=request.xid_type, gpu_id=request.gpu_id
218+
)
219+
220+
if not success:
221+
raise HTTPException(status_code=500, detail=message)
222+
223+
# Track the fault
224+
injector.active_faults[request.fault_id] = {
225+
"type": f"xid_{request.xid_type}",
226+
"gpu_id": request.gpu_id,
227+
"timestamp": datetime.now(timezone.utc).isoformat(),
228+
}
229+
230+
return {
231+
"status": "injected",
232+
"node": injector.node_name,
233+
"fault_id": request.fault_id,
234+
"xid_type": request.xid_type,
235+
"gpu_id": request.gpu_id,
236+
"message": message,
237+
"timestamp": datetime.now(timezone.utc).isoformat(),
238+
}
239+
240+
241+
@app.get("/faults")
242+
async def list_active_faults():
243+
"""List active faults on this node"""
244+
return {
245+
"node": injector.node_name,
246+
"active_faults": list(injector.active_faults.keys()),
247+
"count": len(injector.active_faults),
248+
}
249+
250+
251+
if __name__ == "__main__":
252+
uvicorn.run(
253+
app,
254+
host="0.0.0.0",
255+
port=8083,
256+
log_level="info",
257+
)

0 commit comments

Comments
 (0)