Skip to content

Commit 37d0fc8

Browse files
authored
fix: Use /proc/driver/nvidia/gpus to get PCI addresses (#4717)
Signed-off-by: [email protected] <[email protected]>
1 parent d7c11b6 commit 37d0fc8

File tree

1 file changed

+79
-22
lines changed
  • tests/fault_tolerance/hardware/fault_injection_service/agents/gpu_fault_injector

1 file changed

+79
-22
lines changed

tests/fault_tolerance/hardware/fault_injection_service/agents/gpu_fault_injector/gpu_xid_injector.py

Lines changed: 79 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,83 @@ def _check_privileged(self) -> bool:
153153
"""Check if we have privileged access (required for nsenter)"""
154154
return os.geteuid() == 0
155155

156+
def _get_pci_address_from_proc(self, gpu_id: int) -> str:
157+
"""
158+
Get PCI address for GPU by reading /host/proc/driver/nvidia/gpus/.
159+
160+
This method works without nvidia-smi by reading NVIDIA kernel driver's procfs.
161+
Maps GPU ID (Device Minor) to PCI address by scanning all GPU directories.
162+
163+
Directory structure:
164+
/host/proc/driver/nvidia/gpus/
165+
├── 0001:00:00.0/information (Device Minor: 0 → GPU 0)
166+
├── 0002:00:00.0/information (Device Minor: 1 → GPU 1)
167+
└── ...
168+
169+
Args:
170+
gpu_id: GPU device ID (0, 1, 2, ...)
171+
172+
Returns:
173+
PCI address (e.g., "0001:00:00.0")
174+
175+
Raises:
176+
FileNotFoundError: If /host/proc is not mounted
177+
ValueError: If GPU ID not found
178+
"""
179+
proc_path = "/host/proc/driver/nvidia/gpus"
180+
181+
# Check if proc path exists
182+
if not os.path.exists(proc_path):
183+
raise FileNotFoundError(
184+
f"{proc_path} not found. Ensure /host/proc is mounted in pod spec."
185+
)
186+
187+
# Iterate through all GPU directories
188+
try:
189+
gpu_dirs = os.listdir(proc_path)
190+
except (PermissionError, OSError) as e:
191+
raise FileNotFoundError(f"Cannot access {proc_path}: {e}")
192+
logger.debug(
193+
f"Found {len(gpu_dirs)} GPU directories in {proc_path}: {gpu_dirs}"
194+
)
195+
196+
available_minors = []
197+
for pci_addr in gpu_dirs:
198+
info_file = f"{proc_path}/{pci_addr}/information"
199+
200+
try:
201+
with open(info_file, "r") as f:
202+
for line in f:
203+
if line.startswith("Device Minor:"):
204+
# Parse: "Device Minor: 0" → 0
205+
parts = line.split(":")
206+
if len(parts) < 2:
207+
logger.warning(
208+
f"Unexpected format in {info_file}: {line.strip()}"
209+
)
210+
continue
211+
device_minor = int(parts[1].strip())
212+
available_minors.append(device_minor)
213+
214+
if device_minor == gpu_id:
215+
logger.info(
216+
f"GPU {gpu_id} mapped to PCI {pci_addr} "
217+
f"via /proc (Device Minor: {device_minor})"
218+
)
219+
return pci_addr
220+
except (IOError, OSError) as e:
221+
logger.warning(f"Could not read {info_file}: {e}")
222+
continue
223+
except (ValueError, IndexError) as e:
224+
logger.warning(f"Could not parse Device Minor from {info_file}: {e}")
225+
continue
226+
227+
# GPU ID not found
228+
raise ValueError(
229+
f"GPU {gpu_id} not found in {proc_path}. "
230+
f"Available Device Minors: {sorted(available_minors)}"
231+
)
232+
156233
def _normalize_pci_address(self, pci_addr: str) -> str:
157234
"""
158235
Normalize PCI address from nvidia-smi format to kernel sysfs format.
@@ -247,28 +324,8 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
247324
message template for each XID type.
248325
"""
249326
try:
250-
# Get PCI address for the GPU
251-
pci_result = subprocess.run(
252-
[
253-
"nvidia-smi",
254-
"--query-gpu=pci.bus_id",
255-
"--format=csv,noheader",
256-
"-i",
257-
str(gpu_id),
258-
],
259-
capture_output=True,
260-
text=True,
261-
timeout=10,
262-
)
263-
264-
if pci_result.returncode != 0:
265-
return (
266-
False,
267-
f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}",
268-
)
269-
270-
pci_addr_full = pci_result.stdout.strip()
271-
pci_addr = self._normalize_pci_address(pci_addr_full)
327+
# Get PCI address using /proc method (works without nvidia-smi)
328+
pci_addr = self._get_pci_address_from_proc(gpu_id)
272329

273330
# Get appropriate error message for this XID type
274331
# If XID is known, use specific message; otherwise use generic format

0 commit comments

Comments
 (0)