@@ -153,6 +153,83 @@ def _check_privileged(self) -> bool:
153153 """Check if we have privileged access (required for nsenter)"""
154154 return os .geteuid () == 0
155155
156+ def _get_pci_address_from_proc (self , gpu_id : int ) -> str :
157+ """
158+ Get PCI address for GPU by reading /host/proc/driver/nvidia/gpus/.
159+
160+ This method works without nvidia-smi by reading NVIDIA kernel driver's procfs.
161+ Maps GPU ID (Device Minor) to PCI address by scanning all GPU directories.
162+
163+ Directory structure:
164+ /host/proc/driver/nvidia/gpus/
165+ ├── 0001:00:00.0/information (Device Minor: 0 → GPU 0)
166+ ├── 0002:00:00.0/information (Device Minor: 1 → GPU 1)
167+ └── ...
168+
169+ Args:
170+ gpu_id: GPU device ID (0, 1, 2, ...)
171+
172+ Returns:
173+ PCI address (e.g., "0001:00:00.0")
174+
175+ Raises:
176+ FileNotFoundError: If /host/proc is not mounted
177+ ValueError: If GPU ID not found
178+ """
179+ proc_path = "/host/proc/driver/nvidia/gpus"
180+
181+ # Check if proc path exists
182+ if not os .path .exists (proc_path ):
183+ raise FileNotFoundError (
184+ f"{ proc_path } not found. Ensure /host/proc is mounted in pod spec."
185+ )
186+
187+ # Iterate through all GPU directories
188+ try :
189+ gpu_dirs = os .listdir (proc_path )
190+ except (PermissionError , OSError ) as e :
191+ raise FileNotFoundError (f"Cannot access { proc_path } : { e } " )
192+ logger .debug (
193+ f"Found { len (gpu_dirs )} GPU directories in { proc_path } : { gpu_dirs } "
194+ )
195+
196+ available_minors = []
197+ for pci_addr in gpu_dirs :
198+ info_file = f"{ proc_path } /{ pci_addr } /information"
199+
200+ try :
201+ with open (info_file , "r" ) as f :
202+ for line in f :
203+ if line .startswith ("Device Minor:" ):
204+ # Parse: "Device Minor: 0" → 0
205+ parts = line .split (":" )
206+ if len (parts ) < 2 :
207+ logger .warning (
208+ f"Unexpected format in { info_file } : { line .strip ()} "
209+ )
210+ continue
211+ device_minor = int (parts [1 ].strip ())
212+ available_minors .append (device_minor )
213+
214+ if device_minor == gpu_id :
215+ logger .info (
216+ f"GPU { gpu_id } mapped to PCI { pci_addr } "
217+ f"via /proc (Device Minor: { device_minor } )"
218+ )
219+ return pci_addr
220+ except (IOError , OSError ) as e :
221+ logger .warning (f"Could not read { info_file } : { e } " )
222+ continue
223+ except (ValueError , IndexError ) as e :
224+ logger .warning (f"Could not parse Device Minor from { info_file } : { e } " )
225+ continue
226+
227+ # GPU ID not found
228+ raise ValueError (
229+ f"GPU { gpu_id } not found in { proc_path } . "
230+ f"Available Device Minors: { sorted (available_minors )} "
231+ )
232+
156233 def _normalize_pci_address (self , pci_addr : str ) -> str :
157234 """
158235 Normalize PCI address from nvidia-smi format to kernel sysfs format.
@@ -247,28 +324,8 @@ def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
247324 message template for each XID type.
248325 """
249326 try :
250- # Get PCI address for the GPU
251- pci_result = subprocess .run (
252- [
253- "nvidia-smi" ,
254- "--query-gpu=pci.bus_id" ,
255- "--format=csv,noheader" ,
256- "-i" ,
257- str (gpu_id ),
258- ],
259- capture_output = True ,
260- text = True ,
261- timeout = 10 ,
262- )
263-
264- if pci_result .returncode != 0 :
265- return (
266- False ,
267- f"Failed to get PCI address for GPU { gpu_id } : { pci_result .stderr } " ,
268- )
269-
270- pci_addr_full = pci_result .stdout .strip ()
271- pci_addr = self ._normalize_pci_address (pci_addr_full )
327+ # Get PCI address using /proc method (works without nvidia-smi)
328+ pci_addr = self ._get_pci_address_from_proc (gpu_id )
272329
273330 # Get appropriate error message for this XID type
274331 # If XID is known, use specific message; otherwise use generic format
0 commit comments