diff --git a/test/cases/nvidia/unit_test.go b/test/cases/nvidia/unit_test.go index 27918e878..d703620b1 100644 --- a/test/cases/nvidia/unit_test.go +++ b/test/cases/nvidia/unit_test.go @@ -7,6 +7,7 @@ import ( _ "embed" "fmt" "testing" + "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "sigs.k8s.io/e2e-framework/klient/wait" @@ -66,7 +67,8 @@ func TestSingleNodeUnitTest(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), - wait.WithContext(ctx)) + wait.WithContext(ctx), + wait.WithTimeout(10*time.Minute)) if err != nil { t.Fatal(err) } diff --git a/test/images/nvidia/gpu_unit_tests/tests/common.sh b/test/images/nvidia/gpu_unit_tests/tests/common.sh index a6f5222d7..140e05bda 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/common.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/common.sh @@ -53,3 +53,12 @@ generate_data() eval "$cmd" > $expected _assert_data "$expected" "$cmd" "$msg" } + +function is_vgpu() +{ + local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)} + case "${instance_type}" in + g6f.*|gr6f.*) return ;; + *) return 1 ;; # Not supported + esac +} diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh b/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh index 518e1fc16..9ffc0c0c3 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/test_basic.sh @@ -31,6 +31,11 @@ test_03_nvbandwidth() test_04_dcgm_diagnostics() { + # This test is not applicable for vGPU instance types. + if is_vgpu; then + skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" + fi + # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests if [[ $EC2_INSTANCE_TYPE == g* ]]; then # The G series instance don't have nvlink and GPU p2p communication diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh index 5dc282b3b..909fe9b90 100644 --- a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh @@ -61,9 +61,26 @@ test_nvidia_gpu_unused() test_nvidia_gpu_throttled() { + # vGPU instances don't support GPU clock throttling detection. + # This test is not applicable for vGPU instance types. + if is_vgpu; then + skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" + fi # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'" cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader" assert_status_code 1 "$cmd | $filter" "Throttled gpu detected" } + + +test_nvidia_vgpu_license_status() +{ + if ! is_vgpu; then + skip "This test only applies to vGPU instances (g6f.*, gr6f.*)" + fi + + assert_data $data/nvidia_vgpu_license_status.txt \ + "nvidia-smi -q | grep 'vGPU Software' -A 2" \ + "vGPU license status validation failed" +} \ No newline at end of file diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt @@ -0,0 +1 @@ +0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt new file mode 100644 index 000000000..222c7920e --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +NVIDIA L4-6Q, 0, 00000000:31:00.0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt new file mode 100644 index 000000000..d72d887a6 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-7 +/sys/devices/system/node/node0/distance:10 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..3c4dba324 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +NVIDIA L4-6Q, 00000000:31:00.0, Enabled diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..e0cfd1955 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-7 0 N/A diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt new file mode 100644 index 000000000..0fca4dfc4 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt @@ -0,0 +1,3 @@ + vGPU Software Licensed Product + Product Name : NVIDIA RTX Virtual Workstation + License Status : Licensed (Expiry: N/A) diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt @@ -0,0 +1 @@ +0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt new file mode 100644 index 000000000..a6e53172e --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +NVIDIA L4-12Q, 0, 00000000:35:00.0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt new file mode 100644 index 000000000..ed6c897aa --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-15 +/sys/devices/system/node/node0/distance:10 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..b6a99abc3 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +NVIDIA L4-12Q, 00000000:35:00.0, Enabled diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..39eb6d6f2 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-15 0 N/A diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt new file mode 100644 index 000000000..0fca4dfc4 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt @@ -0,0 +1,3 @@ + vGPU Software Licensed Product + Product Name : NVIDIA RTX Virtual Workstation + License Status : Licensed (Expiry: N/A) diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt @@ -0,0 +1 @@ +0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt new file mode 100644 index 000000000..38d0e6962 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +NVIDIA L4-3Q, 0, 00000000:31:00.0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt new file mode 100644 index 000000000..25a083974 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-1 +/sys/devices/system/node/node0/distance:10 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt new file mode 100644 index 000000000..3f2c93def --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +NVIDIA L4-3Q, 00000000:31:00.0, Enabled diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt new file mode 100644 index 000000000..67611d6f5 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-1 0 N/A diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt new file mode 100644 index 000000000..0fca4dfc4 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt @@ -0,0 +1,3 @@ + vGPU Software Licensed Product + Product Name : NVIDIA RTX Virtual Workstation + License Status : Licensed (Expiry: N/A) diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt new file mode 100644 index 000000000..573541ac9 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt @@ -0,0 +1 @@ +0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt new file mode 100644 index 000000000..38d0e6962 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt @@ -0,0 +1,2 @@ +name, index, pci.bus_id +NVIDIA L4-3Q, 0, 00000000:31:00.0 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt new file mode 100644 index 000000000..b969b825b --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt @@ -0,0 +1,2 @@ +/sys/devices/system/node/node0/cpulist:0-3 +/sys/devices/system/node/node0/distance:10 diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt new file mode 100644 index 000000000..3f2c93def --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt @@ -0,0 +1,2 @@ +name, pci.bus_id, persistence_mode +NVIDIA L4-3Q, 00000000:31:00.0, Enabled diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt new file mode 100644 index 000000000..bd19f8e60 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt @@ -0,0 +1,2 @@ + GPU0 CPU Affinity NUMA Affinity GPU NUMA ID +GPU0 X 0-3 0 N/A diff --git a/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt new file mode 100644 index 000000000..0fca4dfc4 --- /dev/null +++ b/test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt @@ -0,0 +1,3 @@ + vGPU Software Licensed Product + Product Name : NVIDIA RTX Virtual Workstation + License Status : Licensed (Expiry: N/A)