diff --git a/Jenkinsfile b/Jenkinsfile
index 53f34c8a148..eb57489525a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,9 +1,10 @@
#!/usr/bin/groovy
/* groovylint-disable-next-line LineLength */
/* groovylint-disable DuplicateMapLiteral, DuplicateNumberLiteral */
-/* groovylint-disable DuplicateStringLiteral, NestedBlockDepth, VariableName */
+/* groovylint-disable DuplicateStringLiteral, NestedBlockDepth */
+/* groovylint-disable ParameterName, VariableName */
/* Copyright 2019-2024 Intel Corporation
- * Copyright 2025 Hewlett Packard Enterprise Development LP
+ * Copyright 2025-2026 Hewlett Packard Enterprise Development LP
* All rights reserved.
*
* This file is part of the DAOS Project. It is subject to the license terms
@@ -133,12 +134,16 @@ String vm9_label(String distro) {
def_val: params.FUNCTIONAL_VM_LABEL))
}
-void rpm_test_post(String stage_name, String node) {
+void rpm_test_post(String stageName, String node) {
+ // Extract first node from coma-delimited list
+ String firstNode = node.split(',')[0].trim()
sh label: 'Fetch and stage artifacts',
- script: 'hostname; ssh -i ci_key jenkins@' + node + ' ls -ltar /tmp; mkdir -p "' + env.STAGE_NAME + '/" && ' +
- 'scp -i ci_key jenkins@' + node + ':/tmp/{{suite_dmg,daos_{server_helper,{control,agent}}}.log,daos_server.log.*} "' +
- env.STAGE_NAME + '/"'
- archiveArtifacts artifacts: env.STAGE_NAME + '/**'
+ script: 'hostname; ssh -i ci_key jenkins@' + firstNode +
+ ' ls -ltar /tmp; mkdir -p "' + stageName + '/" && ' +
+ 'scp -i ci_key jenkins@' + firstNode +
+ ':/tmp/{{suite_dmg,daos_{server_helper,{control,agent}}}.log,daos_server.log.*} "' +
+ stageName + '/"'
+ archiveArtifacts artifacts: stageName + '/**'
job_status_update()
}
@@ -148,7 +153,7 @@ void rpm_test_post(String stage_name, String node) {
Map update_default_commit_pragmas() {
String default_pragmas_str = sh(script: 'ci/gen_commit_pragmas.py --target origin/' + target_branch,
returnStdout: true).trim()
- println("pragmas from gen_commit_pragmas.py:")
+ println('pragmas from gen_commit_pragmas.py:')
println(default_pragmas_str)
if (default_pragmas_str) {
updatePragmas(default_pragmas_str, false)
@@ -851,7 +856,7 @@ pipeline {
unitTestPost artifacts: ['nlt_logs/'],
testResults: 'nlt-junit.xml',
always_script: 'ci/unit/test_nlt_post.sh',
- referenceJobName: 'daos-stack/daos/release%252F2.6',
+ referenceJobName: 'daos-stack/daos/release%252F2.6',
valgrind_stash: 'el8-gcc-nlt-memcheck'
recordIssues enabledForFailure: true,
failOnError: false,
diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh
index adcd0f78be8..aa056248bcc 100755
--- a/ci/functional/test_main.sh
+++ b/ci/functional/test_main.sh
@@ -1,7 +1,7 @@
#!/bin/bash
#
# Copyright 2020-2024 Intel Corporation.
-# Copyright 2025 Hewlett Packard Enterprise Development LP
+# Copyright 2025-2026 Hewlett Packard Enterprise Development LP
#
# SPDX-License-Identifier: BSD-2-Clause-Patent
#
@@ -14,6 +14,13 @@ fi
test_tag="$TEST_TAG"
+: "${NODELIST:=localhost}"
+: "${TEST_RPMS:=false}"
+: "${STAGE_NAME:=unknown}"
+
+def_node_count="$(nodeset -c "$NODELIST")"
+: "${NODE_COUNT:=$def_node_count}"
+
tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT")
first_node=${NODELIST%%,*}
@@ -42,14 +49,17 @@ cluster_reboot () {
test_cluster() {
# Test that all nodes in the cluster are healthy
clush -B -S -o '-i ci_key' -l root -w "${tnodes}" \
- "OPERATIONS_EMAIL=${OPERATIONS_EMAIL} \
+ "OPERATIONS_EMAIL=${OPERATIONS_EMAIL:-} \
FIRST_NODE=${first_node} \
TEST_RPMS=${TEST_RPMS} \
NODELIST=${tnodes} \
BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \
- STAGE_NAME=\"$STAGE_NAME\" \
+ STAGE_NAME=\"${STAGE_NAME}\" \
JENKINS_URL=\"${JENKINS_URL:-}\" \
DAOS_DEVOPS_EMAIL=\"${DAOS_DEVOPS_EMAIL:-}\" \
+ DAOS_INFINIBAND=${DAOS_INFINIBAND:-} \
+ DAOS_NVME=${DAOS_NVME:-} \
+ DAOS_PMEM=${DAOS_PMEM:-} \
$(cat ci/functional/test_main_prep_node.sh)"
}
@@ -65,7 +75,7 @@ if ! test_cluster; then
echo "Hardware test failed again after reboot"
fi
else
- echo "Cluster reboot failed"
+ echo "Cluster reboot failed"
fi
else
hardware_ok=true
@@ -88,6 +98,7 @@ trap 'clush -B -S -o "-i ci_key" -l root -w "${tnodes}" '\
# Setup the Jenkins build artifacts directory before running the tests to ensure
# there is enough disk space to report the results.
+# Even though STAGE_NAME forced to be set, shellcheck wants this syntax.
rm -rf "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/"
mkdir "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/"
@@ -98,24 +109,23 @@ rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml
mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results
if "$hardware_ok"; then
- if $TEST_RPMS; then
+ if "$TEST_RPMS"; then
# shellcheck disable=SC2029
- ssh -i ci_key -l jenkins "${first_node}" \
- "TEST_TAG=\"$test_tag\" \
- TNODES=\"$tnodes\" \
- FTEST_ARG=\"${FTEST_ARG:-}\" \
- WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
- STAGE_NAME=\"$STAGE_NAME\" \
- DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
+ ssh -i ci_key -l jenkins "${first_node}" \
+ "TEST_TAG=\"$test_tag\" \
+ TNODES=\"$tnodes\" \
+ FTEST_ARG=\"${FTEST_ARG:-}\" \
+ WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
+ STAGE_NAME=\"${STAGE_NAME}\" \
+ DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
$(cat ci/functional/test_main_node.sh)"
else
- ./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG"
+ ./ftest.sh "$test_tag" "$tnodes" "${FTEST_ARG:-}"
fi
fi
# Now rename the previously collected hardware test data for Jenkins
# to use them for Junit processing.
-: "${STAGE_NAME:=}"
mkdir -p "${STAGE_NAME}/hardware_prep/"
for node in ${tnodes//,/ }; do
old_name="./hardware_prep_node_results.xml.$node"
diff --git a/ci/functional/test_main_prep_node.sh b/ci/functional/test_main_prep_node.sh
index d096737d24d..ab75841a54b 100755
--- a/ci/functional/test_main_prep_node.sh
+++ b/ci/functional/test_main_prep_node.sh
@@ -1,7 +1,7 @@
#!/bin/bash
#
# Copyright 2020-2023 Intel Corporation.
-# Copyright 2025 Hewlett Packard Enterprise Development LP
+# Copyright 2025-2026 Hewlett Packard Enterprise Development LP
#
# SPDX-License-Identifier: BSD-2-Clause-Patent
#
@@ -16,9 +16,16 @@ domain1="${JENKINS_URL#https://}"
mail_domain="${domain1%%/*}"
: "${EMAIL_DOMAIN:=$mail_domain}"
: "${DAOS_DEVOPS_EMAIL:="$HOSTNAME"@"$EMAIL_DOMAIN"}"
+: "${DAOS_INFINIBAND:=}"
+: "${DAOS_PMEM:=0}"
+: "${DAOS_NVME:=0}"
+
+#cn is for a cleaned up stage name.
+cn=$(echo "$STAGE_NAME" | sed 's/[^a-zA-Z0-9_]/_/g' | sed 's/__*/_/g')
result=0
mail_message=''
+mail_type='warning'
nl="
"
@@ -48,7 +55,7 @@ function do_mail {
fi
# shellcheck disable=SC2059
build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl"
- mail -s "Hardware check failed after reboot!" \
+ mail -s "Hardware check $mail_type after reboot!" \
-r "$DAOS_DEVOPS_EMAIL" "$OPERATIONS_EMAIL" \
<<< "$build_info$mail_message"
set -x
@@ -58,7 +65,7 @@ if ! command -v lspci; then
if command -v dnf; then
dnf -y install pciutils
else
- echo "pciutils not installed, can not test for Infiniband devices"
+ echo "pciutils not installed, can not test for hardware devices"
fi
fi
@@ -106,17 +113,27 @@ The Omni-Path adapters will not be used."
mail_message+="${nl}${ib_message}${nl}"
echo "$ib_message"
fi
+if [ -z "$DAOS_INFINIBAND" ]; then
+ DAOS_INFINIBAND=$ib_count
+fi
set -x
# Wait for at least the expected IB devices to show up.
-# in the case of dual port HBAs, not all IB devices will
-# show up.
+# in the case of dual port HBAs, only the ports that are connected may show up.
# For some unknown reason, sometimes IB devices will not show up
# except in the lspci output unless an ip link set up command for
# at least one device that should be present shows up.
good_ibs=()
function do_wait_for_ib {
- local ib_devs=("$@")
+ # The problem is that we do not know the actual device names
+ # ahead of time. So we try to bring up all possible devices
+ # and see if at least the expected number show up with IP
+ # addresses.
+ local ib_devs=("ib0" "ib1" "ib2" "ib3" "ib4")
+ # Udev rule convention, first digit is the numa node
+ # second digit should be an index of the HBA on that numa node.
+ ib_devs+=("ib_00" "ib_01" "ib_02" "ib_03")
+ ib_devs+=("ib_10" "ib_11" "ib_12" "ib_13")
local working_ib
ib_timeout=300 # 5 minutes
retry_wait=10 # seconds
@@ -147,15 +164,14 @@ function do_wait_for_ib {
return 1
}
-# Migrating to using udev rules for network devices
-if [ -e /etc/udev/rules.d/70-persistent-ipoib.rules ]; then
- ib_list=('ib_cpu0_0' 'ib_cpu1_0')
-else
- ib_list=('ib0')
- if [ "$ib_count" -gt 1 ]; then
- ib_list+=('ib1')
+# Get list of actual InfiniBand devices from /sys/class/net/
+ib_list=()
+for iface in /sys/class/net/ib*; do
+ if [ -e "$iface" ]; then
+ iface_name=$(basename "$iface")
+ ib_list+=("$iface_name")
fi
-fi
+done
function check_ib_devices {
local ib_devs=("$@")
@@ -165,11 +181,10 @@ function check_ib_devices {
set +x
if ! ip addr show "$iface" | grep "inet "; then
ib_message="$({
- echo "Found interface $iface down after reboot on $HOSTNAME."
+ echo "Found interface $iface with no ip address after reboot on $HOSTNAME."
ip addr show "$iface" || true
cat /sys/class/net/"$iface"/mode || true
ip link set up "$iface" || true
- cat /etc/sysconfig/network-scripts/ifcfg-"$iface" || true
} 2>&1)"
mail_message+="${nl}${ib_message}${nl}"
echo "$ib_message"
@@ -190,11 +205,10 @@ function check_ib_devices {
done
}
-
# First check for InfiniBand devices
if [ "$ib_count" -gt 0 ]; then
- if do_wait_for_ib "${ib_list[@]}"; then
- echo "Found at least $ib_count working devices in" "${ib_list[@]}"
+ if do_wait_for_ib; then
+ echo "Found at least $ib_count working devices on $HOSTNAME"
# All good, generate Junit report
check_ib_devices "${good_ibs[@]}"
else
@@ -205,101 +219,111 @@ fi
# having -x just makes the console log harder to read.
# set +x
-if [ "$ib_count" -ge 2 ]; then
- # now check for pmem & NVMe drives when multiple ib are present.
- # ipmctl show -dimm should show an even number of drives, all healthy
- dimm_count=$(ipmctl show -dimm | grep Healthy -c)
- if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then
- # May not be fatal, the PMEM DIMM should be replaced when downtime can be
- # scheduled for this system.
- dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen."
- mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl"
- else
- echo "OK: Found $dimm_count PMEM DIMMs."
- fi
- # Should have 2 regions 0x0000 and 0x0001, type AppDirect
- dimm_rcount=0
- while IFS= read -r line; do
- if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi
- ((dimm_rcount++)) || true
- done < <(ipmctl show -region)
+if [ "$ib_count" -ge 2 ] ; then
+ if [ "$DAOS_PMEM" -gt 0 ]; then
+ # now check for pmem & NVMe drives when multiple ib are present.
+ # ipmctl show -dimm should show an even number of drives, all healthy
+ dimm_count=$(ipmctl show -dimm | grep Healthy -c)
+ if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then
+ # May not be fatal, the PMEM DIMM should be replaced when downtime
+ # can be # scheduled for this system.
+ dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen"
+ dimm_message+=" on $HOSTNAME."
+
+ mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl"
+ else
+ echo "OK: Found $dimm_count PMEM DIMMs."
+ fi
+ # Should have 2 regions 0x0000 and 0x0001, type AppDirect
+ dimm_rcount=0
+ while IFS= read -r line; do
+ if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi
+ ((dimm_rcount++)) || true
+ done < <(ipmctl show -region)
- ((testruns++)) || true
- testcases+=" ${nl}"
- if [ "$dimm_rcount" -ne 2 ]; then
- nvme_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2."
- nvme_message+="$nl$(ipmctl show -region)"
- mail_message+="$nl$nvme_message$nl"
- ((testfails++)) || true
- testcases+="
-
+ ((testruns++)) || true
+ testcases+=" ${nl}"
+ if [ "$dimm_rcount" -ne 2 ]; then
+ pmem_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2"
+ pmem_message+=" on $HOSTNAME."
+ pmem_message+="$nl$(ipmctl show -region)"
+ mail_message+="$nl$pmem_message$nl"
+ ((testfails++)) || true
+ testcases+="
+
$nl"
result=3
- else
- echo "OK: Found $dimm_rcount DIMM PMEM regions."
- fi
- testcases+=" $nl"
-
- # While this gets more data than needed, it is the same search that
- # DAOS tests do and records it in the console log.
- nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' |
- grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)"
- nvme_count=0
- while IFS= read -r line; do
- if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then
- continue
+ else
+ echo "OK: Found $dimm_rcount DIMM PMEM regions."
fi
- ((nvme_count++)) || true
- done < <(printf %s "$nvme_devices")
+ testcases+=" $nl"
+ fi
+ if [ "$DAOS_NVME" -gt 0 ]; then
+ # While this gets more data than needed, it is the same search that
+ # DAOS tests do and records it in the console log.
+ nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' |
+ grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)"
+ nvme_count=0
+ while IFS= read -r line; do
+ if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then
+ continue
+ fi
+ ((nvme_count++)) || true
+ done < <(printf %s "$nvme_devices")
- ((testruns++)) || true
- testcases+=" ${nl}"
- if [ $((nvme_count%2)) -ne 0 ]; then
- nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen."
- mail_message+="$nl$nvme_message$nl$nvme_devices$nl"
- ((testfails++)) || true
- testcases+="
+ ((testruns++)) || true
+ testcases+=" ${nl}"
+ if [ $((nvme_count%2)) -ne 0 ]; then
+ nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen."
+ mail_message+="$nl$nvme_message$nl$nvme_devices$nl"
+ ((testfails++)) || true
+ testcases+="
$nl"
- result=4
- else
- echo "OK: Even number ($nvme_count) of NVMe devices seen."
+ result=4
+ else
+ echo "OK: Even number ($nvme_count) of NVMe devices seen."
+ fi
+ testcases+=" $nl"
fi
- testcases+=" $nl"
-
# All storage found by lspci should also be in lsblk report
lsblk_nvme=$(lsblk | grep nvme -c)
lsblk_pmem=$(lsblk | grep pmem -c)
- ((testruns++)) || true
- testcases+=" ${nl}"
- if [ "$lsblk_nvme" -ne "$nvme_count" ]; then
- lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen."
- mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl"
- ((testfails++)) || true
- testcases+="
+ if [ "$DAOS_NVME" -gt 0 ]; then
+ ((testruns++)) || true
+ testcases+=" ${nl}"
+ if [ "$lsblk_nvme" -ne "$nvme_count" ]; then
+ lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen"
+ lsblk_nvme_msg+=" on $HOSTNAME."
+ mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl"
+ ((testfails++)) || true
+ testcases+="
$nl"
- result=5
- else
- echo "OK: All $nvme_count NVMe devices are in lsblk report."
+ result=5
+ else
+ echo "OK: All $nvme_count NVMe devices are in lsblk report."
+ fi
+ testcases+=" $nl"
fi
- testcases+=" $nl"
-
- ((testruns++)) || true
- testcases+=" ${nl}"
- if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then
- lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen."
- mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl"
- ((testfails++)) || true
- testcases+="
+ if [ "$DAOS_PMEM" -gt 0 ]; then
+ ((testruns++)) || true
+ testcases+=" ${nl}"
+ if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then
+ lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen"
+ lsblk_pmem_msg+=" on $HOSTNAME."
+ mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl"
+ ((testfails++)) || true
+ testcases+="
$nl"
- result=6
- else
- echo "OK: All $dimm_rcount PMEM devices are in lsblk report."
+ result=6
+ else
+ echo "OK: All $dimm_rcount PMEM devices are in lsblk report."
+ fi
+ testcases+=" $nl"
fi
- testcases+=" $nl"
fi
# Additional information if any check failed
@@ -320,21 +344,23 @@ if [ -n "$FIRST_NODE" ] && ! grep /mnt/share /proc/mounts; then
mount "$FIRST_NODE":/export/share /mnt/share
fi
-# Defaulting the package to "(root)" for now as then Jenkins
-# will default to setting putting the outer stage name and
-# inner stage name in the full test name.
-ts="Hardware"
+# The package name defaults to "(root)" unless there is a dot in the
+# testsuite name, in which case the package name is the part before
+# the last dot in the testsuite name.
+pn="Hardware"
tf="failures=\"$testfails\""
te="errors=\"0\""
tc="tests=\"$testruns\""
-# shellcheck disable=SC2089
-junit_xml="$nl
+junit_xml="$nl
$testcases$nl"
# Each junit file needs the same name for when they are collected.
echo "$junit_xml" > "./hardware_prep_node_results.xml"
+if [ "$testfails" -gt 0 ]; then
+ mail_type='failed'
+fi
do_mail
if [ "$result" -ne 0 ]; then
diff --git a/ci/storage/test_main_storage_prepare_node.sh b/ci/storage/test_main_storage_prepare_node.sh
index f87333327b8..489baa21006 100755
--- a/ci/storage/test_main_storage_prepare_node.sh
+++ b/ci/storage/test_main_storage_prepare_node.sh
@@ -1,7 +1,7 @@
#!/bin/bash
#
# Copyright 2021-2023 Intel Corporation.
-# Copyright 2025 Hewlett Packard Enterprise Development LP
+# Copyright 2025-2026 Hewlett Packard Enterprise Development LP
#
# SPDX-License-Identifier: BSD-2-Clause-Patent
#
@@ -44,6 +44,22 @@ if command -v ibv_devinfo; then ibv_devinfo || true; fi
lspci | grep -i "Non-Volatile memory controller" || true
+ib_count=0
+for ib_path in /sys/class/net/ib*; do
+ if [ ! -e "$ib_path" ]; then
+ continue
+ fi
+ ((ib_count++)) || true
+ ip addr show "$(basename "$ib_path")"
+done
+
+# Skip test controller
+if [ "$ib_count" -le 1 ]; then
+ echo "Less than 2 Infiniband devices found ($ib_count)."
+ echo "Assuming this is a test controller node. Skipping PMEM setup."
+ exit
+fi
+
if ipmctl show -dimm; then
ipmctl show -goal
ipmctl show -region
@@ -60,12 +76,7 @@ if ipmctl show -dimm; then
fi
fi
else
- counter=0
- for ib in /sys/class/net/ib*; do
- ((counter++)) || true
- ip addr show "$ib"
- done
- if "$counter" -ge 2; then
+ if [ "$ib_count" -ge 2 ]; then
# All of our CI nodes with two ib adapters should have PMEM DIMMs
echo 'No PMEM DIMM devices found on CI node!'
exit 1