diff --git a/Jenkinsfile b/Jenkinsfile index 53f34c8a148..eb57489525a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,9 +1,10 @@ #!/usr/bin/groovy /* groovylint-disable-next-line LineLength */ /* groovylint-disable DuplicateMapLiteral, DuplicateNumberLiteral */ -/* groovylint-disable DuplicateStringLiteral, NestedBlockDepth, VariableName */ +/* groovylint-disable DuplicateStringLiteral, NestedBlockDepth */ +/* groovylint-disable ParameterName, VariableName */ /* Copyright 2019-2024 Intel Corporation - * Copyright 2025 Hewlett Packard Enterprise Development LP + * Copyright 2025-2026 Hewlett Packard Enterprise Development LP * All rights reserved. * * This file is part of the DAOS Project. It is subject to the license terms @@ -133,12 +134,16 @@ String vm9_label(String distro) { def_val: params.FUNCTIONAL_VM_LABEL)) } -void rpm_test_post(String stage_name, String node) { +void rpm_test_post(String stageName, String node) { + // Extract first node from coma-delimited list + String firstNode = node.split(',')[0].trim() sh label: 'Fetch and stage artifacts', - script: 'hostname; ssh -i ci_key jenkins@' + node + ' ls -ltar /tmp; mkdir -p "' + env.STAGE_NAME + '/" && ' + - 'scp -i ci_key jenkins@' + node + ':/tmp/{{suite_dmg,daos_{server_helper,{control,agent}}}.log,daos_server.log.*} "' + - env.STAGE_NAME + '/"' - archiveArtifacts artifacts: env.STAGE_NAME + '/**' + script: 'hostname; ssh -i ci_key jenkins@' + firstNode + + ' ls -ltar /tmp; mkdir -p "' + stageName + '/" && ' + + 'scp -i ci_key jenkins@' + firstNode + + ':/tmp/{{suite_dmg,daos_{server_helper,{control,agent}}}.log,daos_server.log.*} "' + + stageName + '/"' + archiveArtifacts artifacts: stageName + '/**' job_status_update() } @@ -148,7 +153,7 @@ void rpm_test_post(String stage_name, String node) { Map update_default_commit_pragmas() { String default_pragmas_str = sh(script: 'ci/gen_commit_pragmas.py --target origin/' + target_branch, returnStdout: true).trim() - println("pragmas from gen_commit_pragmas.py:") + println('pragmas from gen_commit_pragmas.py:') println(default_pragmas_str) if (default_pragmas_str) { updatePragmas(default_pragmas_str, false) @@ -851,7 +856,7 @@ pipeline { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', always_script: 'ci/unit/test_nlt_post.sh', - referenceJobName: 'daos-stack/daos/release%252F2.6', + referenceJobName: 'daos-stack/daos/release%252F2.6', valgrind_stash: 'el8-gcc-nlt-memcheck' recordIssues enabledForFailure: true, failOnError: false, diff --git a/ci/functional/test_main.sh b/ci/functional/test_main.sh index adcd0f78be8..aa056248bcc 100755 --- a/ci/functional/test_main.sh +++ b/ci/functional/test_main.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2024 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -14,6 +14,13 @@ fi test_tag="$TEST_TAG" +: "${NODELIST:=localhost}" +: "${TEST_RPMS:=false}" +: "${STAGE_NAME:=unknown}" + +def_node_count="$(nodeset -c "$NODELIST")" +: "${NODE_COUNT:=$def_node_count}" + tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT") first_node=${NODELIST%%,*} @@ -42,14 +49,17 @@ cluster_reboot () { test_cluster() { # Test that all nodes in the cluster are healthy clush -B -S -o '-i ci_key' -l root -w "${tnodes}" \ - "OPERATIONS_EMAIL=${OPERATIONS_EMAIL} \ + "OPERATIONS_EMAIL=${OPERATIONS_EMAIL:-} \ FIRST_NODE=${first_node} \ TEST_RPMS=${TEST_RPMS} \ NODELIST=${tnodes} \ BUILD_URL=\"${BUILD_URL:-Unknown in GHA}\" \ - STAGE_NAME=\"$STAGE_NAME\" \ + STAGE_NAME=\"${STAGE_NAME}\" \ JENKINS_URL=\"${JENKINS_URL:-}\" \ DAOS_DEVOPS_EMAIL=\"${DAOS_DEVOPS_EMAIL:-}\" \ + DAOS_INFINIBAND=${DAOS_INFINIBAND:-} \ + DAOS_NVME=${DAOS_NVME:-} \ + DAOS_PMEM=${DAOS_PMEM:-} \ $(cat ci/functional/test_main_prep_node.sh)" } @@ -65,7 +75,7 @@ if ! test_cluster; then echo "Hardware test failed again after reboot" fi else - echo "Cluster reboot failed" + echo "Cluster reboot failed" fi else hardware_ok=true @@ -88,6 +98,7 @@ trap 'clush -B -S -o "-i ci_key" -l root -w "${tnodes}" '\ # Setup the Jenkins build artifacts directory before running the tests to ensure # there is enough disk space to report the results. +# Even though STAGE_NAME forced to be set, shellcheck wants this syntax. rm -rf "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/" mkdir "${STAGE_NAME:?ERROR: STAGE_NAME is not defined}/" @@ -98,24 +109,23 @@ rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results if "$hardware_ok"; then - if $TEST_RPMS; then + if "$TEST_RPMS"; then # shellcheck disable=SC2029 - ssh -i ci_key -l jenkins "${first_node}" \ - "TEST_TAG=\"$test_tag\" \ - TNODES=\"$tnodes\" \ - FTEST_ARG=\"${FTEST_ARG:-}\" \ - WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ - STAGE_NAME=\"$STAGE_NAME\" \ - DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ + ssh -i ci_key -l jenkins "${first_node}" \ + "TEST_TAG=\"$test_tag\" \ + TNODES=\"$tnodes\" \ + FTEST_ARG=\"${FTEST_ARG:-}\" \ + WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \ + STAGE_NAME=\"${STAGE_NAME}\" \ + DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ $(cat ci/functional/test_main_node.sh)" else - ./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG" + ./ftest.sh "$test_tag" "$tnodes" "${FTEST_ARG:-}" fi fi # Now rename the previously collected hardware test data for Jenkins # to use them for Junit processing. -: "${STAGE_NAME:=}" mkdir -p "${STAGE_NAME}/hardware_prep/" for node in ${tnodes//,/ }; do old_name="./hardware_prep_node_results.xml.$node" diff --git a/ci/functional/test_main_prep_node.sh b/ci/functional/test_main_prep_node.sh index d096737d24d..ab75841a54b 100755 --- a/ci/functional/test_main_prep_node.sh +++ b/ci/functional/test_main_prep_node.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2020-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -16,9 +16,16 @@ domain1="${JENKINS_URL#https://}" mail_domain="${domain1%%/*}" : "${EMAIL_DOMAIN:=$mail_domain}" : "${DAOS_DEVOPS_EMAIL:="$HOSTNAME"@"$EMAIL_DOMAIN"}" +: "${DAOS_INFINIBAND:=}" +: "${DAOS_PMEM:=0}" +: "${DAOS_NVME:=0}" + +#cn is for a cleaned up stage name. +cn=$(echo "$STAGE_NAME" | sed 's/[^a-zA-Z0-9_]/_/g' | sed 's/__*/_/g') result=0 mail_message='' +mail_type='warning' nl=" " @@ -48,7 +55,7 @@ function do_mail { fi # shellcheck disable=SC2059 build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl" - mail -s "Hardware check failed after reboot!" \ + mail -s "Hardware check $mail_type after reboot!" \ -r "$DAOS_DEVOPS_EMAIL" "$OPERATIONS_EMAIL" \ <<< "$build_info$mail_message" set -x @@ -58,7 +65,7 @@ if ! command -v lspci; then if command -v dnf; then dnf -y install pciutils else - echo "pciutils not installed, can not test for Infiniband devices" + echo "pciutils not installed, can not test for hardware devices" fi fi @@ -106,17 +113,27 @@ The Omni-Path adapters will not be used." mail_message+="${nl}${ib_message}${nl}" echo "$ib_message" fi +if [ -z "$DAOS_INFINIBAND" ]; then + DAOS_INFINIBAND=$ib_count +fi set -x # Wait for at least the expected IB devices to show up. -# in the case of dual port HBAs, not all IB devices will -# show up. +# in the case of dual port HBAs, only the ports that are connected may show up. # For some unknown reason, sometimes IB devices will not show up # except in the lspci output unless an ip link set up command for # at least one device that should be present shows up. good_ibs=() function do_wait_for_ib { - local ib_devs=("$@") + # The problem is that we do not know the actual device names + # ahead of time. So we try to bring up all possible devices + # and see if at least the expected number show up with IP + # addresses. + local ib_devs=("ib0" "ib1" "ib2" "ib3" "ib4") + # Udev rule convention, first digit is the numa node + # second digit should be an index of the HBA on that numa node. + ib_devs+=("ib_00" "ib_01" "ib_02" "ib_03") + ib_devs+=("ib_10" "ib_11" "ib_12" "ib_13") local working_ib ib_timeout=300 # 5 minutes retry_wait=10 # seconds @@ -147,15 +164,14 @@ function do_wait_for_ib { return 1 } -# Migrating to using udev rules for network devices -if [ -e /etc/udev/rules.d/70-persistent-ipoib.rules ]; then - ib_list=('ib_cpu0_0' 'ib_cpu1_0') -else - ib_list=('ib0') - if [ "$ib_count" -gt 1 ]; then - ib_list+=('ib1') +# Get list of actual InfiniBand devices from /sys/class/net/ +ib_list=() +for iface in /sys/class/net/ib*; do + if [ -e "$iface" ]; then + iface_name=$(basename "$iface") + ib_list+=("$iface_name") fi -fi +done function check_ib_devices { local ib_devs=("$@") @@ -165,11 +181,10 @@ function check_ib_devices { set +x if ! ip addr show "$iface" | grep "inet "; then ib_message="$({ - echo "Found interface $iface down after reboot on $HOSTNAME." + echo "Found interface $iface with no ip address after reboot on $HOSTNAME." ip addr show "$iface" || true cat /sys/class/net/"$iface"/mode || true ip link set up "$iface" || true - cat /etc/sysconfig/network-scripts/ifcfg-"$iface" || true } 2>&1)" mail_message+="${nl}${ib_message}${nl}" echo "$ib_message" @@ -190,11 +205,10 @@ function check_ib_devices { done } - # First check for InfiniBand devices if [ "$ib_count" -gt 0 ]; then - if do_wait_for_ib "${ib_list[@]}"; then - echo "Found at least $ib_count working devices in" "${ib_list[@]}" + if do_wait_for_ib; then + echo "Found at least $ib_count working devices on $HOSTNAME" # All good, generate Junit report check_ib_devices "${good_ibs[@]}" else @@ -205,101 +219,111 @@ fi # having -x just makes the console log harder to read. # set +x -if [ "$ib_count" -ge 2 ]; then - # now check for pmem & NVMe drives when multiple ib are present. - # ipmctl show -dimm should show an even number of drives, all healthy - dimm_count=$(ipmctl show -dimm | grep Healthy -c) - if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then - # May not be fatal, the PMEM DIMM should be replaced when downtime can be - # scheduled for this system. - dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen." - mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl" - else - echo "OK: Found $dimm_count PMEM DIMMs." - fi - # Should have 2 regions 0x0000 and 0x0001, type AppDirect - dimm_rcount=0 - while IFS= read -r line; do - if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi - ((dimm_rcount++)) || true - done < <(ipmctl show -region) +if [ "$ib_count" -ge 2 ] ; then + if [ "$DAOS_PMEM" -gt 0 ]; then + # now check for pmem & NVMe drives when multiple ib are present. + # ipmctl show -dimm should show an even number of drives, all healthy + dimm_count=$(ipmctl show -dimm | grep Healthy -c) + if [ "$dimm_count" -eq 0 ] || [ $((dimm_count%2)) -ne 0 ]; then + # May not be fatal, the PMEM DIMM should be replaced when downtime + # can be # scheduled for this system. + dimm_message="FAIL: Wrong number $dimm_count healthy PMEM DIMMs seen" + dimm_message+=" on $HOSTNAME." + + mail_message+="$nl$dimm_message$nl$(ipmctl show -dimm)$nl" + else + echo "OK: Found $dimm_count PMEM DIMMs." + fi + # Should have 2 regions 0x0000 and 0x0001, type AppDirect + dimm_rcount=0 + while IFS= read -r line; do + if [[ "$line" != *"| AppDirect"*"| Healthy"* ]]; then continue; fi + ((dimm_rcount++)) || true + done < <(ipmctl show -region) - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$dimm_rcount" -ne 2 ]; then - nvme_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2." - nvme_message+="$nl$(ipmctl show -region)" - mail_message+="$nl$nvme_message$nl" - ((testfails++)) || true - testcases+=" - + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$dimm_rcount" -ne 2 ]; then + pmem_message="FAIL: Found $dimm_rcount of DIMM PMEM regions, need 2" + pmem_message+=" on $HOSTNAME." + pmem_message+="$nl$(ipmctl show -region)" + mail_message+="$nl$pmem_message$nl" + ((testfails++)) || true + testcases+=" + $nl" result=3 - else - echo "OK: Found $dimm_rcount DIMM PMEM regions." - fi - testcases+=" $nl" - - # While this gets more data than needed, it is the same search that - # DAOS tests do and records it in the console log. - nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' | - grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)" - nvme_count=0 - while IFS= read -r line; do - if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then - continue + else + echo "OK: Found $dimm_rcount DIMM PMEM regions." fi - ((nvme_count++)) || true - done < <(printf %s "$nvme_devices") + testcases+=" $nl" + fi + if [ "$DAOS_NVME" -gt 0 ]; then + # While this gets more data than needed, it is the same search that + # DAOS tests do and records it in the console log. + nvme_devices="$(lspci -vmm -D | grep -E '^(Slot|Class|Device|NUMANode):' | + grep -E 'Class:\s+Non-Volatile memory controller' -B 1 -A 2)" + nvme_count=0 + while IFS= read -r line; do + if [[ "$line" != *"Class:"*"Non-Volatile memory controller"* ]];then + continue + fi + ((nvme_count++)) || true + done < <(printf %s "$nvme_devices") - ((testruns++)) || true - testcases+=" ${nl}" - if [ $((nvme_count%2)) -ne 0 ]; then - nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen." - mail_message+="$nl$nvme_message$nl$nvme_devices$nl" - ((testfails++)) || true - testcases+=" + ((testruns++)) || true + testcases+=" ${nl}" + if [ $((nvme_count%2)) -ne 0 ]; then + nvme_message="Fail: Odd number ($nvme_count) of NVMe devices seen." + mail_message+="$nl$nvme_message$nl$nvme_devices$nl" + ((testfails++)) || true + testcases+=" $nl" - result=4 - else - echo "OK: Even number ($nvme_count) of NVMe devices seen." + result=4 + else + echo "OK: Even number ($nvme_count) of NVMe devices seen." + fi + testcases+=" $nl" fi - testcases+=" $nl" - # All storage found by lspci should also be in lsblk report lsblk_nvme=$(lsblk | grep nvme -c) lsblk_pmem=$(lsblk | grep pmem -c) - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$lsblk_nvme" -ne "$nvme_count" ]; then - lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen." - mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl" - ((testfails++)) || true - testcases+=" + if [ "$DAOS_NVME" -gt 0 ]; then + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$lsblk_nvme" -ne "$nvme_count" ]; then + lsblk_nvme_msg="Fail: Only $lsblk_nvme of $nvme_count NVMe devices seen" + lsblk_nvme_msg+=" on $HOSTNAME." + mail_message+="$nl$lsblk_nvme_msg$nl$(lsblk)$nl" + ((testfails++)) || true + testcases+=" $nl" - result=5 - else - echo "OK: All $nvme_count NVMe devices are in lsblk report." + result=5 + else + echo "OK: All $nvme_count NVMe devices are in lsblk report." + fi + testcases+=" $nl" fi - testcases+=" $nl" - - ((testruns++)) || true - testcases+=" ${nl}" - if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then - lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen." - mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl" - ((testfails++)) || true - testcases+=" + if [ "$DAOS_PMEM" -gt 0 ]; then + ((testruns++)) || true + testcases+=" ${nl}" + if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then + lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen" + lsblk_pmem_msg+=" on $HOSTNAME." + mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl" + ((testfails++)) || true + testcases+=" $nl" - result=6 - else - echo "OK: All $dimm_rcount PMEM devices are in lsblk report." + result=6 + else + echo "OK: All $dimm_rcount PMEM devices are in lsblk report." + fi + testcases+=" $nl" fi - testcases+=" $nl" fi # Additional information if any check failed @@ -320,21 +344,23 @@ if [ -n "$FIRST_NODE" ] && ! grep /mnt/share /proc/mounts; then mount "$FIRST_NODE":/export/share /mnt/share fi -# Defaulting the package to "(root)" for now as then Jenkins -# will default to setting putting the outer stage name and -# inner stage name in the full test name. -ts="Hardware" +# The package name defaults to "(root)" unless there is a dot in the +# testsuite name, in which case the package name is the part before +# the last dot in the testsuite name. +pn="Hardware" tf="failures=\"$testfails\"" te="errors=\"0\"" tc="tests=\"$testruns\"" -# shellcheck disable=SC2089 -junit_xml="$nl +junit_xml="$nl $testcases$nl" # Each junit file needs the same name for when they are collected. echo "$junit_xml" > "./hardware_prep_node_results.xml" +if [ "$testfails" -gt 0 ]; then + mail_type='failed' +fi do_mail if [ "$result" -ne 0 ]; then diff --git a/ci/storage/test_main_storage_prepare_node.sh b/ci/storage/test_main_storage_prepare_node.sh index f87333327b8..489baa21006 100755 --- a/ci/storage/test_main_storage_prepare_node.sh +++ b/ci/storage/test_main_storage_prepare_node.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Copyright 2021-2023 Intel Corporation. -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # # SPDX-License-Identifier: BSD-2-Clause-Patent # @@ -44,6 +44,22 @@ if command -v ibv_devinfo; then ibv_devinfo || true; fi lspci | grep -i "Non-Volatile memory controller" || true +ib_count=0 +for ib_path in /sys/class/net/ib*; do + if [ ! -e "$ib_path" ]; then + continue + fi + ((ib_count++)) || true + ip addr show "$(basename "$ib_path")" +done + +# Skip test controller +if [ "$ib_count" -le 1 ]; then + echo "Less than 2 Infiniband devices found ($ib_count)." + echo "Assuming this is a test controller node. Skipping PMEM setup." + exit +fi + if ipmctl show -dimm; then ipmctl show -goal ipmctl show -region @@ -60,12 +76,7 @@ if ipmctl show -dimm; then fi fi else - counter=0 - for ib in /sys/class/net/ib*; do - ((counter++)) || true - ip addr show "$ib" - done - if "$counter" -ge 2; then + if [ "$ib_count" -ge 2 ]; then # All of our CI nodes with two ib adapters should have PMEM DIMMs echo 'No PMEM DIMM devices found on CI node!' exit 1