Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/bullseye-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -563,13 +563,11 @@ jobs:
STAGE_TAGS+=",provider"
if [[ '${{ matrix.stage }}' = *\ Verbs\ * ]]; then
FTEST_ARG+=' --provider ofi+verbs'
INST_RPMS+=' mercury-libfabric'
elif [[ '${{ matrix.stage }}' = *\ UCX\ * ]]; then
FTEST_ARG+=' --provider ucx+dc_x'
INST_RPMS+=' mercury-ucx'
elif [[ '${{ matrix.stage }}' = *\ TCP\ * ]]; then
FTEST_ARG+=' --provider ofi+tcp'
INST_RPMS+=' mercury-libfabric'
else
echo 'Unknown provider in ${{ matrix.stage }}'
exit 1
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/rpm-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -572,13 +572,11 @@ jobs:
STAGE_TAGS+=",provider"
if [[ '${{ matrix.stage }}' = *\ Verbs\ * ]]; then
FTEST_ARG+=' --provider ofi+verbs'
INST_RPMS+=' mercury-libfabric'
elif [[ '${{ matrix.stage }}' = *\ UCX\ * ]]; then
FTEST_ARG+=' --provider ucx+dc_x'
INST_RPMS+=' mercury-ucx'
elif [[ '${{ matrix.stage }}' = *\ TCP\ * ]]; then
FTEST_ARG+=' --provider ofi+tcp'
INST_RPMS+=' mercury-libfabric'
else
echo 'Unknown provider in ${{ matrix.stage }}'
exit 1
Expand Down
6 changes: 2 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1048,8 +1048,7 @@ pipeline {
steps {
job_step_update(
testRpm(inst_repos: daosRepos(),
daos_pkg_version: daosPackagesVersion(next_version()),
inst_rpms: 'mercury-libfabric')
daos_pkg_version: daosPackagesVersion(next_version()))
)
}
post {
Expand Down Expand Up @@ -1101,8 +1100,7 @@ pipeline {
} */
job_step_update(
testRpm(inst_repos: daosRepos(),
daos_pkg_version: daosPackagesVersion(next_version()),
inst_rpms: 'mercury-libfabric')
daos_pkg_version: daosPackagesVersion(next_version()))
)
}
post {
Expand Down
4 changes: 2 additions & 2 deletions ci/provisioning/post_provision_config_common.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#
# Copyright 2021-2023 Intel Corporation.
# Copyright 2025-2026 Hewlett Packard Enterprise Development LP
# Copyright 2025 Hewlett Packard Enterprise Development LP
#
# SPDX-License-Identifier: BSD-2-Clause-Patent
#
Expand Down Expand Up @@ -32,7 +32,7 @@ fi
# shellcheck disable=SC1091
. /etc/os-release
# shellcheck disable=SC2034
EXCLUDE_UPGRADE=mercury,mercury-\*,daos,daos-\*
EXCLUDE_UPGRADE=mercury,daos,daos-\*
if rpm -qa | grep mlnx; then
# packages not to allow upgrading if MLNX OFED is installed
EXCLUDE_UPGRADE+=,openmpi,\*mlnx\*,\*ucx\*
Expand Down
8 changes: 1 addition & 7 deletions ci/unit/required_packages.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#!/bin/bash
#
# (C) Copyright 2025 Google LLC
# Copyright 2025-2026 Hewlett Packard Enterprise Development LP
#
# SPDX-License-Identifier: BSD-2-Clause-Patent
#

set -eu

# No longer used but provided by pipeline-lib
Expand All @@ -29,7 +24,6 @@ pkgs="$(utils/rpms/package_version.sh argobots lib) \
$(utils/rpms/package_version.sh libfabric debug) \
$(utils/rpms/package_version.sh mercury dev) \
$(utils/rpms/package_version.sh mercury debug) \
$(utils/rpms/package_version.sh mercury lib mercury_libfabric) \
$(utils/rpms/package_version.sh pmdk lib pmemobj) \
$(utils/rpms/package_version.sh pmdk debug pmemobj) \
$(utils/rpms/package_version.sh pmdk debug pmem) \
Expand Down
74 changes: 0 additions & 74 deletions deps/patches/mercury/0001_dep_versions.patch

This file was deleted.

110 changes: 110 additions & 0 deletions deps/patches/mercury/0001_na_ucx.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c
index 84eb8b0..e4b6676 100644
--- a/src/na/na_ucx.c
+++ b/src/na/na_ucx.c
@@ -614,7 +614,7 @@ na_ucx_addr_map_update(struct na_ucx_class *na_ucx_class,
*/
static na_return_t
na_ucx_addr_map_remove(
- struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key);
+ struct na_ucx_map *na_ucx_map, struct na_ucx_addr *remove_addr);

/**
* Hash connection ID.
@@ -1688,8 +1688,12 @@ na_ucp_listener_conn_cb(ucp_conn_request_h conn_request, void *arg)
.addr = (const struct sockaddr *) &conn_request_attrs.client_address,
.addrlen = sizeof(conn_request_attrs.client_address)};
na_ucx_addr = na_ucx_addr_map_lookup(&na_ucx_class->addr_map, &addr_key);
- NA_CHECK_SUBSYS_ERROR_NORET(addr, na_ucx_addr != NULL, error,
- "An entry is already present for this address");
+
+ if (na_ucx_addr != NULL) {
+ NA_LOG_SUBSYS_WARNING(addr,
+ "An entry is already present for this address");
+ na_ucx_addr_map_remove(&na_ucx_class->addr_map, na_ucx_addr);
+ }

/* Insert new entry and create new address */
na_ret = na_ucx_addr_map_insert(na_ucx_class, &na_ucx_class->addr_map,
@@ -1937,10 +1941,14 @@ na_ucp_ep_error_cb(
static void
na_ucp_ep_close(ucp_ep_h ep)
{
- ucs_status_ptr_t status_ptr = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FORCE);
+ const ucp_request_param_t close_params = {
+ .op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS,
+ .flags = UCP_EP_CLOSE_FLAG_FORCE};
+ ucs_status_ptr_t status_ptr = ucp_ep_close_nbx(ep, &close_params);
+
NA_CHECK_SUBSYS_ERROR_DONE(addr,
status_ptr != NULL && UCS_PTR_IS_ERR(status_ptr),
- "ucp_ep_close_nb() failed (%s)",
+ "ucp_ep_close_nbx() failed (%s)",
ucs_status_string(UCS_PTR_STATUS(status_ptr)));
}

@@ -2722,7 +2730,7 @@ unlock:

/*---------------------------------------------------------------------------*/
static na_return_t
-na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key)
+na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, struct na_ucx_addr *remove_addr)
{
struct na_ucx_addr *na_ucx_addr = NULL;
na_return_t ret = NA_SUCCESS;
@@ -2731,13 +2739,14 @@ na_ucx_addr_map_remove(struct na_ucx_map *na_ucx_map, ucs_sock_addr_t *addr_key)
hg_thread_rwlock_wrlock(&na_ucx_map->lock);

na_ucx_addr = hg_hash_table_lookup(
- na_ucx_map->key_map, (hg_hash_table_key_t) addr_key);
- if (na_ucx_addr == HG_HASH_TABLE_NULL)
+ na_ucx_map->key_map, (hg_hash_table_key_t) &remove_addr->addr_key);
+
+ if (na_ucx_addr == HG_HASH_TABLE_NULL || na_ucx_addr->ucp_ep != remove_addr->ucp_ep)
goto unlock;

/* Remove addr key from primary map */
rc = hg_hash_table_remove(
- na_ucx_map->key_map, (hg_hash_table_key_t) addr_key);
+ na_ucx_map->key_map, (hg_hash_table_key_t) &na_ucx_addr->addr_key);
NA_CHECK_SUBSYS_ERROR(addr, rc != 1, unlock, ret, NA_NOENTRY,
"hg_hash_table_remove() failed");

@@ -2841,7 +2850,7 @@ na_ucx_addr_release(struct na_ucx_addr *na_ucx_addr)
NA_UCX_PRINT_ADDR_KEY_INFO("Removing address", &na_ucx_addr->addr_key);

na_ucx_addr_map_remove(
- &na_ucx_addr->na_ucx_class->addr_map, &na_ucx_addr->addr_key);
+ &na_ucx_addr->na_ucx_class->addr_map, na_ucx_addr);
}

if (na_ucx_addr->ucp_ep != NULL) {
@@ -3023,6 +3032,18 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,

/* There is no need to have a fully resolved address to start an RMA.
* This is only necessary for two-sided communication. */
+ /* The above assumption is now in question, so the following will resolve
+ * the address if required. */
+
+ /* Check addr to ensure the EP for that addr is still valid */
+ if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
+ ret = na_ucx_addr_map_update(
+ na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr);
+ NA_CHECK_SUBSYS_NA_ERROR(
+ addr, error, ret, "Could not update NA UCX address");
+ }
+ NA_CHECK_SUBSYS_ERROR(msg, na_ucx_addr->ucp_ep == NULL, error, ret,
+ NA_ADDRNOTAVAIL, "UCP endpoint is NULL for that address");

/* TODO UCX requires the remote key to be bound to the origin, do we need a
* new API? */
@@ -3061,6 +3082,9 @@ na_ucx_rma_key_resolve(ucp_ep_h ep, struct na_ucx_mem_handle *na_ucx_mem_handle,

hg_thread_mutex_lock(&na_ucx_mem_handle->rkey_unpack_lock);

+ NA_CHECK_SUBSYS_ERROR(
+ mem, ep == NULL, error, ret, NA_INVALID_ARG, "Invalid endpoint (%p)", ep);
+
switch (hg_atomic_get32(&na_ucx_mem_handle->type)) {
case NA_UCX_MEM_HANDLE_REMOTE_PACKED: {
ucs_status_t status = ucp_ep_rkey_unpack(ep,
64 changes: 64 additions & 0 deletions deps/patches/mercury/0002_na_ucx_ep_flush.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c
index 6e9c3b0..2f157da 100644
--- a/src/na/na_ucx.c
+++ b/src/na/na_ucx.c
@@ -441,6 +441,12 @@ na_ucp_ep_create(ucp_worker_h worker, ucp_ep_params_t *ep_params,
static void
na_ucp_ep_error_cb(void *arg, ucp_ep_h ep, ucs_status_t status);

+/**
+ * Flush endpoint.
+ */
+static ucs_status_ptr_t
+na_ucp_ep_flush(ucp_ep_h ep);
+
/**
* Close endpoint.
*/
@@ -1940,6 +1946,21 @@ na_ucp_ep_error_cb(
na_ucx_addr_ref_decr(na_ucx_addr);
}

+/*---------------------------------------------------------------------------*/
+static ucs_status_ptr_t
+na_ucp_ep_flush(ucp_ep_h ep)
+{
+ const ucp_request_param_t flush_params = {
+ .op_attr_mask = 0};
+ ucs_status_ptr_t status_ptr = ucp_ep_flush_nbx(ep, &flush_params);
+
+ NA_CHECK_SUBSYS_ERROR_DONE(addr,
+ status_ptr != NULL && UCS_PTR_IS_ERR(status_ptr),
+ "ucp_ep_flush_nb() failed (%s)",
+ ucs_status_string(UCS_PTR_STATUS(status_ptr)));
+ return status_ptr;
+}
+
/*---------------------------------------------------------------------------*/
static void
na_ucp_ep_close(ucp_ep_h ep)
@@ -2859,8 +2880,23 @@ na_ucx_addr_release(struct na_ucx_addr *na_ucx_addr)
if (na_ucx_addr->ucp_ep != NULL) {
/* NB. for deserialized addresses that are not "connected" addresses, do
* not close the EP */
- if (na_ucx_addr->worker_addr == NULL)
+ if (na_ucx_addr->worker_addr == NULL) {
+ if (!na_ucx_addr->na_ucx_class->ucp_listener) {
+ ucs_status_ptr_t status_ptr = na_ucp_ep_flush(na_ucx_addr->ucp_ep);
+
+ if (UCS_PTR_IS_PTR(status_ptr)) {
+ ucs_status_t status;
+
+ do {
+ ucp_worker_progress(na_ucx_addr->na_ucx_class->ucp_worker);
+ status = ucp_request_check_status(status_ptr);
+ } while (status == UCS_INPROGRESS);
+ ucp_request_free(status_ptr);
+ }
+ }
+
na_ucp_ep_close(na_ucx_addr->ucp_ep);
+ }
na_ucx_addr->ucp_ep = NULL;
}

Loading