Skip to content

Commit

Permalink
UCT: Use fabric info for cuda-ipc reachabilit check
Browse files Browse the repository at this point in the history
  • Loading branch information
brminich committed Oct 3, 2024
1 parent b1a268b commit 2dbaf37
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 46 deletions.
10 changes: 6 additions & 4 deletions src/ucp/wireup/wireup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1249,10 +1249,12 @@ int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags,
ucp_context_h context = ep->worker->context;
ucp_worker_iface_t *wiface = ucp_worker_iface(ep->worker, rsc_index);
uct_iface_is_reachable_params_t params = {
.field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR |
UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR,
.device_addr = ae->dev_addr,
.iface_addr = ae->iface_addr,
.field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR |
UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR |
UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH,
.device_addr = ae->dev_addr,
.iface_addr = ae->iface_addr,
.device_addr_length = ae->dev_addr_len
};

if (info_str != NULL) {
Expand Down
10 changes: 9 additions & 1 deletion src/uct/api/v2/uct_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,8 @@ typedef enum {
UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR = UCS_BIT(1), /**< iface_addr field */
UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING = UCS_BIT(2), /**< info_string field */
UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING_LENGTH = UCS_BIT(3), /**< info_string_length field */
UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4) /**< scope field */
UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4), /**< scope field */
UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH = UCS_BIT(5),
} uct_iface_is_reachable_field_mask_t;


Expand Down Expand Up @@ -611,6 +612,13 @@ typedef struct uct_iface_is_reachable_params {
* Reachability scope.
*/
uct_iface_reachability_scope_t scope;

/**
* Device address length. If not provided, the transport will assume a
* default minimal length according to the address buffer contents.
*/
size_t device_addr_length;

} uct_iface_is_reachable_params_t;


Expand Down
24 changes: 24 additions & 0 deletions src/uct/cuda/base/cuda_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,30 @@ uct_cuda_base_query_md_resources(uct_component_t *component,
num_resources_p);
}

int uct_cuda_base_is_coherent()
{
int coherent = 0;
#if HAVE_CUDA_FABRIC
CUdevice cu_device;
ucs_status_t status;

status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0));
if (status != UCS_OK) {
return 0;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(
cuDeviceGetAttribute(&coherent,
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
cu_device));
if (status != UCS_OK) {
return 0;
}

#endif
return coherent;
}

UCS_STATIC_INIT
{
/* coverity[check_return] */
Expand Down
8 changes: 8 additions & 0 deletions src/uct/cuda/base/cuda_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,12 @@ uct_cuda_base_query_devices(uct_md_h md,
*/
ucs_status_t uct_cuda_base_check_device_name(const uct_iface_params_t *params);


/**
* Check whether the platform is coherent.
*
* @return 1 if coherent, or 0 otherwise.
*/
int uct_cuda_base_is_coherent();

#endif
105 changes: 67 additions & 38 deletions src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@
#include <pthread.h>
#include <nvml.h>


typedef struct {
uint32_t clique_id;
uint8_t cluster_uuid[NVML_GPU_FABRIC_UUID_LEN];
} UCS_S_PACKED uct_cuda_ipc_device_mnnvl_addr_t;


typedef struct {
uint64_t system_uuid;
uct_cuda_ipc_device_mnnvl_addr_t mnnvl_addr;
} UCS_S_PACKED uct_cuda_ipc_device_addr_t;


static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = {

{"", "", NULL,
Expand Down Expand Up @@ -63,7 +76,22 @@ static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_iface_t)(uct_iface_t*);
ucs_status_t uct_cuda_ipc_iface_get_device_address(uct_iface_t *tl_iface,
uct_device_addr_t *addr)
{
*(uint64_t*)addr = ucs_get_system_id();
uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface,
uct_cuda_ipc_iface_t);
uct_cuda_ipc_device_addr_t *dev_addr = (uct_cuda_ipc_device_addr_t*)addr;
uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md,
uct_cuda_ipc_md_t);

dev_addr->system_uuid = ucs_get_system_id();

#if HAVE_CUDA_FABRIC
if (md->enable_mnnvl) {
memcpy(dev_addr->mnnvl_addr.cluster_uuid, &md->fabric_info.clusterUuid,
sizeof(dev_addr->mnnvl_addr.cluster_uuid));
dev_addr->mnnvl_addr.clique_id = md->fabric_info.cliqueId;
}
#endif

return UCS_OK;
}

Expand All @@ -74,56 +102,53 @@ static ucs_status_t uct_cuda_ipc_iface_get_address(uct_iface_h tl_iface,
return UCS_OK;
}

static int uct_cuda_ipc_iface_is_mnnvl_supported(uct_cuda_ipc_md_t *md)
{
#if HAVE_CUDA_FABRIC
CUdevice cu_device;
int coherent;
ucs_status_t status;

status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0));
if (status != UCS_OK) {
return 0;
}

status = UCT_CUDADRV_FUNC_LOG_ERR(
cuDeviceGetAttribute(&coherent,
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
cu_device));
if (status != UCS_OK) {
return 0;
}

return coherent && (md->enable_mnnvl != UCS_NO);
#endif

return 0;
}

static int
uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface,
const uct_iface_is_reachable_params_t *params)
{
uct_base_iface_t *base_iface = ucs_derived_of(tl_iface, uct_base_iface_t);
uct_cuda_ipc_md_t *md = ucs_derived_of(base_iface->md, uct_cuda_ipc_md_t);
uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md,
uct_cuda_ipc_md_t);
const uct_cuda_ipc_device_addr_t *dev_addr;
size_t device_addr_len;
int same_uuid;

if (!uct_iface_is_reachable_params_addrs_valid(params)) {
return 0;
}

if (getpid() == *(pid_t*)params->iface_addr) {
dev_addr = (const uct_cuda_ipc_device_addr_t *)UCS_PARAM_VALUE(
UCT_IFACE_IS_REACHABLE_FIELD, params, device_addr,
DEVICE_ADDR, NULL);
device_addr_len = UCS_PARAM_VALUE(UCT_IFACE_IS_REACHABLE_FIELD, params,
device_addr_length, DEVICE_ADDR_LENGTH,
sizeof(uint64_t));
same_uuid = ucs_get_system_id() == dev_addr->system_uuid;

if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) {
uct_iface_fill_info_str_buf(params, "same process");
return 0;
}

/* Either multi-node NVLINK should be supported or iface has to be on the
* same node for cuda-ipc to be reachable */
if ((ucs_get_system_id() != *((const uint64_t*)params->device_addr)) &&
!uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
#if HAVE_CUDA_FABRIC
if ((device_addr_len != sizeof(uint64_t)) && md->enable_mnnvl) {
if (memcmp(&dev_addr->mnnvl_addr.cluster_uuid,
&md->fabric_info.clusterUuid,
sizeof(dev_addr->mnnvl_addr.cluster_uuid))) {
uct_iface_fill_info_str_buf(params, "cluster uuid doesn't match");
return 0;
}
if (dev_addr->mnnvl_addr.clique_id != md->fabric_info.cliqueId){
uct_iface_fill_info_str_buf(params, "clique id doesn't match");
return 0;
}
} else
#endif
if (!same_uuid) {
uct_iface_fill_info_str_buf(params,
"different system id %"PRIx64" vs %"PRIx64"",
ucs_get_system_id(),
*((const uint64_t*)params->device_addr));
ucs_get_system_id(), dev_addr->system_uuid);

return 0;
}

Expand Down Expand Up @@ -239,11 +264,15 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface,
uct_iface_attr_t *iface_attr)
{
uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md,
uct_cuda_ipc_md_t);

uct_base_iface_query(&iface->super.super, iface_attr);

iface_attr->iface_addr_len = sizeof(pid_t);
iface_attr->device_addr_len = sizeof(uint64_t);
iface_attr->device_addr_len = (md->enable_mnnvl) ?
sizeof(uct_cuda_ipc_device_addr_t) :
sizeof(uint64_t);
iface_attr->ep_addr_len = 0;
iface_attr->max_conn_priv = 0;
iface_attr->cap.flags = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
Expand Down Expand Up @@ -608,7 +637,7 @@ uct_cuda_ipc_query_devices(
uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM;
uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t);

if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
if (md->enable_mnnvl) {
dev_type = UCT_DEVICE_TYPE_NET;
}

Expand Down
58 changes: 56 additions & 2 deletions src/uct/cuda/cuda_ipc/cuda_ipc_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,59 @@ uct_cuda_ipc_mem_dereg(uct_md_h md, const uct_md_mem_dereg_params_t *params)
return UCS_OK;
}

static int
uct_cuda_ipc_md_init_fabric_info(uct_cuda_ipc_md_t *md,
ucs_ternary_auto_value_t mnnvl_enable)
{
int mnnvl_supported = 0;
#if HAVE_CUDA_FABRIC
nvmlDevice_t device;
ucs_status_t status;

if (!uct_cuda_base_is_coherent() || (mnnvl_enable == UCS_NO)) {
goto err;
}

status = UCT_NVML_FUNC(nvmlInit_v2(), UCS_LOG_LEVEL_DIAG);
if (status != UCS_OK) {
goto err;
}

status = UCT_NVML_FUNC_LOG_ERR(nvmlDeviceGetHandleByIndex(0, &device));
if (status != UCS_OK) {
goto err_sd;
}

md->fabric_info.version = nvmlGpuFabricInfo_v2;
status = UCT_NVML_FUNC_LOG_ERR(
nvmlDeviceGetGpuFabricInfoV(device, &md->fabric_info));
if (status != UCS_OK) {
goto err_sd;
}

ucs_diag("Fabric_info: clique %u healthmask %u state %u",
md->fabric_info.cliqueId, md->fabric_info.healthMask,
md->fabric_info.state);


if ((md->fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED) ||
(md->fabric_info.cliqueId == UINT_MAX)) {
goto err_sd;
}

mnnvl_supported = 1;

err_sd:
UCT_NVML_FUNC_LOG_ERR(nvmlShutdown());
err:
#endif
if ((mnnvl_enable == UCS_YES) && !mnnvl_supported) {
ucs_warn("multi-node NVLINK support is requested but not supported");
}

return mnnvl_supported;
}

static void uct_cuda_ipc_md_close(uct_md_h md)
{
ucs_free(md);
Expand Down Expand Up @@ -421,9 +474,10 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name,

md->super.ops = &md_ops;
md->super.component = &uct_cuda_ipc_component.super;
md->enable_mnnvl = ipc_config->enable_mnnvl;
*md_p = &md->super;

md->enable_mnnvl = uct_cuda_ipc_md_init_fabric_info(
md, ipc_config->enable_mnnvl);

return UCS_OK;
}

Expand Down
5 changes: 4 additions & 1 deletion src/uct/cuda/cuda_ipc/cuda_ipc_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ typedef CUipcMemHandle uct_cuda_ipc_md_handle_t;
*/
typedef struct uct_cuda_ipc_md {
uct_md_t super; /**< Domain info */
ucs_ternary_auto_value_t enable_mnnvl;
int enable_mnnvl;
#if HAVE_CUDA_FABRIC
nvmlGpuFabricInfoV_t fabric_info;
#endif
} uct_cuda_ipc_md_t;


Expand Down

0 comments on commit 2dbaf37

Please sign in to comment.