diff --git a/src/ucp/wireup/wireup.c b/src/ucp/wireup/wireup.c index 009feb46426..4a22c34c73f 100644 --- a/src/ucp/wireup/wireup.c +++ b/src/ucp/wireup/wireup.c @@ -1249,10 +1249,12 @@ int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags, ucp_context_h context = ep->worker->context; ucp_worker_iface_t *wiface = ucp_worker_iface(ep->worker, rsc_index); uct_iface_is_reachable_params_t params = { - .field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR | - UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR, - .device_addr = ae->dev_addr, - .iface_addr = ae->iface_addr, + .field_mask = UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR | + UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR | + UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH, + .device_addr = ae->dev_addr, + .iface_addr = ae->iface_addr, + .device_addr_length = ae->dev_addr_len }; if (info_str != NULL) { diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h index a6649903483..3c9c7707e2c 100644 --- a/src/uct/api/v2/uct_v2.h +++ b/src/uct/api/v2/uct_v2.h @@ -273,7 +273,8 @@ typedef enum { UCT_IFACE_IS_REACHABLE_FIELD_IFACE_ADDR = UCS_BIT(1), /**< iface_addr field */ UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING = UCS_BIT(2), /**< info_string field */ UCT_IFACE_IS_REACHABLE_FIELD_INFO_STRING_LENGTH = UCS_BIT(3), /**< info_string_length field */ - UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4) /**< scope field */ + UCT_IFACE_IS_REACHABLE_FIELD_SCOPE = UCS_BIT(4), /**< scope field */ + UCT_IFACE_IS_REACHABLE_FIELD_DEVICE_ADDR_LENGTH = UCS_BIT(5), } uct_iface_is_reachable_field_mask_t; @@ -611,6 +612,13 @@ typedef struct uct_iface_is_reachable_params { * Reachability scope. */ uct_iface_reachability_scope_t scope; + + /** + * Device address length. If not provided, the transport will assume a + * default minimal length according to the address buffer contents. + */ + size_t device_addr_length; + } uct_iface_is_reachable_params_t; diff --git a/src/uct/cuda/base/cuda_md.c b/src/uct/cuda/base/cuda_md.c index da8e7126af8..10192744445 100644 --- a/src/uct/cuda/base/cuda_md.c +++ b/src/uct/cuda/base/cuda_md.c @@ -101,6 +101,30 @@ uct_cuda_base_query_md_resources(uct_component_t *component, num_resources_p); } +int uct_cuda_base_is_coherent() +{ + int coherent = 0; +#if HAVE_CUDA_FABRIC + CUdevice cu_device; + ucs_status_t status; + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0)); + if (status != UCS_OK) { + return 0; + } + + status = UCT_CUDADRV_FUNC_LOG_ERR( + cuDeviceGetAttribute(&coherent, + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + cu_device)); + if (status != UCS_OK) { + return 0; + } + +#endif + return coherent; +} + UCS_STATIC_INIT { /* coverity[check_return] */ diff --git a/src/uct/cuda/base/cuda_md.h b/src/uct/cuda/base/cuda_md.h index 23eb233796f..9c1d2660ab4 100644 --- a/src/uct/cuda/base/cuda_md.h +++ b/src/uct/cuda/base/cuda_md.h @@ -42,4 +42,12 @@ uct_cuda_base_query_devices(uct_md_h md, */ ucs_status_t uct_cuda_base_check_device_name(const uct_iface_params_t *params); + +/** + * Check whether the platform is coherent. + * + * @return 1 if coherent, or 0 otherwise. + */ +int uct_cuda_base_is_coherent(); + #endif diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 079c331f19e..fccb3af2eab 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -20,6 +20,19 @@ #include #include + +typedef struct { + uint32_t clique_id; + uint8_t cluster_uuid[NVML_GPU_FABRIC_UUID_LEN]; +} UCS_S_PACKED uct_cuda_ipc_device_mnnvl_addr_t; + + +typedef struct { + uint64_t system_uuid; + uct_cuda_ipc_device_mnnvl_addr_t mnnvl_addr; +} UCS_S_PACKED uct_cuda_ipc_device_addr_t; + + static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { {"", "", NULL, @@ -63,7 +76,22 @@ static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_iface_t)(uct_iface_t*); ucs_status_t uct_cuda_ipc_iface_get_device_address(uct_iface_t *tl_iface, uct_device_addr_t *addr) { - *(uint64_t*)addr = ucs_get_system_id(); + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, + uct_cuda_ipc_iface_t); + uct_cuda_ipc_device_addr_t *dev_addr = (uct_cuda_ipc_device_addr_t*)addr; + uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md, + uct_cuda_ipc_md_t); + + dev_addr->system_uuid = ucs_get_system_id(); + +#if HAVE_CUDA_FABRIC + if (md->enable_mnnvl) { + memcpy(dev_addr->mnnvl_addr.cluster_uuid, &md->fabric_info.clusterUuid, + sizeof(dev_addr->mnnvl_addr.cluster_uuid)); + dev_addr->mnnvl_addr.clique_id = md->fabric_info.cliqueId; + } +#endif + return UCS_OK; } @@ -74,56 +102,53 @@ static ucs_status_t uct_cuda_ipc_iface_get_address(uct_iface_h tl_iface, return UCS_OK; } -static int uct_cuda_ipc_iface_is_mnnvl_supported(uct_cuda_ipc_md_t *md) -{ -#if HAVE_CUDA_FABRIC - CUdevice cu_device; - int coherent; - ucs_status_t status; - - status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cu_device, 0)); - if (status != UCS_OK) { - return 0; - } - - status = UCT_CUDADRV_FUNC_LOG_ERR( - cuDeviceGetAttribute(&coherent, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - cu_device)); - if (status != UCS_OK) { - return 0; - } - - return coherent && (md->enable_mnnvl != UCS_NO); -#endif - - return 0; -} - static int uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface, const uct_iface_is_reachable_params_t *params) { - uct_base_iface_t *base_iface = ucs_derived_of(tl_iface, uct_base_iface_t); - uct_cuda_ipc_md_t *md = ucs_derived_of(base_iface->md, uct_cuda_ipc_md_t); + uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md, + uct_cuda_ipc_md_t); + const uct_cuda_ipc_device_addr_t *dev_addr; + size_t device_addr_len; + int same_uuid; if (!uct_iface_is_reachable_params_addrs_valid(params)) { return 0; } - if (getpid() == *(pid_t*)params->iface_addr) { + dev_addr = (const uct_cuda_ipc_device_addr_t *)UCS_PARAM_VALUE( + UCT_IFACE_IS_REACHABLE_FIELD, params, device_addr, + DEVICE_ADDR, NULL); + device_addr_len = UCS_PARAM_VALUE(UCT_IFACE_IS_REACHABLE_FIELD, params, + device_addr_length, DEVICE_ADDR_LENGTH, + sizeof(uint64_t)); + same_uuid = ucs_get_system_id() == dev_addr->system_uuid; + + if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid) { uct_iface_fill_info_str_buf(params, "same process"); return 0; } - /* Either multi-node NVLINK should be supported or iface has to be on the - * same node for cuda-ipc to be reachable */ - if ((ucs_get_system_id() != *((const uint64_t*)params->device_addr)) && - !uct_cuda_ipc_iface_is_mnnvl_supported(md)) { +#if HAVE_CUDA_FABRIC + if ((device_addr_len != sizeof(uint64_t)) && md->enable_mnnvl) { + if (memcmp(&dev_addr->mnnvl_addr.cluster_uuid, + &md->fabric_info.clusterUuid, + sizeof(dev_addr->mnnvl_addr.cluster_uuid))) { + uct_iface_fill_info_str_buf(params, "cluster uuid doesn't match"); + return 0; + } + if (dev_addr->mnnvl_addr.clique_id != md->fabric_info.cliqueId){ + uct_iface_fill_info_str_buf(params, "clique id doesn't match"); + return 0; + } + } else +#endif + if (!same_uuid) { uct_iface_fill_info_str_buf(params, "different system id %"PRIx64" vs %"PRIx64"", - ucs_get_system_id(), - *((const uint64_t*)params->device_addr)); + ucs_get_system_id(), dev_addr->system_uuid); + return 0; } @@ -239,11 +264,15 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t); + uct_cuda_ipc_md_t *md = ucs_derived_of(iface->super.super.md, + uct_cuda_ipc_md_t); uct_base_iface_query(&iface->super.super, iface_attr); iface_attr->iface_addr_len = sizeof(pid_t); - iface_attr->device_addr_len = sizeof(uint64_t); + iface_attr->device_addr_len = (md->enable_mnnvl) ? + sizeof(uct_cuda_ipc_device_addr_t) : + sizeof(uint64_t); iface_attr->ep_addr_len = 0; iface_attr->max_conn_priv = 0; iface_attr->cap.flags = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE | @@ -608,7 +637,7 @@ uct_cuda_ipc_query_devices( uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM; uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t); - if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) { + if (md->enable_mnnvl) { dev_type = UCT_DEVICE_TYPE_NET; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c index 5922f6e9f57..47c242c72e0 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c @@ -392,6 +392,59 @@ uct_cuda_ipc_mem_dereg(uct_md_h md, const uct_md_mem_dereg_params_t *params) return UCS_OK; } +static int +uct_cuda_ipc_md_init_fabric_info(uct_cuda_ipc_md_t *md, + ucs_ternary_auto_value_t mnnvl_enable) +{ + int mnnvl_supported = 0; +#if HAVE_CUDA_FABRIC + nvmlDevice_t device; + ucs_status_t status; + + if (!uct_cuda_base_is_coherent() || (mnnvl_enable == UCS_NO)) { + goto err; + } + + status = UCT_NVML_FUNC(nvmlInit_v2(), UCS_LOG_LEVEL_DIAG); + if (status != UCS_OK) { + goto err; + } + + status = UCT_NVML_FUNC_LOG_ERR(nvmlDeviceGetHandleByIndex(0, &device)); + if (status != UCS_OK) { + goto err_sd; + } + + md->fabric_info.version = nvmlGpuFabricInfo_v2; + status = UCT_NVML_FUNC_LOG_ERR( + nvmlDeviceGetGpuFabricInfoV(device, &md->fabric_info)); + if (status != UCS_OK) { + goto err_sd; + } + + ucs_diag("Fabric_info: clique %u healthmask %u state %u", + md->fabric_info.cliqueId, md->fabric_info.healthMask, + md->fabric_info.state); + + + if ((md->fabric_info.state != NVML_GPU_FABRIC_STATE_COMPLETED) || + (md->fabric_info.cliqueId == UINT_MAX)) { + goto err_sd; + } + + mnnvl_supported = 1; + +err_sd: + UCT_NVML_FUNC_LOG_ERR(nvmlShutdown()); +err: +#endif + if ((mnnvl_enable == UCS_YES) && !mnnvl_supported) { + ucs_warn("multi-node NVLINK support is requested but not supported"); + } + + return mnnvl_supported; +} + static void uct_cuda_ipc_md_close(uct_md_h md) { ucs_free(md); @@ -421,9 +474,10 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name, md->super.ops = &md_ops; md->super.component = &uct_cuda_ipc_component.super; - md->enable_mnnvl = ipc_config->enable_mnnvl; *md_p = &md->super; - + md->enable_mnnvl = uct_cuda_ipc_md_init_fabric_info( + md, ipc_config->enable_mnnvl); + return UCS_OK; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h index da2a83d6d80..64cb6982981 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h @@ -42,7 +42,10 @@ typedef CUipcMemHandle uct_cuda_ipc_md_handle_t; */ typedef struct uct_cuda_ipc_md { uct_md_t super; /**< Domain info */ - ucs_ternary_auto_value_t enable_mnnvl; + int enable_mnnvl; +#if HAVE_CUDA_FABRIC + nvmlGpuFabricInfoV_t fabric_info; +#endif } uct_cuda_ipc_md_t;