Skip to content

Commit

Permalink
Retry GPU devices check during env vars load if instance supports GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
danehlim committed Oct 8, 2024
1 parent 9eb274a commit 096d34a
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 19 deletions.
68 changes: 49 additions & 19 deletions ecs-init/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,21 @@ const (
networkMode = "host"
// usernsMode specifies the userns mode to create the agent container
usernsMode = "host"
// minBackoffDuration specifies the minimum backoff duration for ping to
// pingDockerSocketMinBackoffDuration specifies the minimum backoff duration for ping to
// return a success response from the docker socket
minBackoffDuration = time.Second
// maxBackoffDuration specifies the maximum backoff duration for ping to
pingDockerSocketMinBackoffDuration = time.Second
// pingDockerSocketMaxBackoffDuration specifies the maximum backoff duration for ping to
// return a success response from docker socket
maxBackoffDuration = 3 * time.Second
// backoffJitterMultiple specifies the backoff jitter multiplier
pingDockerSocketMaxBackoffDuration = 3 * time.Second
// pingDockerSocketBackoffJitterMultiple specifies the backoff jitter multiplier
// coefficient when pinging the docker socket
backoffJitterMultiple = 0.2
// backoffMultiple specifies the backoff multiplier coefficient when
pingDockerSocketBackoffJitterMultiple = 0.2
// pingDockerSocketBackoffMultiple specifies the backoff multiplier coefficient when
// pinging the docker socket
backoffMultiple = 2
// maxRetries specifies the maximum number of retries for ping to return
pingDockerSocketBackoffMultiple = 2
// pingDockerSocketMaxRetries specifies the maximum number of retries for ping to return
// a successful response from the docker socket
maxRetries = 5
pingDockerSocketMaxRetries = 5
// DefaultCgroupMountpoint is the default mount point for the cgroup subsystem
DefaultCgroupMountpoint = "/sys/fs/cgroup"
// pluginSocketFilesDir specifies the location of UNIX domain socket files of
Expand Down Expand Up @@ -110,6 +110,13 @@ const (
execConfigRelativePath = "config"

execAgentLogRelativePath = "/exec"

// nvidiaGPUDevicesPresentRetryTime specifies the duration of time to wait before retrying to check if NVIDIA
// GPU devices are present.
nvidiaGPUDevicesPresentRetryTime = 3 * time.Second
// nvidiaGPUDevicesPresentMaxRetries specifies the maximum number of retries to attempt for checking if NVIDIA
// GPU devices are present.
nvidiaGPUDevicesPresentMaxRetries = 10
)

// Do NOT include "CAP_" in capability string
Expand All @@ -133,12 +140,13 @@ var pluginDirs = []string{
}

var (
dockerOnce sync.Once
dockerClient *client
dockerClientErr error
isPathValid = defaultIsPathValid
execCommand = exec.Command
execLookPath = exec.LookPath
dockerOnce sync.Once
dockerClient *client
dockerClientErr error
isPathValid = defaultIsPathValid
execCommand = exec.Command
execLookPath = exec.LookPath
checkNvidiaGPUDevicesPresence = nvidiaGPUDevicesPresent
)

// client enables business logic for running the Agent inside Docker
Expand All @@ -153,8 +161,8 @@ func Client() (*client, error) {
// Create a backoff for pinging the docker socket. This should result in 17-19
// seconds of delay in the worst-case between different actions that depend on
// docker
pingBackoff := backoff.NewBackoff(minBackoffDuration, maxBackoffDuration, backoffJitterMultiple,
backoffMultiple, maxRetries)
pingBackoff := backoff.NewBackoff(pingDockerSocketMinBackoffDuration, pingDockerSocketMaxBackoffDuration, pingDockerSocketBackoffJitterMultiple,
pingDockerSocketBackoffMultiple, pingDockerSocketMaxRetries)
cl, err := newDockerClient(godockerClientFactory{}, pingBackoff)
if err != nil {
dockerClientErr = err
Expand Down Expand Up @@ -372,7 +380,12 @@ func (c *client) LoadEnvVars() map[string]string {
// merge in instance-specific environment variables
for envKey, envValue := range c.loadCustomInstanceEnvVars() {
if envKey == config.GPUSupportEnvVar && envValue == "true" {
if !nvidiaGPUDevicesPresent() {
// If environment variable with key `config.GPUSupportEnvVar` has value `true`, it is expected that
// NVIDIA GPU devices should eventually be present on the instance. Thus, do NOT give up and continue right
// away in the event that NVIDIA GPU devices are not yet present. Call
// `nvidiaGPUDevicesPresentWithRetries()` (instead of `nvidiaGPUDevicesPresent`) to retry and wait for a
// reasonable amount of time for NVIDIA GPU devices to be present before continuing.
if !nvidiaGPUDevicesPresentWithRetries() {
log.Warn("No GPU devices found, ignoring the GPU support config")
continue
}
Expand Down Expand Up @@ -571,6 +584,23 @@ func nvidiaGPUDevicesPresent() bool {
return true
}

// nvidiaGPUDevicesPresentWithRetries checks if NVIDIA GPU devices are present in the instance. It retries if NVIDIA
// GPU devices are not yet present every `nvidiaGPUDevicesPresentRetryTime` interval of time up to a maximum of
// `nvidiaGPUDevicesPresentMaxRetries` retries.
func nvidiaGPUDevicesPresentWithRetries() bool {
devicesPresent := checkNvidiaGPUDevicesPresence()
for i := 0; i < nvidiaGPUDevicesPresentMaxRetries; i++ {
if devicesPresent {
break
}
log.Warnf("NVIDIA GPU devices are not yet present, retrying (attempt %d/%d) in %d nanoseconds",
i+1, nvidiaGPUDevicesPresentMaxRetries, nvidiaGPUDevicesPresentRetryTime)
time.Sleep(nvidiaGPUDevicesPresentRetryTime)
devicesPresent = checkNvidiaGPUDevicesPresence()
}
return devicesPresent
}

var MatchFilePatternForGPU = FilePatternMatchForGPU

func FilePatternMatchForGPU(pattern string) ([]string, error) {
Expand Down
52 changes: 52 additions & 0 deletions ecs-init/docker/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,58 @@ func TestStartAgentWithGPUConfigNoDevices(t *testing.T) {
assert.NoError(t, err)
}

func TestNvidiaGPUDevicesPresentWithRetries(t *testing.T) {
testCases := []struct {
name string
nvidiaGPUDevicesWillBePresent bool
numRetriesForNvidiaGPUDevicesToBePresent int
}{
{
name: "NVIDIA GPU devices present without retrying",
nvidiaGPUDevicesWillBePresent: true,
numRetriesForNvidiaGPUDevicesToBePresent: 0,
},
{
name: "NVIDIA GPU devices present after retrying",
nvidiaGPUDevicesWillBePresent: true,
numRetriesForNvidiaGPUDevicesToBePresent: 3,
},
{
name: "NVIDIA GPU devices not present after retrying",
nvidiaGPUDevicesWillBePresent: false,
},
}

defer func() {
checkNvidiaGPUDevicesPresence = nvidiaGPUDevicesPresent
}()

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
numTimesNvidiaGPUDevicesPresenceChecked := 0
checkNvidiaGPUDevicesPresence = func() bool {
if !tc.nvidiaGPUDevicesWillBePresent {
return false
}
numTimesNvidiaGPUDevicesPresenceChecked++
if numTimesNvidiaGPUDevicesPresenceChecked > tc.numRetriesForNvidiaGPUDevicesToBePresent {
return true
}
return false
}

devicesPresent := nvidiaGPUDevicesPresentWithRetries()
if !tc.nvidiaGPUDevicesWillBePresent {
assert.False(t, devicesPresent)
} else {
assert.Equal(t, tc.numRetriesForNvidiaGPUDevicesToBePresent, numTimesNvidiaGPUDevicesPresenceChecked-1)
assert.True(t, devicesPresent)
}
})
}

}

func TestGetContainerConfigWithFileOverrides(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
Expand Down

0 comments on commit 096d34a

Please sign in to comment.