Skip to content

Commit

Permalink
hi
Browse files Browse the repository at this point in the history
  • Loading branch information
danehlim committed Sep 19, 2024
1 parent 6cb5505 commit 4777487
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions ecs-init/docker/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package docker
import (
"bytes"
"encoding/json"
"errors"
"io"
"os"
"os/exec"
Expand Down Expand Up @@ -235,6 +236,39 @@ func (c *client) findAgentContainer() (string, error) {

// StartAgent starts the Agent in Docker and returns the exit code from the container
func (c *client) StartAgent() (int, error) {
// To reproduce NVIDIA GPU device availability race condition, launch an ECS-Optimized GPU AMI with below userdata:
// ```
// #!/bin/bash
// # Remove NVIDIA modules, stop nvidia-persistenced, and remove NVIDIA GPU device files.
// sudo modprobe -r nvidia_drm
// sudo modprobe -r nvidia_modeset
// sudo modprobe -r nvidia_uvm
// sudo systemctl stop nvidia-persistenced
// sudo modprobe -r nvidia
// sudo rm -rf /dev/nvidia*
// ```
//
// This makes sure that NVIDIA GPU device files are not present on the instance during PRESTART action.
//
// Once START action is reached, we must perform the below to add back NVIDIA modules, start nvidia-persistenced,
// and regenerate NVIDIA GPU device files. This must be done prior to environment variables from files being loaded
// as part of StartAgent().
generateAndRunCommandAndLogError("modprobe", "nvidia")
generateAndRunCommandAndLogError("systemctl", "start", "nvidia-persistenced")
generateAndRunCommandAndLogError("modprobe", "nvidia_uvm")
generateAndRunCommandAndLogError("modprobe", "nvidia_modeset")
generateAndRunCommandAndLogError("modprobe", "nvidia_drm")
generateAndRunCommandAndLogError("nvidia-smi")

// In this NVIDIA GPU device availability race condition, NVIDIA GPU info file creation never gets triggered during
// PRESTART action, causing ECS Agent to later run into the below error when attempting to initialize its NVIDIA
// GPU manager:
// msg="Config for GPU support is enabled, but GPU information is not found; continuing without it"
// module=nvidia_gpu_manager_unix.go

// Wait a conservative amount of time here for NVIDIA GPU device files to be present on the instance.
time.Sleep(45 * time.Second)

envVarsFromFiles := c.LoadEnvVars()

hostConfig := c.getHostConfig(envVarsFromFiles)
Expand Down Expand Up @@ -373,6 +407,9 @@ func (c *client) LoadEnvVars() map[string]string {
for envKey, envValue := range c.loadCustomInstanceEnvVars() {
if envKey == config.GPUSupportEnvVar && envValue == "true" {
if !nvidiaGPUDevicesPresent() {
// TODO: In the event env variable with key config.GPUSupportEnvVar has value "true",
// we should implement a mechanism to wait an adequate time for NVIDIA GPU devices to be present
// (and/or after some timeout) before continuing.
log.Warn("No GPU devices found, ignoring the GPU support config")
continue
}
Expand Down Expand Up @@ -624,3 +661,21 @@ func isDomainJoined() bool {

return true
}

func generateAndRunCommandAndLogError(name string, arg ...string) {
cmd := exec.Command(name, arg...)
err := runCmd(cmd)
if err != nil {
log.Errorf("error running command \"%s %s\": %v", cmd.Path, strings.Join(cmd.Args, " "), err)
}
}

func runCmd(cmd *exec.Cmd) error {
var stderr bytes.Buffer
cmd.Stderr = &stderr
err := cmd.Run()
if err == nil {
return err
}
return errors.New(stderr.String())
}

0 comments on commit 4777487

Please sign in to comment.