Skip to content

Commit

Permalink
add kmod install script
Browse files Browse the repository at this point in the history
  • Loading branch information
fierlion committed Nov 29, 2023
1 parent 852b682 commit fddc349
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
8 changes: 5 additions & 3 deletions scripts/enable-ecs-agent-gpu-support.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env bash
set -ex

NVIDIA_VERSION="535.54.03"

if [[ $AMI_TYPE != "al2gpu" && $AMI_TYPE != "al2keplergpu" && AMI_TYPE != "opengpu" ]]; then
exit 0
fi
Expand All @@ -9,15 +11,15 @@ if [[ $AMI_TYPE == "nvidiaopen" ]]; then
sudo yum install -y yum-plugin-versionlock \
yum-utils
sudo amazon-linux-extras install epel -y
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"

# disable amzn2 in favor of rh repo
sudo yum-config-manager --disable amzn2-nvidia
sudo yum-config-manager --add-repo=https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
sudo yum-config-manager --enable cuda-rhel7.repo

# install open dkms from rh repo
sudo yum install -y kmod-nvidia-open-dkms \
nvidia-kmod-common
sudo yum install -y nvidia-kmod-common-${NVIDIA_VERSION}

# build nvidia-open kmod tar
DKMS=/usr/sbin/dkms
Expand All @@ -31,7 +33,7 @@ if [[ $AMI_TYPE == "nvidiaopen" ]]; then
sudo cp /var/lib/dkms/${MODULE_NAME}/${MODULE_VERSION}/tarball/*.tar.gz "${DKMS_ARCHIVE_DIR}/${MODULE_NAME}/"

# re-enable amzn2 and clean up
sudo yum remove kmod-nvidia-open-dkms
sudo yum remove -y kmod-nvidia-open-dkms
sudo yum-config-manager --disable cuda-rhel7.repo
sudo rm /etc/yum.repos.d/cuda-rhel7.repo
sudo rm -rf /var/cache/yum
Expand Down
22 changes: 22 additions & 0 deletions scripts/install-nvidia-open-kmod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

set -o errexit
set -o nounset
set -o xtrace

DKMS=/usr/sbin/dkms
DKMS_ARCHIVE_DIR=/var/lib/dkms-archive
NVIDIA_VERSION="535.54.03"
KERNEL_VERSION="$(uname -r)"
MODULE_VERSION=$(${DKMS} status -m nvidia | awk '{print $2}' | tr -d ',:')

${DKMS} uninstall -m nvidia -v ${NVIDIA_VERSION}
NVIDIA_TO_REMOVE="nvidia/${NVIDIA_VERSION}"
${DKMS} remove ${NVIDIA_TO_REMOVE} --all
echo "found nvidia kernel module: ${MODULE_VERSION}"
MODULE_ARCHIVE="${DKMS_ARCHIVE_DIR}/nvidia-open/nvidia-open-${MODULE_VERSION}-kernel${KERNEL_VERSION}-x86_64.dkms.tar.gz"
echo "loading from ${MODULE_ARCHIVE}"
${DKMS} ldtarball ${MODULE_ARCHIVE}
${DKMS} install -m nvidia -v ${NVIDIA_VERSION}
sudo systemctl daemon-reload
/usr/sbin/dkms status -m nvidia

0 comments on commit fddc349

Please sign in to comment.