Use HPU device

mila-iqia · Oct 3, 2024 · ac34712 · ac34712
1 parent 8150c67
commit ac34712
Show file tree

Hide file tree

Showing 10 changed files with 27 additions and 19 deletions.
diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt
diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -98,7 +98,7 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
 
     def __init__(self, cfg: DictConfig) -> None:
 
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:

diff --git a/benchmarks/llm/recipes/full_finetune_single_device.py b/benchmarks/llm/recipes/full_finetune_single_device.py
@@ -97,7 +97,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor
         # enabled necessary features such as gradient scaling.

diff --git a/benchmarks/llm/recipes/lora_finetune_distributed.py b/benchmarks/llm/recipes/lora_finetune_distributed.py
@@ -108,7 +108,7 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:

diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py
@@ -101,8 +101,9 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-
-        self._device = utils.get_device(device=cfg.device)
+        import torchcompat.core as accelerator
+
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         # Reduced precision logic
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # fp16 precision is explicitly disabled as it is not supported in this

diff --git a/benchmarks/torchatari/requirements.hpu.txt b/benchmarks/torchatari/requirements.hpu.txt
diff --git a/config/base.yaml b/config/base.yaml
@@ -566,6 +566,7 @@ llm-lora-single:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
+    device={device_name}: true
 
 
 llm-lora-ddp-gpus:
@@ -587,7 +588,7 @@ llm-lora-ddp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
 
 llm-lora-ddp-nodes:
   tags:
@@ -610,7 +611,7 @@ llm-lora-ddp-nodes:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
@@ -636,7 +637,8 @@ llm-lora-mp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-70B": true
     batch_size=8: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+
 llm-full-mp-gpus:
   inherits: _llm
   tags:
@@ -658,7 +660,8 @@ llm-full-mp-gpus:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+
 llm-full-mp-nodes:
   tags:
     - multinode
@@ -681,7 +684,8 @@ llm-full-mp-nodes:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"

diff --git a/constraints/hpu.txt b/constraints/hpu.txt
@@ -13,4 +13,4 @@ torchtune<0.3.0
 # transformers added torchao support recently
 # but only the most recent version we do not support
 transformers<4.45.0
-torchao
+torchvision
diff --git a/milabench/_version.py b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-12-g26cfed1"
-__commit__ = "26cfed102a1ab0b66ae99074d890215bc74ab441"
-__date__ = "2024-10-03 07:55:13 -0700"
+__tag__ = "v1.0.0_RC1-13-g8150c67"
+__commit__ = "8150c678e5b284acf36fdb08fc4e193b0b2956f2"
+__date__ = "2024-10-03 08:50:53 -0700"
diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh
@@ -38,7 +38,7 @@ install_prepare() {
         export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench"
     fi
 
-    git clone https://github.com/huggingface/optimum-habana.git
+    git clone https://github.com/huggingface/optimum-habana.git -b v1.13.2
 
     # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh
     # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh
@@ -65,11 +65,11 @@ install_prepare() {
     (
         . $BENCHMARK_VENV/bin/activate
         which pip
-        pip install -e $MILABENCH_WORDIR/optimum-habana
+        pip install --no-deps -e $MILABENCH_WORDIR/optimum-habana 
 
         # Override dependencies for HPU
         # benchmarks need pytorch
-        pip uninstall torch torchvision torchaudio
+        pip uninstall torch torchvision torchaudio -y
         export HABANALABS_VIRTUAL_DIR=$BENCHMARK_VENV
         ./habanalabs-installer.sh install -t dependencies --venv -y
         ./habanalabs-installer.sh install -t pytorch --venv -y
@@ -98,6 +98,9 @@ fi
 if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     cd $MILABENCH_WORDIR
 
+    # python -c "import torch; print(torch.__version__)"
+    # milabench prepare $ARGS
+
     #
     #   Run the benchmakrs
     milabench run $ARGS