diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt index 8d792d53f..92a55858c 100644 --- a/.pin/constraints-hpu-torch.txt +++ b/.pin/constraints-hpu-torch.txt @@ -529,7 +529,7 @@ optax==0.2.3 # -r benchmarks/purejaxrl/requirements.in # brax # flax -optree==0.12.1 +optree==0.13.0 # via envpool orbax-checkpoint==0.6.4 # via diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index 3a51842da..22aecf944 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -98,7 +98,7 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: diff --git a/benchmarks/llm/recipes/full_finetune_single_device.py b/benchmarks/llm/recipes/full_finetune_single_device.py index 98322579f..b92eb2110 100755 --- a/benchmarks/llm/recipes/full_finetune_single_device.py +++ b/benchmarks/llm/recipes/full_finetune_single_device.py @@ -97,7 +97,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor # enabled necessary features such as gradient scaling. diff --git a/benchmarks/llm/recipes/lora_finetune_distributed.py b/benchmarks/llm/recipes/lora_finetune_distributed.py index 18b736fbf..2ebbb5794 100755 --- a/benchmarks/llm/recipes/lora_finetune_distributed.py +++ b/benchmarks/llm/recipes/lora_finetune_distributed.py @@ -108,7 +108,7 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py index cf5256ead..91630e0d1 100755 --- a/benchmarks/llm/recipes/lora_finetune_single_device.py +++ b/benchmarks/llm/recipes/lora_finetune_single_device.py @@ -101,8 +101,9 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - - self._device = utils.get_device(device=cfg.device) + import torchcompat.core as accelerator + + self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0])) # Reduced precision logic self._dtype = utils.get_dtype(cfg.dtype, device=self._device) # fp16 precision is explicitly disabled as it is not supported in this diff --git a/benchmarks/torchatari/requirements.hpu.txt b/benchmarks/torchatari/requirements.hpu.txt index 185b96b9f..6d7369dfc 100644 --- a/benchmarks/torchatari/requirements.hpu.txt +++ b/benchmarks/torchatari/requirements.hpu.txt @@ -192,7 +192,7 @@ omegaconf==2.3.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # voir -optree==0.12.1 +optree==0.13.0 # via # -c .pin/../.pin/constraints-hpu-torch.txt # envpool diff --git a/config/base.yaml b/config/base.yaml index d7926799f..3e1a1aa27 100644 --- a/config/base.yaml +++ b/config/base.yaml @@ -566,6 +566,7 @@ llm-lora-single: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true + device={device_name}: true llm-lora-ddp-gpus: @@ -587,7 +588,7 @@ llm-lora-ddp-gpus: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true - + device={device_name}: true llm-lora-ddp-nodes: tags: @@ -610,7 +611,7 @@ llm-lora-ddp-nodes: repo_id="meta-llama/Meta-Llama-3.1-8B": true batch_size=8: true gradient_accumulation_steps=8: true - + device={device_name}: true num_machines: 2 requires_capabilities: - "len(nodes) >= ${num_machines}" @@ -636,7 +637,8 @@ llm-lora-mp-gpus: repo_id="meta-llama/Meta-Llama-3.1-70B": true batch_size=8: true gradient_accumulation_steps=1: true - + device={device_name}: true + llm-full-mp-gpus: inherits: _llm tags: @@ -658,7 +660,8 @@ llm-full-mp-gpus: safetensors=true: true batch_size=2: true gradient_accumulation_steps=1: true - + device={device_name}: true + llm-full-mp-nodes: tags: - multinode @@ -681,7 +684,8 @@ llm-full-mp-nodes: safetensors=true: true batch_size=2: true gradient_accumulation_steps=1: true - + device={device_name}: true + num_machines: 2 requires_capabilities: - "len(nodes) >= ${num_machines}" diff --git a/constraints/hpu.txt b/constraints/hpu.txt index 277e9f506..9f6fe957d 100644 --- a/constraints/hpu.txt +++ b/constraints/hpu.txt @@ -13,4 +13,4 @@ torchtune<0.3.0 # transformers added torchao support recently # but only the most recent version we do not support transformers<4.45.0 -torchao \ No newline at end of file +torchvision \ No newline at end of file diff --git a/milabench/_version.py b/milabench/_version.py index d3b29fabc..184ce9e95 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-12-g26cfed1" -__commit__ = "26cfed102a1ab0b66ae99074d890215bc74ab441" -__date__ = "2024-10-03 07:55:13 -0700" +__tag__ = "v1.0.0_RC1-13-g8150c67" +__commit__ = "8150c678e5b284acf36fdb08fc4e193b0b2956f2" +__date__ = "2024-10-03 08:50:53 -0700" diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh index fae8a3544..c240340fa 100644 --- a/scripts/article/run_hpu.sh +++ b/scripts/article/run_hpu.sh @@ -38,7 +38,7 @@ install_prepare() { export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench" fi - git clone https://github.com/huggingface/optimum-habana.git + git clone https://github.com/huggingface/optimum-habana.git -b v1.13.2 # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh @@ -65,11 +65,11 @@ install_prepare() { ( . $BENCHMARK_VENV/bin/activate which pip - pip install -e $MILABENCH_WORDIR/optimum-habana + pip install --no-deps -e $MILABENCH_WORDIR/optimum-habana # Override dependencies for HPU # benchmarks need pytorch - pip uninstall torch torchvision torchaudio + pip uninstall torch torchvision torchaudio -y export HABANALABS_VIRTUAL_DIR=$BENCHMARK_VENV ./habanalabs-installer.sh install -t dependencies --venv -y ./habanalabs-installer.sh install -t pytorch --venv -y @@ -98,6 +98,9 @@ fi if [ "$MILABENCH_PREPARE" -eq 0 ]; then cd $MILABENCH_WORDIR + # python -c "import torch; print(torch.__version__)" + # milabench prepare $ARGS + # # Run the benchmakrs milabench run $ARGS