diff --git a/.pin/constraints-hpu-torch.txt b/.pin/constraints-hpu-torch.txt
index 8d792d53f..92a55858c 100644
--- a/.pin/constraints-hpu-torch.txt
+++ b/.pin/constraints-hpu-torch.txt
@@ -529,7 +529,7 @@ optax==0.2.3
     #   -r benchmarks/purejaxrl/requirements.in
     #   brax
     #   flax
-optree==0.12.1
+optree==0.13.0
     # via envpool
 orbax-checkpoint==0.6.4
     # via
diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py
index 3a51842da..22aecf944 100755
--- a/benchmarks/llm/recipes/full_finetune_distributed.py
+++ b/benchmarks/llm/recipes/full_finetune_distributed.py
@@ -98,7 +98,7 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
 
     def __init__(self, cfg: DictConfig) -> None:
 
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
diff --git a/benchmarks/llm/recipes/full_finetune_single_device.py b/benchmarks/llm/recipes/full_finetune_single_device.py
index 98322579f..b92eb2110 100755
--- a/benchmarks/llm/recipes/full_finetune_single_device.py
+++ b/benchmarks/llm/recipes/full_finetune_single_device.py
@@ -97,7 +97,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # Disable for fp16, as we haven't validated "full" fp16 with this recipe, nor
         # enabled necessary features such as gradient scaling.
diff --git a/benchmarks/llm/recipes/lora_finetune_distributed.py b/benchmarks/llm/recipes/lora_finetune_distributed.py
index 18b736fbf..2ebbb5794 100755
--- a/benchmarks/llm/recipes/lora_finetune_distributed.py
+++ b/benchmarks/llm/recipes/lora_finetune_distributed.py
@@ -108,7 +108,7 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
diff --git a/benchmarks/llm/recipes/lora_finetune_single_device.py b/benchmarks/llm/recipes/lora_finetune_single_device.py
index cf5256ead..91630e0d1 100755
--- a/benchmarks/llm/recipes/lora_finetune_single_device.py
+++ b/benchmarks/llm/recipes/lora_finetune_single_device.py
@@ -101,8 +101,9 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-
-        self._device = utils.get_device(device=cfg.device)
+        import torchcompat.core as accelerator
+         
+        self._device = accelerator.fetch_device(int(os.getenv("HABANA_VISIBLE_MODULES", "0").split(",")[0]))
         # Reduced precision logic
         self._dtype = utils.get_dtype(cfg.dtype, device=self._device)
         # fp16 precision is explicitly disabled as it is not supported in this
diff --git a/benchmarks/torchatari/requirements.hpu.txt b/benchmarks/torchatari/requirements.hpu.txt
index 185b96b9f..6d7369dfc 100644
--- a/benchmarks/torchatari/requirements.hpu.txt
+++ b/benchmarks/torchatari/requirements.hpu.txt
@@ -192,7 +192,7 @@ omegaconf==2.3.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   voir
-optree==0.12.1
+optree==0.13.0
     # via
     #   -c .pin/../.pin/constraints-hpu-torch.txt
     #   envpool
diff --git a/config/base.yaml b/config/base.yaml
index d7926799f..3e1a1aa27 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -566,6 +566,7 @@ llm-lora-single:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
+    device={device_name}: true
 
 
 llm-lora-ddp-gpus:
@@ -587,7 +588,7 @@ llm-lora-ddp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
 
 llm-lora-ddp-nodes:
   tags:
@@ -610,7 +611,7 @@ llm-lora-ddp-nodes:
     repo_id="meta-llama/Meta-Llama-3.1-8B": true
     batch_size=8: true
     gradient_accumulation_steps=8: true
-
+    device={device_name}: true
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
@@ -636,7 +637,8 @@ llm-lora-mp-gpus:
     repo_id="meta-llama/Meta-Llama-3.1-70B": true
     batch_size=8: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
 llm-full-mp-gpus:
   inherits: _llm
   tags:
@@ -658,7 +660,8 @@ llm-full-mp-gpus:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
 llm-full-mp-nodes:
   tags:
     - multinode
@@ -681,7 +684,8 @@ llm-full-mp-nodes:
     safetensors=true: true
     batch_size=2: true
     gradient_accumulation_steps=1: true
-
+    device={device_name}: true
+  
   num_machines: 2
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
diff --git a/constraints/hpu.txt b/constraints/hpu.txt
index 277e9f506..9f6fe957d 100644
--- a/constraints/hpu.txt
+++ b/constraints/hpu.txt
@@ -13,4 +13,4 @@ torchtune<0.3.0
 # transformers added torchao support recently
 # but only the most recent version we do not support
 transformers<4.45.0
-torchao
\ No newline at end of file
+torchvision
\ No newline at end of file
diff --git a/milabench/_version.py b/milabench/_version.py
index d3b29fabc..184ce9e95 100644
--- a/milabench/_version.py
+++ b/milabench/_version.py
@@ -1,5 +1,5 @@
 """This file is generated, do not modify"""
 
-__tag__ = "v1.0.0_RC1-12-g26cfed1"
-__commit__ = "26cfed102a1ab0b66ae99074d890215bc74ab441"
-__date__ = "2024-10-03 07:55:13 -0700"
+__tag__ = "v1.0.0_RC1-13-g8150c67"
+__commit__ = "8150c678e5b284acf36fdb08fc4e193b0b2956f2"
+__date__ = "2024-10-03 08:50:53 -0700"
diff --git a/scripts/article/run_hpu.sh b/scripts/article/run_hpu.sh
index fae8a3544..c240340fa 100644
--- a/scripts/article/run_hpu.sh
+++ b/scripts/article/run_hpu.sh
@@ -38,7 +38,7 @@ install_prepare() {
         export MILABENCH_SOURCE="$MILABENCH_WORDIR/milabench"
     fi
 
-    git clone https://github.com/huggingface/optimum-habana.git
+    git clone https://github.com/huggingface/optimum-habana.git -b v1.13.2
 
     # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.15.1/habanalabs-installer.sh
     # wget -nv https://vault.habana.ai/artifactory/gaudi-installer/1.16.1/habanalabs-installer.sh
@@ -65,11 +65,11 @@ install_prepare() {
     (
         . $BENCHMARK_VENV/bin/activate
         which pip
-        pip install -e $MILABENCH_WORDIR/optimum-habana
+        pip install --no-deps -e $MILABENCH_WORDIR/optimum-habana 
 
         # Override dependencies for HPU
         # benchmarks need pytorch
-        pip uninstall torch torchvision torchaudio
+        pip uninstall torch torchvision torchaudio -y
         export HABANALABS_VIRTUAL_DIR=$BENCHMARK_VENV
         ./habanalabs-installer.sh install -t dependencies --venv -y
         ./habanalabs-installer.sh install -t pytorch --venv -y
@@ -98,6 +98,9 @@ fi
 if [ "$MILABENCH_PREPARE" -eq 0 ]; then
     cd $MILABENCH_WORDIR
 
+    # python -c "import torch; print(torch.__version__)"
+    # milabench prepare $ARGS
+
     #
     #   Run the benchmakrs
     milabench run $ARGS