Un-pin torch version in nv-torch-latest back to latest and skip test_…

…compile_zero tests on v100 (#5459) Torch updating to 2.3.0 broke some test_compile_zero tests, we pinned it, @tohtana pushed fixes in #5463, this should un-pin and move us back to the latest. Failing test that indicates the generated code cannot run bf16 on V100 [here](https://github.com/microsoft/DeepSpeed/actions/runs/8838672379/job/24270349996?pr=5459#step:8:5157).
microsoft · Apr 29, 2024 · f32ad3e · f32ad3e
1 parent 059bb20
commit f32ad3e
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 5 deletions.
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.3" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.3" --cuda_ver="11.8"
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
@@ -12,7 +12,7 @@
 
 from unit.runtime.compile.util import compare_loss
 from unit.common import DistributedTest
-from unit.util import bf16_required_version_check
+from unit.util import bf16_required_version_check, skip_on_arch
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=2.1),
                                 reason="Compile tests requires Pytorch version 2.1 or above")
@@ -26,9 +26,11 @@ class TestZeRO(DistributedTest):
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
     @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
     def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device):
+        if dtype == torch.bfloat16:
+            skip_on_arch(min_arch=8)
         if dtype == torch.bfloat16 and not bf16_required_version_check():
             pytest.skip(
-                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+                "DeepSpeed BFloat16 tests need NCCL >= 2.10.3, CUDA >=11.0, and HW support for BFloat16 to run correctly"
             )
         if get_accelerator().device_name() == "cpu":
             pytest.skip("CPU does not support this test yet")