Fix cloud multi-nodes

* Copy ssh key to allow connections from master to workers * Use local ip for manager's ip such that workers can find it and connect to it * Fix incompatibility between pandas and numpy 2.0.0 * Fix diffusion benches permission
mila-iqia · Aug 21, 2024 · 4a750e5 · 4a750e5
1 parent 0f34dd2
commit 4a750e5
Show file tree

Hide file tree

Showing 14 changed files with 185 additions and 56 deletions.
diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
@@ -14,11 +14,12 @@ jobs:
   cloud-tests:
     strategy:
       fail-fast: true
+      max-parallel: 1
       matrix:
+        system: [2x_gpu]
         include:
           - arch: cuda
             exclude: "no-cuda"
-            run_on: azure__a100
           # - arch: rocm
           #   exclude : "no-rocm"
 
@@ -27,7 +28,7 @@ jobs:
 
     # Cancel previous jobs if a new version was pushed
     concurrency:
-      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}"
+      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
       cancel-in-progress: true
 
     defaults:
@@ -36,13 +37,15 @@ jobs:
 
     env:
       MILABENCH_CONFIG: "config/standard.yaml"
-      MILABENCH_SYSTEM: "config/cloud-system.yaml"
+      MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
       MILABENCH_BASE: "output"
       MILABENCH_ARGS: ""
       MILABENCH_DASH: "no"
       ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
       ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
       AZURE_CORE_OUTPUT: none
+      _MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus"
+      _MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes"
 
     steps:
       - uses: actions/checkout@v3
@@ -90,23 +93,51 @@ jobs:
 
       - name: setup cloud
         run: |
+          case "${{ matrix.system }}" in
+            "1x_gpu")
+              export MILABENCH_SYSTEM="config/cloud-system.yaml"
+              export RUN_ON="azure__a100"
+              export SELECT=
+              export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            "2x_gpu")
+              export MILABENCH_SYSTEM="config/cloud-system.yaml"
+              export RUN_ON="azure__a100_x2"
+              export SELECT="--select $_MULTI_GPUS"
+              export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            "2x_node")
+              export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
+              export RUN_ON="azure__a100"
+              export SELECT="--select $_MULTI_NODES"
+              export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
+              ;;
+            *)
+              exit 1
+              ;;
+          esac
+
           poetry run milabench cloud \
             --setup \
-            --run-on ${{ matrix.run_on }} \
-            --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }}
-          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV
+            --run-on $RUN_ON \
+            --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
+
+          echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
+          echo "SELECT=$SELECT" >>$GITHUB_ENV
+          echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV
+          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
 
       - name: install benchmarks
         run: |
-          poetry run milabench install --variant ${{ matrix.arch }}
+          poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES
 
       - name: prepare benchmarks
         run: |
-          poetry run milabench prepare
+          poetry run milabench prepare $SELECT $EXCLUDES
 
       - name: run benchmarks
         run: |
-          poetry run milabench run
+          poetry run milabench run $SELECT $EXCLUDES
 
       - name: Summary
         run: |
@@ -118,6 +149,11 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ github.token }}
 
+      - name: DEBUG state file
+        if: always()
+        run: |
+          cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
+
       - name: teardown cloud
         if: always()
         run: |
@@ -127,10 +163,10 @@ jobs:
           fi
           poetry run milabench cloud \
             --teardown \
-            --run-on ${{ matrix.run_on }} \
+            --run-on $RUN_ON \
             --all
 
-      - name: debug logs
+      - name: DEBUG logs
         if: always()
         run: |
           cat ~/.cache/covalent/covalent_ui.log
diff --git a/benchmarks/diffusion/main.py b/benchmarks/diffusion/main.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 from dataclasses import dataclass
 
 from accelerate import Accelerator

diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml
@@ -5,6 +5,7 @@ system:
     - name: manager
       # Use 1.1.1.1 as an ip placeholder
       ip: 1.1.1.1
+      port: 5000
       # Use this node as the master node or not
       main: true
       # User to use in remote milabench operations
@@ -21,11 +22,14 @@ system:
       username: ubuntu
       size: Standard_NC24ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a100_x2:
       username: ubuntu
       size: Standard_NC48ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a10_x2:
       username: ubuntu
       size: Standard_NV72ads_A10_v5
       location: eastus2
+      disk_size: 512
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
@@ -16,11 +16,19 @@ system:
       username: ubuntu
       size: Standard_NC24ads_A100_v4
       location: eastus2
+      disk_size: 512
     azure__a100_x2:
       username: ubuntu
       size: Standard_NC48ads_A100_v4
       location: eastus2
+      disk_size: 512
+    azure__a10:
+      username: ubuntu
+      size: Standard_NV36ads_A10_v5
+      location: eastus2
+      disk_size: 512
     azure__a10_x2:
       username: ubuntu
       size: Standard_NV72ads_A10_v5
       location: eastus2
+      disk_size: 512
diff --git a/config/examples/test.yaml b/config/examples/test.yaml
@@ -7,18 +7,18 @@ _defaults:
 
 test:
   inherits: _defaults
-  group: test_remote
-  install_group: test_remote
-  definition: ../../benchmarks/_template
+  group: simple
+  install_group: test
+  definition: ../../benchmarks/_templates/simple
   plan:
     method: njobs
     n: 1
 
 testing:
   inherits: _defaults
-  definition: ../../benchmarks/_template
-  group: test_remote_2
-  install_group: test_remote_2
+  definition: ../../benchmarks/_templates/stdout
+  group: stdout
+  install_group: test
   plan:
     method: njobs
     n: 1
diff --git a/milabench/cli/run.py b/milabench/cli/run.py
@@ -3,6 +3,7 @@
 
 from coleo import Option, tooled
 
+from milabench.remote import is_remote
 from milabench.utils import validation_layers
 
 from ..common import (
@@ -63,32 +64,43 @@ def arguments():
     return Arguments(run_name, repeat, fulltrace, report, dash, noterm, validations)
 
 
-
 def _fetch_arch(mp):
     try:
         arch = next(iter(mp.packs.values())).config["system"]["arch"]
     except StopIteration:
         print("no selected bench")
         return None
-
+
+
+def _fetch_first_pack(mp):
+    try:
+        return next(iter(mp.packs.values()))
+    except StopIteration:
+        print("no selected bench")
+        return None
+
 
 @tooled
 def cli_run(args=None):
     """Run the benchmarks."""
     if args is None:
         args = arguments()
 
-    layers = validation_names(args.validations)
-
     dash_class = {
         "short": ShortDashFormatter,
         "long": LongDashFormatter,
         "no": None,
     }.get(args.dash, None)
 
     mp = get_multipack(run_name=args.run_name)
+    first_pack = _fetch_first_pack(mp)
     arch = _fetch_arch(mp)
 
+    layers = validation_names(args.validations)
+    if is_remote(first_pack):
+        # Remote execution will never send back rates
+        layers.remove("ensure_rate")
+
     # Initialize the backend here so we can retrieve GPU stats
     init_arch(arch)
 

diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
@@ -674,7 +674,16 @@ def __init__(self, executor: Command, **kwargs) -> None:
         main = self.nodes[0]
 
         # node[port] is for SSH
-        main_host = main["ip"]
+        # Find local ip such that workers can connect to the port
+        for main_host in main["ipaddrlist"]:
+            if ":" in main_host or main_host == "127.0.0.1":
+                continue
+            if all(str.isnumeric(n) for n in main_host.split(".")):
+                break
+        else:
+            main_host = main["ip"]
+        if len(self.nodes) == 1:
+            main_host = "localhost"
         # add them as option so we could tweak them if necessary
         main_port = option("torchrun.port", int, default=29400)
         backend = option("torchrun.backend", str, default="c10d")
@@ -939,6 +948,15 @@ def _get_main_and_workers(self):
     def _argv(self, **_) -> List:
         manager, nodes = self._get_main_and_workers()
 
+        # Find local ip such that workers can connect to the port
+        for manager_ip in manager["ipaddrlist"]:
+            if ":" in manager_ip or manager_ip == "127.0.0.1":
+                continue
+            if all(str.isnumeric(n) for n in manager_ip.split(".")):
+                break
+        else:
+            manager_ip = manager['ip']
+
         num_machines = max(1, len(nodes) + 1)
 
         # Cant do that maybe this run is constrained
@@ -976,9 +994,9 @@ def _argv(self, **_) -> List:
             f"--machine_rank={self.rank}",
             f"--num_machines={num_machines}",
             *deepspeed_argv,
-            f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}",
-            f"--num_cpu_threads_per_process={cpu_per_process}",
-            f"--main_process_ip={manager['ip']}",
+            f"--gradient_accumulation_steps={self.pack.config['gradient_accumulation_steps']}",
+            f"--num_cpu_threads_per_process={self.pack.config['argv']['--cpus_per_gpu']}",
+            f"--main_process_ip={manager_ip}",
             f"--main_process_port={manager['port']}",
             f"--num_processes={nproc}",
             *self.accelerate_argv,

diff --git a/milabench/common.py b/milabench/common.py
@@ -314,18 +314,18 @@ def _read_reports(*runs):
     return all_data
 
 
-def _filter_reports(*reports):
-    all_reports = []
+def _filter_reports(**reports):
+    _reports = {}
 
-    for report in reports:
+    for k, report in reports.items():
         config = next(iter(e for e in report if e["event"] == "config"), None)
         if config is None:
             continue
 
         if config["data"]["name"] != "remote":
-            all_reports.append(report)
+            _reports[k] = report
 
-    return all_reports
+    return _reports
 
 
 def _push_reports(reports_repo, runs):
@@ -356,8 +356,8 @@ def _push_reports(reports_repo, runs):
 
     device_reports = {}
     for run in runs:
-        reports = list(_read_reports(run).values())
-        reports = _filter_reports(*reports)
+        reports = _read_reports(run)
+        reports = list(_filter_reports(**reports).values())
 
         if not reports:
             continue
@@ -392,7 +392,7 @@ def _push_reports(reports_repo, runs):
     for (device, build), reports in device_reports.items():
         reports_dir = XPath(reports_repo.working_tree_dir) / build
         reports = _read_reports(*reports)
-        reports = _filter_reports(*reports.values())
+        reports = _filter_reports(**reports)
         summary = make_summary(reports)
 
         successes = [s["successes"] for s in summary.values()]

diff --git a/milabench/config.py b/milabench/config.py
@@ -150,6 +150,7 @@ def build_config(*config_files):
     for layer in _config_layers(config_files):
         all_configs = merge(all_configs, layer)
 
+    all_configs.setdefault("*", {})
     all_configs["*"]["hash"] = compute_config_hash(all_configs)
 
     all_configs = build_matrix_bench(all_configs)

diff --git a/milabench/remote.py b/milabench/remote.py
@@ -2,16 +2,11 @@
 import os
 import sys
 
-import yaml
-
-from milabench.fs import XPath
-
 from . import ROOT_FOLDER
 from .commands import (
     CmdCommand,
     Command,
     ListCommand,
-    SCPCommand,
     SequenceCommand,
     SSHCommand,
     VoidCommand,
@@ -291,6 +286,6 @@ def milabench_remote_run(pack) -> Command:
 
     argv = sys.argv[2:]
     return SequenceCommand(
-        milabench_remote_command(pack, "run", *argv, run_for="main"),
+        milabench_remote_command(pack, "run", *argv, "--run-name", pack.config["run_name"], run_for="main"),
         milabench_remote_fetch_reports_plan(pack, run_for="main"),
     )