From a72f73a30bc5663102c10d7ede90c0c9e1f1b92c Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Thu, 8 Aug 2024 06:54:49 +0200 Subject: [PATCH] Fix cloud multi-nodes * Copy ssh key to allow connections from master to workers * Use local ip for manager's ip such that workers can find it and connect to it * Fix incompatibility between pandas and numpy 2.0.0 --- .github/workflows/cloud-ci.yml | 57 ++++++++++++++++---- config/cloud-multinodes-system.yaml | 4 ++ config/cloud-system.yaml | 8 +++ config/examples/test.yaml | 12 ++--- milabench/commands/__init__.py | 13 +++-- milabench/config.py | 1 + milabench/remote.py | 5 -- milabench/scripts/covalent/__main__.py | 75 +++++++++++++++++++++----- milabench/system.py | 4 ++ poetry.lock | 4 +- pyproject.toml | 4 +- 11 files changed, 145 insertions(+), 42 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index b1c6e7fc6..363636965 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -14,11 +14,12 @@ jobs: cloud-tests: strategy: fail-fast: true + max-parallel: 1 matrix: + system: [1x_gpu, 2x_gpu] include: - arch: cuda exclude: "no-cuda" - run_on: azure__a100 # - arch: rocm # exclude : "no-rocm" @@ -27,7 +28,7 @@ jobs: # Cancel previous jobs if a new version was pushed concurrency: - group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}" + group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}" cancel-in-progress: true defaults: @@ -36,13 +37,14 @@ jobs: env: MILABENCH_CONFIG: "config/standard.yaml" - MILABENCH_SYSTEM: "config/cloud-system.yaml" + MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml" MILABENCH_BASE: "output" MILABENCH_ARGS: "" MILABENCH_DASH: "no" ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" AZURE_CORE_OUTPUT: none + _MULTI_GPUS: "dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus" steps: - uses: actions/checkout@v3 @@ -90,23 +92,51 @@ jobs: - name: setup cloud run: | + case "${{ matrix.system }}" in + "1x_gpu") + export MILABENCH_SYSTEM="config/cloud-system.yaml" + export RUN_ON="azure__a100" + export SELECT= + export EXCLUDES="--exclude $_MULTI_GPUS,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + "2x_gpu") + export MILABENCH_SYSTEM="config/cloud-system.yaml" + export RUN_ON="azure__a100_x2" + export SELECT="--select $_MULTI_GPUS" + export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + "2x_node") + export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml" + export RUN_ON="azure__a100" + export SELECT="--select llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" + ;; + *) + exit 1 + ;; + esac + poetry run milabench cloud \ --setup \ - --run-on ${{ matrix.run_on }} \ - --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.${{ matrix.run_on }} - echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.${{ matrix.run_on }}" >>$GITHUB_ENV + --run-on $RUN_ON \ + --system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON + + echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV + echo "SELECT=$SELECT" >>$GITHUB_ENV + echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV + echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV - name: install benchmarks run: | - poetry run milabench install --variant ${{ matrix.arch }} + poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES - name: prepare benchmarks run: | - poetry run milabench prepare + poetry run milabench prepare $SELECT $EXCLUDES - name: run benchmarks run: | - poetry run milabench run + poetry run milabench run $SELECT $EXCLUDES - name: Summary run: | @@ -118,6 +148,11 @@ jobs: env: GITHUB_TOKEN: ${{ github.token }} + - name: DEBUG state file + if: always() + run: | + cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate + - name: teardown cloud if: always() run: | @@ -127,10 +162,10 @@ jobs: fi poetry run milabench cloud \ --teardown \ - --run-on ${{ matrix.run_on }} \ + --run-on $RUN_ON \ --all - - name: debug logs + - name: DEBUG logs if: always() run: | cat ~/.cache/covalent/covalent_ui.log diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml index e5dc14f2b..b6d4d6bee 100644 --- a/config/cloud-multinodes-system.yaml +++ b/config/cloud-multinodes-system.yaml @@ -5,6 +5,7 @@ system: - name: manager # Use 1.1.1.1 as an ip placeholder ip: 1.1.1.1 + port: 5000 # Use this node as the master node or not main: true # User to use in remote milabench operations @@ -21,11 +22,14 @@ system: username: ubuntu size: Standard_NC24ads_A100_v4 location: eastus2 + disk_size: 512 azure__a100_x2: username: ubuntu size: Standard_NC48ads_A100_v4 location: eastus2 + disk_size: 512 azure__a10_x2: username: ubuntu size: Standard_NV72ads_A10_v5 location: eastus2 + disk_size: 512 diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml index 2d1a049ad..2cbe41b0b 100644 --- a/config/cloud-system.yaml +++ b/config/cloud-system.yaml @@ -16,11 +16,19 @@ system: username: ubuntu size: Standard_NC24ads_A100_v4 location: eastus2 + disk_size: 512 azure__a100_x2: username: ubuntu size: Standard_NC48ads_A100_v4 location: eastus2 + disk_size: 512 + azure__a10: + username: ubuntu + size: Standard_NV36ads_A10_v5 + location: eastus2 + disk_size: 512 azure__a10_x2: username: ubuntu size: Standard_NV72ads_A10_v5 location: eastus2 + disk_size: 512 diff --git a/config/examples/test.yaml b/config/examples/test.yaml index 6e155a0bf..4f74ac33b 100644 --- a/config/examples/test.yaml +++ b/config/examples/test.yaml @@ -7,18 +7,18 @@ _defaults: test: inherits: _defaults - group: test_remote - install_group: test_remote - definition: ../../benchmarks/_template + group: simple + install_group: test + definition: ../../benchmarks/_templates/simple plan: method: njobs n: 1 testing: inherits: _defaults - definition: ../../benchmarks/_template - group: test_remote_2 - install_group: test_remote_2 + definition: ../../benchmarks/_templates/stdout + group: stdout + install_group: test plan: method: njobs n: 1 diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index 0de00f756..6c166132b 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -939,6 +939,13 @@ def _get_main_and_workers(self): def _argv(self, **_) -> List: manager, nodes = self._get_main_and_workers() + # Find local ip such that workers can connect to the port + for manager_ip in manager["ipaddrlist"]: + if ":" in manager_ip or manager_ip == "127.0.0.1": + continue + if all(str.isnumeric(n) for n in manager_ip.split(".")): + break + num_machines = max(1, len(nodes) + 1) # Cant do that maybe this run is constrained @@ -976,9 +983,9 @@ def _argv(self, **_) -> List: f"--machine_rank={self.rank}", f"--num_machines={num_machines}", *deepspeed_argv, - f"--gradient_accumulation_steps={self.pack.config.get('gradient_accumulation_steps', 1)}", - f"--num_cpu_threads_per_process={cpu_per_process}", - f"--main_process_ip={manager['ip']}", + f"--gradient_accumulation_steps={self.pack.config['gradient_accumulation_steps']}", + f"--num_cpu_threads_per_process={self.pack.config['argv']['--cpus_per_gpu']}", + f"--main_process_ip={manager_ip}", f"--main_process_port={manager['port']}", f"--num_processes={nproc}", *self.accelerate_argv, diff --git a/milabench/config.py b/milabench/config.py index 726f9f540..4936054dc 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -150,6 +150,7 @@ def build_config(*config_files): for layer in _config_layers(config_files): all_configs = merge(all_configs, layer) + all_configs.setdefault("*", {}) all_configs["*"]["hash"] = compute_config_hash(all_configs) all_configs = build_matrix_bench(all_configs) diff --git a/milabench/remote.py b/milabench/remote.py index b657f98c5..78e8ad736 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -2,16 +2,11 @@ import os import sys -import yaml - -from milabench.fs import XPath - from . import ROOT_FOLDER from .commands import ( CmdCommand, Command, ListCommand, - SCPCommand, SequenceCommand, SSHCommand, VoidCommand, diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py index 995cc856f..e5bedc11a 100644 --- a/milabench/scripts/covalent/__main__.py +++ b/milabench/scripts/covalent/__main__.py @@ -89,15 +89,14 @@ def _popen(cmd, *args, _env=None, **kwargs): return_code = 0 try: if args.setup: - dispatch_id = ct.dispatch( - ct.lattice(executor.get_connection_attributes), disable_run=False - )() - - result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + result = ct.dispatch_sync( + ct.lattice(executor.get_connection_attributes) + )().result assert result and result[0] all_connection_attributes, _ = result + master_host:str = None for hostname, connection_attributes in all_connection_attributes.items(): print(f"hostname::>{hostname}") for attribute,value in connection_attributes.items(): @@ -105,17 +104,65 @@ def _popen(cmd, *args, _env=None, **kwargs): continue print(f"{attribute}::>{value}") - if argv: - dispatch_id = ct.dispatch( - ct.lattice( - lambda:ct.electron(_popen, executor=executor)(argv) - ), - disable_run=False - )() + master_host = master_host or hostname + + if len(all_connection_attributes) > 1: + # Add master node to known host to avoid unknown host error + # The authenticity of host '[hostname] ([IP address])' can't be established. + new_host = subprocess.run( + ["ssh-keyscan", master_host], + stdout=subprocess.PIPE, + check=True + ).stdout.decode("utf8") + known_hosts = pathlib.Path("~/.ssh/known_hosts").expanduser() + with known_hosts.open("at") as _f: + _f.write(new_host) + + # Add ssh file to master node to allow connections to worker + # nodes + ssh_key_file = all_connection_attributes[master_host]["ssh_key_file"] + fn = pathlib.Path(ssh_key_file) + result = ct.dispatch_sync( + ct.lattice(executor.cp_to_remote) + )(f".ssh/{fn.name.split('.')[0]}", str(fn)) + + assert result.status == ct.status.COMPLETED - result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + if argv: + result = ct.dispatch_sync( + ct.lattice(executor.list_running_instances) + )().result + + assert result + + dispatch_ids = set() + for connection_attributes in result.get( + (executor.state_prefix, executor.state_id), + {"env": None} + ).values(): + kwargs = { + **_get_executor_kwargs(args), + **connection_attributes + } + del kwargs["env"] + + _executor:ct.executor.BaseExecutor = executor_cls(**kwargs) + + dispatch_ids.add( + ct.dispatch( + ct.lattice( + lambda:ct.electron(_popen, executor=_executor)(argv) + ), + disable_run=False + )() + ) + + for dispatch_id in dispatch_ids: + result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + + _return_code, _, _ = result if result is not None else (1, "", "") + return_code = return_code or _return_code - return_code, _, _ = result if result is not None else (1, "", "") finally: if args.teardown: result = executor.stop_cloud_instance().result diff --git a/milabench/system.py b/milabench/system.py index 8b9711514..8d137d642 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -258,6 +258,10 @@ def _resolve_ip(ip): if not offline: # Resolve the IP try: + # Workaround error with `gethostbyaddr` on azure DNS (like + # `inmako.eastus2.cloudapp.azure.com`). A proper fix might be a + # correct network config in terraform. + # socket.herror: [Errno 1] Unknown host hostname, aliaslist, ipaddrlist = socket.gethostbyname_ex(ip) lazy_raise = None diff --git a/poetry.lock b/poetry.lock index ec0f16753..b910db129 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "alabaster" @@ -2190,4 +2190,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "59901f6d97314b2a67cac2cf9c4300cb5bde2feba01b0198b20c8ac477adae05" +content-hash = "e8817803c68c0acc023e37a954027d5870b08d0e29cf46e8dd673df7e9d6994d" diff --git a/pyproject.toml b/pyproject.toml index 6a1693bf6..e7f784793 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,9 @@ blessed = "^1.19.1" pathspec = "^0.9.0" cp-template = "^0.3.0" pandas = ">=1.4.2" -numpy = ">=1.23.0,<2.0.0" +# Work around for compatibility issue between numpy 2.0.0 and pandas +# https://github.com/numpy/numpy/issues/26710 +numpy = "^1.23.0" pynvml = "^11.4.1" tqdm = "^4.64.1" pip-tools = "^7.4.1"