From 9ea7f78c586298bc6a26d1d215683e0ce3801096 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 30 Oct 2024 13:33:57 +0000 Subject: [PATCH] multinode tweaks --- benchmarks/llm/recipes/full_finetune_distributed.py | 5 ++++- milabench/_version.py | 6 +++--- milabench/remote.py | 6 +++--- milabench/system.py | 8 ++++++-- scripts/article/run_rocm.sh | 4 +++- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/benchmarks/llm/recipes/full_finetune_distributed.py b/benchmarks/llm/recipes/full_finetune_distributed.py index 3a51842da..c80c652d6 100755 --- a/benchmarks/llm/recipes/full_finetune_distributed.py +++ b/benchmarks/llm/recipes/full_finetune_distributed.py @@ -97,7 +97,10 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - + # import os + # import torchcompat.core as acc + # self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0"))) + self._device = utils.get_device(device=cfg.device) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) diff --git a/milabench/_version.py b/milabench/_version.py index 3973a23a5..a3f4e1b45 100644 --- a/milabench/_version.py +++ b/milabench/_version.py @@ -1,5 +1,5 @@ """This file is generated, do not modify""" -__tag__ = "v1.0.0_RC1-17-gbaf5304" -__commit__ = "baf53044e78d0989600359e9496e9aae682bf640" -__date__ = "2024-10-10 16:12:31 +0000" +__tag__ = "v1.0.0_RC1-18-g784b38e" +__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179" +__date__ = "2024-10-18 15:58:46 +0000" diff --git a/milabench/remote.py b/milabench/remote.py index 7e1eef85c..c92166fdd 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -100,7 +100,7 @@ def worker_commands(pack, worker_plan, setup_for="worker"): def sshnode(node, cmd): host = node["ip"] user = node["user"] - port = node["sshport"] + port = node.get("sshport", 22) return SSHCommand(cmd, user=user, host=host, port=port) @@ -124,7 +124,6 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: nodes = pack.config["system"]["nodes"] copy = [] - node_packs = [] copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for) @@ -132,7 +131,8 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: for i, node in enumerate(nodes): if should_run_for(node, setup_for): - install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER)) + node_pack = worker_pack(pack, node) + install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER)) return SequenceCommand( copy_source, diff --git a/milabench/system.py b/milabench/system.py index 421fd3f0a..7eaefb467 100644 --- a/milabench/system.py +++ b/milabench/system.py @@ -456,12 +456,16 @@ def gethostname(host): def resolve_hostname(ip): try: hostname, _, iplist = socket.gethostbyaddr(ip) - + for ip in iplist: if is_loopback(ip): return hostname, True - return hostname, hostname == socket.gethostname() + # Dell has a weird hostname config + # csctmp-xe9680-12.hpc.local csctmp-xe9680-12 + # print(hostname, socket.gethostname()) + + return socket.gethostname(), hostname.startswith(socket.gethostname()) except: if offline: diff --git a/scripts/article/run_rocm.sh b/scripts/article/run_rocm.sh index 7d9034316..3ce545fa6 100644 --- a/scripts/article/run_rocm.sh +++ b/scripts/article/run_rocm.sh @@ -117,7 +117,9 @@ fi pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1 ) -# milabench prepare $ARGS +milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml + +milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml # # Run the benchmakrs