Skip to content

Commit

Permalink
multinode tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Oct 30, 2024
1 parent 784b38e commit 9ea7f78
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 10 deletions.
5 changes: 4 additions & 1 deletion benchmarks/llm/recipes/full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
"""

def __init__(self, cfg: DictConfig) -> None:

# import os
# import torchcompat.core as acc
# self._device = acc.fetch_device(int(os.getenv("LOCAL_RANK", "0")))

self._device = utils.get_device(device=cfg.device)
self._dtype = utils.get_dtype(cfg.dtype, device=self._device)

Expand Down
6 changes: 3 additions & 3 deletions milabench/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""This file is generated, do not modify"""

__tag__ = "v1.0.0_RC1-17-gbaf5304"
__commit__ = "baf53044e78d0989600359e9496e9aae682bf640"
__date__ = "2024-10-10 16:12:31 +0000"
__tag__ = "v1.0.0_RC1-18-g784b38e"
__commit__ = "784b38e77b90116047e3de893c22c2f7d3225179"
__date__ = "2024-10-18 15:58:46 +0000"
6 changes: 3 additions & 3 deletions milabench/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def worker_commands(pack, worker_plan, setup_for="worker"):
def sshnode(node, cmd):
host = node["ip"]
user = node["user"]
port = node["sshport"]
port = node.get("sshport", 22)
return SSHCommand(cmd, user=user, host=host, port=port)


Expand All @@ -124,15 +124,15 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:

nodes = pack.config["system"]["nodes"]
copy = []
node_packs = []

copy_source = copy_folder(pack, INSTALL_FOLDER, setup_for)

install = []

for i, node in enumerate(nodes):
if should_run_for(node, setup_for):
install.append(pip_install_milabench(node_packs[i], node, INSTALL_FOLDER))
node_pack = worker_pack(pack, node)
install.append(pip_install_milabench(node_pack, node, INSTALL_FOLDER))

return SequenceCommand(
copy_source,
Expand Down
8 changes: 6 additions & 2 deletions milabench/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,12 +456,16 @@ def gethostname(host):
def resolve_hostname(ip):
try:
hostname, _, iplist = socket.gethostbyaddr(ip)

for ip in iplist:
if is_loopback(ip):
return hostname, True

return hostname, hostname == socket.gethostname()
# Dell has a weird hostname config
# csctmp-xe9680-12.hpc.local csctmp-xe9680-12
# print(hostname, socket.gethostname())

return socket.gethostname(), hostname.startswith(socket.gethostname())

except:
if offline:
Expand Down
4 changes: 3 additions & 1 deletion scripts/article/run_rocm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ fi
pip install xformers --index-url https://download.pytorch.org/whl/rocm6.1
)

# milabench prepare $ARGS
milabench install $ARGS --system $MILABENCH_WORDIR/system.yaml

milabench prepare $ARGS --system $MILABENCH_WORDIR/system.yaml

#
# Run the benchmakrs
Expand Down

0 comments on commit 9ea7f78

Please sign in to comment.