Skip to content

Commit

Permalink
Fix cloud-ci with gpu arch
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Apr 2, 2024
1 parent e8d7bea commit 074c588
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 73 deletions.
18 changes: 11 additions & 7 deletions .github/workflows/cloud-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
include:
- arch: cuda
exclude: "no-cuda"
run_on: azure
run_on: azure__a100
# - arch: rocm
# exclude : "no-rocm"

Expand All @@ -34,8 +34,8 @@ jobs:
shell: bash -el {0}

env:
MILABENCH_CONFIG: "config/test.yaml"
MILABENCH_SYSTEM: "config/examples/cloud-system.yaml"
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
Expand All @@ -53,15 +53,18 @@ jobs:
with:
python-version: 3.9

# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}",
"clientId": "${{ secrets.ARM_CLIENT_ID }}"
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}
- name: dependencies
Expand Down Expand Up @@ -108,7 +111,7 @@ jobs:
- name: install benchmarks
run: |
poetry run milabench install
poetry run milabench install --variant ${{ matrix.arch }}
- name: prepare benchmarks
run: |
Expand All @@ -134,4 +137,5 @@ jobs:
fi
poetry run milabench cloud \
--teardown \
--run-on ${{ matrix.run_on }}
--run-on ${{ matrix.run_on }} \
--all
18 changes: 18 additions & 0 deletions config/cloud-system.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
system:
# Nodes list
nodes:
# Alias used to reference the node
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
# Use this node as the master node or not
main: true
# User to use in remote milabench operations
user: user

# Cloud instances profiles
cloud_profiles:
azure__a100:
username: ubuntu
size: Standard_NC24ads_A100_v4
location: eastus2
7 changes: 7 additions & 0 deletions config/examples/cloud-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@ system:

# Cloud instances profiles
cloud_profiles:
# The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME}
azure:
# covalent-azure-plugin args
username: ubuntu
size: Standard_B1s
location: eastus2
azure__free:
username: ubuntu
size: Standard_B2ats_v2
location: eastus2
ec2:
# covalent-ec2-plugin args
username: ubuntu
instance_type: t2.micro
volume_size: 8
Expand Down
4 changes: 2 additions & 2 deletions config/test.yaml → config/examples/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ test:
inherits: _defaults
group: test_remote
install_group: test_remote
definition: ../benchmarks/_template
definition: ../../benchmarks/_template
plan:
method: njobs
n: 1

testing:
inherits: _defaults
definition: ../benchmarks/_template
definition: ../../benchmarks/_template
group: test_remote_2
install_group: test_remote_2
plan:
Expand Down
53 changes: 33 additions & 20 deletions milabench/cli/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys

from coleo import Option, tooled
from omegaconf import OmegaConf
import yaml

from ..common import get_multipack
Expand All @@ -15,7 +16,16 @@
_ACTIONS = (_SETUP, _TEARDOWN, _LIST)


def manage_cloud(pack, packs, run_on, action="setup"):
def _flatten_cli_args(**kwargs):
return sum(
(
(f"--{k.replace('_', '-')}", *([v] if v else []))
for k, v in kwargs.items()
), ()
)


def manage_cloud(pack, run_on, action="setup"):
assert run_on in pack.config["system"]["cloud_profiles"]

key_map = {
Expand All @@ -28,11 +38,6 @@ def manage_cloud(pack, packs, run_on, action="setup"):

nodes = iter(enumerate(pack.config["system"]["nodes"]))

state_prefix = []
for p in packs.values():
state_prefix.append(p.config["name"])
state_prefix.append(p.config["install_variant"])

while True:
try:
i, n = next(nodes)
Expand All @@ -41,8 +46,10 @@ def manage_cloud(pack, packs, run_on, action="setup"):
except StopIteration:
break

plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix])
plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on])
plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
if i > 0:
plan_params["reuse_resource_group"] = None

import milabench.cli.covalent as cv

Expand All @@ -59,16 +66,9 @@ def manage_cloud(pack, packs, run_on, action="setup"):
cmd = [
sys.executable,
"-m", cv.__name__,
run_on,
run_on.split("__")[0],
f"--{action}",
*list(
sum(
(
(f"--{k.replace('_', '-')}", v)
for k, v in plan_params.items()
), ()
)
)
*_flatten_cli_args(**plan_params)
]
p = subprocess.Popen(
cmd,
Expand Down Expand Up @@ -121,7 +121,8 @@ def _setup():

mp = get_multipack()
setup_pack = mp.setup_pack()
system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP)
system_config = manage_cloud(setup_pack, run_on, action=_SETUP)
del system_config["arch"]

print(f"# hash::>{setup_pack.config['hash']}")
print(yaml.dump({"system": system_config}))
Expand All @@ -131,12 +132,24 @@ def _setup():
def _teardown():
"""Teardown a cloud infrastructure"""

# Setup cloud on target infra
# Teardown cloud instance on target infra
run_on: Option & str

mp = get_multipack()
# Teardown all cloud instances
all: Option & bool = False

overrides = {}
if all:
overrides = {
"*": OmegaConf.to_object(OmegaConf.from_dotlist([
f"system.cloud_profiles.{run_on}.state_prefix='*'",
f"system.cloud_profiles.{run_on}.state_id='*'",
]))
}

mp = get_multipack(overrides=overrides)
setup_pack = mp.setup_pack()
manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN)
manage_cloud(setup_pack, run_on, action=_TEARDOWN)


@tooled
Expand Down
20 changes: 3 additions & 17 deletions milabench/cli/covalent/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,29 +119,16 @@ def lattice(argv=(), deps_bash = None):
deps_bash = None

if not argv and args.setup:
conda_prefix = "eval \"$(conda shell.bash hook)\""
conda_activate = "conda activate milabench"
deps_bash = []
for _cmd in (
f"{conda_activate} || conda create -n milabench -y",
f"{conda_activate}"
f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
f" || >&2 echo First attempt to install python in milabench env failed",
f"{conda_activate}"
f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
f" || conda remove -n milabench --all -y",
):
deps_bash.append(f"{conda_prefix} && ({_cmd})")
deps_bash = ct.DepsBash(deps_bash)
argv = ["conda", "env", "list"]
deps_bash = ct.DepsBash([])
# Make sure pip is installed
argv = ["python3", "-m", "pip", "freeze"]

if argv:
dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
result = ct.get_result(dispatch_id=dispatch_id, wait=True)
return_code, stdout, _ = result.result if result.result is not None else (1, "", "")

if return_code == 0 and args.setup:
assert any([l for l in stdout.split("\n") if l.startswith("milabench ")])
_executor:ct.executor.BaseExecutor = executor_cls(
**{
**_get_executor_kwargs(args),
Expand All @@ -154,7 +141,6 @@ def lattice(argv=(), deps_bash = None):
print(f"hostname::>{_executor.hostname}")
print(f"username::>{_executor.username}")
print(f"ssh_key_file::>{_executor.ssh_key_file}")
print(f"env::>{_executor.env}")
finally:
result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
results_dir = result.results_dir if result else ""
Expand Down
8 changes: 6 additions & 2 deletions milabench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ def resolve_addresses(nodes):
is_local = (
("127.0.0.1" in ipaddrlist)
or (hostname in ("localhost", socket.gethostname()))
# Tmp workaround until networking on azure allows to associate the
# local hostname (`hostname.split(".")[0]`) with the public fqdn
# (hostname.split(".")[0].*.cloudapp.azure.com)
or (hostname.split(".")[0] == socket.gethostname())
or len(ip_list.intersection(ipaddrlist)) > 0
)
node["local"] = is_local
Expand Down Expand Up @@ -227,9 +231,9 @@ def build_system_config(config_file, defaults=None, gpu=True):
config = yaml.safe_load(cf)

if defaults:
config = merge(defaults, config)
config["system"] = merge(defaults["system"], config["system"])

system = config.get("system", {})
system = config["system"]

# capacity is only required if batch resizer is enabled
if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
Expand Down
29 changes: 4 additions & 25 deletions milabench/remote.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
import os
import sys

Expand Down Expand Up @@ -78,7 +79,7 @@ def pip_install_milabench(pack, node, folder) -> SSHCommand:
host = node["ip"]
user = node["user"]

cmd = ["pip", "install", "-e", folder]
cmd = ["python3", "-m", "pip", "install", "-e", folder]
plan = CmdCommand(pack, *cmd)
return SSHCommand(plan, host=host, user=user)

Expand Down Expand Up @@ -184,8 +185,9 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand:
CmdCommand(
worker_pack(pack, worker),
"cd", f"{INSTALL_FOLDER}", "&&",
f"MILABENCH_CONFIG={pack.config['config_file']}",
f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}",
f"MILABENCH_CONFIG={os.environ.get('MILABENCH_CONFIG', '')}",
f"MILABENCH_SYSTEM={os.environ.get('MILABENCH_SYSTEM', '')}",
"milabench", *command
),
host=host,
Expand Down Expand Up @@ -232,16 +234,6 @@ def _sanity(pack, setup_for):


def milabench_remote_config(pack, packs):
config = {}
config_hash = pack.config["hash"]
config_file = XPath(pack.config["config_file"])
config_file = config_file.with_name(f"{config_file.name}.{config_hash}")
pack.config["config_file"] = str(config_file)
for p in packs.values():
config[p.config["name"]] = p.config
p.config["config_file"] = str(config_file)
config_file.write_text(yaml.dump(config))

for n in pack.config["system"]["nodes"]:
_cmds = [
SSHCommand(
Expand All @@ -253,18 +245,6 @@ def milabench_remote_config(pack, packs):
),
n["ip"],
),
SSHCommand(
CmdCommand(
pack,
"mkdir", "-p", str(config_file.parent),
),
n["ip"],
),
SCPCommand(
pack,
n["ip"],
str(config_file),
),
]

yield SequenceCommand(*_cmds)
Expand All @@ -280,7 +260,6 @@ def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand:
argv = sys.argv[2:]
return SequenceCommand(
milabench_remote_setup_plan(pack, setup_for),
milabench_remote_command(pack, "pin", *argv, run_for=setup_for),
milabench_remote_command(pack, "install", *argv, run_for=setup_for),
)

Expand Down

0 comments on commit 074c588

Please sign in to comment.