From 53a99356ca664765ea836a0c7ec756a7ca8017e8 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 27 Feb 2024 09:57:14 -0500 Subject: [PATCH 01/22] Fix pip-tools https://github.com/jazzband/pip-tools/pull/1906 --- poetry.lock | 61 ++++++++++++++++++++++++++++---------------------- pyproject.toml | 2 +- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/poetry.lock b/poetry.lock index 942bff025..d03ffe232 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "alabaster" @@ -172,25 +172,26 @@ wcwidth = ">=0.1.4" [[package]] name = "build" -version = "0.10.0" +version = "1.1.1" description = "A simple, correct Python build frontend" optional = false python-versions = ">= 3.7" files = [ - {file = "build-0.10.0-py3-none-any.whl", hash = "sha256:af266720050a66c893a6096a2f410989eeac74ff9a68ba194b3f6473e8e26171"}, - {file = "build-0.10.0.tar.gz", hash = "sha256:d5b71264afdb5951d6704482aac78de887c80691c52b88a9ad195983ca2c9269"}, + {file = "build-1.1.1-py3-none-any.whl", hash = "sha256:8ed0851ee76e6e38adce47e4bee3b51c771d86c64cf578d0c2245567ee200e73"}, + {file = "build-1.1.1.tar.gz", hash = "sha256:8eea65bb45b1aac2e734ba2cc8dad3a6d97d97901a395bd0ed3e7b46953d2a31"}, ] [package.dependencies] colorama = {version = "*", markers = "os_name == \"nt\""} +importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10.2\""} packaging = ">=19.0" pyproject_hooks = "*" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} [package.extras] -docs = ["furo (>=2021.08.31)", "sphinx (>=4.0,<5.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)"] -test = ["filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "toml (>=0.10.0)", "wheel (>=0.36.0)"] -typing = ["importlib-metadata (>=5.1)", "mypy (==0.991)", "tomli", "typing-extensions (>=3.7.4.3)"] +docs = ["furo (>=2023.08.17)", "sphinx (>=7.0,<8.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)", "sphinx-issues (>=3.0.0)"] +test = ["filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "setuptools (>=56.0.0)", "setuptools (>=67.8.0)", "wheel (>=0.36.0)"] +typing = ["importlib-metadata (>=5.1)", "mypy (>=1.5.0,<1.6.0)", "tomli", "typing-extensions (>=3.7.4.3)"] virtualenv = ["virtualenv (>=20.0.35)"] [[package]] @@ -1082,25 +1083,27 @@ files = [ [[package]] name = "pip-tools" -version = "6.13.0" +version = "7.4.1" description = "pip-tools keeps your pinned dependencies fresh." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pip-tools-6.13.0.tar.gz", hash = "sha256:61d46bd2eb8016ed4a924e196e6e5b0a268cd3babd79e593048720db23522bb1"}, - {file = "pip_tools-6.13.0-py3-none-any.whl", hash = "sha256:50943f151d87e752abddec8158622c34ad7f292e193836e90e30d87da60b19d9"}, + {file = "pip-tools-7.4.1.tar.gz", hash = "sha256:864826f5073864450e24dbeeb85ce3920cdfb09848a3d69ebf537b521f14bcc9"}, + {file = "pip_tools-7.4.1-py3-none-any.whl", hash = "sha256:4c690e5fbae2f21e87843e89c26191f0d9454f362d8acdbd695716493ec8b3a9"}, ] [package.dependencies] -build = "*" +build = ">=1.0.0" click = ">=8" pip = ">=22.2" +pyproject_hooks = "*" setuptools = "*" +tomli = {version = "*", markers = "python_version < \"3.11\""} wheel = "*" [package.extras] -coverage = ["pytest-cov"] -testing = ["flit-core (>=2,<4)", "poetry-core (>=1.0.0)", "pytest (>=7.2.0)", "pytest-rerunfailures", "pytest-xdist"] +coverage = ["covdefaults", "pytest-cov"] +testing = ["flit_core (>=2,<4)", "poetry_core (>=1.0.0)", "pytest (>=7.2.0)", "pytest-rerunfailures", "pytest-xdist", "tomli-w"] [[package]] name = "platformdirs" @@ -2034,22 +2037,26 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess [[package]] name = "voir" -version = "0.2.10" +version = "0.2.12" description = "Instrument, extend and visualize your programs" optional = false -python-versions = ">=3.7,<4.0" -files = [ - {file = "voir-0.2.10-py3-none-any.whl", hash = "sha256:70266f9cab487bb3b6f7aea90d724055f6a88824db37c326473f72cf40e93343"}, - {file = "voir-0.2.10.tar.gz", hash = "sha256:352425923d7186941036f7c9909c8bf3ad13d10b633abd6dd3697609b3b6673b"}, -] +python-versions = "^3.7" +files = [] +develop = false [package.dependencies] -giving = ">=0.4.2,<0.5.0" -omegaconf = ">=2.3.0,<3.0.0" -ovld = ">=0.3.2,<0.4.0" -ptera = ">=1.4.1,<2.0.0" -pynvml = ">=11.5.0,<12.0.0" -rich = ">=13.3.2,<14.0.0" +giving = "^0.4.2" +omegaconf = "^2.3.0" +ovld = "^0.3.2" +ptera = "^1.4.1" +pynvml = "^11.5.0" +rich = "^13.3.2" + +[package.source] +type = "git" +url = "https://github.com/breuleux/voir" +reference = "master" +resolved_reference = "01caa92f5bc49c696dea3090eaee7c8f97e85f4f" [[package]] name = "wcwidth" @@ -2094,4 +2101,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "0407b1f9e231b83ca25d848e4c21033a7016d5825c31a86ce075479b4b419fa8" +content-hash = "dab3de344c4805f5071dec0f4b2b866f6b9d6bd2f16b1ac8d9b2df20b1184494" diff --git a/pyproject.toml b/pyproject.toml index 773986140..956c802b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ pandas = "^1.4.2" numpy = ">=1.23.0" pynvml = "^11.4.1" tqdm = "^4.64.1" -pip-tools = "^6.12.3" +pip-tools = "^7.0.0" rich = "^13.3.2" omegaconf = "^2.3.0" sqlalchemy = "^2.0.15" From c3b9dbf1a011256cae78a9ecd10cfaa65bfdd39d Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 27 Feb 2024 09:57:30 -0500 Subject: [PATCH 02/22] Add covalent cloud aws ec2 infra and report --push covalent is not compatible with milabench as it requires sqlalchemy<2.0.0 --- .github/workflows/cloud-ci.yml | 109 ++++++++++ benchmarks/_template/requirements.cpu.txt | 46 +++++ config/examples/ec2-system.yaml | 19 ++ config/test.yaml | 24 +++ milabench/__init__.py | 5 + milabench/cli/__init__.py | 5 + milabench/cli/badges/__main__.py | 45 +++++ milabench/cli/badges/requirements.txt | 1 + milabench/cli/cloud.py | 153 ++++++++++++++ milabench/cli/covalent/__main__.py | 233 ++++++++++++++++++++++ milabench/cli/covalent/requirements.txt | 2 + milabench/cli/report.py | 43 +++- milabench/commands/__init__.py | 28 ++- milabench/common.py | 168 +++++++++++++++- milabench/config.py | 14 ++ milabench/multi.py | 25 +++ milabench/remote.py | 118 ++++++++++- 17 files changed, 1011 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/cloud-ci.yml create mode 100644 benchmarks/_template/requirements.cpu.txt create mode 100644 config/examples/ec2-system.yaml create mode 100644 config/test.yaml create mode 100644 milabench/cli/badges/__main__.py create mode 100644 milabench/cli/badges/requirements.txt create mode 100644 milabench/cli/cloud.py create mode 100644 milabench/cli/covalent/__main__.py create mode 100644 milabench/cli/covalent/requirements.txt diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml new file mode 100644 index 000000000..ab1211a9f --- /dev/null +++ b/.github/workflows/cloud-ci.yml @@ -0,0 +1,109 @@ +name: tests + +on: + # Runs for pull requests + pull_request: + branches: + - master + +jobs: + tests: + strategy: + fail-fast: true + matrix: + include: + - arch: cuda + exclude: "no-cuda" + run_on: ec2 + # - arch: rocm + # exclude : "no-rocm" + + runs-on: ubuntu-latest + environment: test-cloud-ci + + # Cancel previous jobs if a new version was pushed + concurrency: + group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}" + cancel-in-progress: true + + defaults: + run: + shell: bash -el {0} + + env: + MILABENCH_CONFIG: "config/test.yaml" + MILABENCH_SYSTEM: "config/examples/${{ matrix.run_on }}-system.yaml" + MILABENCH_BASE: "output" + MILABENCH_ARGS: "" + MILABENCH_GPU_ARCH: "${{ matrix.arch }}" + MILABENCH_DASH: "no" + + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPORTS_PAT }} + + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: dependencies + run: | + python -m pip install -U pip + python -m pip install -U poetry + poetry lock --no-update + poetry install + + - name: setup cloud credentials + run: | + mkdir -p ~/.aws + mkdir -p ~/.ssh/covalent + echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem + echo "[default]" >~/.aws/credentials + echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials + echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials + chmod -R a-rwx,u+rwX ~/.aws ~/.ssh + + - name: setup cloud + run: | + _system=$( + poetry run milabench cloud \ + --setup \ + --run-on ${{ matrix.run_on }} + ) + { read _hash ; }< <( + echo -n "$_system" | while read l + do + if [[ "$l" == "# hash::>"* ]] + then + echo -n "${l#*::>}" + fi + done + echo + ) + if [[ -z "${_hash}" ]] + then + >&2 echo "Failed to fetch system config hash" + exit 1 + fi + echo -n "$_system" >$MILABENCH_SYSTEM.$_hash + echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$_hash" >>$GITHUB_ENV + + - name: install benchmarks + run: | + poetry run milabench install + + - name: prepare benchmarks + run: | + poetry run milabench prepare + + - name: run benchmarks + run: | + poetry run milabench run + + - name: Summary + run: | + git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)" + git config --global user.email "github-ci@example.com" + git config --global user.name "GitHub CI" + poetry run milabench report --push diff --git a/benchmarks/_template/requirements.cpu.txt b/benchmarks/_template/requirements.cpu.txt new file mode 100644 index 000000000..e0058b822 --- /dev/null +++ b/benchmarks/_template/requirements.cpu.txt @@ -0,0 +1,46 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=benchmarks/_template/requirements.cpu.txt benchmarks/_template/requirements.in +# +antlr4-python3-runtime==4.9.3 + # via omegaconf +asttokens==2.4.1 + # via giving +codefind==0.1.3 + # via ptera +executing==1.2.0 + # via varname +giving==0.4.2 + # via + # ptera + # voir +markdown-it-py==3.0.0 + # via rich +mdurl==0.1.2 + # via markdown-it-py +omegaconf==2.3.0 + # via voir +ovld==0.3.2 + # via voir +ptera==1.4.1 + # via voir +pygments==2.17.2 + # via rich +pynvml==11.5.0 + # via voir +pyyaml==6.0.1 + # via omegaconf +reactivex==4.0.4 + # via giving +rich==13.7.0 + # via voir +six==1.16.0 + # via asttokens +typing-extensions==4.10.0 + # via reactivex +varname==0.10.0 + # via giving +voir==0.2.12 + # via -r benchmarks/_template/requirements.in diff --git a/config/examples/ec2-system.yaml b/config/examples/ec2-system.yaml new file mode 100644 index 000000000..dab1a7a4e --- /dev/null +++ b/config/examples/ec2-system.yaml @@ -0,0 +1,19 @@ +system: + # Nodes list + nodes: + # Alias used to reference the node + - name: manager + # Use 1.1.1.1 as an ip placeholder + ip: 1.1.1.1 + # Use this node as the master node or not + main: true + # User to use in remote milabench operations + user: user + + # Cloud instances profiles + cloud_profiles: + ec2: + username: ubuntu + instance_type: t2.micro + volume_size: 8 + region: us-east-2 diff --git a/config/test.yaml b/config/test.yaml new file mode 100644 index 000000000..060949e40 --- /dev/null +++ b/config/test.yaml @@ -0,0 +1,24 @@ +_defaults: + max_duration: 600 + voir: + options: + stop: 60 + interval: "1s" + +test: + inherits: _defaults + group: test_remote + install_group: test_remote + definition: ../benchmarks/_template + plan: + method: njobs + n: 1 + +testing: + inherits: _defaults + definition: ../benchmarks/_template + group: test_remote_2 + install_group: test_remote_2 + plan: + method: njobs + n: 1 diff --git a/milabench/__init__.py b/milabench/__init__.py index e69de29bb..ac33e6bb3 100644 --- a/milabench/__init__.py +++ b/milabench/__init__.py @@ -0,0 +1,5 @@ +import pathlib + +ROOT_FOLDER = pathlib.Path(__file__).resolve().parent.parent +CONFIG_FOLDER = ROOT_FOLDER / "config" +BENCHMARK_FOLDER = ROOT_FOLDER / "benchmarks" diff --git a/milabench/cli/__init__.py b/milabench/cli/__init__.py index f0eea8d1e..401179944 100644 --- a/milabench/cli/__init__.py +++ b/milabench/cli/__init__.py @@ -3,6 +3,7 @@ from coleo import run_cli +from .cloud import cli_cloud from .compare import cli_compare from .dev import cli_dev from .install import cli_install @@ -37,6 +38,10 @@ def pin(): """Pin the benchmarks' dependencies.""" cli_pin() + def cloud(): + """Setup cloud instances.""" + cli_cloud() + def dev(): """Create a shell in a benchmark's environment for development.""" cli_dev() diff --git a/milabench/cli/badges/__main__.py b/milabench/cli/badges/__main__.py new file mode 100644 index 000000000..027a59a4b --- /dev/null +++ b/milabench/cli/badges/__main__.py @@ -0,0 +1,45 @@ +import pathlib +import subprocess +import sys + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + try: + import pybadges as _ + except ImportError: + module = pathlib.Path(__file__).resolve().parent + cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") + python3 = str(cache_dir / "bin/python3") + check_module = "import pybadges" + try: + subprocess.run([python3, "-c", check_module], check=True) + except (FileNotFoundError, subprocess.CalledProcessError): + cache_dir.mkdir(parents=True, exist_ok=True) + subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True) + subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True) + subprocess.run([ + python3, + "-m", + "pip", + "install", + "-r", + str(module / "requirements.txt") + ], check=True) + subprocess.run([python3, "-c", check_module], check=True) + return subprocess.call( + [python3, __file__, *argv], + ) + + return subprocess.run([ + sys.executable, + "-m", + "pybadges", + *argv + ], check=True).returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/milabench/cli/badges/requirements.txt b/milabench/cli/badges/requirements.txt new file mode 100644 index 000000000..26620981a --- /dev/null +++ b/milabench/cli/badges/requirements.txt @@ -0,0 +1 @@ +pybadges \ No newline at end of file diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py new file mode 100644 index 000000000..c0f9c9bcb --- /dev/null +++ b/milabench/cli/cloud.py @@ -0,0 +1,153 @@ +from copy import deepcopy +import os +import subprocess +import sys + +from coleo import Option, tooled +import yaml + +# import milabench as mb +from ..common import get_multipack + + +_SETUP = "setup" +_TEARDOWN = "teardown" +_LIST = "list" +_ACTIONS = (_SETUP, _TEARDOWN, _LIST) + + +def manage_cloud(pack, packs, run_on, action="setup"): + assert run_on in pack.config["system"]["cloud_profiles"] + + key_map = { + "hostname":(lambda v: ("ip",v)), + "username":(lambda v: ("user",v)), + "ssh_key_file":(lambda v: ("key",v)), + "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])), + } + plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on]) + + nodes = iter(enumerate(pack.config["system"]["nodes"])) + + state_prefix = [] + for p in packs.values(): + state_prefix.append(p.config["name"]) + state_prefix.append(p.config["install_variant"]) + + while True: + try: + i, n = next(nodes) + if n["ip"] != "1.1.1.1": + continue + except StopIteration: + break + + plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix]) + plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"] + + import milabench.cli.covalent as cv + + subprocess.run( + [ + sys.executable, + "-m", cv.__name__, + "serve", "start" + ] + , stdout=sys.stderr + , check=True + ) + + cmd = [ + sys.executable, + "-m", cv.__name__, + run_on, + f"--{action}", + *[ + f"--{k.replace('_', '-')}={v}" + for k, v in plan_params.items() + ], + ] + p = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout_chunks = [] + while True: + line = p.stdout.readline() + if not line: + break + line_str = line.decode("utf-8").strip() + stdout_chunks.append(line_str) + print(line_str, file=sys.stderr) + + if not line_str: + continue + try: + k, v = line_str.split("::>") + k, v = key_map[k](v) + if k == "ip" and n[k] != "1.1.1.1": + i, n = next(nodes) + n[k] = v + except ValueError: + pass + + _, stderr = p.communicate() + stderr = stderr.decode("utf-8").strip() + print(stderr, file=sys.stderr) + + if p.returncode != 0: + stdout = os.linesep.join(stdout_chunks) + raise subprocess.CalledProcessError( + p.returncode, + cmd, + stdout, + stderr + ) + + return pack.config["system"] + + +@tooled +def _setup(): + """Setup a cloud infrastructure""" + + # Setup cloud on target infra + run_on: Option & str + + mp = get_multipack() + setup_pack = mp.setup_pack() + system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP) + + print(f"# hash::>{setup_pack.config['hash']}") + print(yaml.dump({"system": system_config})) + + +@tooled +def _teardown(): + """Teardown a cloud infrastructure""" + + # Setup cloud on target infra + run_on: Option & str + + mp = get_multipack() + setup_pack = mp.setup_pack() + manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN) + + +@tooled +def cli_cloud(): + """Manage cloud instances.""" + + # Setup a cloud infrastructure + setup: Option & bool = False + # Teardown a cloud infrastructure + teardown: Option & bool = False + + assert any((setup, teardown)) and not all((setup, teardown)) + + if setup: + _setup() + elif teardown: + _teardown() diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py new file mode 100644 index 000000000..cf5ff8537 --- /dev/null +++ b/milabench/cli/covalent/__main__.py @@ -0,0 +1,233 @@ +import argparse +import asyncio +import os +import pathlib +import subprocess +import sys +import tempfile + + +def serve(*argv): + return subprocess.run([ + str(pathlib.Path(sys.executable).with_name("covalent")), + *argv + ]).returncode + + +def _get_executor_kwargs(args): + return { + **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")}, + **{"action":k for k,v in vars(args).items() if k in ("setup", "teardown") and v}, + } + + +def executor(executor_cls, args, *argv): + import covalent as ct + + executor:ct.executor.BaseExecutor = executor_cls( + **_get_executor_kwargs(args), + ) + + def _popen(cmd, *args, _env=None, **kwargs): + _env = _env if _env is not None else {} + + for envvar in _env.keys(): + envvar_val = _env[envvar] + + if not envvar_val: + continue + + envvar_val = pathlib.Path(envvar_val).expanduser() + if str(envvar_val) != _env[envvar]: + _env[envvar] = str(envvar_val) + + if "MILABENCH_CONFIG_CONTENT" in _env: + _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent + with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f: + _f.write(_env["MILABENCH_CONFIG_CONTENT"]) + _env["MILABENCH_CONFIG"] = _f.name + + try: + cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:]) + except IndexError: + pass + + cwd = kwargs.pop("cwd", None) + if cwd is not None: + cwd = str(pathlib.Path(cwd).expanduser()) + kwargs["cwd"] = cwd + + _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env} + + kwargs = { + **kwargs, + "env": _env, + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + } + p = subprocess.Popen(cmd, *args, **kwargs) + + stdout_chunks = [] + while True: + line = p.stdout.readline() + if not line: + break + line_str = line.decode("utf-8").strip() + stdout_chunks.append(line_str) + print(line_str) + + _, stderr = p.communicate() + stderr = stderr.decode("utf-8").strip() + stdout = os.linesep.join(stdout_chunks) + + if p.returncode != 0: + raise subprocess.CalledProcessError( + p.returncode, + (cmd, args, kwargs), + stdout, + stderr + ) + return p.returncode, stdout, stderr + + @ct.lattice + def lattice(argv=(), deps_bash = None): + return ct.electron( + _popen, + executor=executor, + deps_bash=deps_bash, + )( + argv, + ) + + return_code = 0 + try: + dispatch_id = None + result = None + deps_bash = None + + if not argv and args.setup: + conda_prefix = "eval \"$(conda shell.bash hook)\"" + conda_activate = "conda activate milabench" + deps_bash = [] + for _cmd in ( + f"{conda_activate} || conda create -n milabench -y", + f"{conda_activate}" + f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y" + f" || >&2 echo First attempt to install python in milabench env failed", + f"{conda_activate}" + f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y" + f" || conda remove -n milabench --all -y", + ): + deps_bash.append(f"{conda_prefix} && ({_cmd})") + deps_bash = ct.DepsBash(deps_bash) + argv = ["conda", "env", "list"] + + if argv: + dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) + result = ct.get_result(dispatch_id=dispatch_id, wait=True) + return_code, stdout, _ = result.result if result.result is not None else (1, "", "") + + if return_code == 0 and args.setup: + assert any([l for l in stdout.split("\n") if l.startswith("milabench ")]) + _executor:ct.executor.BaseExecutor = executor_cls( + **{ + **_get_executor_kwargs(args), + **{"action": "teardown"}, + } + ) + asyncio.run(_executor.setup({})) + + assert _executor.hostname + print(f"hostname::>{_executor.hostname}") + print(f"username::>{_executor.username}") + print(f"ssh_key_file::>{_executor.ssh_key_file}") + print(f"env::>~/.condaenvrc") + finally: + result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None + results_dir = result.results_dir if result else "" + if args.teardown: + try: + _executor:ct.executor.BaseExecutor = executor_cls( + **{ + **_get_executor_kwargs(args), + **{"action": "teardown"}, + } + ) + asyncio.run(_executor.setup({})) + asyncio.run( + _executor.teardown( + {"dispatch_id": dispatch_id, "node_id": 0, "results_dir": results_dir} + ) + ) + except FileNotFoundError: + pass + + return return_code + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + try: + import covalent as ct + ct.get_config(f"executors.ec2") + except (KeyError, ImportError): + module = pathlib.Path(__file__).resolve().parent + cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") + python3 = str(cache_dir / "bin/python3") + check_module = "import covalent ; from covalent.executor import EC2Executor" + try: + subprocess.run([python3, "-c", check_module], check=True) + except (FileNotFoundError, subprocess.CalledProcessError): + cache_dir.mkdir(parents=True, exist_ok=True) + subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True) + subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True) + subprocess.run([ + python3, + "-m", + "pip", + "install", + "-r", + str(module / "requirements.txt") + ], check=True) + subprocess.run([python3, "-c", check_module], check=True) + return subprocess.call( + [python3, __file__, *argv], + ) + + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers() + subparser = subparsers.add_parser("serve") + subparser.add_argument(f"argv", nargs=argparse.REMAINDER) + for p in ("ec2",): + try: + config = ct.get_config(f"executors.{p}") + except KeyError: + continue + subparser = subparsers.add_parser(p) + subparser.add_argument(f"--setup", action="store_true") + subparser.add_argument(f"--teardown", action="store_true") + for param, default in config.items(): + if param == "action": + continue + subparser.add_argument(f"--{param.replace('_', '-')}", default=default) + + try: + cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:] + except ValueError: + cv_argv, argv = argv, [] + + args = parser.parse_args(cv_argv) + + if cv_argv[0] == "serve": + assert not argv + return serve(*args.argv) + elif cv_argv[0] == "ec2": + return executor(ct.executor.EC2Executor, args, *argv) + else: + raise + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt new file mode 100644 index 000000000..f810e6eaf --- /dev/null +++ b/milabench/cli/covalent/requirements.txt @@ -0,0 +1,2 @@ +covalent +covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench \ No newline at end of file diff --git a/milabench/cli/report.py b/milabench/cli/report.py index cbad44223..b14b49528 100644 --- a/milabench/cli/report.py +++ b/milabench/cli/report.py @@ -1,10 +1,12 @@ +import glob import os import sys from dataclasses import dataclass, field from coleo import Option, config as configuration, tooled -from ..common import Option, _error_report, _get_multipack, _read_reports +from ..common import Option, _error_report, _get_multipack, _push_reports, _read_reports +from ..fs import XPath from ..report import make_report from ..summary import make_summary @@ -12,12 +14,13 @@ # fmt: off @dataclass class Arguments: - runs: list = field(default_factory=list) + runs : list = field(default_factory=list) config : str = os.environ.get("MILABENCH_CONFIG", None) compare : str = None compare_gpus: bool = False html : str = None price : int = None + push : bool = False # fmt: on @@ -42,7 +45,10 @@ def arguments(): # Price per unit price: Option & int = None - return Arguments(runs, config, compare, compare_gpus, html, price) + # Push reports to repo + push: Option & bool = False + + return Arguments(runs, config, compare, compare_gpus, html, price, push) @tooled @@ -68,11 +74,6 @@ def cli_report(args=None): # ------ # 1 errors, details in HTML report. - reports = None - if args.runs: - reports = _read_reports(*args.runs) - summary = make_summary(reports.values()) - if args.config: from milabench.common import arguments as multipack_args @@ -81,6 +82,25 @@ def cli_report(args=None): args.config = _get_multipack(margs, return_config=True) + assert args.config if args.push else None + + if not args.runs and args.config: + run_dirs = {XPath(pack_config["dirs"]["runs"]) for pack_config in args.config.values()} + filter = lambda _p: not any([XPath(_p).name.startswith(f"{prefix}.") for prefix in ("install", "prepare")]) + args.runs = sorted( + {_r + for _rd in run_dirs + for _r in glob.glob(str(_rd / "*.*.*/")) + if filter(_r) + }, + key=lambda _p: XPath(_p).name.split(".")[-2:] + ) + + reports = None + if args.runs: + reports = _read_reports(*args.runs) + summary = make_summary(reports.values()) + make_report( summary, compare=args.compare, @@ -93,3 +113,10 @@ def cli_report(args=None): errdata=reports and _error_report(reports), stream=sys.stdout, ) + + if len(reports) and args.push: + reports_repo = next(iter( + XPath(pack_config["dirs"]["base"]) / "reports" + for pack_config in args.config.values() + )) + _push_reports(reports_repo, args.runs, summary) diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py index 30ce3ffa8..00284208d 100644 --- a/milabench/commands/__init__.py +++ b/milabench/commands/__init__.py @@ -399,6 +399,11 @@ def is_local(self): == localnode["hostname"] # The hostname is the local node ) + def _load_env(self, node): + if node.get("env", None): + return node["env"] + return [] + def _argv(self, **kwargs) -> List: # No-op when executing on a local node if self.is_local(): @@ -410,13 +415,14 @@ def _argv(self, **kwargs) -> List: host = f"{user}@{self.host}" if user else self.host argv = super()._argv(**kwargs) - argv.extend(["-oPasswordAuthentication=no"]) - argv.extend(["-p", str(self.port)]) - if key: - argv.append(f"-i{key}") + # scp apparently needs `-i` to be first + argv.insert(1, f"-i{key}") + argv.append(f"-p{self.port}") argv.append(host) + argv.extend(self._load_env(node)) + return argv @@ -427,21 +433,27 @@ def __init__( self, pack: pack.BasePackage, host: str, - directory: str, + src: str, *scp_argv, + dest: str = None, user: str = None, key: str = None, **kwargs, ) -> None: super().__init__(pack, host, "-r", *scp_argv, user=user, key=key, **kwargs) - self.dir = directory + self.src = src + self.dest = dest if dest is not None else self.src + + def _load_env(self, node): + del node + return [] def _argv(self, **kwargs) -> List: argv = super()._argv(**kwargs) host = argv.pop() - argv.append(self.dir) - argv.append(f"{host}:{self.dir}") + argv.append(self.src) + argv.append(f"{host}:{self.dest}") return argv diff --git a/milabench/common.py b/milabench/common.py index 35f9cf125..01a8976a0 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -1,16 +1,21 @@ +from copy import deepcopy import io import json import os import re import runpy +import subprocess import sys import traceback from dataclasses import dataclass, field from datetime import datetime from coleo import Option, default, tooled +import git from omegaconf import OmegaConf from voir.instruments.gpu import deduce_backend, select_backend +import yaml +from milabench import ROOT_FOLDER from milabench.alt_async import proceed from milabench.utils import available_layers, blabla, multilogger @@ -194,6 +199,13 @@ def _get_multipack( if args.config is None: sys.exit("Error: CONFIG argument not provided and no $MILABENCH_CONFIG") + if args.system is None: + args.system = os.environ.get("MILABENCH_SYSTEM", None) + + if args.system is None: + if XPath(f"{args.config}.system").exists(): + args.system = f"{args.config}.system" + if args.select: args.select = set(args.select.split(",")) @@ -255,7 +267,7 @@ def is_selected(defn): return selected_config else: return MultiPackage( - {name: get_pack(defn) for name, defn in selected_config.items()} + {name: get_pack(deepcopy(defn)) for name, defn in selected_config.items()} ) @@ -296,6 +308,160 @@ def _read_reports(*runs): return all_data +def _find_metas(reports): + local_meta = next(iter(e for _r in reports for e in _r if e["event"] == "meta"), None) + if local_meta: + local_meta = local_meta["data"] + remote_metas = [] + for _r in reports: + meta_lines = [] + for event in _r: + _, event_type, line = None, "", [] + + try: + _, event_type, *line = event["data"].split(" ") + except (AttributeError, ValueError): + pass + + if event_type[:1] + event_type[-1:] != "[]": + event_type = None + line = event["data"] + else: + line = " ".join(line) + + if event_type == "[meta]": + meta_lines.append(line) + elif event_type is None and meta_lines: + meta_lines.append(line) + elif meta_lines: + remote_metas.append(yaml.safe_load("".join(meta_lines))) + meta_lines = [] + + return local_meta, remote_metas + + +def _filter_reports(*reports): + all_reports = [] + + for report in reports: + config = next(iter(e for e in report if e["event"] == "config"), None) + if config is None: + continue + + if config["data"]["name"] != "remote": + all_reports.append(report) + + return all_reports + + +def _push_reports(reports_repo, runs, packs:dict=None): + _SVG_COLORS = { + "pass": "blue", + "partial": "yellow", + "failure": "red", + } + import milabench.cli.badges as badges + + _repo = git.repo.base.Repo(ROOT_FOLDER) + try: + reports_repo = git.repo.base.Repo(str(reports_repo)) + except (git.exc.InvalidGitRepositoryError, git.exc.NoSuchPathError): + repo_url = next(iter(_r.url for _r in _repo.remotes if _r.name == "origin"), None) + reports_repo = git.repo.base.Repo.clone_from(repo_url, str(reports_repo), branch="reports") + + reports_url = ([ + _r.url for _r in _repo.remotes if "mila-iqia" in _r.url + ] or [ + _r.url for _r in _repo.remotes if _r.name == "origin" + ])[0] + reports_url = XPath("github.com".join(reports_url.split("github.com")[1:])[1:]) + reports_url = XPath("https://github.com") / f"{reports_url.with_suffix('')}/tree/{reports_repo.active_branch.name}" + + device_reports = {} + for run in runs: + reports = list(_read_reports(run).values()) + reports = _filter_reports(*reports) + + if not reports: + continue + + meta = [e["data"] for _r in reports for e in _r if e["event"] == "meta"] + + for _meta in meta: + for gpu in _meta["accelerators"]["gpus"].values(): + device = gpu["product"].replace(" ", "_") + break + else: + for _meta in meta: + device = _meta["cpu"]["brand"].replace(" ", "_") + break + + tag = ([ + t.name + for t in _repo.tags + if meta[0]["milabench"]["tag"].startswith(t.name) + ] or [meta[0]["milabench"]["tag"]])[0] + reports_dir = XPath(reports_repo.working_tree_dir) / tag + + run = XPath(run) + try: + run.copy(reports_dir / device / run.name) + except FileExistsError: + pass + + device_reports.setdefault((device, tag), set()) + device_reports[(device, tag)].update( + (reports_dir / device).glob("*/") + ) + + for (device, tag), reports in device_reports.items(): + reports_dir = XPath(reports_repo.working_tree_dir) / tag + reports = _read_reports(*reports) + reports = _filter_reports(*reports.values()) + summary = make_summary(reports) + + successes = [s["successes"] for s in summary.values()] + failures = [s["failures"] for s in summary.values()] + + if sum(successes) == 0: + text = "failure" + elif any(failures): + text = "partial" + else: + text = "pass" + + result = subprocess.run( + [ + sys.executable, + "-m", badges.__name__, + "--left-text", device, + "--right-text", text, + "--right-color", _SVG_COLORS[text], + "--whole-link", str(reports_url / tag / device) + ], + capture_output=True + ) + if result.returncode == 0: + (reports_dir / device / "badge.svg").write_text(result.stdout.decode("utf8")) + + with open(str(reports_dir / device / "README.md"), "wt") as _f: + _f.write("```\n") + make_report(summary, stream=_f) + _f.write("```\n") + + for cmd, _kwargs in ( + (["git", "pull"], {"check": True}), + (["git", "add", tag], {"check": True}), + (["git", "commit", "-m", tag], {"check": False}), + (["git", "push"], {"check": True}) + ): + subprocess.run( + cmd, + cwd=reports_repo.working_tree_dir, + **_kwargs + ) + + def _error_report(reports): out = {} for r, data in reports.items(): diff --git a/milabench/config.py b/milabench/config.py index bfee806e7..e276cb17c 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -1,6 +1,8 @@ import contextvars +import hashlib import os import socket +from copy import deepcopy import psutil import yaml @@ -57,6 +59,16 @@ def resolve_inheritance(bench_config, all_configs): return bench_config +def compute_config_hash(config): + config = deepcopy(config) + for entry in config: + config[entry]["dirs"] = {} + config[entry]["config_base"] = "" + config[entry]["config_file"] = "" + config[entry]["run_name"] = "" + return hashlib.md5(str(config).encode("utf8")).hexdigest() + + def finalize_config(name, bench_config): bench_config["name"] = name if "definition" in bench_config: @@ -76,6 +88,8 @@ def build_config(*config_files): for layer in _config_layers(config_files): all_configs = merge(all_configs, layer) + all_configs["*"]["hash"] = compute_config_hash(all_configs) + for name, bench_config in all_configs.items(): all_configs[name] = resolve_inheritance(bench_config, all_configs) diff --git a/milabench/multi.py b/milabench/multi.py index 9946a3642..4a6cbd58a 100644 --- a/milabench/multi.py +++ b/milabench/multi.py @@ -13,6 +13,7 @@ is_main_local, is_multinode, is_remote, + milabench_remote_config, milabench_remote_install, milabench_remote_prepare, milabench_remote_run, @@ -84,7 +85,10 @@ def setup_pack(self) -> Package: "dirs": pack.config["dirs"], "config_base": pack.config["config_base"], "config_file": pack.config["config_file"], + "plan": pack.config["plan"], "system": pack.config["system"], + "hash": pack.config["hash"], + "install_variant": pack.config["install_variant"], } ) @@ -121,6 +125,13 @@ async def do_install(self): remote_task = None if is_remote(setup): + await asyncio.wait( + [ + asyncio.create_task(t.execute()) + for t in milabench_remote_config(setup, self.packs) + ] + ) + # We are outside system, setup the main node first remote_plan = milabench_remote_install(setup, setup_for="main") remote_task = asyncio.create_task(remote_plan.execute()) @@ -142,6 +153,13 @@ async def do_prepare(self): remote_task = None if is_remote(setup): + await asyncio.wait( + [ + asyncio.create_task(t.execute()) + for t in milabench_remote_config(setup, self.packs) + ] + ) + remote_plan = milabench_remote_prepare(setup, run_for="main") remote_task = asyncio.create_task(remote_plan.execute()) await asyncio.wait([remote_task]) @@ -158,6 +176,13 @@ async def do_run(self, repeat=1): setup = self.setup_pack() if is_remote(setup): + await asyncio.wait( + [ + asyncio.create_task(t.execute()) + for t in milabench_remote_config(setup, self.packs) + ] + ) + # if we are not on the main node right now # ssh to the main node and launch milabench remote_plan = milabench_remote_run(setup) diff --git a/milabench/remote.py b/milabench/remote.py index bf5963183..b1759f2fa 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -1,16 +1,22 @@ import os import sys +import yaml + +from milabench.fs import XPath + +from . import ROOT_FOLDER from .commands import ( CmdCommand, Command, ListCommand, + SCPCommand, SequenceCommand, SSHCommand, VoidCommand, ) -INSTALL_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +INSTALL_FOLDER = str(ROOT_FOLDER) def scp(node, folder, dest=None) -> list: @@ -30,21 +36,41 @@ def scp(node, folder, dest=None) -> list: ] -def rsync(node, folder, dest=None) -> list: +def rsync(node, src=None, remote_src=None, dest=None) -> list: """Copy a folder from local node to remote node""" host = node["ip"] user = node["user"] + key = node.get("key", None) + key = f"-i{key}" if key else "" + + if isinstance(src, str): + src = [src] + + assert not src or not remote_src + assert src or remote_src if dest is None: - dest = os.path.abspath(os.path.join(folder, "..")) + _ = remote_src if remote_src else src[0] + dest = os.path.abspath(os.path.join(_, "..")) + + if remote_src: + remote_src = [f"{user}@{host}:{remote_src}"] + src = [] + else: + dest = f"{user}@{host}:{dest}" + remote_src = [] return [ "rsync", + "--force", "-av", "-e", - "ssh -oCheckHostIP=no -oStrictHostKeyChecking=no", - folder, - f"{user}@{host}:{dest}", + f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no", + "--include=*/.git/*", + *[f"--exclude=*/{_dir}/*" + for _dir in (".*", "venv", "env", "tmp")], + *src, *remote_src, + dest, ] @@ -84,9 +110,9 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: """ nodes = pack.config["system"]["nodes"] - copy = [] node_packs = [] + copy = [] for node in nodes: node_pack = None @@ -107,6 +133,30 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand: ) +def milabench_remote_fetch_reports_plan(pack, run_for="main") -> SequenceCommand: + """Copy milabench reports from remote + + Notes + ----- + Assume that the filesystem of remote node mirror local system. + """ + + nodes = pack.config["system"]["nodes"] + runs = pack.config["dirs"]["runs"] + + copy = [] + for node in nodes: + node_pack = None + + if should_run_for(node, run_for): + node_pack = worker_pack(pack, node) + copy.append(CmdCommand(node_pack, *rsync(node, remote_src=str(runs)))) + + return SequenceCommand( + ListCommand(*copy), + ) + + def worker_pack(pack, worker): if is_remote(pack): return pack.copy({}) @@ -131,7 +181,13 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand: cmds.append( SSHCommand( - CmdCommand(worker_pack(pack, worker), "milabench", *command), + CmdCommand( + worker_pack(pack, worker), + "cd", f"{INSTALL_FOLDER}", "&&", + f"MILABENCH_CONFIG={pack.config['config_file']}", + f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}", + "milabench", *command + ), host=host, user=user, key=key, @@ -175,6 +231,45 @@ def _sanity(pack, setup_for): assert is_remote(pack), "Only a remote node can setup the main node" +def milabench_remote_config(pack, packs): + config = {} + config_hash = pack.config["hash"] + config_file = XPath(pack.config["config_file"]) + config_file = config_file.with_name(f"{config_file.name}.{config_hash}") + pack.config["config_file"] = str(config_file) + for p in packs.values(): + config[p.config["name"]] = p.config + p.config["config_file"] = str(config_file) + config_file.write_text(yaml.dump(config)) + + for n in pack.config["system"]["nodes"]: + _cmds = [ + SSHCommand( + CmdCommand( + pack, + "(", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")", + "||", "(", "sudo", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], + "&&", "sudo", "chmod", "-R", "a+rwX", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")", + ), + n["ip"], + ), + SSHCommand( + CmdCommand( + pack, + "mkdir", "-p", str(config_file.parent), + ), + n["ip"], + ), + SCPCommand( + pack, + n["ip"], + str(config_file), + ), + ] + + yield SequenceCommand(*_cmds) + + def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand: """Copy milabench code, install milabench, execute milabench install""" _sanity(pack, setup_for) @@ -183,9 +278,9 @@ def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand: return VoidCommand(pack) argv = sys.argv[2:] - return SequenceCommand( milabench_remote_setup_plan(pack, setup_for), + milabench_remote_command(pack, "pin", *argv, run_for=setup_for), milabench_remote_command(pack, "install", *argv, run_for=setup_for), ) @@ -210,4 +305,7 @@ def milabench_remote_run(pack) -> Command: return VoidCommand(pack) argv = sys.argv[2:] - return milabench_remote_command(pack, "run", *argv) + return SequenceCommand( + milabench_remote_command(pack, "run", *argv, run_for="main"), + milabench_remote_fetch_reports_plan(pack, run_for="main"), + ) From aaa92e4c17369aa9b750df07eaa39ac4ffafd4f1 Mon Sep 17 00:00:00 2001 From: satyaog Date: Tue, 19 Mar 2024 09:31:39 -0400 Subject: [PATCH 03/22] Update .github/workflows/cloud-ci.yml --- .github/workflows/cloud-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index ab1211a9f..a215cb0a5 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -7,7 +7,7 @@ on: - master jobs: - tests: + could-tests: strategy: fail-fast: true matrix: From 44ebcea64b0783bd9c501715d1fb0617e71f8d4a Mon Sep 17 00:00:00 2001 From: satyaog Date: Tue, 19 Mar 2024 11:39:43 -0400 Subject: [PATCH 04/22] Apply suggestions from code review --- .github/workflows/cloud-ci.yml | 2 +- milabench/cli/badges/requirements.txt | 2 +- milabench/cli/covalent/requirements.txt | 2 +- milabench/common.py | 32 ------------------------- 4 files changed, 3 insertions(+), 35 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index a215cb0a5..0dcad45dc 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -19,7 +19,7 @@ jobs: # exclude : "no-rocm" runs-on: ubuntu-latest - environment: test-cloud-ci + environment: cloud-ci # Cancel previous jobs if a new version was pushed concurrency: diff --git a/milabench/cli/badges/requirements.txt b/milabench/cli/badges/requirements.txt index 26620981a..2c1953bd5 100644 --- a/milabench/cli/badges/requirements.txt +++ b/milabench/cli/badges/requirements.txt @@ -1 +1 @@ -pybadges \ No newline at end of file +pybadges diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt index f810e6eaf..158fa227c 100644 --- a/milabench/cli/covalent/requirements.txt +++ b/milabench/cli/covalent/requirements.txt @@ -1,2 +1,2 @@ covalent -covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench \ No newline at end of file +covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench diff --git a/milabench/common.py b/milabench/common.py index 01a8976a0..92bde8c4e 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -308,38 +308,6 @@ def _read_reports(*runs): return all_data -def _find_metas(reports): - local_meta = next(iter(e for _r in reports for e in _r if e["event"] == "meta"), None) - if local_meta: - local_meta = local_meta["data"] - remote_metas = [] - for _r in reports: - meta_lines = [] - for event in _r: - _, event_type, line = None, "", [] - - try: - _, event_type, *line = event["data"].split(" ") - except (AttributeError, ValueError): - pass - - if event_type[:1] + event_type[-1:] != "[]": - event_type = None - line = event["data"] - else: - line = " ".join(line) - - if event_type == "[meta]": - meta_lines.append(line) - elif event_type is None and meta_lines: - meta_lines.append(line) - elif meta_lines: - remote_metas.append(yaml.safe_load("".join(meta_lines))) - meta_lines = [] - - return local_meta, remote_metas - - def _filter_reports(*reports): all_reports = [] From a296410b0f9724ecd8ce65c9085577c24eb4e7d1 Mon Sep 17 00:00:00 2001 From: satyaog Date: Tue, 19 Mar 2024 14:39:23 -0400 Subject: [PATCH 05/22] Update .github/workflows/cloud-ci.yml --- .github/workflows/cloud-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 0dcad45dc..e27b9c14c 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -7,7 +7,7 @@ on: - master jobs: - could-tests: + cloud-tests: strategy: fail-fast: true matrix: From 7996e5646b2ae137f1d0d829c36f1ba678525530 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Fri, 22 Mar 2024 02:32:07 -0400 Subject: [PATCH 06/22] Add azure covalent cloud infra --- .github/workflows/cloud-ci.yml | 32 +++++++++++++++++-- .../{ec2-system.yaml => cloud-system.yaml} | 4 +++ milabench/cli/cloud.py | 13 +++++--- milabench/cli/covalent/__main__.py | 31 ++++++++++++++---- milabench/cli/covalent/requirements.txt | 1 + milabench/config.py | 2 +- milabench/remote.py | 2 +- 7 files changed, 69 insertions(+), 16 deletions(-) rename config/examples/{ec2-system.yaml => cloud-system.yaml} (83%) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index e27b9c14c..b73029626 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -6,6 +6,9 @@ on: branches: - master +permissions: + id-token: write + jobs: cloud-tests: strategy: @@ -14,7 +17,7 @@ jobs: include: - arch: cuda exclude: "no-cuda" - run_on: ec2 + run_on: azure # - arch: rocm # exclude : "no-rocm" @@ -32,11 +35,14 @@ jobs: env: MILABENCH_CONFIG: "config/test.yaml" - MILABENCH_SYSTEM: "config/examples/${{ matrix.run_on }}-system.yaml" + MILABENCH_SYSTEM: "config/examples/cloud-system.yaml" MILABENCH_BASE: "output" MILABENCH_ARGS: "" MILABENCH_GPU_ARCH: "${{ matrix.arch }}" MILABENCH_DASH: "no" + ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" + ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" + AZURE_CORE_OUTPUT: none steps: - uses: actions/checkout@v3 @@ -47,6 +53,17 @@ jobs: with: python-version: 3.9 + - name: Azure login + uses: azure/login@v2 + with: + creds: | + { + "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", + "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", + "tenantId": "${{ secrets.ARM_TENANT_ID }}", + "clientId": "${{ secrets.ARM_CLIENT_ID }}" + } + - name: dependencies run: | python -m pip install -U pip @@ -107,3 +124,14 @@ jobs: git config --global user.email "github-ci@example.com" git config --global user.name "GitHub CI" poetry run milabench report --push + + - name: teardown cloud + if: always() + run: | + if [[ -f "${MILABENCH_SYSTEM%.*}" ]] + then + export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*} + fi + poetry run milabench cloud \ + --teardown \ + --run-on ${{ matrix.run_on }} diff --git a/config/examples/ec2-system.yaml b/config/examples/cloud-system.yaml similarity index 83% rename from config/examples/ec2-system.yaml rename to config/examples/cloud-system.yaml index dab1a7a4e..5cf618b53 100644 --- a/config/examples/ec2-system.yaml +++ b/config/examples/cloud-system.yaml @@ -12,6 +12,10 @@ system: # Cloud instances profiles cloud_profiles: + azure: + username: ubuntu + size: Standard_B2ats_v2 + location: eastus2 ec2: username: ubuntu instance_type: t2.micro diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index c0f9c9bcb..310a4506b 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -6,7 +6,6 @@ from coleo import Option, tooled import yaml -# import milabench as mb from ..common import get_multipack @@ -62,10 +61,14 @@ def manage_cloud(pack, packs, run_on, action="setup"): "-m", cv.__name__, run_on, f"--{action}", - *[ - f"--{k.replace('_', '-')}={v}" - for k, v in plan_params.items() - ], + *list( + sum( + ( + (f"--{k.replace('_', '-')}", v) + for k, v in plan_params.items() + ), () + ) + ) ] p = subprocess.Popen( cmd, diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py index cf5ff8537..1c837a83c 100644 --- a/milabench/cli/covalent/__main__.py +++ b/milabench/cli/covalent/__main__.py @@ -1,5 +1,6 @@ import argparse import asyncio +import json import os import pathlib import subprocess @@ -7,9 +8,21 @@ import tempfile +def _load_venv(venv:pathlib.Path) -> dict: + activate = venv / "bin/activate" + if not activate.exists(): + raise FileNotFoundError(str(activate)) + env = subprocess.run( + f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'", + shell=True, + capture_output=True + ).stdout + return json.loads(env) + + def serve(*argv): return subprocess.run([ - str(pathlib.Path(sys.executable).with_name("covalent")), + "covalent", *argv ]).returncode @@ -141,7 +154,7 @@ def lattice(argv=(), deps_bash = None): print(f"hostname::>{_executor.hostname}") print(f"username::>{_executor.username}") print(f"ssh_key_file::>{_executor.ssh_key_file}") - print(f"env::>~/.condaenvrc") + print(f"env::>{_executor.env}") finally: result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None results_dir = result.results_dir if result else "" @@ -171,12 +184,11 @@ def main(argv=None): try: import covalent as ct - ct.get_config(f"executors.ec2") except (KeyError, ImportError): module = pathlib.Path(__file__).resolve().parent cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") python3 = str(cache_dir / "bin/python3") - check_module = "import covalent ; from covalent.executor import EC2Executor" + check_module = "import covalent" try: subprocess.run([python3, "-c", check_module], check=True) except (FileNotFoundError, subprocess.CalledProcessError): @@ -190,17 +202,18 @@ def main(argv=None): "install", "-r", str(module / "requirements.txt") - ], check=True) + ], stdout=sys.stderr, check=True) subprocess.run([python3, "-c", check_module], check=True) return subprocess.call( [python3, __file__, *argv], + env=_load_venv(cache_dir) ) parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() subparser = subparsers.add_parser("serve") subparser.add_argument(f"argv", nargs=argparse.REMAINDER) - for p in ("ec2",): + for p in ("azure","ec2"): try: config = ct.get_config(f"executors.{p}") except KeyError: @@ -223,11 +236,15 @@ def main(argv=None): if cv_argv[0] == "serve": assert not argv return serve(*args.argv) + elif cv_argv[0] == "azure": + executor_cls = ct.executor.AzureExecutor elif cv_argv[0] == "ec2": - return executor(ct.executor.EC2Executor, args, *argv) + executor_cls = ct.executor.EC2Executor else: raise + return executor(executor_cls, args, *argv) + if __name__ == "__main__": sys.exit(main()) diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt index 158fa227c..c988e26a3 100644 --- a/milabench/cli/covalent/requirements.txt +++ b/milabench/cli/covalent/requirements.txt @@ -1,2 +1,3 @@ covalent covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench +covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench \ No newline at end of file diff --git a/milabench/config.py b/milabench/config.py index e276cb17c..694a9e60f 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -128,7 +128,7 @@ def get_remote_ip(): def _resolve_ip(ip): # Resolve the IP try: - hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip) + hostname, aliaslist, ipaddrlist = socket.gethostbyname_ex(ip) lazy_raise = None except socket.gaierror as err: # Get Addr Info (GAI) Error diff --git a/milabench/remote.py b/milabench/remote.py index b1759f2fa..e8aaa8312 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -249,7 +249,7 @@ def milabench_remote_config(pack, packs): pack, "(", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")", "||", "(", "sudo", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], - "&&", "sudo", "chmod", "-R", "a+rwX", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")", + "&&", "sudo", "chown", "-R", "$USER:$USER", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")", ), n["ip"], ), From 6dbbd05260a3b902aea6a0a37b421198ed567f53 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 2 Apr 2024 02:10:36 -0400 Subject: [PATCH 07/22] Fix reports --- milabench/cli/badges/__main__.py | 28 +++----------------- milabench/cli/report.py | 2 +- milabench/cli/utils.py | 44 ++++++++++++++++++++++++++++++++ milabench/common.py | 9 +++---- 4 files changed, 53 insertions(+), 30 deletions(-) create mode 100644 milabench/cli/utils.py diff --git a/milabench/cli/badges/__main__.py b/milabench/cli/badges/__main__.py index 027a59a4b..e0a7bdc81 100644 --- a/milabench/cli/badges/__main__.py +++ b/milabench/cli/badges/__main__.py @@ -1,4 +1,3 @@ -import pathlib import subprocess import sys @@ -10,29 +9,10 @@ def main(argv=None): try: import pybadges as _ except ImportError: - module = pathlib.Path(__file__).resolve().parent - cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") - python3 = str(cache_dir / "bin/python3") - check_module = "import pybadges" - try: - subprocess.run([python3, "-c", check_module], check=True) - except (FileNotFoundError, subprocess.CalledProcessError): - cache_dir.mkdir(parents=True, exist_ok=True) - subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True) - subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True) - subprocess.run([ - python3, - "-m", - "pip", - "install", - "-r", - str(module / "requirements.txt") - ], check=True) - subprocess.run([python3, "-c", check_module], check=True) - return subprocess.call( - [python3, __file__, *argv], - ) - + from ..utils import run_in_module_venv + check_if_module = "import pybadges" + return run_in_module_venv(__file__, check_if_module, argv) + return subprocess.run([ sys.executable, "-m", diff --git a/milabench/cli/report.py b/milabench/cli/report.py index b14b49528..a65d9f31b 100644 --- a/milabench/cli/report.py +++ b/milabench/cli/report.py @@ -119,4 +119,4 @@ def cli_report(args=None): XPath(pack_config["dirs"]["base"]) / "reports" for pack_config in args.config.values() )) - _push_reports(reports_repo, args.runs, summary) + _push_reports(reports_repo, args.runs) diff --git a/milabench/cli/utils.py b/milabench/cli/utils.py new file mode 100644 index 000000000..5aec72d06 --- /dev/null +++ b/milabench/cli/utils.py @@ -0,0 +1,44 @@ +import json +import pathlib +import subprocess +import sys + + +def get_venv(venv:pathlib.Path) -> dict: + activate = venv / "bin/activate" + if not activate.exists(): + raise FileNotFoundError(str(activate)) + env = subprocess.run( + f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'", + shell=True, + capture_output=True + ).stdout + return json.loads(env) + + +def run_in_module_venv(module_main:str, check_if_module:str, argv:list=None): + module = pathlib.Path(module_main).resolve().parent + cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") + python3 = str(cache_dir / "bin/python3") + try: + subprocess.run([python3, "-c", check_if_module], check=True, + stdout=sys.stderr) + except (FileNotFoundError, subprocess.CalledProcessError): + cache_dir.mkdir(parents=True, exist_ok=True) + subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], + check=True, stdout=sys.stderr) + subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], + check=True, stdout=sys.stderr) + subprocess.run([ + python3, + "-m", + "pip", + "install", + "-r", + str(module / "requirements.txt") + ], stdout=sys.stderr, check=True) + subprocess.run([python3, "-c", check_if_module], check=True, stdout=sys.stderr) + return subprocess.call( + [python3, module_main, *argv], + env=get_venv(cache_dir) + ) \ No newline at end of file diff --git a/milabench/common.py b/milabench/common.py index 92bde8c4e..0fc540dde 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -322,7 +322,7 @@ def _filter_reports(*reports): return all_reports -def _push_reports(reports_repo, runs, packs:dict=None): +def _push_reports(reports_repo, runs): _SVG_COLORS = { "pass": "blue", "partial": "yellow", @@ -355,10 +355,9 @@ def _push_reports(reports_repo, runs, packs:dict=None): meta = [e["data"] for _r in reports for e in _r if e["event"] == "meta"] - for _meta in meta: - for gpu in _meta["accelerators"]["gpus"].values(): - device = gpu["product"].replace(" ", "_") - break + for gpu in (_ for _meta in meta for _ in _meta["accelerators"]["gpus"].values()): + device = gpu["product"].replace(" ", "_") + break else: for _meta in meta: device = _meta["cpu"]["brand"].replace(" ", "_") From a18a03461587a9d36b25806a98ca2dfcf3d40700 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Mon, 1 Apr 2024 15:14:58 -0400 Subject: [PATCH 08/22] Fix cloud-ci with gpu arch --- .github/workflows/cloud-ci.yml | 18 +++++---- config/cloud-system.yaml | 18 +++++++++ config/examples/cloud-system.yaml | 7 ++++ config/{ => examples}/test.yaml | 4 +- milabench/cli/cloud.py | 53 +++++++++++++++++---------- milabench/cli/covalent/__main__.py | 59 +++--------------------------- milabench/config.py | 8 +++- milabench/remote.py | 29 ++------------- 8 files changed, 87 insertions(+), 109 deletions(-) create mode 100644 config/cloud-system.yaml rename config/{ => examples}/test.yaml (79%) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index b73029626..71e471252 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -17,7 +17,7 @@ jobs: include: - arch: cuda exclude: "no-cuda" - run_on: azure + run_on: azure__a100 # - arch: rocm # exclude : "no-rocm" @@ -34,8 +34,8 @@ jobs: shell: bash -el {0} env: - MILABENCH_CONFIG: "config/test.yaml" - MILABENCH_SYSTEM: "config/examples/cloud-system.yaml" + MILABENCH_CONFIG: "config/standard.yaml" + MILABENCH_SYSTEM: "config/cloud-system.yaml" MILABENCH_BASE: "output" MILABENCH_ARGS: "" MILABENCH_GPU_ARCH: "${{ matrix.arch }}" @@ -53,15 +53,18 @@ jobs: with: python-version: 3.9 + # Follow + # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret + # to generate a clientId as well as a clientSecret - name: Azure login uses: azure/login@v2 with: creds: | { + "clientId": "${{ secrets.ARM_CLIENT_ID }}", "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", - "tenantId": "${{ secrets.ARM_TENANT_ID }}", - "clientId": "${{ secrets.ARM_CLIENT_ID }}" + "tenantId": "${{ secrets.ARM_TENANT_ID }}" } - name: dependencies @@ -108,7 +111,7 @@ jobs: - name: install benchmarks run: | - poetry run milabench install + poetry run milabench install --variant ${{ matrix.arch }} - name: prepare benchmarks run: | @@ -134,4 +137,5 @@ jobs: fi poetry run milabench cloud \ --teardown \ - --run-on ${{ matrix.run_on }} + --run-on ${{ matrix.run_on }} \ + --all diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml new file mode 100644 index 000000000..d1889c724 --- /dev/null +++ b/config/cloud-system.yaml @@ -0,0 +1,18 @@ +system: + # Nodes list + nodes: + # Alias used to reference the node + - name: manager + # Use 1.1.1.1 as an ip placeholder + ip: 1.1.1.1 + # Use this node as the master node or not + main: true + # User to use in remote milabench operations + user: user + + # Cloud instances profiles + cloud_profiles: + azure__a100: + username: ubuntu + size: Standard_NC24ads_A100_v4 + location: eastus2 diff --git a/config/examples/cloud-system.yaml b/config/examples/cloud-system.yaml index 5cf618b53..b3d1f70aa 100644 --- a/config/examples/cloud-system.yaml +++ b/config/examples/cloud-system.yaml @@ -12,11 +12,18 @@ system: # Cloud instances profiles cloud_profiles: + # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME} azure: + # covalent-azure-plugin args + username: ubuntu + size: Standard_B1s + location: eastus2 + azure__free: username: ubuntu size: Standard_B2ats_v2 location: eastus2 ec2: + # covalent-ec2-plugin args username: ubuntu instance_type: t2.micro volume_size: 8 diff --git a/config/test.yaml b/config/examples/test.yaml similarity index 79% rename from config/test.yaml rename to config/examples/test.yaml index 060949e40..6e155a0bf 100644 --- a/config/test.yaml +++ b/config/examples/test.yaml @@ -9,14 +9,14 @@ test: inherits: _defaults group: test_remote install_group: test_remote - definition: ../benchmarks/_template + definition: ../../benchmarks/_template plan: method: njobs n: 1 testing: inherits: _defaults - definition: ../benchmarks/_template + definition: ../../benchmarks/_template group: test_remote_2 install_group: test_remote_2 plan: diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index 310a4506b..c21fe16d9 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -4,6 +4,7 @@ import sys from coleo import Option, tooled +from omegaconf import OmegaConf import yaml from ..common import get_multipack @@ -15,7 +16,16 @@ _ACTIONS = (_SETUP, _TEARDOWN, _LIST) -def manage_cloud(pack, packs, run_on, action="setup"): +def _flatten_cli_args(**kwargs): + return sum( + ( + (f"--{k.replace('_', '-')}", *([v] if v else [])) + for k, v in kwargs.items() + ), () + ) + + +def manage_cloud(pack, run_on, action="setup"): assert run_on in pack.config["system"]["cloud_profiles"] key_map = { @@ -28,11 +38,6 @@ def manage_cloud(pack, packs, run_on, action="setup"): nodes = iter(enumerate(pack.config["system"]["nodes"])) - state_prefix = [] - for p in packs.values(): - state_prefix.append(p.config["name"]) - state_prefix.append(p.config["install_variant"]) - while True: try: i, n = next(nodes) @@ -41,8 +46,10 @@ def manage_cloud(pack, packs, run_on, action="setup"): except StopIteration: break - plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix]) + plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on]) plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"] + if i > 0: + plan_params["reuse_resource_group"] = None import milabench.cli.covalent as cv @@ -59,16 +66,9 @@ def manage_cloud(pack, packs, run_on, action="setup"): cmd = [ sys.executable, "-m", cv.__name__, - run_on, + run_on.split("__")[0], f"--{action}", - *list( - sum( - ( - (f"--{k.replace('_', '-')}", v) - for k, v in plan_params.items() - ), () - ) - ) + *_flatten_cli_args(**plan_params) ] p = subprocess.Popen( cmd, @@ -121,7 +121,8 @@ def _setup(): mp = get_multipack() setup_pack = mp.setup_pack() - system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP) + system_config = manage_cloud(setup_pack, run_on, action=_SETUP) + del system_config["arch"] print(f"# hash::>{setup_pack.config['hash']}") print(yaml.dump({"system": system_config})) @@ -131,12 +132,24 @@ def _setup(): def _teardown(): """Teardown a cloud infrastructure""" - # Setup cloud on target infra + # Teardown cloud instance on target infra run_on: Option & str - mp = get_multipack() + # Teardown all cloud instances + all: Option & bool = False + + overrides = {} + if all: + overrides = { + "*": OmegaConf.to_object(OmegaConf.from_dotlist([ + f"system.cloud_profiles.{run_on}.state_prefix='*'", + f"system.cloud_profiles.{run_on}.state_id='*'", + ])) + } + + mp = get_multipack(overrides=overrides) setup_pack = mp.setup_pack() - manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN) + manage_cloud(setup_pack, run_on, action=_TEARDOWN) @tooled diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py index 1c837a83c..9537cc2dd 100644 --- a/milabench/cli/covalent/__main__.py +++ b/milabench/cli/covalent/__main__.py @@ -1,6 +1,5 @@ import argparse import asyncio -import json import os import pathlib import subprocess @@ -8,18 +7,6 @@ import tempfile -def _load_venv(venv:pathlib.Path) -> dict: - activate = venv / "bin/activate" - if not activate.exists(): - raise FileNotFoundError(str(activate)) - env = subprocess.run( - f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'", - shell=True, - capture_output=True - ).stdout - return json.loads(env) - - def serve(*argv): return subprocess.run([ "covalent", @@ -119,21 +106,9 @@ def lattice(argv=(), deps_bash = None): deps_bash = None if not argv and args.setup: - conda_prefix = "eval \"$(conda shell.bash hook)\"" - conda_activate = "conda activate milabench" - deps_bash = [] - for _cmd in ( - f"{conda_activate} || conda create -n milabench -y", - f"{conda_activate}" - f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y" - f" || >&2 echo First attempt to install python in milabench env failed", - f"{conda_activate}" - f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y" - f" || conda remove -n milabench --all -y", - ): - deps_bash.append(f"{conda_prefix} && ({_cmd})") - deps_bash = ct.DepsBash(deps_bash) - argv = ["conda", "env", "list"] + deps_bash = ct.DepsBash([]) + # Make sure pip is installed + argv = ["python3", "-m", "pip", "freeze"] if argv: dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) @@ -141,7 +116,6 @@ def lattice(argv=(), deps_bash = None): return_code, stdout, _ = result.result if result.result is not None else (1, "", "") if return_code == 0 and args.setup: - assert any([l for l in stdout.split("\n") if l.startswith("milabench ")]) _executor:ct.executor.BaseExecutor = executor_cls( **{ **_get_executor_kwargs(args), @@ -154,7 +128,6 @@ def lattice(argv=(), deps_bash = None): print(f"hostname::>{_executor.hostname}") print(f"username::>{_executor.username}") print(f"ssh_key_file::>{_executor.ssh_key_file}") - print(f"env::>{_executor.env}") finally: result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None results_dir = result.results_dir if result else "" @@ -185,29 +158,9 @@ def main(argv=None): try: import covalent as ct except (KeyError, ImportError): - module = pathlib.Path(__file__).resolve().parent - cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv") - python3 = str(cache_dir / "bin/python3") - check_module = "import covalent" - try: - subprocess.run([python3, "-c", check_module], check=True) - except (FileNotFoundError, subprocess.CalledProcessError): - cache_dir.mkdir(parents=True, exist_ok=True) - subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True) - subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True) - subprocess.run([ - python3, - "-m", - "pip", - "install", - "-r", - str(module / "requirements.txt") - ], stdout=sys.stderr, check=True) - subprocess.run([python3, "-c", check_module], check=True) - return subprocess.call( - [python3, __file__, *argv], - env=_load_venv(cache_dir) - ) + from ..utils import run_in_module_venv + check_if_module = "import covalent" + return run_in_module_venv(__file__, check_if_module, argv) parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() diff --git a/milabench/config.py b/milabench/config.py index 694a9e60f..f50d735cb 100644 --- a/milabench/config.py +++ b/milabench/config.py @@ -173,6 +173,10 @@ def resolve_addresses(nodes): is_local = ( ("127.0.0.1" in ipaddrlist) or (hostname in ("localhost", socket.gethostname())) + # Tmp workaround until networking on azure allows to associate the + # local hostname (`hostname.split(".")[0]`) with the public fqdn + # (hostname.split(".")[0].*.cloudapp.azure.com) + or (hostname.split(".")[0] == socket.gethostname()) or len(ip_list.intersection(ipaddrlist)) > 0 ) node["local"] = is_local @@ -227,9 +231,9 @@ def build_system_config(config_file, defaults=None, gpu=True): config = yaml.safe_load(cf) if defaults: - config = merge(defaults, config) + config["system"] = merge(defaults["system"], config["system"]) - system = config.get("system", {}) + system = config["system"] # capacity is only required if batch resizer is enabled if (gpu or is_autoscale_enabled()) and not "gpu" not in system: diff --git a/milabench/remote.py b/milabench/remote.py index e8aaa8312..bbf1b4f0f 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -1,3 +1,4 @@ +from copy import deepcopy import os import sys @@ -78,7 +79,7 @@ def pip_install_milabench(pack, node, folder) -> SSHCommand: host = node["ip"] user = node["user"] - cmd = ["pip", "install", "-e", folder] + cmd = ["python3", "-m", "pip", "install", "-e", folder] plan = CmdCommand(pack, *cmd) return SSHCommand(plan, host=host, user=user) @@ -184,8 +185,9 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand: CmdCommand( worker_pack(pack, worker), "cd", f"{INSTALL_FOLDER}", "&&", - f"MILABENCH_CONFIG={pack.config['config_file']}", f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}", + f"MILABENCH_CONFIG={os.environ.get('MILABENCH_CONFIG', '')}", + f"MILABENCH_SYSTEM={os.environ.get('MILABENCH_SYSTEM', '')}", "milabench", *command ), host=host, @@ -232,16 +234,6 @@ def _sanity(pack, setup_for): def milabench_remote_config(pack, packs): - config = {} - config_hash = pack.config["hash"] - config_file = XPath(pack.config["config_file"]) - config_file = config_file.with_name(f"{config_file.name}.{config_hash}") - pack.config["config_file"] = str(config_file) - for p in packs.values(): - config[p.config["name"]] = p.config - p.config["config_file"] = str(config_file) - config_file.write_text(yaml.dump(config)) - for n in pack.config["system"]["nodes"]: _cmds = [ SSHCommand( @@ -253,18 +245,6 @@ def milabench_remote_config(pack, packs): ), n["ip"], ), - SSHCommand( - CmdCommand( - pack, - "mkdir", "-p", str(config_file.parent), - ), - n["ip"], - ), - SCPCommand( - pack, - n["ip"], - str(config_file), - ), ] yield SequenceCommand(*_cmds) @@ -280,7 +260,6 @@ def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand: argv = sys.argv[2:] return SequenceCommand( milabench_remote_setup_plan(pack, setup_for), - milabench_remote_command(pack, "pin", *argv, run_for=setup_for), milabench_remote_command(pack, "install", *argv, run_for=setup_for), ) From 5267334ba5381623645d12db048983d45ecf0e6f Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 3 Apr 2024 00:29:32 -0400 Subject: [PATCH 09/22] Add multi-node on cloud --- config/examples/cloud-multinodes-system.yaml | 36 ++++++++++++++++++++ milabench/cli/cloud.py | 22 +++++------- milabench/cli/covalent/__main__.py | 9 ++--- 3 files changed, 50 insertions(+), 17 deletions(-) create mode 100644 config/examples/cloud-multinodes-system.yaml diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml new file mode 100644 index 000000000..a5b45c606 --- /dev/null +++ b/config/examples/cloud-multinodes-system.yaml @@ -0,0 +1,36 @@ +system: + # Nodes list + nodes: + # Alias used to reference the node + - name: manager + # Use 1.1.1.1 as an ip placeholder + ip: 1.1.1.1 + # Use this node as the master node or not + main: true + # User to use in remote milabench operations + user: user + + - name: node1 + ip: 1.1.1.1 + main: false + user: username + + # Cloud instances profiles + cloud_profiles: + # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME} + azure: + # covalent-azure-plugin args + username: ubuntu + size: Standard_B1s + location: eastus2 + azure__free: + username: ubuntu + size: Standard_B2ats_v2 + location: eastus2 + ec2: + # covalent-ec2-plugin args + username: ubuntu + instance_type: t2.micro + volume_size: 8 + region: us-east-2 + state_id: 71669879043a3864225aabb94f91a2d4 diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index c21fe16d9..d93e4fec4 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -19,7 +19,7 @@ def _flatten_cli_args(**kwargs): return sum( ( - (f"--{k.replace('_', '-')}", *([v] if v else [])) + (f"--{str(k).replace('_', '-')}", *([str(v)] if str(v) else [])) for k, v in kwargs.items() ), () ) @@ -35,21 +35,17 @@ def manage_cloud(pack, run_on, action="setup"): "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])), } plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on]) + run_on, *profile = run_on.split("__") + profile = profile[0] if profile else "" nodes = iter(enumerate(pack.config["system"]["nodes"])) + for i, n in nodes: + if n["ip"] != "1.1.1.1": + continue - while True: - try: - i, n = next(nodes) - if n["ip"] != "1.1.1.1": - continue - except StopIteration: - break - - plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on]) + plan_params["state_prefix"] = plan_params.get("state_prefix", None) or profile or run_on plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"] - if i > 0: - plan_params["reuse_resource_group"] = None + plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1) import milabench.cli.covalent as cv @@ -66,7 +62,7 @@ def manage_cloud(pack, run_on, action="setup"): cmd = [ sys.executable, "-m", cv.__name__, - run_on.split("__")[0], + run_on, f"--{action}", *_flatten_cli_args(**plan_params) ] diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py index 9537cc2dd..f4f2ca47d 100644 --- a/milabench/cli/covalent/__main__.py +++ b/milabench/cli/covalent/__main__.py @@ -124,10 +124,11 @@ def lattice(argv=(), deps_bash = None): ) asyncio.run(_executor.setup({})) - assert _executor.hostname - print(f"hostname::>{_executor.hostname}") - print(f"username::>{_executor.username}") - print(f"ssh_key_file::>{_executor.ssh_key_file}") + assert _executor.hostnames + for hostname in _executor.hostnames: + print(f"hostname::>{hostname}") + print(f"username::>{_executor.username}") + print(f"ssh_key_file::>{_executor.ssh_key_file}") finally: result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None results_dir = result.results_dir if result else "" From 15b2d9c204800c72c7afd363150df818177f5e43 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 9 Apr 2024 13:57:57 -0400 Subject: [PATCH 10/22] Fix cloud data dir * VM on the cloud might not have enough space on all partitions. Add a workaround which should cover most cases * Use branch and commit name to versionize reports directories * Fix parsing error when temperature is not available in nvidia-smi outputs * export MILABENCH_* env vars to remote --- config/cloud-multinodes-system.yaml | 31 ++++++++ config/cloud-system.yaml | 8 ++ config/examples/cloud-multinodes-system.yaml | 3 +- docs/usage.rst | 81 ++++++++++++++++++++ milabench/cli/cloud.py | 21 ++++- milabench/cli/covalent/__main__.py | 2 +- milabench/cli/covalent/requirements.txt | 2 +- milabench/common.py | 28 +++---- milabench/log.py | 13 +++- milabench/remote.py | 12 ++- 10 files changed, 178 insertions(+), 23 deletions(-) create mode 100644 config/cloud-multinodes-system.yaml diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml new file mode 100644 index 000000000..e5dc14f2b --- /dev/null +++ b/config/cloud-multinodes-system.yaml @@ -0,0 +1,31 @@ +system: + # Nodes list + nodes: + # Alias used to reference the node + - name: manager + # Use 1.1.1.1 as an ip placeholder + ip: 1.1.1.1 + # Use this node as the master node or not + main: true + # User to use in remote milabench operations + user: user + + - name: node1 + ip: 1.1.1.1 + main: false + user: username + + # Cloud instances profiles + cloud_profiles: + azure__a100: + username: ubuntu + size: Standard_NC24ads_A100_v4 + location: eastus2 + azure__a100_x2: + username: ubuntu + size: Standard_NC48ads_A100_v4 + location: eastus2 + azure__a10_x2: + username: ubuntu + size: Standard_NV72ads_A10_v5 + location: eastus2 diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml index d1889c724..2d1a049ad 100644 --- a/config/cloud-system.yaml +++ b/config/cloud-system.yaml @@ -16,3 +16,11 @@ system: username: ubuntu size: Standard_NC24ads_A100_v4 location: eastus2 + azure__a100_x2: + username: ubuntu + size: Standard_NC48ads_A100_v4 + location: eastus2 + azure__a10_x2: + username: ubuntu + size: Standard_NV72ads_A10_v5 + location: eastus2 diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml index a5b45c606..5066af5eb 100644 --- a/config/examples/cloud-multinodes-system.yaml +++ b/config/examples/cloud-multinodes-system.yaml @@ -17,7 +17,8 @@ system: # Cloud instances profiles cloud_profiles: - # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME} + # The cloud platform to use in the form of {PLATFORM} or + # {PLATFORM}__{PROFILE_NAME} azure: # covalent-azure-plugin args username: ubuntu diff --git a/docs/usage.rst b/docs/usage.rst index ecea88b75..26c308513 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -69,3 +69,84 @@ The following command will print out a report of the tests that ran, the metrics milabench report --runs $MILABENCH_BASE/runs/some_specific_run --html report.html The report will also print out a score based on a weighting of the metrics, as defined in the file ``$MILABENCH_CONFIG`` points to. + + +Use milabench on the cloud +~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Setup Terraform and a free Azure account +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. | Install azure cli (it does not need to be in the same environment than + milabench) + | ``pip install azure-cli`` + +2. Setup a free account on + `azure.microsoft.com `_ + +3. Follow instructions in the + `azurerm documentation `_ + to generate a ``ARM_CLIENT_ID`` as well as a ``ARM_CLIENT_SECRET``. If you + don't have the permissions to create / assign a role to a service principal, + you can ignore the ``az ad sp create-for-rbac`` command to work directly with + your ``ARM_TENANT_ID`` and ``ARM_SUBSCRIPTION_ID`` + +4. `Install Terraform `_ + +5. Configure the ``azurerm`` Terraform provider by + `exporting the environment variables `_ + + +Create a cloud system configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add a ``cloud_profiles`` section to the ``system`` configuration which lists the +supported cloud profiles. + +.. notes:: + + Nodes that should be created on the cloud should have the ``1.1.1.1`` ip + address placeholder. Other ip addresses will be used as-is and no cloud + instance will be created for that node + +.. notes:: + + A cloud profile entry needs to start with a covalent plugin (e.g. `azure`). To + define multiple profiles on the same cloud platform, use the form + ``{PLATFORM}__{PROFILE_NAME}`` (e.g. ``azure__profile``). All cloud profile + attributes will be used as is as argument for the target covalent plugin + +.. code-block:: yaml + + system: + nodes: + - name: manager + # Use 1.1.1.1 as an ip placeholder + ip: 1.1.1.1 + main: true + user: + - name: node1 + ip: 1.1.1.1 + main: false + user: + + # Cloud instances profiles + cloud_profiles: + # The cloud platform to use in the form of {PLATFORM} or + # {PLATFORM}__{PROFILE_NAME} + azure__free: + # covalent-azure-plugin args + username: ubuntu + size: Standard_B2ats_v2 + location: eastus2 + + +Run milabench on the cloud +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. | Initialize the cloud instances + | ``milabench cloud --system {{SYSTEM_CONFIG.YAML}} --setup --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}`` + +2. | Prepare, install and run milabench + | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}`` diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index d93e4fec4..150e37749 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -7,8 +7,9 @@ from omegaconf import OmegaConf import yaml -from ..common import get_multipack +from milabench.fs import XPath +from ..common import get_multipack _SETUP = "setup" _TEARDOWN = "teardown" @@ -25,8 +26,12 @@ def _flatten_cli_args(**kwargs): ) +def _or_sudo(cmd:str): + return f"( {cmd} || sudo {cmd} )" + + def manage_cloud(pack, run_on, action="setup"): - assert run_on in pack.config["system"]["cloud_profiles"] + assert run_on in pack.config["system"]["cloud_profiles"], f"{run_on} cloud profile not found in {list(pack.config['system']['cloud_profiles'].keys())}" key_map = { "hostname":(lambda v: ("ip",v)), @@ -38,6 +43,9 @@ def manage_cloud(pack, run_on, action="setup"): run_on, *profile = run_on.split("__") profile = profile[0] if profile else "" + remote_base = XPath("/data") / pack.dirs.base.name + local_base = pack.dirs.base.absolute().parent + nodes = iter(enumerate(pack.config["system"]["nodes"])) for i, n in nodes: if n["ip"] != "1.1.1.1": @@ -66,6 +74,15 @@ def manage_cloud(pack, run_on, action="setup"): f"--{action}", *_flatten_cli_args(**plan_params) ] + if action == _SETUP: + cmd += [ + "--", + "bash", "-c", + _or_sudo(f"mkdir -p '{local_base.parent}'") + + " && " + _or_sudo(f"chmod a+rwX '{local_base.parent}'") + + f" && mkdir -p '{remote_base}'" + f" && ln -sfT '{remote_base}' '{local_base}'" + ] p = subprocess.Popen( cmd, stdout=subprocess.PIPE, diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py index f4f2ca47d..eb602ee27 100644 --- a/milabench/cli/covalent/__main__.py +++ b/milabench/cli/covalent/__main__.py @@ -113,7 +113,7 @@ def lattice(argv=(), deps_bash = None): if argv: dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) result = ct.get_result(dispatch_id=dispatch_id, wait=True) - return_code, stdout, _ = result.result if result.result is not None else (1, "", "") + return_code, _, _ = result.result if result.result is not None else (1, "", "") if return_code == 0 and args.setup: _executor:ct.executor.BaseExecutor = executor_cls( diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt index c988e26a3..b70efc793 100644 --- a/milabench/cli/covalent/requirements.txt +++ b/milabench/cli/covalent/requirements.txt @@ -1,3 +1,3 @@ -covalent +covalent==0.232 covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench \ No newline at end of file diff --git a/milabench/common.py b/milabench/common.py index 0fc540dde..ff0388df2 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -363,12 +363,8 @@ def _push_reports(reports_repo, runs): device = _meta["cpu"]["brand"].replace(" ", "_") break - tag = ([ - t.name - for t in _repo.tags - if meta[0]["milabench"]["tag"].startswith(t.name) - ] or [meta[0]["milabench"]["tag"]])[0] - reports_dir = XPath(reports_repo.working_tree_dir) / tag + build = "-".join([_repo.active_branch.name.replace(os.path.sep, "_"), next(_repo.iter_commits()).hexsha]) + reports_dir = XPath(reports_repo.working_tree_dir) / build run = XPath(run) try: @@ -376,13 +372,16 @@ def _push_reports(reports_repo, runs): except FileExistsError: pass - device_reports.setdefault((device, tag), set()) - device_reports[(device, tag)].update( + for _f in (reports_dir / device / run.name).glob("*.stderr"): + _f.unlink() + + device_reports.setdefault((device, build), set()) + device_reports[(device, build)].update( (reports_dir / device).glob("*/") ) - for (device, tag), reports in device_reports.items(): - reports_dir = XPath(reports_repo.working_tree_dir) / tag + for (device, build), reports in device_reports.items(): + reports_dir = XPath(reports_repo.working_tree_dir) / build reports = _read_reports(*reports) reports = _filter_reports(*reports.values()) summary = make_summary(reports) @@ -404,9 +403,10 @@ def _push_reports(reports_repo, runs): "--left-text", device, "--right-text", text, "--right-color", _SVG_COLORS[text], - "--whole-link", str(reports_url / tag / device) + "--whole-link", str(reports_url / build / device) ], - capture_output=True + capture_output=True, + check=True ) if result.returncode == 0: (reports_dir / device / "badge.svg").write_text(result.stdout.decode("utf8")) @@ -418,8 +418,8 @@ def _push_reports(reports_repo, runs): for cmd, _kwargs in ( (["git", "pull"], {"check": True}), - (["git", "add", tag], {"check": True}), - (["git", "commit", "-m", tag], {"check": False}), + (["git", "add", build], {"check": True}), + (["git", "commit", "-m", build], {"check": False}), (["git", "push"], {"check": True}) ): subprocess.run( diff --git a/milabench/log.py b/milabench/log.py index a6f7388a9..3724b34c3 100644 --- a/milabench/log.py +++ b/milabench/log.py @@ -333,6 +333,16 @@ def on_end(self, entry, data, row): self.refresh() +_NO_DEFAULT_FLAG=("__NO_DEFAULT__",) +def _parse_int(value, default=_NO_DEFAULT_FLAG): + try: + return int(value) + except TypeError: + if default is not _NO_DEFAULT_FLAG: + return default + raise + + class LongDashFormatter(DashFormatter): def make_table(self): table = Table.grid(padding=(0, 3, 0, 0)) @@ -375,7 +385,8 @@ def on_data(self, entry, data, row): for gpuid, data in gpudata.items(): load = int(data.get("load", 0) * 100) currm, totalm = data.get("memory", [0, 0]) - temp = int(data.get("temperature", 0)) + # "temperature" is sometimes reported as None for some GPUs? A10? + temp = _parse_int(data.get("temperature", 0), 0) row[f"gpu:{gpuid}"] = ( f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C" ) diff --git a/milabench/remote.py b/milabench/remote.py index bbf1b4f0f..b657f98c5 100644 --- a/milabench/remote.py +++ b/milabench/remote.py @@ -20,6 +20,14 @@ INSTALL_FOLDER = str(ROOT_FOLDER) +def milabench_env() -> list: + return [ + f"{envvar}={os.environ[envvar]}" + for envvar in os.environ + if envvar.split("_")[0] == "MILABENCH" and os.environ[envvar] + ] + + def scp(node, folder, dest=None) -> list: """Copy a folder from local node to remote node""" host = node["ip"] @@ -185,9 +193,7 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand: CmdCommand( worker_pack(pack, worker), "cd", f"{INSTALL_FOLDER}", "&&", - f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}", - f"MILABENCH_CONFIG={os.environ.get('MILABENCH_CONFIG', '')}", - f"MILABENCH_SYSTEM={os.environ.get('MILABENCH_SYSTEM', '')}", + *milabench_env(), "milabench", *command ), host=host, From 4ccea235126018e349572a27e5b086e7a48c23bc Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Fri, 12 Apr 2024 14:42:50 -0400 Subject: [PATCH 11/22] Add docs --- docs/dev-usage.rst | 13 +++++++++++++ docs/usage.rst | 7 +++++++ 2 files changed, 20 insertions(+) diff --git a/docs/dev-usage.rst b/docs/dev-usage.rst index 42a9871e2..58d66fb0c 100644 --- a/docs/dev-usage.rst +++ b/docs/dev-usage.rst @@ -97,3 +97,16 @@ milabench compare ~~~~~~~~~~~~~~~~~ TODO. + +Using milabench on the cloud +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Milabench uses `Terraform `_ through +`Covalent `_. To add support for a new cloud +platform you will need to develop a new clovalent plugin with it's Terraform +config. An example is the +`covalent-azure-plugin `_. +The interesting parts would be: + +* `Terraform provider's related plugin arguments `_ +* `Terraform provider's configuration `_ diff --git a/docs/usage.rst b/docs/usage.rst index 26c308513..76aed5934 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -150,3 +150,10 @@ Run milabench on the cloud 2. | Prepare, install and run milabench | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}`` + +3. | Destroy the cloud instances + | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}`` + | or + | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all`` + | to destroy not just a single cloud instance but all instances on a + specified platform that were instanced from the current local machine From d7f9366282db77fb17881b3d947c6efba97d8871 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:05:15 -0400 Subject: [PATCH 12/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 71e471252..9762ff3ff 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -6,6 +6,7 @@ on: branches: - master +# Trigger CI permissions: id-token: write From d132c48b060da7f2ed868476606753e67f7f8528 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:07:06 -0400 Subject: [PATCH 13/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 9762ff3ff..051c1d4e9 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -48,7 +48,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - token: ${{ secrets.REPORTS_PAT }} + token: ${{ github.token }} - uses: actions/setup-python@v2 with: From 131a98cbcfed1f6ec5235975bf0d00dd6080daf9 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:32:36 -0400 Subject: [PATCH 14/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 051c1d4e9..473bab0a2 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -6,7 +6,6 @@ on: branches: - master -# Trigger CI permissions: id-token: write From e8e2a6de36d25b30c14f5a3491aa665a49147d91 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:36:40 -0400 Subject: [PATCH 15/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 473bab0a2..89c03df67 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -6,6 +6,7 @@ on: branches: - master +# permissions: id-token: write From a72f9b568b39214138c52ddd89d5ffed76e2e634 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:40:44 -0400 Subject: [PATCH 16/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 89c03df67..5c745c057 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -57,17 +57,22 @@ jobs: # Follow # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret # to generate a clientId as well as a clientSecret - - name: Azure login - uses: azure/login@v2 - with: - creds: | - { - "clientId": "${{ secrets.ARM_CLIENT_ID }}", - "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", - "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", - "tenantId": "${{ secrets.ARM_TENANT_ID }}" - } - + # - name: Azure login + # uses: azure/login@v2 + # with: + # creds: | + # { + # "clientId": "${{ secrets.ARM_CLIENT_ID }}", + # "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", + # "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", + # "tenantId": "${{ secrets.ARM_TENANT_ID }}" + # } + + - name: Azure Login + run: | + pip install azure-cli + az login --service-principal -u "${{ secrets.ARM_CLIENT_ID }}" -p "${{ secrets.ARM_CLIENT_SECRET }}" --tenant "${{ secrets.ARM_TENANT_ID }}" + - name: dependencies run: | python -m pip install -U pip From 78c54582b844ca62fa63719ba68bc5df7f1bd1f4 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 11:57:24 -0400 Subject: [PATCH 17/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 5c745c057..e0db94033 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -6,7 +6,6 @@ on: branches: - master -# permissions: id-token: write From 3b3a5acb5ea30f88f97d5c97f3b24ab8c49f2906 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 12:03:34 -0400 Subject: [PATCH 18/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index e0db94033..473bab0a2 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -56,22 +56,17 @@ jobs: # Follow # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret # to generate a clientId as well as a clientSecret - # - name: Azure login - # uses: azure/login@v2 - # with: - # creds: | - # { - # "clientId": "${{ secrets.ARM_CLIENT_ID }}", - # "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", - # "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", - # "tenantId": "${{ secrets.ARM_TENANT_ID }}" - # } - - - name: Azure Login - run: | - pip install azure-cli - az login --service-principal -u "${{ secrets.ARM_CLIENT_ID }}" -p "${{ secrets.ARM_CLIENT_SECRET }}" --tenant "${{ secrets.ARM_TENANT_ID }}" - + - name: Azure login + uses: azure/login@v2 + with: + creds: | + { + "clientId": "${{ secrets.ARM_CLIENT_ID }}", + "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", + "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", + "tenantId": "${{ secrets.ARM_TENANT_ID }}" + } + - name: dependencies run: | python -m pip install -U pip From db65d0562ae4b61617dbdd3e67db83225e3743a4 Mon Sep 17 00:00:00 2001 From: Setepenre Date: Mon, 15 Apr 2024 12:48:40 -0400 Subject: [PATCH 19/22] Update cloud-ci.yml --- .github/workflows/cloud-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 473bab0a2..7587c59ab 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -9,6 +9,7 @@ on: permissions: id-token: write +# jobs: cloud-tests: strategy: From bc483750383b4632598074535b6915e6d58c1f4a Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Mon, 15 Apr 2024 15:54:46 -0400 Subject: [PATCH 20/22] Fix cloud instance name conflict This would prevent the CI or multiple contributors to run tests with the same config --- docs/usage.rst | 4 ++++ milabench/cli/cloud.py | 8 ++++++-- milabench/common.py | 6 +++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 76aed5934..b2a25d85d 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -140,6 +140,10 @@ supported cloud profiles. username: ubuntu size: Standard_B2ats_v2 location: eastus2 + # state_prefix and state_id can be set to force a specific cloud + # instance id + # state_prefix: cloud-ci + # state_id: 849897_bivunaku Run milabench on the cloud diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index 150e37749..8d95a47d1 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -1,5 +1,6 @@ from copy import deepcopy import os +import socket import subprocess import sys @@ -8,6 +9,7 @@ import yaml from milabench.fs import XPath +from milabench.utils import blabla from ..common import get_multipack @@ -42,6 +44,8 @@ def manage_cloud(pack, run_on, action="setup"): plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on]) run_on, *profile = run_on.split("__") profile = profile[0] if profile else "" + default_state_prefix = profile or run_on + default_state_id = "_".join((pack.config["hash"][:6], blabla())) remote_base = XPath("/data") / pack.dirs.base.name local_base = pack.dirs.base.absolute().parent @@ -51,8 +55,8 @@ def manage_cloud(pack, run_on, action="setup"): if n["ip"] != "1.1.1.1": continue - plan_params["state_prefix"] = plan_params.get("state_prefix", None) or profile or run_on - plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"] + plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix) + plan_params["state_id"] = plan_params.get("state_id", default_state_id) plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1) import milabench.cli.covalent as cv diff --git a/milabench/common.py b/milabench/common.py index ff0388df2..1f22463b6 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -338,9 +338,9 @@ def _push_reports(reports_repo, runs): reports_repo = git.repo.base.Repo.clone_from(repo_url, str(reports_repo), branch="reports") reports_url = ([ - _r.url for _r in _repo.remotes if "mila-iqia" in _r.url + url for _r in _repo.remotes for url in _r.urls if "mila-iqia" in url ] or [ - _r.url for _r in _repo.remotes if _r.name == "origin" + url for _r in _repo.remotes for url in _r.urls if _r.name == "origin" ])[0] reports_url = XPath("github.com".join(reports_url.split("github.com")[1:])[1:]) reports_url = XPath("https://github.com") / f"{reports_url.with_suffix('')}/tree/{reports_repo.active_branch.name}" @@ -363,7 +363,7 @@ def _push_reports(reports_repo, runs): device = _meta["cpu"]["brand"].replace(" ", "_") break - build = "-".join([_repo.active_branch.name.replace(os.path.sep, "_"), next(_repo.iter_commits()).hexsha]) + build = meta[0]["milabench"]["tag"] reports_dir = XPath(reports_repo.working_tree_dir) / build run = XPath(run) From 86598c3ae286ddada95c4ab57361a3ccb64d3972 Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Tue, 16 Apr 2024 15:06:50 -0400 Subject: [PATCH 21/22] Fix github push in CI --- .github/workflows/cloud-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml index 7587c59ab..c19954c71 100644 --- a/.github/workflows/cloud-ci.yml +++ b/.github/workflows/cloud-ci.yml @@ -9,7 +9,6 @@ on: permissions: id-token: write -# jobs: cloud-tests: strategy: @@ -124,7 +123,7 @@ jobs: - name: Summary run: | - git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)" + # git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)" git config --global user.email "github-ci@example.com" git config --global user.name "GitHub CI" poetry run milabench report --push From 5b5700ca90ed3926d9cd4db1c897f54b356082eb Mon Sep 17 00:00:00 2001 From: Satya Ortiz-Gagne Date: Wed, 22 May 2024 14:13:20 -0400 Subject: [PATCH 22/22] Cleaner and tested azure plugin --- milabench/cli/cloud.py | 23 +- milabench/cli/covalent/__main__.py | 204 ------------------ milabench/common.py | 2 +- milabench/{cli => scripts}/badges/__main__.py | 0 .../{cli => scripts}/badges/requirements.txt | 0 milabench/scripts/covalent/__main__.py | 103 +++++++++ .../covalent/requirements.txt | 0 milabench/{cli => scripts}/utils.py | 0 8 files changed, 118 insertions(+), 214 deletions(-) delete mode 100644 milabench/cli/covalent/__main__.py rename milabench/{cli => scripts}/badges/__main__.py (100%) rename milabench/{cli => scripts}/badges/requirements.txt (100%) create mode 100644 milabench/scripts/covalent/__main__.py rename milabench/{cli => scripts}/covalent/requirements.txt (100%) rename milabench/{cli => scripts}/utils.py (100%) diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py index 8d95a47d1..16dd46d89 100644 --- a/milabench/cli/cloud.py +++ b/milabench/cli/cloud.py @@ -1,8 +1,8 @@ from copy import deepcopy import os -import socket import subprocess import sys +import warnings from coleo import Option, tooled from omegaconf import OmegaConf @@ -22,7 +22,7 @@ def _flatten_cli_args(**kwargs): return sum( ( - (f"--{str(k).replace('_', '-')}", *([str(v)] if str(v) else [])) + (f"--{str(k).replace('_', '-')}", *([str(v)] if v is not None else [])) for k, v in kwargs.items() ), () ) @@ -39,7 +39,7 @@ def manage_cloud(pack, run_on, action="setup"): "hostname":(lambda v: ("ip",v)), "username":(lambda v: ("user",v)), "ssh_key_file":(lambda v: ("key",v)), - "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])), + # "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])), } plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on]) run_on, *profile = run_on.split("__") @@ -58,8 +58,9 @@ def manage_cloud(pack, run_on, action="setup"): plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix) plan_params["state_id"] = plan_params.get("state_id", default_state_id) plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1) + plan_params["keep_alive"] = None - import milabench.cli.covalent as cv + import milabench.scripts.covalent as cv subprocess.run( [ @@ -106,12 +107,16 @@ def manage_cloud(pack, run_on, action="setup"): continue try: k, v = line_str.split("::>") - k, v = key_map[k](v) - if k == "ip" and n[k] != "1.1.1.1": - i, n = next(nodes) - n[k] = v except ValueError: - pass + continue + try: + k, v = key_map[k](v) + except KeyError: + warnings.warn(f"Ignoring invalid key received: {k}:{v}") + continue + if k == "ip" and n[k] != "1.1.1.1": + i, n = next(nodes) + n[k] = v _, stderr = p.communicate() stderr = stderr.decode("utf-8").strip() diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py deleted file mode 100644 index eb602ee27..000000000 --- a/milabench/cli/covalent/__main__.py +++ /dev/null @@ -1,204 +0,0 @@ -import argparse -import asyncio -import os -import pathlib -import subprocess -import sys -import tempfile - - -def serve(*argv): - return subprocess.run([ - "covalent", - *argv - ]).returncode - - -def _get_executor_kwargs(args): - return { - **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")}, - **{"action":k for k,v in vars(args).items() if k in ("setup", "teardown") and v}, - } - - -def executor(executor_cls, args, *argv): - import covalent as ct - - executor:ct.executor.BaseExecutor = executor_cls( - **_get_executor_kwargs(args), - ) - - def _popen(cmd, *args, _env=None, **kwargs): - _env = _env if _env is not None else {} - - for envvar in _env.keys(): - envvar_val = _env[envvar] - - if not envvar_val: - continue - - envvar_val = pathlib.Path(envvar_val).expanduser() - if str(envvar_val) != _env[envvar]: - _env[envvar] = str(envvar_val) - - if "MILABENCH_CONFIG_CONTENT" in _env: - _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent - with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f: - _f.write(_env["MILABENCH_CONFIG_CONTENT"]) - _env["MILABENCH_CONFIG"] = _f.name - - try: - cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:]) - except IndexError: - pass - - cwd = kwargs.pop("cwd", None) - if cwd is not None: - cwd = str(pathlib.Path(cwd).expanduser()) - kwargs["cwd"] = cwd - - _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env} - - kwargs = { - **kwargs, - "env": _env, - "stdout": subprocess.PIPE, - "stderr": subprocess.PIPE, - } - p = subprocess.Popen(cmd, *args, **kwargs) - - stdout_chunks = [] - while True: - line = p.stdout.readline() - if not line: - break - line_str = line.decode("utf-8").strip() - stdout_chunks.append(line_str) - print(line_str) - - _, stderr = p.communicate() - stderr = stderr.decode("utf-8").strip() - stdout = os.linesep.join(stdout_chunks) - - if p.returncode != 0: - raise subprocess.CalledProcessError( - p.returncode, - (cmd, args, kwargs), - stdout, - stderr - ) - return p.returncode, stdout, stderr - - @ct.lattice - def lattice(argv=(), deps_bash = None): - return ct.electron( - _popen, - executor=executor, - deps_bash=deps_bash, - )( - argv, - ) - - return_code = 0 - try: - dispatch_id = None - result = None - deps_bash = None - - if not argv and args.setup: - deps_bash = ct.DepsBash([]) - # Make sure pip is installed - argv = ["python3", "-m", "pip", "freeze"] - - if argv: - dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash) - result = ct.get_result(dispatch_id=dispatch_id, wait=True) - return_code, _, _ = result.result if result.result is not None else (1, "", "") - - if return_code == 0 and args.setup: - _executor:ct.executor.BaseExecutor = executor_cls( - **{ - **_get_executor_kwargs(args), - **{"action": "teardown"}, - } - ) - asyncio.run(_executor.setup({})) - - assert _executor.hostnames - for hostname in _executor.hostnames: - print(f"hostname::>{hostname}") - print(f"username::>{_executor.username}") - print(f"ssh_key_file::>{_executor.ssh_key_file}") - finally: - result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None - results_dir = result.results_dir if result else "" - if args.teardown: - try: - _executor:ct.executor.BaseExecutor = executor_cls( - **{ - **_get_executor_kwargs(args), - **{"action": "teardown"}, - } - ) - asyncio.run(_executor.setup({})) - asyncio.run( - _executor.teardown( - {"dispatch_id": dispatch_id, "node_id": 0, "results_dir": results_dir} - ) - ) - except FileNotFoundError: - pass - - return return_code - - -def main(argv=None): - if argv is None: - argv = sys.argv[1:] - - try: - import covalent as ct - except (KeyError, ImportError): - from ..utils import run_in_module_venv - check_if_module = "import covalent" - return run_in_module_venv(__file__, check_if_module, argv) - - parser = argparse.ArgumentParser() - subparsers = parser.add_subparsers() - subparser = subparsers.add_parser("serve") - subparser.add_argument(f"argv", nargs=argparse.REMAINDER) - for p in ("azure","ec2"): - try: - config = ct.get_config(f"executors.{p}") - except KeyError: - continue - subparser = subparsers.add_parser(p) - subparser.add_argument(f"--setup", action="store_true") - subparser.add_argument(f"--teardown", action="store_true") - for param, default in config.items(): - if param == "action": - continue - subparser.add_argument(f"--{param.replace('_', '-')}", default=default) - - try: - cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:] - except ValueError: - cv_argv, argv = argv, [] - - args = parser.parse_args(cv_argv) - - if cv_argv[0] == "serve": - assert not argv - return serve(*args.argv) - elif cv_argv[0] == "azure": - executor_cls = ct.executor.AzureExecutor - elif cv_argv[0] == "ec2": - executor_cls = ct.executor.EC2Executor - else: - raise - - return executor(executor_cls, args, *argv) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/milabench/common.py b/milabench/common.py index 1f22463b6..ad56ac82b 100644 --- a/milabench/common.py +++ b/milabench/common.py @@ -328,7 +328,7 @@ def _push_reports(reports_repo, runs): "partial": "yellow", "failure": "red", } - import milabench.cli.badges as badges + import milabench.scripts.badges as badges _repo = git.repo.base.Repo(ROOT_FOLDER) try: diff --git a/milabench/cli/badges/__main__.py b/milabench/scripts/badges/__main__.py similarity index 100% rename from milabench/cli/badges/__main__.py rename to milabench/scripts/badges/__main__.py diff --git a/milabench/cli/badges/requirements.txt b/milabench/scripts/badges/requirements.txt similarity index 100% rename from milabench/cli/badges/requirements.txt rename to milabench/scripts/badges/requirements.txt diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py new file mode 100644 index 000000000..3cd61e007 --- /dev/null +++ b/milabench/scripts/covalent/__main__.py @@ -0,0 +1,103 @@ +import argparse +import subprocess +import sys + + +def serve(*argv): + return subprocess.run([ + "covalent", + *argv + ]).returncode + + +def _get_executor_kwargs(args): + return { + **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")}, + } + + +def executor(executor_cls, args): + import covalent as ct + + return_code = 0 + try: + executor:ct.executor.BaseExecutor = executor_cls( + **_get_executor_kwargs(args), + ) + + if args.setup: + dispatch_id = ct.dispatch( + ct.lattice(executor.get_connection_attributes), disable_run=False + )() + + result = ct.get_result(dispatch_id=dispatch_id, wait=True).result + + assert result and result[0] + + all_connection_attributes, _ = result + for hostname, connection_attributes in all_connection_attributes.items(): + print(f"hostname::>{hostname}") + for attribute,value in connection_attributes.items(): + if attribute == "hostname": + continue + print(f"{attribute}::>{value}") + finally: + if args.teardown: + executor.stop_cloud_instance({}) + + return return_code + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + try: + import covalent as ct + except (KeyError, ImportError): + from ..utils import run_in_module_venv + check_if_module = "import covalent" + return run_in_module_venv(__file__, check_if_module, argv) + + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers() + subparser = subparsers.add_parser("serve") + subparser.add_argument(f"argv", nargs=argparse.REMAINDER) + for p in ("azure","ec2"): + try: + config = ct.get_config(f"executors.{p}") + except KeyError: + continue + subparser = subparsers.add_parser(p) + subparser.add_argument(f"--setup", action="store_true") + subparser.add_argument(f"--teardown", action="store_true") + for param, default in config.items(): + add_argument_kwargs = {} + if isinstance(default, bool): + add_argument_kwargs["action"] = "store_false" if default else "store_true" + else: + add_argument_kwargs["default"] = default + subparser.add_argument(f"--{param.replace('_', '-')}", **add_argument_kwargs) + + try: + cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:] + except ValueError: + cv_argv, argv = argv, [] + + args = parser.parse_args(cv_argv) + + if cv_argv[0] == "serve": + assert not argv + return serve(*args.argv) + elif cv_argv[0] == "azure": + executor_cls = ct.executor.AzureExecutor + elif cv_argv[0] == "ec2": + executor_cls = ct.executor.EC2Executor + else: + raise + + return executor(executor_cls, args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/milabench/cli/covalent/requirements.txt b/milabench/scripts/covalent/requirements.txt similarity index 100% rename from milabench/cli/covalent/requirements.txt rename to milabench/scripts/covalent/requirements.txt diff --git a/milabench/cli/utils.py b/milabench/scripts/utils.py similarity index 100% rename from milabench/cli/utils.py rename to milabench/scripts/utils.py