From 53a99356ca664765ea836a0c7ec756a7ca8017e8 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 27 Feb 2024 09:57:14 -0500
Subject: [PATCH 01/22] Fix pip-tools

https://github.com/jazzband/pip-tools/pull/1906
---
 poetry.lock    | 61 ++++++++++++++++++++++++++++----------------------
 pyproject.toml |  2 +-
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 942bff025..d03ffe232 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "alabaster"
@@ -172,25 +172,26 @@ wcwidth = ">=0.1.4"
 
 [[package]]
 name = "build"
-version = "0.10.0"
+version = "1.1.1"
 description = "A simple, correct Python build frontend"
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "build-0.10.0-py3-none-any.whl", hash = "sha256:af266720050a66c893a6096a2f410989eeac74ff9a68ba194b3f6473e8e26171"},
-    {file = "build-0.10.0.tar.gz", hash = "sha256:d5b71264afdb5951d6704482aac78de887c80691c52b88a9ad195983ca2c9269"},
+    {file = "build-1.1.1-py3-none-any.whl", hash = "sha256:8ed0851ee76e6e38adce47e4bee3b51c771d86c64cf578d0c2245567ee200e73"},
+    {file = "build-1.1.1.tar.gz", hash = "sha256:8eea65bb45b1aac2e734ba2cc8dad3a6d97d97901a395bd0ed3e7b46953d2a31"},
 ]
 
 [package.dependencies]
 colorama = {version = "*", markers = "os_name == \"nt\""}
+importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10.2\""}
 packaging = ">=19.0"
 pyproject_hooks = "*"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-docs = ["furo (>=2021.08.31)", "sphinx (>=4.0,<5.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)"]
-test = ["filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "toml (>=0.10.0)", "wheel (>=0.36.0)"]
-typing = ["importlib-metadata (>=5.1)", "mypy (==0.991)", "tomli", "typing-extensions (>=3.7.4.3)"]
+docs = ["furo (>=2023.08.17)", "sphinx (>=7.0,<8.0)", "sphinx-argparse-cli (>=1.5)", "sphinx-autodoc-typehints (>=1.10)", "sphinx-issues (>=3.0.0)"]
+test = ["filelock (>=3)", "pytest (>=6.2.4)", "pytest-cov (>=2.12)", "pytest-mock (>=2)", "pytest-rerunfailures (>=9.1)", "pytest-xdist (>=1.34)", "setuptools (>=42.0.0)", "setuptools (>=56.0.0)", "setuptools (>=56.0.0)", "setuptools (>=67.8.0)", "wheel (>=0.36.0)"]
+typing = ["importlib-metadata (>=5.1)", "mypy (>=1.5.0,<1.6.0)", "tomli", "typing-extensions (>=3.7.4.3)"]
 virtualenv = ["virtualenv (>=20.0.35)"]
 
 [[package]]
@@ -1082,25 +1083,27 @@ files = [
 
 [[package]]
 name = "pip-tools"
-version = "6.13.0"
+version = "7.4.1"
 description = "pip-tools keeps your pinned dependencies fresh."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "pip-tools-6.13.0.tar.gz", hash = "sha256:61d46bd2eb8016ed4a924e196e6e5b0a268cd3babd79e593048720db23522bb1"},
-    {file = "pip_tools-6.13.0-py3-none-any.whl", hash = "sha256:50943f151d87e752abddec8158622c34ad7f292e193836e90e30d87da60b19d9"},
+    {file = "pip-tools-7.4.1.tar.gz", hash = "sha256:864826f5073864450e24dbeeb85ce3920cdfb09848a3d69ebf537b521f14bcc9"},
+    {file = "pip_tools-7.4.1-py3-none-any.whl", hash = "sha256:4c690e5fbae2f21e87843e89c26191f0d9454f362d8acdbd695716493ec8b3a9"},
 ]
 
 [package.dependencies]
-build = "*"
+build = ">=1.0.0"
 click = ">=8"
 pip = ">=22.2"
+pyproject_hooks = "*"
 setuptools = "*"
+tomli = {version = "*", markers = "python_version < \"3.11\""}
 wheel = "*"
 
 [package.extras]
-coverage = ["pytest-cov"]
-testing = ["flit-core (>=2,<4)", "poetry-core (>=1.0.0)", "pytest (>=7.2.0)", "pytest-rerunfailures", "pytest-xdist"]
+coverage = ["covdefaults", "pytest-cov"]
+testing = ["flit_core (>=2,<4)", "poetry_core (>=1.0.0)", "pytest (>=7.2.0)", "pytest-rerunfailures", "pytest-xdist", "tomli-w"]
 
 [[package]]
 name = "platformdirs"
@@ -2034,22 +2037,26 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.3)", "coverage-enable-subprocess
 
 [[package]]
 name = "voir"
-version = "0.2.10"
+version = "0.2.12"
 description = "Instrument, extend and visualize your programs"
 optional = false
-python-versions = ">=3.7,<4.0"
-files = [
-    {file = "voir-0.2.10-py3-none-any.whl", hash = "sha256:70266f9cab487bb3b6f7aea90d724055f6a88824db37c326473f72cf40e93343"},
-    {file = "voir-0.2.10.tar.gz", hash = "sha256:352425923d7186941036f7c9909c8bf3ad13d10b633abd6dd3697609b3b6673b"},
-]
+python-versions = "^3.7"
+files = []
+develop = false
 
 [package.dependencies]
-giving = ">=0.4.2,<0.5.0"
-omegaconf = ">=2.3.0,<3.0.0"
-ovld = ">=0.3.2,<0.4.0"
-ptera = ">=1.4.1,<2.0.0"
-pynvml = ">=11.5.0,<12.0.0"
-rich = ">=13.3.2,<14.0.0"
+giving = "^0.4.2"
+omegaconf = "^2.3.0"
+ovld = "^0.3.2"
+ptera = "^1.4.1"
+pynvml = "^11.5.0"
+rich = "^13.3.2"
+
+[package.source]
+type = "git"
+url = "https://github.com/breuleux/voir"
+reference = "master"
+resolved_reference = "01caa92f5bc49c696dea3090eaee7c8f97e85f4f"
 
 [[package]]
 name = "wcwidth"
@@ -2094,4 +2101,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<4.0"
-content-hash = "0407b1f9e231b83ca25d848e4c21033a7016d5825c31a86ce075479b4b419fa8"
+content-hash = "dab3de344c4805f5071dec0f4b2b866f6b9d6bd2f16b1ac8d9b2df20b1184494"
diff --git a/pyproject.toml b/pyproject.toml
index 773986140..956c802b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ pandas = "^1.4.2"
 numpy = ">=1.23.0"
 pynvml = "^11.4.1"
 tqdm = "^4.64.1"
-pip-tools = "^6.12.3"
+pip-tools = "^7.0.0"
 rich = "^13.3.2"
 omegaconf = "^2.3.0"
 sqlalchemy = "^2.0.15"

From c3b9dbf1a011256cae78a9ecd10cfaa65bfdd39d Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 27 Feb 2024 09:57:30 -0500
Subject: [PATCH 02/22] Add covalent cloud aws ec2 infra and report --push

covalent is not compatible with milabench as it requires sqlalchemy<2.0.0
---
 .github/workflows/cloud-ci.yml            | 109 ++++++++++
 benchmarks/_template/requirements.cpu.txt |  46 +++++
 config/examples/ec2-system.yaml           |  19 ++
 config/test.yaml                          |  24 +++
 milabench/__init__.py                     |   5 +
 milabench/cli/__init__.py                 |   5 +
 milabench/cli/badges/__main__.py          |  45 +++++
 milabench/cli/badges/requirements.txt     |   1 +
 milabench/cli/cloud.py                    | 153 ++++++++++++++
 milabench/cli/covalent/__main__.py        | 233 ++++++++++++++++++++++
 milabench/cli/covalent/requirements.txt   |   2 +
 milabench/cli/report.py                   |  43 +++-
 milabench/commands/__init__.py            |  28 ++-
 milabench/common.py                       | 168 +++++++++++++++-
 milabench/config.py                       |  14 ++
 milabench/multi.py                        |  25 +++
 milabench/remote.py                       | 118 ++++++++++-
 17 files changed, 1011 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/cloud-ci.yml
 create mode 100644 benchmarks/_template/requirements.cpu.txt
 create mode 100644 config/examples/ec2-system.yaml
 create mode 100644 config/test.yaml
 create mode 100644 milabench/cli/badges/__main__.py
 create mode 100644 milabench/cli/badges/requirements.txt
 create mode 100644 milabench/cli/cloud.py
 create mode 100644 milabench/cli/covalent/__main__.py
 create mode 100644 milabench/cli/covalent/requirements.txt

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
new file mode 100644
index 000000000..ab1211a9f
--- /dev/null
+++ b/.github/workflows/cloud-ci.yml
@@ -0,0 +1,109 @@
+name: tests
+
+on:
+  # Runs for pull requests
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  tests:
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - arch: cuda
+            exclude: "no-cuda"
+            run_on: ec2
+          # - arch: rocm
+          #   exclude : "no-rocm"
+
+    runs-on: ubuntu-latest
+    environment: test-cloud-ci
+
+    # Cancel previous jobs if a new version was pushed
+    concurrency:
+      group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.run_on }}"
+      cancel-in-progress: true
+
+    defaults:
+      run:
+        shell: bash -el {0}
+
+    env:
+      MILABENCH_CONFIG: "config/test.yaml"
+      MILABENCH_SYSTEM: "config/examples/${{ matrix.run_on }}-system.yaml"
+      MILABENCH_BASE: "output"
+      MILABENCH_ARGS: ""
+      MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
+      MILABENCH_DASH: "no"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.REPORTS_PAT }}
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: dependencies
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U poetry
+          poetry lock --no-update
+          poetry install
+
+      - name: setup cloud credentials
+        run: |
+          mkdir -p ~/.aws
+          mkdir -p ~/.ssh/covalent
+          echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
+          echo "[default]" >~/.aws/credentials
+          echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
+          echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
+          chmod -R a-rwx,u+rwX ~/.aws ~/.ssh
+
+      - name: setup cloud
+        run: |
+          _system=$(
+          poetry run milabench cloud \
+            --setup \
+            --run-on ${{ matrix.run_on }}
+          )
+          { read _hash ; }< <(
+          echo -n "$_system" | while read l
+          do
+              if [[ "$l" == "# hash::>"* ]]
+              then
+                  echo -n "${l#*::>}"
+              fi
+          done
+          echo
+          )
+          if [[ -z "${_hash}" ]]
+          then
+            >&2 echo "Failed to fetch system config hash"
+            exit 1
+          fi
+          echo -n "$_system" >$MILABENCH_SYSTEM.$_hash
+          echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$_hash" >>$GITHUB_ENV
+
+      - name: install benchmarks
+        run: |
+          poetry run milabench install
+
+      - name: prepare benchmarks
+        run: |
+          poetry run milabench prepare
+
+      - name: run benchmarks
+        run: |
+          poetry run milabench run
+
+      - name: Summary
+        run: |
+          git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)"
+          git config --global user.email "github-ci@example.com"
+          git config --global user.name "GitHub CI"
+          poetry run milabench report --push
diff --git a/benchmarks/_template/requirements.cpu.txt b/benchmarks/_template/requirements.cpu.txt
new file mode 100644
index 000000000..e0058b822
--- /dev/null
+++ b/benchmarks/_template/requirements.cpu.txt
@@ -0,0 +1,46 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=benchmarks/_template/requirements.cpu.txt benchmarks/_template/requirements.in
+#
+antlr4-python3-runtime==4.9.3
+    # via omegaconf
+asttokens==2.4.1
+    # via giving
+codefind==0.1.3
+    # via ptera
+executing==1.2.0
+    # via varname
+giving==0.4.2
+    # via
+    #   ptera
+    #   voir
+markdown-it-py==3.0.0
+    # via rich
+mdurl==0.1.2
+    # via markdown-it-py
+omegaconf==2.3.0
+    # via voir
+ovld==0.3.2
+    # via voir
+ptera==1.4.1
+    # via voir
+pygments==2.17.2
+    # via rich
+pynvml==11.5.0
+    # via voir
+pyyaml==6.0.1
+    # via omegaconf
+reactivex==4.0.4
+    # via giving
+rich==13.7.0
+    # via voir
+six==1.16.0
+    # via asttokens
+typing-extensions==4.10.0
+    # via reactivex
+varname==0.10.0
+    # via giving
+voir==0.2.12
+    # via -r benchmarks/_template/requirements.in
diff --git a/config/examples/ec2-system.yaml b/config/examples/ec2-system.yaml
new file mode 100644
index 000000000..dab1a7a4e
--- /dev/null
+++ b/config/examples/ec2-system.yaml
@@ -0,0 +1,19 @@
+system:
+  # Nodes list
+  nodes:
+      # Alias used to reference the node
+    - name: manager
+      # Use 1.1.1.1 as an ip placeholder
+      ip: 1.1.1.1
+      # Use this node as the master node or not
+      main: true
+      # User to use in remote milabench operations
+      user: user
+
+  # Cloud instances profiles
+  cloud_profiles:
+    ec2:
+      username: ubuntu
+      instance_type: t2.micro
+      volume_size: 8
+      region: us-east-2
diff --git a/config/test.yaml b/config/test.yaml
new file mode 100644
index 000000000..060949e40
--- /dev/null
+++ b/config/test.yaml
@@ -0,0 +1,24 @@
+_defaults:
+  max_duration: 600
+  voir:
+    options:
+      stop: 60
+      interval: "1s"
+
+test:
+  inherits: _defaults
+  group: test_remote
+  install_group: test_remote
+  definition: ../benchmarks/_template
+  plan:
+    method: njobs
+    n: 1
+
+testing:
+  inherits: _defaults
+  definition: ../benchmarks/_template
+  group: test_remote_2
+  install_group: test_remote_2
+  plan:
+    method: njobs
+    n: 1
diff --git a/milabench/__init__.py b/milabench/__init__.py
index e69de29bb..ac33e6bb3 100644
--- a/milabench/__init__.py
+++ b/milabench/__init__.py
@@ -0,0 +1,5 @@
+import pathlib
+
+ROOT_FOLDER = pathlib.Path(__file__).resolve().parent.parent
+CONFIG_FOLDER = ROOT_FOLDER / "config"
+BENCHMARK_FOLDER = ROOT_FOLDER / "benchmarks"
diff --git a/milabench/cli/__init__.py b/milabench/cli/__init__.py
index f0eea8d1e..401179944 100644
--- a/milabench/cli/__init__.py
+++ b/milabench/cli/__init__.py
@@ -3,6 +3,7 @@
 
 from coleo import run_cli
 
+from .cloud import cli_cloud
 from .compare import cli_compare
 from .dev import cli_dev
 from .install import cli_install
@@ -37,6 +38,10 @@ def pin():
         """Pin the benchmarks' dependencies."""
         cli_pin()
 
+    def cloud():
+        """Setup cloud instances."""
+        cli_cloud()
+
     def dev():
         """Create a shell in a benchmark's environment for development."""
         cli_dev()
diff --git a/milabench/cli/badges/__main__.py b/milabench/cli/badges/__main__.py
new file mode 100644
index 000000000..027a59a4b
--- /dev/null
+++ b/milabench/cli/badges/__main__.py
@@ -0,0 +1,45 @@
+import pathlib
+import subprocess
+import sys
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    try:
+        import pybadges as _
+    except ImportError:
+        module = pathlib.Path(__file__).resolve().parent
+        cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
+        python3 = str(cache_dir / "bin/python3")
+        check_module = "import pybadges"
+        try:
+            subprocess.run([python3, "-c", check_module], check=True)
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True)
+            subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True)
+            subprocess.run([
+                python3,
+                "-m",
+                "pip",
+                "install",
+                "-r",
+                str(module / "requirements.txt")
+            ], check=True)
+            subprocess.run([python3, "-c", check_module], check=True)
+        return subprocess.call(
+            [python3, __file__, *argv],
+        )
+    
+    return subprocess.run([
+        sys.executable,
+        "-m",
+        "pybadges",
+        *argv
+    ], check=True).returncode
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/milabench/cli/badges/requirements.txt b/milabench/cli/badges/requirements.txt
new file mode 100644
index 000000000..26620981a
--- /dev/null
+++ b/milabench/cli/badges/requirements.txt
@@ -0,0 +1 @@
+pybadges
\ No newline at end of file
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
new file mode 100644
index 000000000..c0f9c9bcb
--- /dev/null
+++ b/milabench/cli/cloud.py
@@ -0,0 +1,153 @@
+from copy import deepcopy
+import os
+import subprocess
+import sys
+
+from coleo import Option, tooled
+import yaml
+
+# import milabench as mb
+from ..common import get_multipack
+
+
+_SETUP = "setup"
+_TEARDOWN = "teardown"
+_LIST = "list"
+_ACTIONS = (_SETUP, _TEARDOWN, _LIST)
+
+
+def manage_cloud(pack, packs, run_on, action="setup"):
+    assert run_on in pack.config["system"]["cloud_profiles"]
+
+    key_map = {
+        "hostname":(lambda v: ("ip",v)),
+        "username":(lambda v: ("user",v)),
+        "ssh_key_file":(lambda v: ("key",v)),
+        "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
+    }
+    plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
+
+    nodes = iter(enumerate(pack.config["system"]["nodes"]))
+
+    state_prefix = []
+    for p in packs.values():
+        state_prefix.append(p.config["name"])
+        state_prefix.append(p.config["install_variant"])
+
+    while True:
+        try:
+            i, n = next(nodes)
+            if n["ip"] != "1.1.1.1":
+                continue
+        except StopIteration:
+            break
+
+        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix])
+        plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
+
+        import milabench.cli.covalent as cv
+
+        subprocess.run(
+            [
+                sys.executable,
+                "-m", cv.__name__,
+                "serve", "start"
+            ]
+            , stdout=sys.stderr
+            , check=True
+        )
+
+        cmd = [
+            sys.executable,
+            "-m", cv.__name__,
+            run_on,
+            f"--{action}",
+            *[
+                f"--{k.replace('_', '-')}={v}"
+                for k, v in plan_params.items()
+            ],
+        ]
+        p = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        stdout_chunks = []
+        while True:
+            line = p.stdout.readline()
+            if not line:
+                break
+            line_str = line.decode("utf-8").strip()
+            stdout_chunks.append(line_str)
+            print(line_str, file=sys.stderr)
+
+            if not line_str:
+                continue
+            try:
+                k, v = line_str.split("::>")
+                k, v = key_map[k](v)
+                if k == "ip" and n[k] != "1.1.1.1":
+                    i, n = next(nodes)
+                n[k] = v
+            except ValueError:
+                pass
+
+        _, stderr = p.communicate()
+        stderr = stderr.decode("utf-8").strip()
+        print(stderr, file=sys.stderr)
+
+        if p.returncode != 0:
+            stdout = os.linesep.join(stdout_chunks)
+            raise subprocess.CalledProcessError(
+                p.returncode,
+                cmd,
+                stdout,
+                stderr
+            )
+
+    return pack.config["system"]
+
+
+@tooled
+def _setup():
+    """Setup a cloud infrastructure"""
+
+    # Setup cloud on target infra
+    run_on: Option & str
+
+    mp = get_multipack()
+    setup_pack = mp.setup_pack()
+    system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP)
+
+    print(f"# hash::>{setup_pack.config['hash']}")
+    print(yaml.dump({"system": system_config}))
+
+
+@tooled
+def _teardown():
+    """Teardown a cloud infrastructure"""
+
+    # Setup cloud on target infra
+    run_on: Option & str
+
+    mp = get_multipack()
+    setup_pack = mp.setup_pack()
+    manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN)
+
+
+@tooled
+def cli_cloud():
+    """Manage cloud instances."""
+
+    # Setup a cloud infrastructure
+    setup: Option & bool = False
+    # Teardown a cloud infrastructure
+    teardown: Option & bool = False
+
+    assert any((setup, teardown)) and not all((setup, teardown))
+
+    if setup:
+        _setup()
+    elif teardown:
+        _teardown()
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
new file mode 100644
index 000000000..cf5ff8537
--- /dev/null
+++ b/milabench/cli/covalent/__main__.py
@@ -0,0 +1,233 @@
+import argparse
+import asyncio
+import os
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+
+def serve(*argv):
+    return subprocess.run([
+        str(pathlib.Path(sys.executable).with_name("covalent")),
+        *argv
+    ]).returncode
+
+
+def _get_executor_kwargs(args):
+    return {
+        **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")},
+        **{"action":k for k,v in vars(args).items() if k in ("setup", "teardown") and v},
+    }
+
+
+def executor(executor_cls, args, *argv):
+    import covalent as ct
+
+    executor:ct.executor.BaseExecutor = executor_cls(
+        **_get_executor_kwargs(args),
+    )
+
+    def _popen(cmd, *args, _env=None, **kwargs):
+        _env = _env if _env is not None else {}
+
+        for envvar in _env.keys():
+            envvar_val = _env[envvar]
+
+            if not envvar_val:
+                continue
+
+            envvar_val = pathlib.Path(envvar_val).expanduser()
+            if str(envvar_val) != _env[envvar]:
+                _env[envvar] = str(envvar_val)
+
+        if "MILABENCH_CONFIG_CONTENT" in _env:
+            _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent
+            with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f:
+                _f.write(_env["MILABENCH_CONFIG_CONTENT"])
+                _env["MILABENCH_CONFIG"] = _f.name
+
+        try:
+            cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:])
+        except IndexError:
+            pass
+
+        cwd = kwargs.pop("cwd", None)
+        if cwd is not None:
+            cwd = str(pathlib.Path(cwd).expanduser())
+            kwargs["cwd"] = cwd
+
+        _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env}
+
+        kwargs = {
+            **kwargs,
+            "env": _env,
+            "stdout": subprocess.PIPE,
+            "stderr": subprocess.PIPE,
+        }
+        p = subprocess.Popen(cmd, *args, **kwargs)
+
+        stdout_chunks = []
+        while True:
+            line = p.stdout.readline()
+            if not line:
+                break
+            line_str = line.decode("utf-8").strip()
+            stdout_chunks.append(line_str)
+            print(line_str)
+
+        _, stderr = p.communicate()
+        stderr = stderr.decode("utf-8").strip()
+        stdout = os.linesep.join(stdout_chunks)
+
+        if p.returncode != 0:
+            raise subprocess.CalledProcessError(
+                p.returncode,
+                (cmd, args, kwargs),
+                stdout,
+                stderr
+            )
+        return p.returncode, stdout, stderr
+
+    @ct.lattice
+    def lattice(argv=(), deps_bash = None):
+        return ct.electron(
+            _popen,
+            executor=executor,
+            deps_bash=deps_bash,
+        )(
+            argv,
+        )
+
+    return_code = 0
+    try:
+        dispatch_id = None
+        result = None
+        deps_bash = None
+
+        if not argv and args.setup:
+            conda_prefix = "eval \"$(conda shell.bash hook)\""
+            conda_activate = "conda activate milabench"
+            deps_bash = []
+            for _cmd in (
+                f"{conda_activate} || conda create -n milabench -y",
+                f"{conda_activate}"
+                f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
+                f" || >&2 echo First attempt to install python in milabench env failed",
+                f"{conda_activate}"
+                f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
+                f" || conda remove -n milabench --all -y",
+            ):
+                deps_bash.append(f"{conda_prefix} && ({_cmd})")
+            deps_bash = ct.DepsBash(deps_bash)
+            argv = ["conda", "env", "list"]
+
+        if argv:
+            dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
+            result = ct.get_result(dispatch_id=dispatch_id, wait=True)
+            return_code, stdout, _ = result.result if result.result is not None else (1, "", "")
+
+        if return_code == 0 and args.setup:
+            assert any([l for l in stdout.split("\n") if l.startswith("milabench ")])
+            _executor:ct.executor.BaseExecutor = executor_cls(
+                **{
+                    **_get_executor_kwargs(args),
+                    **{"action": "teardown"},
+                }
+            )
+            asyncio.run(_executor.setup({}))
+
+            assert _executor.hostname
+            print(f"hostname::>{_executor.hostname}")
+            print(f"username::>{_executor.username}")
+            print(f"ssh_key_file::>{_executor.ssh_key_file}")
+            print(f"env::>~/.condaenvrc")
+    finally:
+        result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
+        results_dir = result.results_dir if result else ""
+        if args.teardown:
+            try:
+                _executor:ct.executor.BaseExecutor = executor_cls(
+                    **{
+                        **_get_executor_kwargs(args),
+                        **{"action": "teardown"},
+                    }
+                )
+                asyncio.run(_executor.setup({}))
+                asyncio.run(
+                    _executor.teardown(
+                        {"dispatch_id": dispatch_id, "node_id": 0, "results_dir": results_dir}
+                    )
+                )
+            except FileNotFoundError:
+                pass
+
+    return return_code
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    try:
+        import covalent as ct
+        ct.get_config(f"executors.ec2")
+    except (KeyError, ImportError):
+        module = pathlib.Path(__file__).resolve().parent
+        cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
+        python3 = str(cache_dir / "bin/python3")
+        check_module = "import covalent ; from covalent.executor import EC2Executor"
+        try:
+            subprocess.run([python3, "-c", check_module], check=True)
+        except (FileNotFoundError, subprocess.CalledProcessError):
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True)
+            subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True)
+            subprocess.run([
+                python3,
+                "-m",
+                "pip",
+                "install",
+                "-r",
+                str(module / "requirements.txt")
+            ], check=True)
+            subprocess.run([python3, "-c", check_module], check=True)
+        return subprocess.call(
+            [python3, __file__, *argv],
+        )
+
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+    subparser = subparsers.add_parser("serve")
+    subparser.add_argument(f"argv", nargs=argparse.REMAINDER)
+    for p in ("ec2",):
+        try:
+            config = ct.get_config(f"executors.{p}")
+        except KeyError:
+            continue
+        subparser = subparsers.add_parser(p)
+        subparser.add_argument(f"--setup", action="store_true")
+        subparser.add_argument(f"--teardown", action="store_true")
+        for param, default in config.items():
+            if param == "action":
+                continue
+            subparser.add_argument(f"--{param.replace('_', '-')}", default=default)
+
+    try:
+        cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:]
+    except ValueError:
+        cv_argv, argv = argv, []
+
+    args = parser.parse_args(cv_argv)
+
+    if cv_argv[0] == "serve":
+        assert not argv
+        return serve(*args.argv)
+    elif cv_argv[0] == "ec2":
+        return executor(ct.executor.EC2Executor, args, *argv)
+    else:
+        raise
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt
new file mode 100644
index 000000000..f810e6eaf
--- /dev/null
+++ b/milabench/cli/covalent/requirements.txt
@@ -0,0 +1,2 @@
+covalent
+covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
\ No newline at end of file
diff --git a/milabench/cli/report.py b/milabench/cli/report.py
index cbad44223..b14b49528 100644
--- a/milabench/cli/report.py
+++ b/milabench/cli/report.py
@@ -1,10 +1,12 @@
+import glob
 import os
 import sys
 from dataclasses import dataclass, field
 
 from coleo import Option, config as configuration, tooled
 
-from ..common import Option, _error_report, _get_multipack, _read_reports
+from ..common import Option, _error_report, _get_multipack, _push_reports, _read_reports
+from ..fs import XPath
 from ..report import make_report
 from ..summary import make_summary
 
@@ -12,12 +14,13 @@
 # fmt: off
 @dataclass
 class Arguments:
-    runs:        list = field(default_factory=list)
+    runs        : list = field(default_factory=list)
     config      : str = os.environ.get("MILABENCH_CONFIG", None)
     compare     : str = None
     compare_gpus: bool = False
     html        : str = None
     price       : int = None
+    push        : bool = False
 # fmt: on
 
 
@@ -42,7 +45,10 @@ def arguments():
     # Price per unit
     price: Option & int = None
 
-    return Arguments(runs, config, compare, compare_gpus, html, price)
+    # Push reports to repo
+    push: Option & bool = False
+
+    return Arguments(runs, config, compare, compare_gpus, html, price, push)
 
 
 @tooled
@@ -68,11 +74,6 @@ def cli_report(args=None):
     # ------
     # 1 errors, details in HTML report.
 
-    reports = None
-    if args.runs:
-        reports = _read_reports(*args.runs)
-        summary = make_summary(reports.values())
-
     if args.config:
         from milabench.common import arguments as multipack_args
 
@@ -81,6 +82,25 @@ def cli_report(args=None):
 
         args.config = _get_multipack(margs, return_config=True)
 
+    assert args.config if args.push else None
+
+    if not args.runs and args.config:
+        run_dirs = {XPath(pack_config["dirs"]["runs"]) for pack_config in args.config.values()}
+        filter = lambda _p: not any([XPath(_p).name.startswith(f"{prefix}.") for prefix in ("install", "prepare")])
+        args.runs = sorted(
+            {_r
+             for _rd in run_dirs
+             for _r in glob.glob(str(_rd / "*.*.*/"))
+             if filter(_r)
+            },
+            key=lambda _p: XPath(_p).name.split(".")[-2:]
+        )
+
+    reports = None
+    if args.runs:
+        reports = _read_reports(*args.runs)
+        summary = make_summary(reports.values())
+
     make_report(
         summary,
         compare=args.compare,
@@ -93,3 +113,10 @@ def cli_report(args=None):
         errdata=reports and _error_report(reports),
         stream=sys.stdout,
     )
+
+    if len(reports) and args.push:
+        reports_repo = next(iter(
+            XPath(pack_config["dirs"]["base"]) / "reports"
+            for pack_config in args.config.values()
+        ))
+        _push_reports(reports_repo, args.runs, summary)
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index 30ce3ffa8..00284208d 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -399,6 +399,11 @@ def is_local(self):
             == localnode["hostname"]  # The hostname is the local node
         )
 
+    def _load_env(self, node):
+        if node.get("env", None):
+            return node["env"]
+        return []
+
     def _argv(self, **kwargs) -> List:
         # No-op when executing on a local node
         if self.is_local():
@@ -410,13 +415,14 @@ def _argv(self, **kwargs) -> List:
         host = f"{user}@{self.host}" if user else self.host
 
         argv = super()._argv(**kwargs)
-        argv.extend(["-oPasswordAuthentication=no"])
-        argv.extend(["-p", str(self.port)])
-
         if key:
-            argv.append(f"-i{key}")
+            # scp apparently needs `-i` to be first
+            argv.insert(1, f"-i{key}")
+        argv.append(f"-p{self.port}")
         argv.append(host)
 
+        argv.extend(self._load_env(node))
+
         return argv
 
 
@@ -427,21 +433,27 @@ def __init__(
         self,
         pack: pack.BasePackage,
         host: str,
-        directory: str,
+        src: str,
         *scp_argv,
+        dest: str = None,
         user: str = None,
         key: str = None,
         **kwargs,
     ) -> None:
         super().__init__(pack, host, "-r", *scp_argv, user=user, key=key, **kwargs)
-        self.dir = directory
+        self.src = src
+        self.dest = dest if dest is not None else self.src
+
+    def _load_env(self, node):
+        del node
+        return []
 
     def _argv(self, **kwargs) -> List:
         argv = super()._argv(**kwargs)
 
         host = argv.pop()
-        argv.append(self.dir)
-        argv.append(f"{host}:{self.dir}")
+        argv.append(self.src)
+        argv.append(f"{host}:{self.dest}")
 
         return argv
 
diff --git a/milabench/common.py b/milabench/common.py
index 35f9cf125..01a8976a0 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -1,16 +1,21 @@
+from copy import deepcopy
 import io
 import json
 import os
 import re
 import runpy
+import subprocess
 import sys
 import traceback
 from dataclasses import dataclass, field
 from datetime import datetime
 
 from coleo import Option, default, tooled
+import git
 from omegaconf import OmegaConf
 from voir.instruments.gpu import deduce_backend, select_backend
+import yaml
+from milabench import ROOT_FOLDER
 
 from milabench.alt_async import proceed
 from milabench.utils import available_layers, blabla, multilogger
@@ -194,6 +199,13 @@ def _get_multipack(
     if args.config is None:
         sys.exit("Error: CONFIG argument not provided and no $MILABENCH_CONFIG")
 
+    if args.system is None:
+        args.system = os.environ.get("MILABENCH_SYSTEM", None)
+
+    if args.system is None:
+        if XPath(f"{args.config}.system").exists():
+            args.system = f"{args.config}.system"
+
     if args.select:
         args.select = set(args.select.split(","))
 
@@ -255,7 +267,7 @@ def is_selected(defn):
         return selected_config
     else:
         return MultiPackage(
-            {name: get_pack(defn) for name, defn in selected_config.items()}
+            {name: get_pack(deepcopy(defn)) for name, defn in selected_config.items()}
         )
 
 
@@ -296,6 +308,160 @@ def _read_reports(*runs):
     return all_data
 
 
+def _find_metas(reports):
+    local_meta = next(iter(e for _r in reports for e in _r if e["event"] == "meta"), None)
+    if local_meta:
+        local_meta = local_meta["data"]
+    remote_metas = []
+    for _r in reports:
+        meta_lines = []
+        for event in _r:
+            _, event_type, line = None, "", []
+
+            try:
+                _, event_type, *line = event["data"].split(" ")
+            except (AttributeError, ValueError):
+                pass
+
+            if event_type[:1] + event_type[-1:] != "[]":
+                event_type = None
+                line = event["data"]
+            else:
+                line = " ".join(line)
+
+            if event_type == "[meta]":
+                meta_lines.append(line)
+            elif event_type is None and meta_lines:
+                meta_lines.append(line)
+            elif meta_lines:
+                remote_metas.append(yaml.safe_load("".join(meta_lines)))
+                meta_lines = []
+
+    return local_meta, remote_metas
+
+
+def _filter_reports(*reports):
+    all_reports = []
+
+    for report in reports:
+        config = next(iter(e for e in report if e["event"] == "config"), None)
+        if config is None:
+            continue
+
+        if config["data"]["name"] != "remote":
+            all_reports.append(report)
+
+    return all_reports
+
+
+def _push_reports(reports_repo, runs, packs:dict=None):
+    _SVG_COLORS = {
+        "pass": "blue",
+        "partial": "yellow",
+        "failure": "red",
+    }
+    import milabench.cli.badges as badges
+
+    _repo = git.repo.base.Repo(ROOT_FOLDER)
+    try:
+        reports_repo = git.repo.base.Repo(str(reports_repo))
+    except (git.exc.InvalidGitRepositoryError, git.exc.NoSuchPathError):
+        repo_url = next(iter(_r.url for _r in _repo.remotes if _r.name == "origin"), None)
+        reports_repo = git.repo.base.Repo.clone_from(repo_url, str(reports_repo), branch="reports")
+
+    reports_url = ([
+        _r.url for _r in _repo.remotes if "mila-iqia" in _r.url
+    ] or [
+        _r.url for _r in _repo.remotes if _r.name == "origin"
+    ])[0]
+    reports_url = XPath("github.com".join(reports_url.split("github.com")[1:])[1:])
+    reports_url = XPath("https://github.com") / f"{reports_url.with_suffix('')}/tree/{reports_repo.active_branch.name}"
+
+    device_reports = {}
+    for run in runs:
+        reports = list(_read_reports(run).values())
+        reports = _filter_reports(*reports)
+
+        if not reports:
+            continue
+
+        meta = [e["data"] for _r in reports for e in _r if e["event"] == "meta"]
+
+        for _meta in meta:
+            for gpu in _meta["accelerators"]["gpus"].values():
+                device = gpu["product"].replace(" ", "_")
+                break
+        else:
+            for _meta in meta:
+                device = _meta["cpu"]["brand"].replace(" ", "_")
+                break
+
+        tag = ([
+            t.name
+            for t in _repo.tags
+            if meta[0]["milabench"]["tag"].startswith(t.name)
+        ] or [meta[0]["milabench"]["tag"]])[0]
+        reports_dir = XPath(reports_repo.working_tree_dir) / tag
+
+        run = XPath(run)
+        try:
+            run.copy(reports_dir / device / run.name)
+        except FileExistsError:
+            pass
+
+        device_reports.setdefault((device, tag), set())
+        device_reports[(device, tag)].update(
+            (reports_dir / device).glob("*/")
+        )
+
+    for (device, tag), reports in device_reports.items():
+        reports_dir = XPath(reports_repo.working_tree_dir) / tag
+        reports = _read_reports(*reports)
+        reports = _filter_reports(*reports.values())
+        summary = make_summary(reports)
+
+        successes = [s["successes"] for s in summary.values()]
+        failures = [s["failures"] for s in summary.values()]
+
+        if sum(successes) == 0:
+            text = "failure"
+        elif any(failures):
+            text = "partial"
+        else:
+            text = "pass"
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m", badges.__name__,
+                "--left-text", device,
+                "--right-text", text,
+                "--right-color", _SVG_COLORS[text],
+                "--whole-link", str(reports_url / tag / device)
+            ],
+            capture_output=True
+        )
+        if result.returncode == 0:
+            (reports_dir / device / "badge.svg").write_text(result.stdout.decode("utf8"))
+
+        with open(str(reports_dir / device / "README.md"), "wt") as _f:
+            _f.write("```\n")
+            make_report(summary, stream=_f)
+            _f.write("```\n")
+
+        for cmd, _kwargs in (
+            (["git", "pull"], {"check": True}),
+            (["git", "add", tag], {"check": True}),
+            (["git", "commit", "-m", tag], {"check": False}),
+            (["git", "push"], {"check": True})
+        ):
+            subprocess.run(
+                cmd,
+                cwd=reports_repo.working_tree_dir,
+                **_kwargs
+            )
+
+
 def _error_report(reports):
     out = {}
     for r, data in reports.items():
diff --git a/milabench/config.py b/milabench/config.py
index bfee806e7..e276cb17c 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -1,6 +1,8 @@
 import contextvars
+import hashlib
 import os
 import socket
+from copy import deepcopy
 
 import psutil
 import yaml
@@ -57,6 +59,16 @@ def resolve_inheritance(bench_config, all_configs):
     return bench_config
 
 
+def compute_config_hash(config):
+    config = deepcopy(config)
+    for entry in config:
+        config[entry]["dirs"] = {}
+        config[entry]["config_base"] = ""
+        config[entry]["config_file"] = ""
+        config[entry]["run_name"] = ""
+    return hashlib.md5(str(config).encode("utf8")).hexdigest()
+
+
 def finalize_config(name, bench_config):
     bench_config["name"] = name
     if "definition" in bench_config:
@@ -76,6 +88,8 @@ def build_config(*config_files):
     for layer in _config_layers(config_files):
         all_configs = merge(all_configs, layer)
 
+    all_configs["*"]["hash"] = compute_config_hash(all_configs)
+
     for name, bench_config in all_configs.items():
         all_configs[name] = resolve_inheritance(bench_config, all_configs)
 
diff --git a/milabench/multi.py b/milabench/multi.py
index 9946a3642..4a6cbd58a 100644
--- a/milabench/multi.py
+++ b/milabench/multi.py
@@ -13,6 +13,7 @@
     is_main_local,
     is_multinode,
     is_remote,
+    milabench_remote_config,
     milabench_remote_install,
     milabench_remote_prepare,
     milabench_remote_run,
@@ -84,7 +85,10 @@ def setup_pack(self) -> Package:
                 "dirs": pack.config["dirs"],
                 "config_base": pack.config["config_base"],
                 "config_file": pack.config["config_file"],
+                "plan": pack.config["plan"],
                 "system": pack.config["system"],
+                "hash": pack.config["hash"],
+                "install_variant": pack.config["install_variant"],
             }
         )
 
@@ -121,6 +125,13 @@ async def do_install(self):
         remote_task = None
 
         if is_remote(setup):
+            await asyncio.wait(
+                [
+                    asyncio.create_task(t.execute())
+                    for t in milabench_remote_config(setup, self.packs)
+                ]
+            )
+
             # We are outside system, setup the main node first
             remote_plan = milabench_remote_install(setup, setup_for="main")
             remote_task = asyncio.create_task(remote_plan.execute())
@@ -142,6 +153,13 @@ async def do_prepare(self):
         remote_task = None
 
         if is_remote(setup):
+            await asyncio.wait(
+                [
+                    asyncio.create_task(t.execute())
+                    for t in milabench_remote_config(setup, self.packs)
+                ]
+            )
+
             remote_plan = milabench_remote_prepare(setup, run_for="main")
             remote_task = asyncio.create_task(remote_plan.execute())
             await asyncio.wait([remote_task])
@@ -158,6 +176,13 @@ async def do_run(self, repeat=1):
         setup = self.setup_pack()
 
         if is_remote(setup):
+            await asyncio.wait(
+                [
+                    asyncio.create_task(t.execute())
+                    for t in milabench_remote_config(setup, self.packs)
+                ]
+            )
+
             # if we are not on the main node right now
             # ssh to the main node and launch milabench
             remote_plan = milabench_remote_run(setup)
diff --git a/milabench/remote.py b/milabench/remote.py
index bf5963183..b1759f2fa 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -1,16 +1,22 @@
 import os
 import sys
 
+import yaml
+
+from milabench.fs import XPath
+
+from . import ROOT_FOLDER
 from .commands import (
     CmdCommand,
     Command,
     ListCommand,
+    SCPCommand,
     SequenceCommand,
     SSHCommand,
     VoidCommand,
 )
 
-INSTALL_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+INSTALL_FOLDER = str(ROOT_FOLDER)
 
 
 def scp(node, folder, dest=None) -> list:
@@ -30,21 +36,41 @@ def scp(node, folder, dest=None) -> list:
     ]
 
 
-def rsync(node, folder, dest=None) -> list:
+def rsync(node, src=None, remote_src=None, dest=None) -> list:
     """Copy a folder from local node to remote node"""
     host = node["ip"]
     user = node["user"]
+    key = node.get("key", None)
+    key = f"-i{key}" if key else ""
+
+    if isinstance(src, str):
+        src = [src]
+
+    assert not src or not remote_src
+    assert src or remote_src
 
     if dest is None:
-        dest = os.path.abspath(os.path.join(folder, ".."))
+        _ = remote_src if remote_src else src[0]
+        dest = os.path.abspath(os.path.join(_, ".."))
+
+    if remote_src:
+        remote_src = [f"{user}@{host}:{remote_src}"]
+        src = []
+    else:
+        dest = f"{user}@{host}:{dest}"
+        remote_src = []
 
     return [
         "rsync",
+        "--force",
         "-av",
         "-e",
-        "ssh -oCheckHostIP=no -oStrictHostKeyChecking=no",
-        folder,
-        f"{user}@{host}:{dest}",
+        f"ssh {key} -oCheckHostIP=no -oStrictHostKeyChecking=no",
+        "--include=*/.git/*",
+        *[f"--exclude=*/{_dir}/*"
+          for _dir in (".*", "venv", "env", "tmp")],
+        *src, *remote_src,
+        dest,
     ]
 
 
@@ -84,9 +110,9 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
     """
 
     nodes = pack.config["system"]["nodes"]
-    copy = []
     node_packs = []
 
+    copy = []
     for node in nodes:
         node_pack = None
 
@@ -107,6 +133,30 @@ def milabench_remote_setup_plan(pack, setup_for="worker") -> SequenceCommand:
     )
 
 
+def milabench_remote_fetch_reports_plan(pack, run_for="main") -> SequenceCommand:
+    """Copy milabench reports from remote
+
+    Notes
+    -----
+    Assume that the filesystem of remote node mirror local system.
+    """
+
+    nodes = pack.config["system"]["nodes"]
+    runs = pack.config["dirs"]["runs"]
+
+    copy = []
+    for node in nodes:
+        node_pack = None
+
+        if should_run_for(node, run_for):
+            node_pack = worker_pack(pack, node)
+            copy.append(CmdCommand(node_pack, *rsync(node, remote_src=str(runs))))
+
+    return SequenceCommand(
+        ListCommand(*copy),
+    )
+
+
 def worker_pack(pack, worker):
     if is_remote(pack):
         return pack.copy({})
@@ -131,7 +181,13 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand:
 
             cmds.append(
                 SSHCommand(
-                    CmdCommand(worker_pack(pack, worker), "milabench", *command),
+                    CmdCommand(
+                        worker_pack(pack, worker),
+                        "cd", f"{INSTALL_FOLDER}", "&&",
+                        f"MILABENCH_CONFIG={pack.config['config_file']}",
+                        f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}",
+                        "milabench", *command
+                    ),
                     host=host,
                     user=user,
                     key=key,
@@ -175,6 +231,45 @@ def _sanity(pack, setup_for):
         assert is_remote(pack), "Only a remote node can setup the main node"
 
 
+def milabench_remote_config(pack, packs):
+    config = {}
+    config_hash = pack.config["hash"]
+    config_file = XPath(pack.config["config_file"])
+    config_file = config_file.with_name(f"{config_file.name}.{config_hash}")
+    pack.config["config_file"] = str(config_file)
+    for p in packs.values():
+        config[p.config["name"]] = p.config
+        p.config["config_file"] = str(config_file)
+    config_file.write_text(yaml.dump(config))
+
+    for n in pack.config["system"]["nodes"]:
+        _cmds = [
+            SSHCommand(
+                CmdCommand(
+                    pack,
+                    "(", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")",
+                    "||", "(", "sudo", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"],
+                               "&&", "sudo", "chmod", "-R", "a+rwX", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")",
+                ),
+                n["ip"],
+            ),
+            SSHCommand(
+                CmdCommand(
+                    pack,
+                    "mkdir", "-p", str(config_file.parent),
+                ),
+                n["ip"],
+            ),
+            SCPCommand(
+                pack,
+                n["ip"],
+                str(config_file),
+            ),
+        ]
+
+        yield SequenceCommand(*_cmds)
+
+
 def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand:
     """Copy milabench code, install milabench, execute milabench install"""
     _sanity(pack, setup_for)
@@ -183,9 +278,9 @@ def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand:
         return VoidCommand(pack)
 
     argv = sys.argv[2:]
-
     return SequenceCommand(
         milabench_remote_setup_plan(pack, setup_for),
+        milabench_remote_command(pack, "pin", *argv, run_for=setup_for),
         milabench_remote_command(pack, "install", *argv, run_for=setup_for),
     )
 
@@ -210,4 +305,7 @@ def milabench_remote_run(pack) -> Command:
         return VoidCommand(pack)
 
     argv = sys.argv[2:]
-    return milabench_remote_command(pack, "run", *argv)
+    return SequenceCommand(
+        milabench_remote_command(pack, "run", *argv, run_for="main"),
+        milabench_remote_fetch_reports_plan(pack, run_for="main"),
+    )

From aaa92e4c17369aa9b750df07eaa39ac4ffafd4f1 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Tue, 19 Mar 2024 09:31:39 -0400
Subject: [PATCH 03/22] Update .github/workflows/cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index ab1211a9f..a215cb0a5 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -7,7 +7,7 @@ on:
       - master
 
 jobs:
-  tests:
+  could-tests:
     strategy:
       fail-fast: true
       matrix:

From 44ebcea64b0783bd9c501715d1fb0617e71f8d4a Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Tue, 19 Mar 2024 11:39:43 -0400
Subject: [PATCH 04/22] Apply suggestions from code review

---
 .github/workflows/cloud-ci.yml          |  2 +-
 milabench/cli/badges/requirements.txt   |  2 +-
 milabench/cli/covalent/requirements.txt |  2 +-
 milabench/common.py                     | 32 -------------------------
 4 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index a215cb0a5..0dcad45dc 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -19,7 +19,7 @@ jobs:
           #   exclude : "no-rocm"
 
     runs-on: ubuntu-latest
-    environment: test-cloud-ci
+    environment: cloud-ci
 
     # Cancel previous jobs if a new version was pushed
     concurrency:
diff --git a/milabench/cli/badges/requirements.txt b/milabench/cli/badges/requirements.txt
index 26620981a..2c1953bd5 100644
--- a/milabench/cli/badges/requirements.txt
+++ b/milabench/cli/badges/requirements.txt
@@ -1 +1 @@
-pybadges
\ No newline at end of file
+pybadges
diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt
index f810e6eaf..158fa227c 100644
--- a/milabench/cli/covalent/requirements.txt
+++ b/milabench/cli/covalent/requirements.txt
@@ -1,2 +1,2 @@
 covalent
-covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
\ No newline at end of file
+covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
diff --git a/milabench/common.py b/milabench/common.py
index 01a8976a0..92bde8c4e 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -308,38 +308,6 @@ def _read_reports(*runs):
     return all_data
 
 
-def _find_metas(reports):
-    local_meta = next(iter(e for _r in reports for e in _r if e["event"] == "meta"), None)
-    if local_meta:
-        local_meta = local_meta["data"]
-    remote_metas = []
-    for _r in reports:
-        meta_lines = []
-        for event in _r:
-            _, event_type, line = None, "", []
-
-            try:
-                _, event_type, *line = event["data"].split(" ")
-            except (AttributeError, ValueError):
-                pass
-
-            if event_type[:1] + event_type[-1:] != "[]":
-                event_type = None
-                line = event["data"]
-            else:
-                line = " ".join(line)
-
-            if event_type == "[meta]":
-                meta_lines.append(line)
-            elif event_type is None and meta_lines:
-                meta_lines.append(line)
-            elif meta_lines:
-                remote_metas.append(yaml.safe_load("".join(meta_lines)))
-                meta_lines = []
-
-    return local_meta, remote_metas
-
-
 def _filter_reports(*reports):
     all_reports = []
 

From a296410b0f9724ecd8ce65c9085577c24eb4e7d1 Mon Sep 17 00:00:00 2001
From: satyaog <satyaog@gmail.com>
Date: Tue, 19 Mar 2024 14:39:23 -0400
Subject: [PATCH 05/22] Update .github/workflows/cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 0dcad45dc..e27b9c14c 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -7,7 +7,7 @@ on:
       - master
 
 jobs:
-  could-tests:
+  cloud-tests:
     strategy:
       fail-fast: true
       matrix:

From 7996e5646b2ae137f1d0d829c36f1ba678525530 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Fri, 22 Mar 2024 02:32:07 -0400
Subject: [PATCH 06/22] Add azure covalent cloud infra

---
 .github/workflows/cloud-ci.yml                | 32 +++++++++++++++++--
 .../{ec2-system.yaml => cloud-system.yaml}    |  4 +++
 milabench/cli/cloud.py                        | 13 +++++---
 milabench/cli/covalent/__main__.py            | 31 ++++++++++++++----
 milabench/cli/covalent/requirements.txt       |  1 +
 milabench/config.py                           |  2 +-
 milabench/remote.py                           |  2 +-
 7 files changed, 69 insertions(+), 16 deletions(-)
 rename config/examples/{ec2-system.yaml => cloud-system.yaml} (83%)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index e27b9c14c..b73029626 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -6,6 +6,9 @@ on:
     branches:
       - master
 
+permissions:
+  id-token: write
+
 jobs:
   cloud-tests:
     strategy:
@@ -14,7 +17,7 @@ jobs:
         include:
           - arch: cuda
             exclude: "no-cuda"
-            run_on: ec2
+            run_on: azure
           # - arch: rocm
           #   exclude : "no-rocm"
 
@@ -32,11 +35,14 @@ jobs:
 
     env:
       MILABENCH_CONFIG: "config/test.yaml"
-      MILABENCH_SYSTEM: "config/examples/${{ matrix.run_on }}-system.yaml"
+      MILABENCH_SYSTEM: "config/examples/cloud-system.yaml"
       MILABENCH_BASE: "output"
       MILABENCH_ARGS: ""
       MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
       MILABENCH_DASH: "no"
+      ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
+      ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
+      AZURE_CORE_OUTPUT: none
 
     steps:
       - uses: actions/checkout@v3
@@ -47,6 +53,17 @@ jobs:
         with:
           python-version: 3.9
 
+      - name: Azure login
+        uses: azure/login@v2
+        with:
+          creds: |
+            {
+              "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
+              "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
+              "tenantId": "${{ secrets.ARM_TENANT_ID }}",
+              "clientId": "${{ secrets.ARM_CLIENT_ID }}"
+            }
+
       - name: dependencies
         run: |
           python -m pip install -U pip
@@ -107,3 +124,14 @@ jobs:
           git config --global user.email "github-ci@example.com"
           git config --global user.name "GitHub CI"
           poetry run milabench report --push
+
+      - name: teardown cloud
+        if: always()
+        run: |
+          if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
+          then
+            export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
+          fi
+          poetry run milabench cloud \
+            --teardown \
+            --run-on ${{ matrix.run_on }}
diff --git a/config/examples/ec2-system.yaml b/config/examples/cloud-system.yaml
similarity index 83%
rename from config/examples/ec2-system.yaml
rename to config/examples/cloud-system.yaml
index dab1a7a4e..5cf618b53 100644
--- a/config/examples/ec2-system.yaml
+++ b/config/examples/cloud-system.yaml
@@ -12,6 +12,10 @@ system:
 
   # Cloud instances profiles
   cloud_profiles:
+    azure:
+      username: ubuntu
+      size: Standard_B2ats_v2
+      location: eastus2
     ec2:
       username: ubuntu
       instance_type: t2.micro
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index c0f9c9bcb..310a4506b 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -6,7 +6,6 @@
 from coleo import Option, tooled
 import yaml
 
-# import milabench as mb
 from ..common import get_multipack
 
 
@@ -62,10 +61,14 @@ def manage_cloud(pack, packs, run_on, action="setup"):
             "-m", cv.__name__,
             run_on,
             f"--{action}",
-            *[
-                f"--{k.replace('_', '-')}={v}"
-                for k, v in plan_params.items()
-            ],
+            *list(
+                sum(
+                    (
+                        (f"--{k.replace('_', '-')}", v)
+                        for k, v in plan_params.items()
+                    ), ()
+                )
+            )
         ]
         p = subprocess.Popen(
             cmd,
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
index cf5ff8537..1c837a83c 100644
--- a/milabench/cli/covalent/__main__.py
+++ b/milabench/cli/covalent/__main__.py
@@ -1,5 +1,6 @@
 import argparse
 import asyncio
+import json
 import os
 import pathlib
 import subprocess
@@ -7,9 +8,21 @@
 import tempfile
 
 
+def _load_venv(venv:pathlib.Path) -> dict:
+    activate = venv / "bin/activate"
+    if not activate.exists():
+        raise FileNotFoundError(str(activate))
+    env = subprocess.run(
+        f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'",
+        shell=True,
+        capture_output=True
+    ).stdout
+    return json.loads(env)
+
+
 def serve(*argv):
     return subprocess.run([
-        str(pathlib.Path(sys.executable).with_name("covalent")),
+        "covalent",
         *argv
     ]).returncode
 
@@ -141,7 +154,7 @@ def lattice(argv=(), deps_bash = None):
             print(f"hostname::>{_executor.hostname}")
             print(f"username::>{_executor.username}")
             print(f"ssh_key_file::>{_executor.ssh_key_file}")
-            print(f"env::>~/.condaenvrc")
+            print(f"env::>{_executor.env}")
     finally:
         result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
         results_dir = result.results_dir if result else ""
@@ -171,12 +184,11 @@ def main(argv=None):
 
     try:
         import covalent as ct
-        ct.get_config(f"executors.ec2")
     except (KeyError, ImportError):
         module = pathlib.Path(__file__).resolve().parent
         cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
         python3 = str(cache_dir / "bin/python3")
-        check_module = "import covalent ; from covalent.executor import EC2Executor"
+        check_module = "import covalent"
         try:
             subprocess.run([python3, "-c", check_module], check=True)
         except (FileNotFoundError, subprocess.CalledProcessError):
@@ -190,17 +202,18 @@ def main(argv=None):
                 "install",
                 "-r",
                 str(module / "requirements.txt")
-            ], check=True)
+            ], stdout=sys.stderr, check=True)
             subprocess.run([python3, "-c", check_module], check=True)
         return subprocess.call(
             [python3, __file__, *argv],
+            env=_load_venv(cache_dir)
         )
 
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers()
     subparser = subparsers.add_parser("serve")
     subparser.add_argument(f"argv", nargs=argparse.REMAINDER)
-    for p in ("ec2",):
+    for p in ("azure","ec2"):
         try:
             config = ct.get_config(f"executors.{p}")
         except KeyError:
@@ -223,11 +236,15 @@ def main(argv=None):
     if cv_argv[0] == "serve":
         assert not argv
         return serve(*args.argv)
+    elif cv_argv[0] == "azure":
+        executor_cls = ct.executor.AzureExecutor
     elif cv_argv[0] == "ec2":
-        return executor(ct.executor.EC2Executor, args, *argv)
+        executor_cls = ct.executor.EC2Executor
     else:
         raise
 
+    return executor(executor_cls, args, *argv)
+
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt
index 158fa227c..c988e26a3 100644
--- a/milabench/cli/covalent/requirements.txt
+++ b/milabench/cli/covalent/requirements.txt
@@ -1,2 +1,3 @@
 covalent
 covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
+covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench
\ No newline at end of file
diff --git a/milabench/config.py b/milabench/config.py
index e276cb17c..694a9e60f 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -128,7 +128,7 @@ def get_remote_ip():
 def _resolve_ip(ip):
     # Resolve the IP
     try:
-        hostname, aliaslist, ipaddrlist = socket.gethostbyaddr(ip)
+        hostname, aliaslist, ipaddrlist = socket.gethostbyname_ex(ip)
         lazy_raise = None
     except socket.gaierror as err:
         # Get Addr Info (GAI) Error
diff --git a/milabench/remote.py b/milabench/remote.py
index b1759f2fa..e8aaa8312 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -249,7 +249,7 @@ def milabench_remote_config(pack, packs):
                     pack,
                     "(", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")",
                     "||", "(", "sudo", "mkdir", "-p", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"],
-                               "&&", "sudo", "chmod", "-R", "a+rwX", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")",
+                               "&&", "sudo", "chown", "-R", "$USER:$USER", str(ROOT_FOLDER.parent), pack.config["dirs"]["base"], ")",
                 ),
                 n["ip"],
             ),

From 6dbbd05260a3b902aea6a0a37b421198ed567f53 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 2 Apr 2024 02:10:36 -0400
Subject: [PATCH 07/22] Fix reports

---
 milabench/cli/badges/__main__.py | 28 +++-----------------
 milabench/cli/report.py          |  2 +-
 milabench/cli/utils.py           | 44 ++++++++++++++++++++++++++++++++
 milabench/common.py              |  9 +++----
 4 files changed, 53 insertions(+), 30 deletions(-)
 create mode 100644 milabench/cli/utils.py

diff --git a/milabench/cli/badges/__main__.py b/milabench/cli/badges/__main__.py
index 027a59a4b..e0a7bdc81 100644
--- a/milabench/cli/badges/__main__.py
+++ b/milabench/cli/badges/__main__.py
@@ -1,4 +1,3 @@
-import pathlib
 import subprocess
 import sys
 
@@ -10,29 +9,10 @@ def main(argv=None):
     try:
         import pybadges as _
     except ImportError:
-        module = pathlib.Path(__file__).resolve().parent
-        cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
-        python3 = str(cache_dir / "bin/python3")
-        check_module = "import pybadges"
-        try:
-            subprocess.run([python3, "-c", check_module], check=True)
-        except (FileNotFoundError, subprocess.CalledProcessError):
-            cache_dir.mkdir(parents=True, exist_ok=True)
-            subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True)
-            subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True)
-            subprocess.run([
-                python3,
-                "-m",
-                "pip",
-                "install",
-                "-r",
-                str(module / "requirements.txt")
-            ], check=True)
-            subprocess.run([python3, "-c", check_module], check=True)
-        return subprocess.call(
-            [python3, __file__, *argv],
-        )
-    
+        from ..utils import run_in_module_venv
+        check_if_module = "import pybadges"
+        return run_in_module_venv(__file__, check_if_module, argv)
+
     return subprocess.run([
         sys.executable,
         "-m",
diff --git a/milabench/cli/report.py b/milabench/cli/report.py
index b14b49528..a65d9f31b 100644
--- a/milabench/cli/report.py
+++ b/milabench/cli/report.py
@@ -119,4 +119,4 @@ def cli_report(args=None):
             XPath(pack_config["dirs"]["base"]) / "reports"
             for pack_config in args.config.values()
         ))
-        _push_reports(reports_repo, args.runs, summary)
+        _push_reports(reports_repo, args.runs)
diff --git a/milabench/cli/utils.py b/milabench/cli/utils.py
new file mode 100644
index 000000000..5aec72d06
--- /dev/null
+++ b/milabench/cli/utils.py
@@ -0,0 +1,44 @@
+import json
+import pathlib
+import subprocess
+import sys
+
+
+def get_venv(venv:pathlib.Path) -> dict:
+    activate = venv / "bin/activate"
+    if not activate.exists():
+        raise FileNotFoundError(str(activate))
+    env = subprocess.run(
+        f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'",
+        shell=True,
+        capture_output=True
+    ).stdout
+    return json.loads(env)
+
+
+def run_in_module_venv(module_main:str, check_if_module:str, argv:list=None):
+    module = pathlib.Path(module_main).resolve().parent
+    cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
+    python3 = str(cache_dir / "bin/python3")
+    try:
+        subprocess.run([python3, "-c", check_if_module], check=True,
+                       stdout=sys.stderr)
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)],
+                       check=True, stdout=sys.stderr)
+        subprocess.run([python3, "-m", "pip", "install", "-U", "pip"],
+                       check=True, stdout=sys.stderr)
+        subprocess.run([
+            python3,
+            "-m",
+            "pip",
+            "install",
+            "-r",
+            str(module / "requirements.txt")
+        ], stdout=sys.stderr, check=True)
+        subprocess.run([python3, "-c", check_if_module], check=True, stdout=sys.stderr)
+    return subprocess.call(
+        [python3, module_main, *argv],
+        env=get_venv(cache_dir)
+    )
\ No newline at end of file
diff --git a/milabench/common.py b/milabench/common.py
index 92bde8c4e..0fc540dde 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -322,7 +322,7 @@ def _filter_reports(*reports):
     return all_reports
 
 
-def _push_reports(reports_repo, runs, packs:dict=None):
+def _push_reports(reports_repo, runs):
     _SVG_COLORS = {
         "pass": "blue",
         "partial": "yellow",
@@ -355,10 +355,9 @@ def _push_reports(reports_repo, runs, packs:dict=None):
 
         meta = [e["data"] for _r in reports for e in _r if e["event"] == "meta"]
 
-        for _meta in meta:
-            for gpu in _meta["accelerators"]["gpus"].values():
-                device = gpu["product"].replace(" ", "_")
-                break
+        for gpu in (_ for _meta in meta for _ in _meta["accelerators"]["gpus"].values()):
+            device = gpu["product"].replace(" ", "_")
+            break
         else:
             for _meta in meta:
                 device = _meta["cpu"]["brand"].replace(" ", "_")

From a18a03461587a9d36b25806a98ca2dfcf3d40700 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Mon, 1 Apr 2024 15:14:58 -0400
Subject: [PATCH 08/22] Fix cloud-ci with gpu arch

---
 .github/workflows/cloud-ci.yml     | 18 +++++----
 config/cloud-system.yaml           | 18 +++++++++
 config/examples/cloud-system.yaml  |  7 ++++
 config/{ => examples}/test.yaml    |  4 +-
 milabench/cli/cloud.py             | 53 +++++++++++++++++----------
 milabench/cli/covalent/__main__.py | 59 +++---------------------------
 milabench/config.py                |  8 +++-
 milabench/remote.py                | 29 ++-------------
 8 files changed, 87 insertions(+), 109 deletions(-)
 create mode 100644 config/cloud-system.yaml
 rename config/{ => examples}/test.yaml (79%)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index b73029626..71e471252 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -17,7 +17,7 @@ jobs:
         include:
           - arch: cuda
             exclude: "no-cuda"
-            run_on: azure
+            run_on: azure__a100
           # - arch: rocm
           #   exclude : "no-rocm"
 
@@ -34,8 +34,8 @@ jobs:
         shell: bash -el {0}
 
     env:
-      MILABENCH_CONFIG: "config/test.yaml"
-      MILABENCH_SYSTEM: "config/examples/cloud-system.yaml"
+      MILABENCH_CONFIG: "config/standard.yaml"
+      MILABENCH_SYSTEM: "config/cloud-system.yaml"
       MILABENCH_BASE: "output"
       MILABENCH_ARGS: ""
       MILABENCH_GPU_ARCH: "${{ matrix.arch }}"
@@ -53,15 +53,18 @@ jobs:
         with:
           python-version: 3.9
 
+      # Follow
+      # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
+      # to generate a clientId as well as a clientSecret
       - name: Azure login
         uses: azure/login@v2
         with:
           creds: |
             {
+              "clientId": "${{ secrets.ARM_CLIENT_ID }}",
               "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
               "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
-              "tenantId": "${{ secrets.ARM_TENANT_ID }}",
-              "clientId": "${{ secrets.ARM_CLIENT_ID }}"
+              "tenantId": "${{ secrets.ARM_TENANT_ID }}"
             }
 
       - name: dependencies
@@ -108,7 +111,7 @@ jobs:
 
       - name: install benchmarks
         run: |
-          poetry run milabench install
+          poetry run milabench install --variant ${{ matrix.arch }}
 
       - name: prepare benchmarks
         run: |
@@ -134,4 +137,5 @@ jobs:
           fi
           poetry run milabench cloud \
             --teardown \
-            --run-on ${{ matrix.run_on }}
+            --run-on ${{ matrix.run_on }} \
+            --all
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
new file mode 100644
index 000000000..d1889c724
--- /dev/null
+++ b/config/cloud-system.yaml
@@ -0,0 +1,18 @@
+system:
+  # Nodes list
+  nodes:
+      # Alias used to reference the node
+    - name: manager
+      # Use 1.1.1.1 as an ip placeholder
+      ip: 1.1.1.1
+      # Use this node as the master node or not
+      main: true
+      # User to use in remote milabench operations
+      user: user
+
+  # Cloud instances profiles
+  cloud_profiles:
+    azure__a100:
+      username: ubuntu
+      size: Standard_NC24ads_A100_v4
+      location: eastus2
diff --git a/config/examples/cloud-system.yaml b/config/examples/cloud-system.yaml
index 5cf618b53..b3d1f70aa 100644
--- a/config/examples/cloud-system.yaml
+++ b/config/examples/cloud-system.yaml
@@ -12,11 +12,18 @@ system:
 
   # Cloud instances profiles
   cloud_profiles:
+    # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME}
     azure:
+      # covalent-azure-plugin args
+      username: ubuntu
+      size: Standard_B1s
+      location: eastus2
+    azure__free:
       username: ubuntu
       size: Standard_B2ats_v2
       location: eastus2
     ec2:
+      # covalent-ec2-plugin args
       username: ubuntu
       instance_type: t2.micro
       volume_size: 8
diff --git a/config/test.yaml b/config/examples/test.yaml
similarity index 79%
rename from config/test.yaml
rename to config/examples/test.yaml
index 060949e40..6e155a0bf 100644
--- a/config/test.yaml
+++ b/config/examples/test.yaml
@@ -9,14 +9,14 @@ test:
   inherits: _defaults
   group: test_remote
   install_group: test_remote
-  definition: ../benchmarks/_template
+  definition: ../../benchmarks/_template
   plan:
     method: njobs
     n: 1
 
 testing:
   inherits: _defaults
-  definition: ../benchmarks/_template
+  definition: ../../benchmarks/_template
   group: test_remote_2
   install_group: test_remote_2
   plan:
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index 310a4506b..c21fe16d9 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -4,6 +4,7 @@
 import sys
 
 from coleo import Option, tooled
+from omegaconf import OmegaConf
 import yaml
 
 from ..common import get_multipack
@@ -15,7 +16,16 @@
 _ACTIONS = (_SETUP, _TEARDOWN, _LIST)
 
 
-def manage_cloud(pack, packs, run_on, action="setup"):
+def _flatten_cli_args(**kwargs):
+    return sum(
+        (
+            (f"--{k.replace('_', '-')}", *([v] if v else []))
+            for k, v in kwargs.items()
+        ), ()
+    )
+
+
+def manage_cloud(pack, run_on, action="setup"):
     assert run_on in pack.config["system"]["cloud_profiles"]
 
     key_map = {
@@ -28,11 +38,6 @@ def manage_cloud(pack, packs, run_on, action="setup"):
 
     nodes = iter(enumerate(pack.config["system"]["nodes"]))
 
-    state_prefix = []
-    for p in packs.values():
-        state_prefix.append(p.config["name"])
-        state_prefix.append(p.config["install_variant"])
-
     while True:
         try:
             i, n = next(nodes)
@@ -41,8 +46,10 @@ def manage_cloud(pack, packs, run_on, action="setup"):
         except StopIteration:
             break
 
-        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), *state_prefix])
+        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on])
         plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
+        if i > 0:
+            plan_params["reuse_resource_group"] = None
 
         import milabench.cli.covalent as cv
 
@@ -59,16 +66,9 @@ def manage_cloud(pack, packs, run_on, action="setup"):
         cmd = [
             sys.executable,
             "-m", cv.__name__,
-            run_on,
+            run_on.split("__")[0],
             f"--{action}",
-            *list(
-                sum(
-                    (
-                        (f"--{k.replace('_', '-')}", v)
-                        for k, v in plan_params.items()
-                    ), ()
-                )
-            )
+            *_flatten_cli_args(**plan_params)
         ]
         p = subprocess.Popen(
             cmd,
@@ -121,7 +121,8 @@ def _setup():
 
     mp = get_multipack()
     setup_pack = mp.setup_pack()
-    system_config = manage_cloud(setup_pack, mp.packs, run_on, action=_SETUP)
+    system_config = manage_cloud(setup_pack, run_on, action=_SETUP)
+    del system_config["arch"]
 
     print(f"# hash::>{setup_pack.config['hash']}")
     print(yaml.dump({"system": system_config}))
@@ -131,12 +132,24 @@ def _setup():
 def _teardown():
     """Teardown a cloud infrastructure"""
 
-    # Setup cloud on target infra
+    # Teardown cloud instance on target infra
     run_on: Option & str
 
-    mp = get_multipack()
+    # Teardown all cloud instances
+    all: Option & bool = False
+
+    overrides = {}
+    if all:
+        overrides = {
+            "*": OmegaConf.to_object(OmegaConf.from_dotlist([
+                f"system.cloud_profiles.{run_on}.state_prefix='*'",
+                f"system.cloud_profiles.{run_on}.state_id='*'",
+            ]))
+        }
+
+    mp = get_multipack(overrides=overrides)
     setup_pack = mp.setup_pack()
-    manage_cloud(setup_pack, mp.packs, run_on, action=_TEARDOWN)
+    manage_cloud(setup_pack, run_on, action=_TEARDOWN)
 
 
 @tooled
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
index 1c837a83c..9537cc2dd 100644
--- a/milabench/cli/covalent/__main__.py
+++ b/milabench/cli/covalent/__main__.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import json
 import os
 import pathlib
 import subprocess
@@ -8,18 +7,6 @@
 import tempfile
 
 
-def _load_venv(venv:pathlib.Path) -> dict:
-    activate = venv / "bin/activate"
-    if not activate.exists():
-        raise FileNotFoundError(str(activate))
-    env = subprocess.run(
-        f". '{activate}' && python3 -c 'import os ; import json ; print(json.dumps(dict(os.environ)))'",
-        shell=True,
-        capture_output=True
-    ).stdout
-    return json.loads(env)
-
-
 def serve(*argv):
     return subprocess.run([
         "covalent",
@@ -119,21 +106,9 @@ def lattice(argv=(), deps_bash = None):
         deps_bash = None
 
         if not argv and args.setup:
-            conda_prefix = "eval \"$(conda shell.bash hook)\""
-            conda_activate = "conda activate milabench"
-            deps_bash = []
-            for _cmd in (
-                f"{conda_activate} || conda create -n milabench -y",
-                f"{conda_activate}"
-                f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
-                f" || >&2 echo First attempt to install python in milabench env failed",
-                f"{conda_activate}"
-                f" && conda install python={sys.version_info.major}.{sys.version_info.minor} virtualenv pip -y"
-                f" || conda remove -n milabench --all -y",
-            ):
-                deps_bash.append(f"{conda_prefix} && ({_cmd})")
-            deps_bash = ct.DepsBash(deps_bash)
-            argv = ["conda", "env", "list"]
+            deps_bash = ct.DepsBash([])
+            # Make sure pip is installed
+            argv = ["python3", "-m", "pip", "freeze"]
 
         if argv:
             dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
@@ -141,7 +116,6 @@ def lattice(argv=(), deps_bash = None):
             return_code, stdout, _ = result.result if result.result is not None else (1, "", "")
 
         if return_code == 0 and args.setup:
-            assert any([l for l in stdout.split("\n") if l.startswith("milabench ")])
             _executor:ct.executor.BaseExecutor = executor_cls(
                 **{
                     **_get_executor_kwargs(args),
@@ -154,7 +128,6 @@ def lattice(argv=(), deps_bash = None):
             print(f"hostname::>{_executor.hostname}")
             print(f"username::>{_executor.username}")
             print(f"ssh_key_file::>{_executor.ssh_key_file}")
-            print(f"env::>{_executor.env}")
     finally:
         result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
         results_dir = result.results_dir if result else ""
@@ -185,29 +158,9 @@ def main(argv=None):
     try:
         import covalent as ct
     except (KeyError, ImportError):
-        module = pathlib.Path(__file__).resolve().parent
-        cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
-        python3 = str(cache_dir / "bin/python3")
-        check_module = "import covalent"
-        try:
-            subprocess.run([python3, "-c", check_module], check=True)
-        except (FileNotFoundError, subprocess.CalledProcessError):
-            cache_dir.mkdir(parents=True, exist_ok=True)
-            subprocess.run([sys.executable, "-m", "virtualenv", str(cache_dir)], check=True)
-            subprocess.run([python3, "-m", "pip", "install", "-U", "pip"], check=True)
-            subprocess.run([
-                python3,
-                "-m",
-                "pip",
-                "install",
-                "-r",
-                str(module / "requirements.txt")
-            ], stdout=sys.stderr, check=True)
-            subprocess.run([python3, "-c", check_module], check=True)
-        return subprocess.call(
-            [python3, __file__, *argv],
-            env=_load_venv(cache_dir)
-        )
+        from ..utils import run_in_module_venv
+        check_if_module = "import covalent"
+        return run_in_module_venv(__file__, check_if_module, argv)
 
     parser = argparse.ArgumentParser()
     subparsers = parser.add_subparsers()
diff --git a/milabench/config.py b/milabench/config.py
index 694a9e60f..f50d735cb 100644
--- a/milabench/config.py
+++ b/milabench/config.py
@@ -173,6 +173,10 @@ def resolve_addresses(nodes):
         is_local = (
             ("127.0.0.1" in ipaddrlist)
             or (hostname in ("localhost", socket.gethostname()))
+            # Tmp workaround until networking on azure allows to associate the
+            # local hostname (`hostname.split(".")[0]`) with the public fqdn
+            # (hostname.split(".")[0].*.cloudapp.azure.com)
+            or (hostname.split(".")[0] == socket.gethostname())
             or len(ip_list.intersection(ipaddrlist)) > 0
         )
         node["local"] = is_local
@@ -227,9 +231,9 @@ def build_system_config(config_file, defaults=None, gpu=True):
             config = yaml.safe_load(cf)
 
     if defaults:
-        config = merge(defaults, config)
+        config["system"] = merge(defaults["system"], config["system"])
 
-    system = config.get("system", {})
+    system = config["system"]
 
     # capacity is only required if batch resizer is enabled
     if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
diff --git a/milabench/remote.py b/milabench/remote.py
index e8aaa8312..bbf1b4f0f 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 import os
 import sys
 
@@ -78,7 +79,7 @@ def pip_install_milabench(pack, node, folder) -> SSHCommand:
     host = node["ip"]
     user = node["user"]
 
-    cmd = ["pip", "install", "-e", folder]
+    cmd = ["python3", "-m", "pip", "install", "-e", folder]
     plan = CmdCommand(pack, *cmd)
     return SSHCommand(plan, host=host, user=user)
 
@@ -184,8 +185,9 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand:
                     CmdCommand(
                         worker_pack(pack, worker),
                         "cd", f"{INSTALL_FOLDER}", "&&",
-                        f"MILABENCH_CONFIG={pack.config['config_file']}",
                         f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}",
+                        f"MILABENCH_CONFIG={os.environ.get('MILABENCH_CONFIG', '')}",
+                        f"MILABENCH_SYSTEM={os.environ.get('MILABENCH_SYSTEM', '')}",
                         "milabench", *command
                     ),
                     host=host,
@@ -232,16 +234,6 @@ def _sanity(pack, setup_for):
 
 
 def milabench_remote_config(pack, packs):
-    config = {}
-    config_hash = pack.config["hash"]
-    config_file = XPath(pack.config["config_file"])
-    config_file = config_file.with_name(f"{config_file.name}.{config_hash}")
-    pack.config["config_file"] = str(config_file)
-    for p in packs.values():
-        config[p.config["name"]] = p.config
-        p.config["config_file"] = str(config_file)
-    config_file.write_text(yaml.dump(config))
-
     for n in pack.config["system"]["nodes"]:
         _cmds = [
             SSHCommand(
@@ -253,18 +245,6 @@ def milabench_remote_config(pack, packs):
                 ),
                 n["ip"],
             ),
-            SSHCommand(
-                CmdCommand(
-                    pack,
-                    "mkdir", "-p", str(config_file.parent),
-                ),
-                n["ip"],
-            ),
-            SCPCommand(
-                pack,
-                n["ip"],
-                str(config_file),
-            ),
         ]
 
         yield SequenceCommand(*_cmds)
@@ -280,7 +260,6 @@ def milabench_remote_install(pack, setup_for="worker") -> SequenceCommand:
     argv = sys.argv[2:]
     return SequenceCommand(
         milabench_remote_setup_plan(pack, setup_for),
-        milabench_remote_command(pack, "pin", *argv, run_for=setup_for),
         milabench_remote_command(pack, "install", *argv, run_for=setup_for),
     )
 

From 5267334ba5381623645d12db048983d45ecf0e6f Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Wed, 3 Apr 2024 00:29:32 -0400
Subject: [PATCH 09/22] Add multi-node on cloud

---
 config/examples/cloud-multinodes-system.yaml | 36 ++++++++++++++++++++
 milabench/cli/cloud.py                       | 22 +++++-------
 milabench/cli/covalent/__main__.py           |  9 ++---
 3 files changed, 50 insertions(+), 17 deletions(-)
 create mode 100644 config/examples/cloud-multinodes-system.yaml

diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml
new file mode 100644
index 000000000..a5b45c606
--- /dev/null
+++ b/config/examples/cloud-multinodes-system.yaml
@@ -0,0 +1,36 @@
+system:
+  # Nodes list
+  nodes:
+      # Alias used to reference the node
+    - name: manager
+      # Use 1.1.1.1 as an ip placeholder
+      ip: 1.1.1.1
+      # Use this node as the master node or not
+      main: true
+      # User to use in remote milabench operations
+      user: user
+
+    - name: node1
+      ip: 1.1.1.1
+      main: false
+      user: username
+
+  # Cloud instances profiles
+  cloud_profiles:
+    # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME}
+    azure:
+      # covalent-azure-plugin args
+      username: ubuntu
+      size: Standard_B1s
+      location: eastus2
+    azure__free:
+      username: ubuntu
+      size: Standard_B2ats_v2
+      location: eastus2
+    ec2:
+      # covalent-ec2-plugin args
+      username: ubuntu
+      instance_type: t2.micro
+      volume_size: 8
+      region: us-east-2
+      state_id: 71669879043a3864225aabb94f91a2d4
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index c21fe16d9..d93e4fec4 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -19,7 +19,7 @@
 def _flatten_cli_args(**kwargs):
     return sum(
         (
-            (f"--{k.replace('_', '-')}", *([v] if v else []))
+            (f"--{str(k).replace('_', '-')}", *([str(v)] if str(v) else []))
             for k, v in kwargs.items()
         ), ()
     )
@@ -35,21 +35,17 @@ def manage_cloud(pack, run_on, action="setup"):
         "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
     }
     plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
+    run_on, *profile = run_on.split("__")
+    profile = profile[0] if profile else ""
 
     nodes = iter(enumerate(pack.config["system"]["nodes"]))
+    for i, n in nodes:
+        if n["ip"] != "1.1.1.1":
+            continue
 
-    while True:
-        try:
-            i, n = next(nodes)
-            if n["ip"] != "1.1.1.1":
-                continue
-        except StopIteration:
-            break
-
-        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or "-".join([str(i), run_on])
+        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or profile or run_on
         plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
-        if i > 0:
-            plan_params["reuse_resource_group"] = None
+        plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
 
         import milabench.cli.covalent as cv
 
@@ -66,7 +62,7 @@ def manage_cloud(pack, run_on, action="setup"):
         cmd = [
             sys.executable,
             "-m", cv.__name__,
-            run_on.split("__")[0],
+            run_on,
             f"--{action}",
             *_flatten_cli_args(**plan_params)
         ]
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
index 9537cc2dd..f4f2ca47d 100644
--- a/milabench/cli/covalent/__main__.py
+++ b/milabench/cli/covalent/__main__.py
@@ -124,10 +124,11 @@ def lattice(argv=(), deps_bash = None):
             )
             asyncio.run(_executor.setup({}))
 
-            assert _executor.hostname
-            print(f"hostname::>{_executor.hostname}")
-            print(f"username::>{_executor.username}")
-            print(f"ssh_key_file::>{_executor.ssh_key_file}")
+            assert _executor.hostnames
+            for hostname in _executor.hostnames:
+                print(f"hostname::>{hostname}")
+                print(f"username::>{_executor.username}")
+                print(f"ssh_key_file::>{_executor.ssh_key_file}")
     finally:
         result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
         results_dir = result.results_dir if result else ""

From 15b2d9c204800c72c7afd363150df818177f5e43 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 9 Apr 2024 13:57:57 -0400
Subject: [PATCH 10/22] Fix cloud data dir

* VM on the cloud might not have enough space on all partitions. Add a workaround which should cover most cases
* Use branch and commit name to versionize reports directories
* Fix parsing error when temperature is not available in nvidia-smi outputs
* export MILABENCH_* env vars to remote
---
 config/cloud-multinodes-system.yaml          | 31 ++++++++
 config/cloud-system.yaml                     |  8 ++
 config/examples/cloud-multinodes-system.yaml |  3 +-
 docs/usage.rst                               | 81 ++++++++++++++++++++
 milabench/cli/cloud.py                       | 21 ++++-
 milabench/cli/covalent/__main__.py           |  2 +-
 milabench/cli/covalent/requirements.txt      |  2 +-
 milabench/common.py                          | 28 +++----
 milabench/log.py                             | 13 +++-
 milabench/remote.py                          | 12 ++-
 10 files changed, 178 insertions(+), 23 deletions(-)
 create mode 100644 config/cloud-multinodes-system.yaml

diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml
new file mode 100644
index 000000000..e5dc14f2b
--- /dev/null
+++ b/config/cloud-multinodes-system.yaml
@@ -0,0 +1,31 @@
+system:
+  # Nodes list
+  nodes:
+      # Alias used to reference the node
+    - name: manager
+      # Use 1.1.1.1 as an ip placeholder
+      ip: 1.1.1.1
+      # Use this node as the master node or not
+      main: true
+      # User to use in remote milabench operations
+      user: user
+
+    - name: node1
+      ip: 1.1.1.1
+      main: false
+      user: username
+
+  # Cloud instances profiles
+  cloud_profiles:
+    azure__a100:
+      username: ubuntu
+      size: Standard_NC24ads_A100_v4
+      location: eastus2
+    azure__a100_x2:
+      username: ubuntu
+      size: Standard_NC48ads_A100_v4
+      location: eastus2
+    azure__a10_x2:
+      username: ubuntu
+      size: Standard_NV72ads_A10_v5
+      location: eastus2
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
index d1889c724..2d1a049ad 100644
--- a/config/cloud-system.yaml
+++ b/config/cloud-system.yaml
@@ -16,3 +16,11 @@ system:
       username: ubuntu
       size: Standard_NC24ads_A100_v4
       location: eastus2
+    azure__a100_x2:
+      username: ubuntu
+      size: Standard_NC48ads_A100_v4
+      location: eastus2
+    azure__a10_x2:
+      username: ubuntu
+      size: Standard_NV72ads_A10_v5
+      location: eastus2
diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml
index a5b45c606..5066af5eb 100644
--- a/config/examples/cloud-multinodes-system.yaml
+++ b/config/examples/cloud-multinodes-system.yaml
@@ -17,7 +17,8 @@ system:
 
   # Cloud instances profiles
   cloud_profiles:
-    # The cloud platform to use in the form of {PLATFORM}__{PROFILE_NAME}
+    # The cloud platform to use in the form of {PLATFORM} or
+    # {PLATFORM}__{PROFILE_NAME}
     azure:
       # covalent-azure-plugin args
       username: ubuntu
diff --git a/docs/usage.rst b/docs/usage.rst
index ecea88b75..26c308513 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -69,3 +69,84 @@ The following command will print out a report of the tests that ran, the metrics
     milabench report --runs $MILABENCH_BASE/runs/some_specific_run --html report.html
 
 The report will also print out a score based on a weighting of the metrics, as defined in the file ``$MILABENCH_CONFIG`` points to.
+
+
+Use milabench on the cloud
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+Setup Terraform and a free Azure account
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. | Install azure cli (it does not need to be in the same environment than
+     milabench)
+   | ``pip install azure-cli``
+
+2. Setup a free account on
+   `azure.microsoft.com <https://azure.microsoft.com/en-us/free/>`_
+
+3. Follow instructions in the
+   `azurerm documentation <https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret#creating-a-service-principal-using-the-azure-cli>`_
+   to generate a ``ARM_CLIENT_ID`` as well as a ``ARM_CLIENT_SECRET``. If you
+   don't have the permissions to create / assign a role to a service principal,
+   you can ignore the ``az ad sp create-for-rbac`` command to work directly with
+   your ``ARM_TENANT_ID`` and ``ARM_SUBSCRIPTION_ID``
+
+4. `Install Terraform <https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli>`_
+
+5. Configure the ``azurerm`` Terraform provider by
+   `exporting the environment variables <https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret#configuring-the-service-principal-in-terraform>`_
+
+
+Create a cloud system configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
+supported cloud profiles.
+
+.. notes::
+
+  Nodes that should be created on the cloud should have the ``1.1.1.1`` ip
+  address placeholder. Other ip addresses will be used as-is and no cloud
+  instance will be created for that node
+
+.. notes::
+
+  A cloud profile entry needs to start with a covalent plugin (e.g. `azure`). To
+  define multiple profiles on the same cloud platform, use the form
+  ``{PLATFORM}__{PROFILE_NAME}`` (e.g. ``azure__profile``). All cloud profile
+  attributes will be used as is as argument for the target covalent plugin
+
+.. code-block:: yaml
+
+  system:
+    nodes:
+      - name: manager
+        # Use 1.1.1.1 as an ip placeholder
+        ip: 1.1.1.1
+        main: true
+        user: <username>
+      - name: node1
+        ip: 1.1.1.1
+        main: false
+        user: <username>
+  
+    # Cloud instances profiles
+    cloud_profiles:
+      # The cloud platform to use in the form of {PLATFORM} or
+      # {PLATFORM}__{PROFILE_NAME}
+      azure__free:
+        # covalent-azure-plugin args
+        username: ubuntu
+        size: Standard_B2ats_v2
+        location: eastus2
+
+
+Run milabench on the cloud
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. | Initialize the cloud instances
+   | ``milabench cloud --system {{SYSTEM_CONFIG.YAML}} --setup --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
+
+2. | Prepare, install and run milabench
+   | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}``
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index d93e4fec4..150e37749 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -7,8 +7,9 @@
 from omegaconf import OmegaConf
 import yaml
 
-from ..common import get_multipack
+from milabench.fs import XPath
 
+from ..common import get_multipack
 
 _SETUP = "setup"
 _TEARDOWN = "teardown"
@@ -25,8 +26,12 @@ def _flatten_cli_args(**kwargs):
     )
 
 
+def _or_sudo(cmd:str):
+    return f"( {cmd} || sudo {cmd} )"
+
+
 def manage_cloud(pack, run_on, action="setup"):
-    assert run_on in pack.config["system"]["cloud_profiles"]
+    assert run_on in pack.config["system"]["cloud_profiles"], f"{run_on} cloud profile not found in {list(pack.config['system']['cloud_profiles'].keys())}"
 
     key_map = {
         "hostname":(lambda v: ("ip",v)),
@@ -38,6 +43,9 @@ def manage_cloud(pack, run_on, action="setup"):
     run_on, *profile = run_on.split("__")
     profile = profile[0] if profile else ""
 
+    remote_base = XPath("/data") / pack.dirs.base.name
+    local_base = pack.dirs.base.absolute().parent
+
     nodes = iter(enumerate(pack.config["system"]["nodes"]))
     for i, n in nodes:
         if n["ip"] != "1.1.1.1":
@@ -66,6 +74,15 @@ def manage_cloud(pack, run_on, action="setup"):
             f"--{action}",
             *_flatten_cli_args(**plan_params)
         ]
+        if action == _SETUP:
+            cmd += [
+                "--",
+                "bash", "-c",
+                _or_sudo(f"mkdir -p '{local_base.parent}'") +
+                " && " + _or_sudo(f"chmod a+rwX '{local_base.parent}'") +
+                f" && mkdir -p '{remote_base}'"
+                f" && ln -sfT '{remote_base}' '{local_base}'"
+            ]
         p = subprocess.Popen(
             cmd,
             stdout=subprocess.PIPE,
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
index f4f2ca47d..eb602ee27 100644
--- a/milabench/cli/covalent/__main__.py
+++ b/milabench/cli/covalent/__main__.py
@@ -113,7 +113,7 @@ def lattice(argv=(), deps_bash = None):
         if argv:
             dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
             result = ct.get_result(dispatch_id=dispatch_id, wait=True)
-            return_code, stdout, _ = result.result if result.result is not None else (1, "", "")
+            return_code, _, _ = result.result if result.result is not None else (1, "", "")
 
         if return_code == 0 and args.setup:
             _executor:ct.executor.BaseExecutor = executor_cls(
diff --git a/milabench/cli/covalent/requirements.txt b/milabench/cli/covalent/requirements.txt
index c988e26a3..b70efc793 100644
--- a/milabench/cli/covalent/requirements.txt
+++ b/milabench/cli/covalent/requirements.txt
@@ -1,3 +1,3 @@
-covalent
+covalent==0.232
 covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
 covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench
\ No newline at end of file
diff --git a/milabench/common.py b/milabench/common.py
index 0fc540dde..ff0388df2 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -363,12 +363,8 @@ def _push_reports(reports_repo, runs):
                 device = _meta["cpu"]["brand"].replace(" ", "_")
                 break
 
-        tag = ([
-            t.name
-            for t in _repo.tags
-            if meta[0]["milabench"]["tag"].startswith(t.name)
-        ] or [meta[0]["milabench"]["tag"]])[0]
-        reports_dir = XPath(reports_repo.working_tree_dir) / tag
+        build = "-".join([_repo.active_branch.name.replace(os.path.sep, "_"), next(_repo.iter_commits()).hexsha])
+        reports_dir = XPath(reports_repo.working_tree_dir) / build
 
         run = XPath(run)
         try:
@@ -376,13 +372,16 @@ def _push_reports(reports_repo, runs):
         except FileExistsError:
             pass
 
-        device_reports.setdefault((device, tag), set())
-        device_reports[(device, tag)].update(
+        for _f in (reports_dir / device / run.name).glob("*.stderr"):
+            _f.unlink()
+
+        device_reports.setdefault((device, build), set())
+        device_reports[(device, build)].update(
             (reports_dir / device).glob("*/")
         )
 
-    for (device, tag), reports in device_reports.items():
-        reports_dir = XPath(reports_repo.working_tree_dir) / tag
+    for (device, build), reports in device_reports.items():
+        reports_dir = XPath(reports_repo.working_tree_dir) / build
         reports = _read_reports(*reports)
         reports = _filter_reports(*reports.values())
         summary = make_summary(reports)
@@ -404,9 +403,10 @@ def _push_reports(reports_repo, runs):
                 "--left-text", device,
                 "--right-text", text,
                 "--right-color", _SVG_COLORS[text],
-                "--whole-link", str(reports_url / tag / device)
+                "--whole-link", str(reports_url / build / device)
             ],
-            capture_output=True
+            capture_output=True,
+            check=True
         )
         if result.returncode == 0:
             (reports_dir / device / "badge.svg").write_text(result.stdout.decode("utf8"))
@@ -418,8 +418,8 @@ def _push_reports(reports_repo, runs):
 
         for cmd, _kwargs in (
             (["git", "pull"], {"check": True}),
-            (["git", "add", tag], {"check": True}),
-            (["git", "commit", "-m", tag], {"check": False}),
+            (["git", "add", build], {"check": True}),
+            (["git", "commit", "-m", build], {"check": False}),
             (["git", "push"], {"check": True})
         ):
             subprocess.run(
diff --git a/milabench/log.py b/milabench/log.py
index a6f7388a9..3724b34c3 100644
--- a/milabench/log.py
+++ b/milabench/log.py
@@ -333,6 +333,16 @@ def on_end(self, entry, data, row):
         self.refresh()
 
 
+_NO_DEFAULT_FLAG=("__NO_DEFAULT__",)
+def _parse_int(value, default=_NO_DEFAULT_FLAG):
+    try:
+        return int(value)
+    except TypeError:
+        if default is not _NO_DEFAULT_FLAG:
+            return default
+        raise
+
+
 class LongDashFormatter(DashFormatter):
     def make_table(self):
         table = Table.grid(padding=(0, 3, 0, 0))
@@ -375,7 +385,8 @@ def on_data(self, entry, data, row):
             for gpuid, data in gpudata.items():
                 load = int(data.get("load", 0) * 100)
                 currm, totalm = data.get("memory", [0, 0])
-                temp = int(data.get("temperature", 0))
+                # "temperature" is sometimes reported as None for some GPUs? A10?
+                temp = _parse_int(data.get("temperature", 0), 0)
                 row[f"gpu:{gpuid}"] = (
                     f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
                 )
diff --git a/milabench/remote.py b/milabench/remote.py
index bbf1b4f0f..b657f98c5 100644
--- a/milabench/remote.py
+++ b/milabench/remote.py
@@ -20,6 +20,14 @@
 INSTALL_FOLDER = str(ROOT_FOLDER)
 
 
+def milabench_env() -> list:
+    return [
+        f"{envvar}={os.environ[envvar]}"
+        for envvar in os.environ
+        if envvar.split("_")[0] == "MILABENCH" and os.environ[envvar]
+    ]
+
+
 def scp(node, folder, dest=None) -> list:
     """Copy a folder from local node to remote node"""
     host = node["ip"]
@@ -185,9 +193,7 @@ def milabench_remote_command(pack, *command, run_for="worker") -> ListCommand:
                     CmdCommand(
                         worker_pack(pack, worker),
                         "cd", f"{INSTALL_FOLDER}", "&&",
-                        f"MILABENCH_BASE={os.environ.get('MILABENCH_BASE', '')}",
-                        f"MILABENCH_CONFIG={os.environ.get('MILABENCH_CONFIG', '')}",
-                        f"MILABENCH_SYSTEM={os.environ.get('MILABENCH_SYSTEM', '')}",
+                        *milabench_env(),
                         "milabench", *command
                     ),
                     host=host,

From 4ccea235126018e349572a27e5b086e7a48c23bc Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Fri, 12 Apr 2024 14:42:50 -0400
Subject: [PATCH 11/22] Add docs

---
 docs/dev-usage.rst | 13 +++++++++++++
 docs/usage.rst     |  7 +++++++
 2 files changed, 20 insertions(+)

diff --git a/docs/dev-usage.rst b/docs/dev-usage.rst
index 42a9871e2..58d66fb0c 100644
--- a/docs/dev-usage.rst
+++ b/docs/dev-usage.rst
@@ -97,3 +97,16 @@ milabench compare
 ~~~~~~~~~~~~~~~~~
 
 TODO.
+
+Using milabench on the cloud
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Milabench uses `Terraform <https://developer.hashicorp.com/terraform>`_ through
+`Covalent <https://docs.covalent.xyz/>`_. To add support for a new cloud
+platform you will need to develop a new clovalent plugin with it's Terraform
+config. An example is the
+`covalent-azure-plugin <https://github.com/satyaog/covalent-azure-plugin/tree/feature/milabench>`_.
+The interesting parts would be:
+
+* `Terraform provider's related plugin arguments <https://github.com/satyaog/covalent-azure-plugin/blob/feature/milabench/covalent_azure_plugin/azure.py>`_
+* `Terraform provider's configuration <https://github.com/satyaog/covalent-azure-plugin/blob/feature/milabench/covalent_azure_plugin/infra/main.tf>`_
diff --git a/docs/usage.rst b/docs/usage.rst
index 26c308513..76aed5934 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -150,3 +150,10 @@ Run milabench on the cloud
 
 2. | Prepare, install and run milabench
    | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}``
+
+3. | Destroy the cloud instances
+   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
+   | or
+   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
+   | to destroy not just a single cloud instance but all instances on a
+   specified platform that were instanced from the current local machine

From d7f9366282db77fb17881b3d947c6efba97d8871 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:05:15 -0400
Subject: [PATCH 12/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 71e471252..9762ff3ff 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -6,6 +6,7 @@ on:
     branches:
       - master
 
+# Trigger CI
 permissions:
   id-token: write
 

From d132c48b060da7f2ed868476606753e67f7f8528 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:07:06 -0400
Subject: [PATCH 13/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 9762ff3ff..051c1d4e9 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -48,7 +48,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          token: ${{ secrets.REPORTS_PAT }}
+          token: ${{ github.token }}
 
       - uses: actions/setup-python@v2
         with:

From 131a98cbcfed1f6ec5235975bf0d00dd6080daf9 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:32:36 -0400
Subject: [PATCH 14/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 051c1d4e9..473bab0a2 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -6,7 +6,6 @@ on:
     branches:
       - master
 
-# Trigger CI
 permissions:
   id-token: write
 

From e8e2a6de36d25b30c14f5a3491aa665a49147d91 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:36:40 -0400
Subject: [PATCH 15/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 473bab0a2..89c03df67 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -6,6 +6,7 @@ on:
     branches:
       - master
 
+# 
 permissions:
   id-token: write
 

From a72f9b568b39214138c52ddd89d5ffed76e2e634 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:40:44 -0400
Subject: [PATCH 16/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 89c03df67..5c745c057 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -57,17 +57,22 @@ jobs:
       # Follow
       # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
       # to generate a clientId as well as a clientSecret
-      - name: Azure login
-        uses: azure/login@v2
-        with:
-          creds: |
-            {
-              "clientId": "${{ secrets.ARM_CLIENT_ID }}",
-              "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
-              "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
-              "tenantId": "${{ secrets.ARM_TENANT_ID }}"
-            }
-
+      # - name: Azure login
+      #   uses: azure/login@v2
+      #   with:
+      #     creds: |
+      #       {
+      #         "clientId": "${{ secrets.ARM_CLIENT_ID }}",
+      #         "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
+      #         "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
+      #         "tenantId": "${{ secrets.ARM_TENANT_ID }}"
+      #       }
+
+      - name: Azure Login
+        run: |
+          pip install azure-cli
+          az login --service-principal -u "${{ secrets.ARM_CLIENT_ID }}" -p "${{ secrets.ARM_CLIENT_SECRET }}" --tenant "${{ secrets.ARM_TENANT_ID }}"
+          
       - name: dependencies
         run: |
           python -m pip install -U pip

From 78c54582b844ca62fa63719ba68bc5df7f1bd1f4 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 11:57:24 -0400
Subject: [PATCH 17/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 5c745c057..e0db94033 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -6,7 +6,6 @@ on:
     branches:
       - master
 
-# 
 permissions:
   id-token: write
 

From 3b3a5acb5ea30f88f97d5c97f3b24ab8c49f2906 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 12:03:34 -0400
Subject: [PATCH 18/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index e0db94033..473bab0a2 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -56,22 +56,17 @@ jobs:
       # Follow
       # https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
       # to generate a clientId as well as a clientSecret
-      # - name: Azure login
-      #   uses: azure/login@v2
-      #   with:
-      #     creds: |
-      #       {
-      #         "clientId": "${{ secrets.ARM_CLIENT_ID }}",
-      #         "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
-      #         "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
-      #         "tenantId": "${{ secrets.ARM_TENANT_ID }}"
-      #       }
-
-      - name: Azure Login
-        run: |
-          pip install azure-cli
-          az login --service-principal -u "${{ secrets.ARM_CLIENT_ID }}" -p "${{ secrets.ARM_CLIENT_SECRET }}" --tenant "${{ secrets.ARM_TENANT_ID }}"
-          
+      - name: Azure login
+        uses: azure/login@v2
+        with:
+          creds: |
+            {
+              "clientId": "${{ secrets.ARM_CLIENT_ID }}",
+              "clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
+              "subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
+              "tenantId": "${{ secrets.ARM_TENANT_ID }}"
+            }
+
       - name: dependencies
         run: |
           python -m pip install -U pip

From db65d0562ae4b61617dbdd3e67db83225e3743a4 Mon Sep 17 00:00:00 2001
From: Setepenre <pierre.delaunay.tr@gmail.com>
Date: Mon, 15 Apr 2024 12:48:40 -0400
Subject: [PATCH 19/22] Update cloud-ci.yml

---
 .github/workflows/cloud-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 473bab0a2..7587c59ab 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -9,6 +9,7 @@ on:
 permissions:
   id-token: write
 
+# 
 jobs:
   cloud-tests:
     strategy:

From bc483750383b4632598074535b6915e6d58c1f4a Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Mon, 15 Apr 2024 15:54:46 -0400
Subject: [PATCH 20/22] Fix cloud instance name conflict

This would prevent the CI or multiple contributors to run tests with the same config
---
 docs/usage.rst         | 4 ++++
 milabench/cli/cloud.py | 8 ++++++--
 milabench/common.py    | 6 +++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/usage.rst b/docs/usage.rst
index 76aed5934..b2a25d85d 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -140,6 +140,10 @@ supported cloud profiles.
         username: ubuntu
         size: Standard_B2ats_v2
         location: eastus2
+        # state_prefix and state_id can be set to force a specific cloud
+        # instance id
+        # state_prefix: cloud-ci
+        # state_id: 849897_bivunaku
 
 
 Run milabench on the cloud
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index 150e37749..8d95a47d1 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -1,5 +1,6 @@
 from copy import deepcopy
 import os
+import socket
 import subprocess
 import sys
 
@@ -8,6 +9,7 @@
 import yaml
 
 from milabench.fs import XPath
+from milabench.utils import blabla
 
 from ..common import get_multipack
 
@@ -42,6 +44,8 @@ def manage_cloud(pack, run_on, action="setup"):
     plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
     run_on, *profile = run_on.split("__")
     profile = profile[0] if profile else ""
+    default_state_prefix = profile or run_on
+    default_state_id = "_".join((pack.config["hash"][:6], blabla()))
 
     remote_base = XPath("/data") / pack.dirs.base.name
     local_base = pack.dirs.base.absolute().parent
@@ -51,8 +55,8 @@ def manage_cloud(pack, run_on, action="setup"):
         if n["ip"] != "1.1.1.1":
             continue
 
-        plan_params["state_prefix"] = plan_params.get("state_prefix", None) or profile or run_on
-        plan_params["state_id"] = plan_params.get("state_id", None) or pack.config["hash"]
+        plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
+        plan_params["state_id"] = plan_params.get("state_id", default_state_id)
         plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
 
         import milabench.cli.covalent as cv
diff --git a/milabench/common.py b/milabench/common.py
index ff0388df2..1f22463b6 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -338,9 +338,9 @@ def _push_reports(reports_repo, runs):
         reports_repo = git.repo.base.Repo.clone_from(repo_url, str(reports_repo), branch="reports")
 
     reports_url = ([
-        _r.url for _r in _repo.remotes if "mila-iqia" in _r.url
+        url for _r in _repo.remotes for url in _r.urls if "mila-iqia" in url
     ] or [
-        _r.url for _r in _repo.remotes if _r.name == "origin"
+        url for _r in _repo.remotes for url in _r.urls if _r.name == "origin"
     ])[0]
     reports_url = XPath("github.com".join(reports_url.split("github.com")[1:])[1:])
     reports_url = XPath("https://github.com") / f"{reports_url.with_suffix('')}/tree/{reports_repo.active_branch.name}"
@@ -363,7 +363,7 @@ def _push_reports(reports_repo, runs):
                 device = _meta["cpu"]["brand"].replace(" ", "_")
                 break
 
-        build = "-".join([_repo.active_branch.name.replace(os.path.sep, "_"), next(_repo.iter_commits()).hexsha])
+        build = meta[0]["milabench"]["tag"]
         reports_dir = XPath(reports_repo.working_tree_dir) / build
 
         run = XPath(run)

From 86598c3ae286ddada95c4ab57361a3ccb64d3972 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 16 Apr 2024 15:06:50 -0400
Subject: [PATCH 21/22] Fix github push in CI

---
 .github/workflows/cloud-ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/cloud-ci.yml b/.github/workflows/cloud-ci.yml
index 7587c59ab..c19954c71 100644
--- a/.github/workflows/cloud-ci.yml
+++ b/.github/workflows/cloud-ci.yml
@@ -9,7 +9,6 @@ on:
 permissions:
   id-token: write
 
-# 
 jobs:
   cloud-tests:
     strategy:
@@ -124,7 +123,7 @@ jobs:
 
       - name: Summary
         run: |
-          git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)"
+          # git remote set-url origin "https://${{ vars.REPORTS_USERNAME }}:${{ secrets.REPORTS_PAT }}@$(git remote get-url origin | cut -d'/' -f3-)"
           git config --global user.email "github-ci@example.com"
           git config --global user.name "GitHub CI"
           poetry run milabench report --push

From 5b5700ca90ed3926d9cd4db1c897f54b356082eb Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Wed, 22 May 2024 14:13:20 -0400
Subject: [PATCH 22/22] Cleaner and tested azure plugin

---
 milabench/cli/cloud.py                        |  23 +-
 milabench/cli/covalent/__main__.py            | 204 ------------------
 milabench/common.py                           |   2 +-
 milabench/{cli => scripts}/badges/__main__.py |   0
 .../{cli => scripts}/badges/requirements.txt  |   0
 milabench/scripts/covalent/__main__.py        | 103 +++++++++
 .../covalent/requirements.txt                 |   0
 milabench/{cli => scripts}/utils.py           |   0
 8 files changed, 118 insertions(+), 214 deletions(-)
 delete mode 100644 milabench/cli/covalent/__main__.py
 rename milabench/{cli => scripts}/badges/__main__.py (100%)
 rename milabench/{cli => scripts}/badges/requirements.txt (100%)
 create mode 100644 milabench/scripts/covalent/__main__.py
 rename milabench/{cli => scripts}/covalent/requirements.txt (100%)
 rename milabench/{cli => scripts}/utils.py (100%)

diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index 8d95a47d1..16dd46d89 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -1,8 +1,8 @@
 from copy import deepcopy
 import os
-import socket
 import subprocess
 import sys
+import warnings
 
 from coleo import Option, tooled
 from omegaconf import OmegaConf
@@ -22,7 +22,7 @@
 def _flatten_cli_args(**kwargs):
     return sum(
         (
-            (f"--{str(k).replace('_', '-')}", *([str(v)] if str(v) else []))
+            (f"--{str(k).replace('_', '-')}", *([str(v)] if v is not None else []))
             for k, v in kwargs.items()
         ), ()
     )
@@ -39,7 +39,7 @@ def manage_cloud(pack, run_on, action="setup"):
         "hostname":(lambda v: ("ip",v)),
         "username":(lambda v: ("user",v)),
         "ssh_key_file":(lambda v: ("key",v)),
-        "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
+        # "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
     }
     plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
     run_on, *profile = run_on.split("__")
@@ -58,8 +58,9 @@ def manage_cloud(pack, run_on, action="setup"):
         plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
         plan_params["state_id"] = plan_params.get("state_id", default_state_id)
         plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
+        plan_params["keep_alive"] = None
 
-        import milabench.cli.covalent as cv
+        import milabench.scripts.covalent as cv
 
         subprocess.run(
             [
@@ -106,12 +107,16 @@ def manage_cloud(pack, run_on, action="setup"):
                 continue
             try:
                 k, v = line_str.split("::>")
-                k, v = key_map[k](v)
-                if k == "ip" and n[k] != "1.1.1.1":
-                    i, n = next(nodes)
-                n[k] = v
             except ValueError:
-                pass
+                continue
+            try:
+                k, v = key_map[k](v)
+            except KeyError:
+                warnings.warn(f"Ignoring invalid key received: {k}:{v}")
+                continue
+            if k == "ip" and n[k] != "1.1.1.1":
+                i, n = next(nodes)
+            n[k] = v
 
         _, stderr = p.communicate()
         stderr = stderr.decode("utf-8").strip()
diff --git a/milabench/cli/covalent/__main__.py b/milabench/cli/covalent/__main__.py
deleted file mode 100644
index eb602ee27..000000000
--- a/milabench/cli/covalent/__main__.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import argparse
-import asyncio
-import os
-import pathlib
-import subprocess
-import sys
-import tempfile
-
-
-def serve(*argv):
-    return subprocess.run([
-        "covalent",
-        *argv
-    ]).returncode
-
-
-def _get_executor_kwargs(args):
-    return {
-        **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")},
-        **{"action":k for k,v in vars(args).items() if k in ("setup", "teardown") and v},
-    }
-
-
-def executor(executor_cls, args, *argv):
-    import covalent as ct
-
-    executor:ct.executor.BaseExecutor = executor_cls(
-        **_get_executor_kwargs(args),
-    )
-
-    def _popen(cmd, *args, _env=None, **kwargs):
-        _env = _env if _env is not None else {}
-
-        for envvar in _env.keys():
-            envvar_val = _env[envvar]
-
-            if not envvar_val:
-                continue
-
-            envvar_val = pathlib.Path(envvar_val).expanduser()
-            if str(envvar_val) != _env[envvar]:
-                _env[envvar] = str(envvar_val)
-
-        if "MILABENCH_CONFIG_CONTENT" in _env:
-            _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent
-            with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f:
-                _f.write(_env["MILABENCH_CONFIG_CONTENT"])
-                _env["MILABENCH_CONFIG"] = _f.name
-
-        try:
-            cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:])
-        except IndexError:
-            pass
-
-        cwd = kwargs.pop("cwd", None)
-        if cwd is not None:
-            cwd = str(pathlib.Path(cwd).expanduser())
-            kwargs["cwd"] = cwd
-
-        _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env}
-
-        kwargs = {
-            **kwargs,
-            "env": _env,
-            "stdout": subprocess.PIPE,
-            "stderr": subprocess.PIPE,
-        }
-        p = subprocess.Popen(cmd, *args, **kwargs)
-
-        stdout_chunks = []
-        while True:
-            line = p.stdout.readline()
-            if not line:
-                break
-            line_str = line.decode("utf-8").strip()
-            stdout_chunks.append(line_str)
-            print(line_str)
-
-        _, stderr = p.communicate()
-        stderr = stderr.decode("utf-8").strip()
-        stdout = os.linesep.join(stdout_chunks)
-
-        if p.returncode != 0:
-            raise subprocess.CalledProcessError(
-                p.returncode,
-                (cmd, args, kwargs),
-                stdout,
-                stderr
-            )
-        return p.returncode, stdout, stderr
-
-    @ct.lattice
-    def lattice(argv=(), deps_bash = None):
-        return ct.electron(
-            _popen,
-            executor=executor,
-            deps_bash=deps_bash,
-        )(
-            argv,
-        )
-
-    return_code = 0
-    try:
-        dispatch_id = None
-        result = None
-        deps_bash = None
-
-        if not argv and args.setup:
-            deps_bash = ct.DepsBash([])
-            # Make sure pip is installed
-            argv = ["python3", "-m", "pip", "freeze"]
-
-        if argv:
-            dispatch_id = ct.dispatch(lattice, disable_run=False)(argv, deps_bash=deps_bash)
-            result = ct.get_result(dispatch_id=dispatch_id, wait=True)
-            return_code, _, _ = result.result if result.result is not None else (1, "", "")
-
-        if return_code == 0 and args.setup:
-            _executor:ct.executor.BaseExecutor = executor_cls(
-                **{
-                    **_get_executor_kwargs(args),
-                    **{"action": "teardown"},
-                }
-            )
-            asyncio.run(_executor.setup({}))
-
-            assert _executor.hostnames
-            for hostname in _executor.hostnames:
-                print(f"hostname::>{hostname}")
-                print(f"username::>{_executor.username}")
-                print(f"ssh_key_file::>{_executor.ssh_key_file}")
-    finally:
-        result = ct.get_result(dispatch_id=dispatch_id, wait=False) if dispatch_id else None
-        results_dir = result.results_dir if result else ""
-        if args.teardown:
-            try:
-                _executor:ct.executor.BaseExecutor = executor_cls(
-                    **{
-                        **_get_executor_kwargs(args),
-                        **{"action": "teardown"},
-                    }
-                )
-                asyncio.run(_executor.setup({}))
-                asyncio.run(
-                    _executor.teardown(
-                        {"dispatch_id": dispatch_id, "node_id": 0, "results_dir": results_dir}
-                    )
-                )
-            except FileNotFoundError:
-                pass
-
-    return return_code
-
-
-def main(argv=None):
-    if argv is None:
-        argv = sys.argv[1:]
-
-    try:
-        import covalent as ct
-    except (KeyError, ImportError):
-        from ..utils import run_in_module_venv
-        check_if_module = "import covalent"
-        return run_in_module_venv(__file__, check_if_module, argv)
-
-    parser = argparse.ArgumentParser()
-    subparsers = parser.add_subparsers()
-    subparser = subparsers.add_parser("serve")
-    subparser.add_argument(f"argv", nargs=argparse.REMAINDER)
-    for p in ("azure","ec2"):
-        try:
-            config = ct.get_config(f"executors.{p}")
-        except KeyError:
-            continue
-        subparser = subparsers.add_parser(p)
-        subparser.add_argument(f"--setup", action="store_true")
-        subparser.add_argument(f"--teardown", action="store_true")
-        for param, default in config.items():
-            if param == "action":
-                continue
-            subparser.add_argument(f"--{param.replace('_', '-')}", default=default)
-
-    try:
-        cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:]
-    except ValueError:
-        cv_argv, argv = argv, []
-
-    args = parser.parse_args(cv_argv)
-
-    if cv_argv[0] == "serve":
-        assert not argv
-        return serve(*args.argv)
-    elif cv_argv[0] == "azure":
-        executor_cls = ct.executor.AzureExecutor
-    elif cv_argv[0] == "ec2":
-        executor_cls = ct.executor.EC2Executor
-    else:
-        raise
-
-    return executor(executor_cls, args, *argv)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/milabench/common.py b/milabench/common.py
index 1f22463b6..ad56ac82b 100644
--- a/milabench/common.py
+++ b/milabench/common.py
@@ -328,7 +328,7 @@ def _push_reports(reports_repo, runs):
         "partial": "yellow",
         "failure": "red",
     }
-    import milabench.cli.badges as badges
+    import milabench.scripts.badges as badges
 
     _repo = git.repo.base.Repo(ROOT_FOLDER)
     try:
diff --git a/milabench/cli/badges/__main__.py b/milabench/scripts/badges/__main__.py
similarity index 100%
rename from milabench/cli/badges/__main__.py
rename to milabench/scripts/badges/__main__.py
diff --git a/milabench/cli/badges/requirements.txt b/milabench/scripts/badges/requirements.txt
similarity index 100%
rename from milabench/cli/badges/requirements.txt
rename to milabench/scripts/badges/requirements.txt
diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py
new file mode 100644
index 000000000..3cd61e007
--- /dev/null
+++ b/milabench/scripts/covalent/__main__.py
@@ -0,0 +1,103 @@
+import argparse
+import subprocess
+import sys
+
+
+def serve(*argv):
+    return subprocess.run([
+        "covalent",
+        *argv
+    ]).returncode
+
+
+def _get_executor_kwargs(args):
+    return {
+        **{k:v for k,v in vars(args).items() if k not in ("setup", "teardown")},
+    }
+
+
+def executor(executor_cls, args):
+    import covalent as ct
+
+    return_code = 0
+    try:
+        executor:ct.executor.BaseExecutor = executor_cls(
+            **_get_executor_kwargs(args),
+        )
+
+        if args.setup:
+            dispatch_id = ct.dispatch(
+                ct.lattice(executor.get_connection_attributes), disable_run=False
+            )()
+
+            result = ct.get_result(dispatch_id=dispatch_id, wait=True).result
+
+            assert result and result[0]
+
+            all_connection_attributes, _ = result
+            for hostname, connection_attributes in all_connection_attributes.items():
+                print(f"hostname::>{hostname}")
+                for attribute,value in connection_attributes.items():
+                    if attribute == "hostname":
+                        continue
+                    print(f"{attribute}::>{value}")
+    finally:
+        if args.teardown:
+            executor.stop_cloud_instance({})
+
+    return return_code
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    try:
+        import covalent as ct
+    except (KeyError, ImportError):
+        from ..utils import run_in_module_venv
+        check_if_module = "import covalent"
+        return run_in_module_venv(__file__, check_if_module, argv)
+
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+    subparser = subparsers.add_parser("serve")
+    subparser.add_argument(f"argv", nargs=argparse.REMAINDER)
+    for p in ("azure","ec2"):
+        try:
+            config = ct.get_config(f"executors.{p}")
+        except KeyError:
+            continue
+        subparser = subparsers.add_parser(p)
+        subparser.add_argument(f"--setup", action="store_true")
+        subparser.add_argument(f"--teardown", action="store_true")
+        for param, default in config.items():
+            add_argument_kwargs = {}
+            if isinstance(default, bool):
+                add_argument_kwargs["action"] = "store_false" if default else "store_true"
+            else:
+                add_argument_kwargs["default"] = default
+            subparser.add_argument(f"--{param.replace('_', '-')}", **add_argument_kwargs)
+
+    try:
+        cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:]
+    except ValueError:
+        cv_argv, argv = argv, []
+
+    args = parser.parse_args(cv_argv)
+
+    if cv_argv[0] == "serve":
+        assert not argv
+        return serve(*args.argv)
+    elif cv_argv[0] == "azure":
+        executor_cls = ct.executor.AzureExecutor
+    elif cv_argv[0] == "ec2":
+        executor_cls = ct.executor.EC2Executor
+    else:
+        raise
+
+    return executor(executor_cls, args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/milabench/cli/covalent/requirements.txt b/milabench/scripts/covalent/requirements.txt
similarity index 100%
rename from milabench/cli/covalent/requirements.txt
rename to milabench/scripts/covalent/requirements.txt
diff --git a/milabench/cli/utils.py b/milabench/scripts/utils.py
similarity index 100%
rename from milabench/cli/utils.py
rename to milabench/scripts/utils.py