Skip to content

Satyaog/feature/covalent #66

Satyaog/feature/covalent

Satyaog/feature/covalent #66

Workflow file for this run

name: cloud-tests
on:
# Runs for pull requests
pull_request:
branches:
- master
permissions:
id-token: write
contents: write
jobs:
cloud-tests:
strategy:
fail-fast: true
max-parallel: 1
matrix:
system: [1x_gpu, 2x_gpu]
include:
- arch: cuda
exclude: "no-cuda"
# - arch: rocm
# exclude : "no-rocm"
runs-on: ubuntu-latest
environment: cloud-ci
# Cancel previous jobs if a new version was pushed
concurrency:
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
cancel-in-progress: true
defaults:
run:
shell: bash -el {0}
env:
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_DASH: "no"
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
AZURE_CORE_OUTPUT: none
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus"
_MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes"
steps:
- uses: actions/checkout@v3
with:
token: ${{ github.token }}
- uses: actions/setup-python@v2
with:
python-version: 3.9
# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}
- name: dependencies
run: |
python -m pip install -U pip
python -m pip install -U poetry
poetry lock --no-update
poetry install
- name: setup cloud credentials
run: |
mkdir -p ~/.aws
mkdir -p ~/.ssh/covalent
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
echo "[default]" >~/.aws/credentials
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh
- name: start covalent server
run: |
poetry run -- python3 -m milabench.scripts.covalent serve start --develop
- name: setup cloud
run: |
case "${{ matrix.system }}" in
"1x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100"
export SELECT=
export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
"2x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100_x2"
export SELECT="--select $_MULTI_GPUS"
export EXCLUDES=
;;
"2x_node")
export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
export RUN_ON="azure__a100"
export SELECT="--select $_MULTI_NODES"
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
*)
exit 1
;;
esac
poetry run milabench cloud \
--setup \
--run-on $RUN_ON \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
echo "SELECT=$SELECT" >>$GITHUB_ENV
echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
- name: install benchmarks
run: |
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES
- name: prepare benchmarks
run: |
poetry run milabench prepare $SELECT $EXCLUDES
- name: run benchmarks
run: |
poetry run milabench run --no-report $SELECT $EXCLUDES
- name: Summary
run: |
git config credential.${{ github.server_url }}.username ${{ github.actor }}
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f'
git config --global user.email "[email protected]"
git config --global user.name "GitHub CI"
poetry run milabench report --push
env:
GITHUB_TOKEN: ${{ github.token }}
- name: DEBUG state file
if: always()
run: |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
- name: teardown cloud
if: always()
run: |
if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
then
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
fi
poetry run milabench cloud \
--teardown \
--run-on $RUN_ON \
--all
- name: DEBUG logs
if: always()
run: |
cat ~/.cache/covalent/covalent_ui.log