Satyaog/feature/covalent #71
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: cloud-tests | |
on: | |
# Runs for pull requests | |
pull_request: | |
branches: | |
- master | |
permissions: | |
id-token: write | |
contents: write | |
jobs: | |
cloud-tests: | |
strategy: | |
fail-fast: true | |
max-parallel: 1 | |
matrix: | |
system: [2x_gpu] | |
include: | |
- arch: cuda | |
exclude: "no-cuda" | |
# - arch: rocm | |
# exclude : "no-rocm" | |
runs-on: ubuntu-latest | |
environment: cloud-ci | |
# Cancel previous jobs if a new version was pushed | |
concurrency: | |
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}" | |
cancel-in-progress: true | |
defaults: | |
run: | |
shell: bash -el {0} | |
env: | |
MILABENCH_CONFIG: "config/standard.yaml" | |
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml" | |
MILABENCH_BASE: "output" | |
MILABENCH_ARGS: "" | |
MILABENCH_DASH: "no" | |
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}" | |
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}" | |
AZURE_CORE_OUTPUT: none | |
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus" | |
_MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes" | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
token: ${{ github.token }} | |
- uses: actions/setup-python@v2 | |
with: | |
python-version: 3.9 | |
# Follow | |
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret | |
# to generate a clientId as well as a clientSecret | |
- name: Azure login | |
uses: azure/login@v2 | |
with: | |
creds: | | |
{ | |
"clientId": "${{ secrets.ARM_CLIENT_ID }}", | |
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}", | |
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}", | |
"tenantId": "${{ secrets.ARM_TENANT_ID }}" | |
} | |
- name: dependencies | |
run: | | |
python -m pip install -U pip | |
python -m pip install -U poetry | |
poetry lock --no-update | |
poetry install | |
- name: setup cloud credentials | |
run: | | |
mkdir -p ~/.aws | |
mkdir -p ~/.ssh/covalent | |
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem | |
echo "[default]" >~/.aws/credentials | |
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials | |
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials | |
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh | |
- name: start covalent server | |
run: | | |
poetry run -- python3 -m milabench.scripts.covalent serve start --develop | |
- name: setup cloud | |
run: | | |
case "${{ matrix.system }}" in | |
"1x_gpu") | |
export MILABENCH_SYSTEM="config/cloud-system.yaml" | |
export RUN_ON="azure__a100" | |
export SELECT= | |
export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" | |
;; | |
"2x_gpu") | |
export MILABENCH_SYSTEM="config/cloud-system.yaml" | |
export RUN_ON="azure__a100_x2" | |
export SELECT="--select $_MULTI_GPUS" | |
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" | |
;; | |
"2x_node") | |
export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml" | |
export RUN_ON="azure__a100" | |
export SELECT="--select $_MULTI_NODES" | |
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single" | |
;; | |
*) | |
exit 1 | |
;; | |
esac | |
poetry run milabench cloud \ | |
--setup \ | |
--run-on $RUN_ON \ | |
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON | |
echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV | |
echo "SELECT=$SELECT" >>$GITHUB_ENV | |
echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV | |
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV | |
- name: install benchmarks | |
run: | | |
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES | |
- name: prepare benchmarks | |
run: | | |
poetry run milabench prepare $SELECT $EXCLUDES | |
- name: run benchmarks | |
run: | | |
poetry run milabench run $SELECT $EXCLUDES | |
- name: Summary | |
run: | | |
git config credential.${{ github.server_url }}.username ${{ github.actor }} | |
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f' | |
git config --global user.email "[email protected]" | |
git config --global user.name "GitHub CI" | |
poetry run milabench report --push | |
env: | |
GITHUB_TOKEN: ${{ github.token }} | |
- name: DEBUG state file | |
if: always() | |
run: | | |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate | |
- name: teardown cloud | |
if: always() | |
run: | | |
if [[ -f "${MILABENCH_SYSTEM%.*}" ]] | |
then | |
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*} | |
fi | |
poetry run milabench cloud \ | |
--teardown \ | |
--run-on $RUN_ON \ | |
--all | |
- name: DEBUG logs | |
if: always() | |
run: | | |
cat ~/.cache/covalent/covalent_ui.log |