-
Notifications
You must be signed in to change notification settings - Fork 26
172 lines (149 loc) · 5.65 KB
/
cloud-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
name: cloud-tests
on:
# Runs for pull requests
pull_request:
branches:
- master
permissions:
id-token: write
contents: write
jobs:
cloud-tests:
strategy:
fail-fast: true
max-parallel: 1
matrix:
system: [2x_gpu]
include:
- arch: cuda
exclude: "no-cuda"
# - arch: rocm
# exclude : "no-rocm"
runs-on: ubuntu-latest
environment: cloud-ci
# Cancel previous jobs if a new version was pushed
concurrency:
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
cancel-in-progress: true
defaults:
run:
shell: bash -el {0}
env:
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
MILABENCH_BASE: "output"
MILABENCH_ARGS: ""
MILABENCH_DASH: "no"
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
AZURE_CORE_OUTPUT: none
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,resnet152-ddp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus,llm-full-mp-gpus"
_MULTI_NODES: "diffusion-nodes,dinov2-giant-nodes,llm-lora-ddp-nodes,llm-full-mp-nodes"
steps:
- uses: actions/checkout@v3
with:
token: ${{ github.token }}
- uses: actions/setup-python@v2
with:
python-version: 3.9
# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}
- name: dependencies
run: |
python -m pip install -U pip
python -m pip install -U poetry
poetry lock --no-update
poetry install
- name: setup cloud credentials
run: |
mkdir -p ~/.aws
mkdir -p ~/.ssh/covalent
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
echo "[default]" >~/.aws/credentials
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh
- name: start covalent server
run: |
poetry run -- python3 -m milabench.scripts.covalent serve start --develop
- name: setup cloud
run: |
case "${{ matrix.system }}" in
"1x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100"
export SELECT=
export EXCLUDES="--exclude $_MULTI_GPUS,$_MULTI_NODES,llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
"2x_gpu")
export MILABENCH_SYSTEM="config/cloud-system.yaml"
export RUN_ON="azure__a100_x2"
export SELECT="--select $_MULTI_GPUS"
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
"2x_node")
export MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
export RUN_ON="azure__a100"
export SELECT="--select $_MULTI_NODES"
export EXCLUDES="--exclude llm-full-mp-gpus,llm-full-mp-nodes,llm-lora-ddp-gpus,llm-lora-ddp-nodes,llm-lora-mp-gpus,llm-lora-single"
;;
*)
exit 1
;;
esac
poetry run milabench cloud \
--setup \
--run-on $RUN_ON \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON
echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV
echo "SELECT=$SELECT" >>$GITHUB_ENV
echo "EXCLUDES=$EXCLUDES" >>$GITHUB_ENV
echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
- name: install benchmarks
run: |
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDES
- name: prepare benchmarks
run: |
poetry run milabench prepare $SELECT $EXCLUDES
- name: run benchmarks
run: |
poetry run milabench run $SELECT $EXCLUDES
- name: Summary
run: |
git config credential.${{ github.server_url }}.username ${{ github.actor }}
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f'
git config --global user.email "[email protected]"
git config --global user.name "GitHub CI"
poetry run milabench report --push
env:
GITHUB_TOKEN: ${{ github.token }}
- name: DEBUG state file
if: always()
run: |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate
- name: teardown cloud
if: always()
run: |
if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
then
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
fi
poetry run milabench cloud \
--teardown \
--run-on $RUN_ON \
--all
- name: DEBUG logs
if: always()
run: |
cat ~/.cache/covalent/covalent_ui.log