-
Notifications
You must be signed in to change notification settings - Fork 48
121 lines (110 loc) · 3.94 KB
/
_test_te.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
name: ~test TransformerEngine
on:
workflow_call:
inputs:
TE_IMAGE:
type: string
description: 'JAX+TE+PAXML image'
required: true
default: 'ghcr.io/nvidia/upstream-pax:latest'
ARTIFACT_PREFIX:
type: string
description: 'Name of the artifact zip file'
required: false
default: 'te'
jobs:
te-multi-gpu:
uses: ./.github/workflows/_test_slurm_pyxis.yaml
strategy:
matrix:
N_GPU: [2, 4, 8]
fail-fast: false
secrets:
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
with:
NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
OUTPUT_BASEDIR: /nfs/cluster
OUTPUT_MOUNTPOINT: /output
NODES: 1
GPUS_PER_NODE: ${{ matrix.N_GPU }}
NTASKS: 1
NTASKS_PER_NODE: 1
TIME_LIMIT: '00:10:00'
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
IMAGE: ${{ inputs.TE_IMAGE }}
SRUN_PREAMBLE: |
nvidia-smi
pip install \
pytest \
pytest-reportlog \
cuda-python \
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
SRUN_SCRIPT: |
set -ex
cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
pytest --report-log=/output/pytest-report.jsonl \
test_single_gpu_encoder.py \
test_multigpu_encoder.py \
test_model_parallel_encoder.py
sitrep:
needs: te-multi-gpu
if: success() || failure()
runs-on: ubuntu-latest
env:
ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
steps:
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4
- name: Download artifacts
uses: actions/download-artifact@v4
with:
pattern: |
${{ inputs.ARTIFACT_PREFIX }}-*
merge-multiple: true
- name: Generate sitrep
shell: bash -x -e {0}
run: |
# bring in utility functions
source .github/workflows/scripts/to_json.sh
test_outcome_files=$(find -name pytest-report.jsonl)
badge_label='TE Multi GPU tests'
passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
total_tests=$((failed_tests + passed_tests))
if [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
summary='TE multi GPU tests did not complete due to errors.'
else
badge_message="${passed_tests}/${total_tests} passed"
if [[ ${failed_tests} == 0 ]]; then
badge_color=brightgreen
else
badge_color=yellow
fi
summary="TE multi GPU tests : $badge_message"
fi
run_id=${{ github.run_id }} \
to_json \
run_id \
summary \
total_tests passed_tests failed_tests \
badge_label badge_color badge_message \
> sitrep.json
schemaVersion=1 \
label="${badge_label}" \
message="${badge_message}" \
color="${badge_color}" \
to_json schemaVersion label message color \
> ${{ env.BADGE_FILENAME_FULL }}
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ env.ARTIFACT_NAME_FULL }}
path: |
sitrep.json
${{ env.BADGE_FILENAME_FULL }}