.github/workflows/_test_te.yaml

name: ~test TransformerEngine

on:
  workflow_call:
    inputs:
      TE_IMAGE:
        type: string
        description: 'JAX+TE+PAXML image'
        required: true
        default: 'ghcr.io/nvidia/upstream-pax:latest'
      ARTIFACT_PREFIX:
        type: string
        description: 'Name of the artifact zip file'
        required: false
        default: 'te'

jobs:
  te-multi-gpu:
    uses: ./.github/workflows/_test_slurm_pyxis.yaml
    strategy:
      matrix:
        N_GPU: [2, 4, 8]
      fail-fast: false
    secrets:
      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
      SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
      CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
    with:
      NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
      SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
      OUTPUT_BASEDIR: /nfs/cluster
      OUTPUT_MOUNTPOINT: /output
      NODES: 1
      GPUS_PER_NODE: ${{ matrix.N_GPU }}
      NTASKS: 1
      NTASKS_PER_NODE: 1
      TIME_LIMIT: '00:10:00'
      EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
      IMAGE: ${{ inputs.TE_IMAGE }}
      SRUN_PREAMBLE: |
        nvidia-smi
        pip install \
          pytest \
          pytest-reportlog \
          cuda-python \
          -r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
      SRUN_SCRIPT: |
        set -ex
        cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
        pytest --report-log=/output/pytest-report.jsonl \
          test_single_gpu_encoder.py \
          test_multigpu_encoder.py \
          test_model_parallel_encoder.py

  sitrep:
    needs: te-multi-gpu
    if: success() || failure()
    runs-on: ubuntu-latest
    env:
      ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
      BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
    steps:
      - name: Check out the repository under ${GITHUB_WORKSPACE}
        uses: actions/checkout@v4

      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          pattern: |
            ${{ inputs.ARTIFACT_PREFIX }}-*
          merge-multiple: true

      - name: Generate sitrep
        shell: bash -x -e {0}
        run: |
          # bring in utility functions
          source .github/workflows/scripts/to_json.sh

          test_outcome_files=$(find -name pytest-report.jsonl)

          badge_label='TE Multi GPU tests'
          passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
          failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
          total_tests=$((failed_tests + passed_tests))
          
          if [[ ${total_tests} == 0 ]]; then
            badge_message='error'
            badge_color=red
            summary='TE multi GPU tests did not complete due to errors.'
          else
            badge_message="${passed_tests}/${total_tests} passed"
            if [[ ${failed_tests} == 0 ]]; then
              badge_color=brightgreen
            else
              badge_color=yellow
            fi
            summary="TE multi GPU tests : $badge_message"
          fi

          run_id=${{ github.run_id }} \
          to_json \
            run_id \
            summary \
            total_tests passed_tests failed_tests \
            badge_label badge_color badge_message \
          > sitrep.json

          schemaVersion=1 \
          label="${badge_label}" \
          message="${badge_message}" \
          color="${badge_color}" \
          to_json schemaVersion label message color \
          > ${{ env.BADGE_FILENAME_FULL }}

      - name: Upload training logs as artifacts
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.ARTIFACT_NAME_FULL }}
          path: |
            sitrep.json
            ${{ env.BADGE_FILENAME_FULL }}