Skip to content

[fix] multi-node backward slowdown #8674

[fix] multi-node backward slowdown

[fix] multi-node backward slowdown #8674

Workflow file for this run

name: Build on PR
on:
pull_request:
types: [synchronize, opened, reopened, ready_for_review, closed]
branches:
- "main"
- "develop"
- "feature/**"
paths:
- ".github/workflows/build_on_pr.yml" # run command & env variables change
- "colossalai/**" # source code change
- "!colossalai/**.md" # ignore doc change
- "op_builder/**" # cuda extension change
- "!op_builder/**.md" # ignore doc change
- "requirements/**" # requirements change
- "tests/**" # test change
- "!tests/**.md" # ignore doc change
- "pytest.ini" # test config change
- "setup.py" # install command change
create:
delete:
jobs:
detect:
name: Detect file change
if: |
github.event_name == 'pull_request' &&
(github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'ready_for_review') &&
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
outputs:
changedExtenisonFiles: ${{ steps.find-extension-change.outputs.all_changed_files }}
anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
cancel-in-progress: true
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
ref: ${{ github.event.pull_request.head.sha }}
- name: Locate base commit
id: locate-base-sha
run: |
curBranch=$(git rev-parse --abbrev-ref HEAD)
commonCommit=$(git merge-base origin/main $curBranch)
echo $commonCommit
echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
- name: Find the changed extension-related files
id: find-extension-change
uses: tj-actions/changed-files@v35
with:
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
files: |
op_builder/**
colossalai/kernel/**
setup.py
- name: Find the changed library-related files
id: find-lib-change
uses: tj-actions/changed-files@v35
with:
base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
files: |
**/*.py
**/*.h
**/*.cpp
**/*.cu
**/*.txt
- name: List changed files
run: |
for file in ${{ steps.find-extension-change.outputs.all_changed_files }}; do
echo "$file was changed"
done
for file in ${{ steps.find-lib-change.outputs.all_changed_files }}; do
echo "$file was changed"
done
build:
name: Build and Test Colossal-AI
needs: detect
if: needs.detect.outputs.anyLibraryFileChanged == 'true'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
timeout-minutes: 90
defaults:
run:
shell: bash
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test
cancel-in-progress: true
steps:
- name: Checkout TensorNVMe
uses: actions/checkout@v2
with:
repository: hpcaitech/TensorNVMe
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
path: TensorNVMe
- name: Restore TensorNVMe Cache
run: |
if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then
cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
fi
- name: Install TensorNVMe
run: |
cd TensorNVMe
conda install cmake
pip install -r requirements.txt
DISABLE_URING=1 pip install -v .
- name: Store TensorNVMe Cache
run: |
cd TensorNVMe
cp -p -r ./build /github/home/tensornvme_cache/
cp -p -r ./cmake-build /github/home/tensornvme_cache/
- name: Checkout Colossal-AI
uses: actions/checkout@v2
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
- name: Restore Colossal-AI Cache
if: needs.detect.outputs.anyExtensionFileChanged != 'true'
run: |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
fi
- name: Install Colossal-AI
run: |
BUILD_EXT=1 pip install -v -e .
pip install --no-cache-dir -r requirements/requirements-test.txt
- name: Store Colossal-AI Cache
run: |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
- name: Execute Unit Testing
run: |
CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
-m "not largedist" \
--durations=0 \
--ignore tests/test_analyzer \
--ignore tests/test_auto_parallel \
--ignore tests/test_fx \
--ignore tests/test_autochunk \
--ignore tests/test_gptq \
--ignore tests/test_infer_ops \
--ignore tests/test_legacy \
--ignore tests/test_smoothquant \
tests/
env:
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
LLAMA_PATH: /data/scratch/llama-tiny
MOE_TENSOR_PATH: /data/scratch/moe_tensors
- name: Collate artifact
env:
PR_NUMBER: ${{ github.event.number }}
changedLibraryFiles: ${{ needs.detect.outputs.changedLibraryFiles }}
anyLibraryFileChanged: ${{ needs.detect.outputs.anyLibraryFileChanged }}
changedExtenisonFiles: ${{ needs.detect.outputs.changedExtenisonFiles }}
run: |
mkdir report
echo $PR_NUMBER > ./report/pr_number
# generate coverage.xml if any
if [ "$anyLibraryFileChanged" == "true" ] && [ -e .coverage ]; then
allFiles=""
for file in $changedLibraryFiles; do
if [ "$allFiles" == "" ]; then
allFiles=$file
else
allFiles=$allFiles,$file
fi
done
coverage report --data-file .coverage --include $allFiles > ./coverage.txt
covPercentage=$(tail -n 1 coverage.txt | grep -o '[1-9]*%$')
covNum=${covPercentage::-1}
mv coverage.txt ./report
echo $covNum > ./report/cov_number
else
echo "No coverage report is generated"
fi
- name: Upload test coverage artifact
uses: actions/upload-artifact@v3
with:
name: report
path: report/