deepspeedai/DeepSpeed
27 workflows · maturity 50% · 5 patterns · GitHub ↗
Practices
✓ Matrix✓ Permissions○ Security scan○ AI review○ Cache✓ Concurrency○ Reusable workflows
Detected patterns
Security dimensions
Workflows (27)
amd-mi200 perms .github/workflows/amd-mi200.yml
View raw YAML
name: amd-mi200
on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/amd-mi200.yml'
- 'requirements/**'
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
amd-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, amd, mi200]
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
git rev-parse --short HEAD
pip install .
- name: Install (ROCm) apex
run: |
git clone https://github.com/ROCmSoftwarePlatform/apex.git
CURRENT_VER=$(git rev-parse HEAD)
INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
fi
echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
# Runs a set of commands using the runners shell
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning]
#python -c "from deepspeed.env_report import cli_main; cli_main()"
ds_report
- name: Python environment
run: |
pip list
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS -n 4 --verbose unit/
pytest $PYTEST_OPTS -m 'sequential' unit/
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
aws-accelerate .github/workflows/aws-accelerate.yml
View raw YAML
################################################################################
# DeepSpeed CI - AWS L40S GPU Tests (HuggingFace Accelerate Integration)
#
# Runs the same tests as modal-accelerate.yml but on AWS self-hosted runners.
# Tests DeepSpeed integration with HuggingFace Accelerate library.
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
################################################################################
name: aws-accelerate
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-paths:
name: Check Paths
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.filter.outputs.run_tests }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
run_tests:
- '**'
- '!docs/**'
- '!blogs/**'
- '!deepspeed/inference/v2/**'
- '!tests/unit/inference/v2/**'
accelerate-tests:
name: Accelerate Integration Tests
needs: check-paths
if: needs.check-paths.outputs.should_run == 'true'
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-1gpu, aws]
timeout-minutes: 60
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
options: --gpus all --shm-size "32G"
env:
TORCH_VER: "2.7"
CUDA_VER: "12.6"
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
git lfs install
ln -sf /usr/bin/python3 /usr/bin/python
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install PyTorch
run: |
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-dev.txt
pip install datasets
- name: Check environment
run: |
echo "=== GPU Information ==="
nvidia-smi
echo ""
echo "=== CUDA Version ==="
nvcc --version
echo ""
echo "=== Python/PyTorch Info ==="
python --version
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
- name: Install DeepSpeed
run: |
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
pip install --no-build-isolation .
ds_report
# Debug: Check captured torch_info values
python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
- name: Clone and install Accelerate
run: |
git clone https://github.com/huggingface/accelerate
pip install "./accelerate[testing]"
- name: Run Accelerate DeepSpeed tests
run: |
pytest --verbose ./accelerate/tests/deepspeed
aws-torch-latest .github/workflows/aws-torch-latest.yml
View raw YAML
################################################################################
# DeepSpeed CI - AWS L40S GPU Tests (PyTorch Latest)
#
# Runs the same tests as modal-torch-latest.yml but on AWS self-hosted runners.
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
################################################################################
name: aws-torch-latest
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-paths:
name: Check Paths
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.filter.outputs.run_tests }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
run_tests:
- '**'
- '!docs/**'
- '!blogs/**'
- '!deepspeed/inference/v2/**'
- '!tests/unit/inference/v2/**'
unit-tests:
name: Unit Tests (V1)
needs: check-paths
if: needs.check-paths.outputs.should_run == 'true'
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
timeout-minutes: 60
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
options: --gpus all --shm-size "32G"
env:
TORCH_VER: "2.7"
CUDA_VER: "12.6"
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
git lfs install
ln -sf /usr/bin/python3 /usr/bin/python
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install PyTorch
run: |
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-dev.txt
pip install -r requirements/requirements-deepcompile.txt
- name: Check environment
run: |
echo "=== GPU Information ==="
nvidia-smi
echo ""
echo "=== CUDA Version ==="
nvcc --version
echo ""
echo "=== Python/PyTorch Info ==="
python --version
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
- name: Install DeepSpeed
run: |
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
pip install --no-build-isolation .
ds_report
# Debug: Check captured torch_info values
python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
- name: Run unit tests
run: |
pytest -n 4 --forked --verbose tests/unit/v1/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
aws-torch-latest-full .github/workflows/aws-torch-latest-full.yml
View raw YAML
################################################################################
# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
#
# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
#
# This workflow runs:
# - Parallel tests with pytest-xdist (-n 8)
# - Sequential tests marked with @pytest.mark.sequential
#
# Nightly schedule: skips if no new commits since last successful run.
################################################################################
name: aws-torch-latest-full
on:
schedule:
- cron: '0 8 * * *' # Daily at 08:00 UTC (midnight PST)
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changes:
name: Check for new commits
runs-on: ubuntu-latest
# Only check on schedule; workflow_dispatch always runs
if: github.event_name == 'schedule'
outputs:
has_changes: ${{ steps.check.outputs.has_changes }}
steps:
- name: Check for commits since last successful run
id: check
env:
GH_TOKEN: ${{ github.token }}
run: |
default_branch="${{ github.event.repository.default_branch }}"
# Get the HEAD SHA of the last successful run of this workflow
last_sha=$(gh api \
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
--jq '.workflow_runs[0].head_sha // empty')
current_sha="${{ github.sha }}"
if [ -z "$last_sha" ]; then
echo "No previous successful run found — running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
elif [ "$last_sha" = "$current_sha" ]; then
echo "No new commits since last successful run ($last_sha) — skipping"
echo "has_changes=false" >> "$GITHUB_OUTPUT"
else
echo "New commits detected: $last_sha -> $current_sha — running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
fi
unit-tests:
name: Unit Tests (Full)
needs: [check-changes]
# Run if: (a) workflow_dispatch, or (b) schedule with new commits
if: |
always() &&
(github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
timeout-minutes: 180
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
# Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs)
options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
env:
TORCH_VER: "2.7"
CUDA_VER: "12.6"
CUTLASS_PATH: /opt/cutlass
# Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
DS_DISABLE_REUSE_DIST_ENV: "1"
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git git-lfs libaio-dev pdsh python3 python3-pip
git lfs install
ln -sf /usr/bin/python3 /usr/bin/python
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install CUTLASS
run: |
git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
echo "CUTLASS installed at /opt/cutlass"
ls -la /opt/cutlass/include/ | head -10
- name: Install PyTorch
run: |
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git checkout 981c276
pip install .
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-dev.txt
pip install -r requirements/requirements-deepcompile.txt
pip install pytest-timeout pytest-instafail
- name: Check environment
run: |
echo "=== GPU Information ==="
nvidia-smi
echo ""
echo "=== CUDA Version ==="
nvcc --version
echo ""
echo "=== Python/PyTorch Info ==="
python --version
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
echo ""
echo "=== CUTLASS ==="
echo "CUTLASS_PATH: $CUTLASS_PATH"
ls -la $CUTLASS_PATH/include/ | head -5
- name: Install DeepSpeed
run: |
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests (parallel)
run: |
export TORCH_CUDA_ARCH_LIST="8.9"
cd tests
# Skip tests requiring unavailable hardware or known issues:
# - nvme checkpointing: no nvme device
# - GDS tests: no GPUDirect Storage support
# - launcher user_args: pdsh requires SSH server
# - zenflow: Stage 3 tests have pre-existing bugs + CUDA/fork issues
rm -rf /mnt/aio/pytest
pytest --instafail --timeout 600 --forked -n 8 --basetemp=/mnt/aio/pytest unit/ \
--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
--ignore=unit/ops/aio/test_gds.py \
--ignore=unit/launcher/test_user_args.py \
--ignore=unit/runtime/zenflow \
--ignore=unit/ops/adam/test_zf_torch_adam.py \
--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
- name: Unit tests (sequential)
run: |
export TORCH_CUDA_ARCH_LIST="8.9"
cd tests
rm -rf /mnt/aio/pytest
pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
--ignore=unit/runtime/zero/test_nvme_checkpointing.py \
--ignore=unit/ops/aio/test_gds.py \
--ignore=unit/launcher/test_user_args.py \
--ignore=unit/runtime/zenflow \
--ignore=unit/ops/adam/test_zf_torch_adam.py \
--ignore=unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py \
--torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
cpu-torch-latest .github/workflows/cpu-torch-latest.yml
View raw YAML
name: cpu-torch-latest
on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install system packages
run: |
sudo apt-get install -y numactl pdsh
- name: Install pytorch
run: |
pip install torch==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cpu
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 981c276
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.7.1+cpu"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.7.1+cpu"
formatting .github/workflows/formatting.yml
View raw YAML
name: Formatting
on:
workflow_dispatch:
pull_request:
branches:
'**'
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
# formatting and basic install on cpu-only machine
unit-tests:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: environment
run: |
which python
python --version
- name: Install dependencies
run: |
# Previously we would do pip install .[dev] but this is causing out of
# space errors start with torch 2.1.0 release
grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
- name: Formatting checks
run: |
pip show pre-commit clang-format
pre-commit run --all-files
hpu-gaudi2 perms .github/workflows/hpu-gaudi2.yml
View raw YAML
name: hpu-gaudi2
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2.yml"
- "accelerator/hpu_accelerator.py"
- "op_builder/hpu/**"
- "deepspeed/runtime/engine.py"
- "deepspeed/runtime/bf16_optimizer.py"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/zero/partition_parameters.py"
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
- "deepspeed/runtime/zero/parameter_offload.py"
- "deepspeed/runtime/pipe/engine.py"
- "deepspeed/runtime/utils.py"
- "deepspeed/inference/engine.py"
- "deepspeed/module_inject/auto_tp.py"
- "deepspeed/module_inject/replace_module.py"
- "deepspeed/module_inject/load_checkpoint.py"
- "deepspeed/module_inject/inject.py"
- "deepspeed/ops/transformer/**"
- "deepspeed/ops/adam/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_accelerator.py
test_autotuning.py
test_compression.py
test_dist.py
test_elastic.py
test_ds_arguments.py
test_run.py
test_multinode_runner.py
test_moe_tp.py
test_monitor.py
(test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed))
(test_latest_checkpoint.py and test_missing_latest)
test_reshape_checkpoint.py
test_shared_weights.py
test_sparse.py
test_tag_validation.py
test_pipe_module.py
(test_flops_profiler.py and test_flops_profiler_in_inference)
test_get_optim_files.py
test_groups.py
test_partition_balanced.py
(test_adamw.py and TestAdamConfigs)
test_coalesced_collectives.py
test_activation_checkpointing_non_reentrant.py
test_activation_checkpointing.py
test_data.py
(test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig))
test_ds_config_model.py
test_mup_optimizers.py
(test_pld.py and test_pld_schedule)
test_runtime_utils.py
test_pipe_schedule.py
test_topology.py
(test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler))
test_csr.py
(test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer))
(test_bf16.py and TestZeroDtypeCocktail)
test_partition.py
test_ignore_unused_parameters.py
test_zero_config.py
test_zero_context_ancestry.py
(test_zero_context.py and not TestSerialContext)
test_zero_dynamic_class.py
test_zero_nesting_init.py
test_zeropp.py
(test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
(test_linear.py and (TestLoRALinear or TestBasicLinear))
(test_ctx.py and TestEngine)
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
hl-smi -L
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 981c276
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
pytest --verbose unit/ -k "${TEST_LIST}"
hpu-gaudi2-nightly perms .github/workflows/hpu-gaudi2-nightly.yml
View raw YAML
name: hpu-gaudi2-nightly
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2-nightly.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_adamw.py
test_bf16.py
test_ds_config_dict.py
test_dynamic_loss_scale.py
test_latest_checkpoint.py
test_moe_checkpoint.py
test_multi_output_model.py
test_other_optimizer.py
test_pipe.py
test_pipeline.py
test_universal_checkpoint.py
test_zero_context_return.py
test_zero_leaf_module.py
test_zero_offloadpp.py
test_zero_tiled.py
test_autotp_training.py
test_ulysses.py
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
hl-smi -L
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
pytest --verbose unit/ -k "${TEST_LIST}"
modal-accelerate .github/workflows/modal-accelerate.yml
View raw YAML
name: modal-accelerate
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see ci/accelerate.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
# you have to switch to `pull_request` if you need to change the CI job's python script,
# otherwise GH will use a master version of the CI files, ignoring the modifications in the PR -
# the other way is to use modal cli to test this job from one's host - it'd require setting up
# modal secrets
# pull_request:
pull_request_target:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [review_requested, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.accelerate
modal-torch-latest .github/workflows/modal-torch-latest.yml
View raw YAML
name: modal-torch-latest
# This CI is running on modal.com's GPUs.
#
# It's set up here on github actions and then the cloned repo is sent to modal and everything
# happens on their hw - see ci/torch_latest.py for where the actual vm is loaded, updated and the tests are
# run.
#
# Both files are annotated to what's important and how one might change or update things if needed.
#
# Note that since this is a Required job we can't use `on.push.path` file filter - we are using
# collect-tests job to do the filtering for us so that the job can be skipped and satisfy the
# Required status for PRs to pass.
#
on:
workflow_dispatch:
push:
branches:
- master
pull_request_target:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
types: [review_requested, ready_for_review, synchronize]
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
collect-tests:
name: Collect tests to run
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
deepspeed: ${{ steps.filter.outputs.deepspeed }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Filter changed files
uses: dorny/paths-filter@v2
id: filter
with:
token: ${{ secrets.GITHUB_TOKEN }}
filters: |
deepspeed:
- 'deepspeed/**'
- '.github/workflows/modal*.yml'
- 'ci/**'
- 'tests/unit/**'
- 'csrc/**'
deploy:
name: DeepSpeedAI CI
runs-on: ubuntu-latest
needs: collect-tests
env:
# these are created at https://modal.com/settings/deepspeedai/tokens
# they are then added to the repo's secrets at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
# this one comes from https://huggingface.co/settings/profile of the bot user
# and it too is then updated at https://github.com/deepspeedai/deepspeed/settings/secrets/actions
HF_TOKEN: ${{ secrets.HF_TOKEN }}
if: needs.collect-tests.outputs.deepspeed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- name: Install build dependencies
run: |
pip install uv # much faster than pip
uv pip install --system modal
- name: Run tests
run: |
modal run -m ci.torch_latest
no-torch perms .github/workflows/no-torch.yml
View raw YAML
name: no-torch
on:
workflow_dispatch:
pull_request:
paths:
- 'accelerator/**'
- '.github/workflows/no-torch.yml'
- 'op_builder/**'
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Python environment
run: |
pip uninstall torch --yes
pip install setuptools
pip install build
pip list
- name: Build deepspeed
run: |
DS_BUILD_STRING=" " python -m build --sdist
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-a6000 .github/workflows/nv-a6000.yml
View raw YAML
name: nv-a6000
on:
pull_request:
paths:
- 'accelerator/cuda_accelerator.py'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
workflow_dispatch:
inputs:
mii_branch:
description: 'DeepSpeed-MII Branch'
required: false
default: 'main'
type: string
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:25.01-py3
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if you need to use an older transformers version temporarily in case of breakage
# git checkout 981c276
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
- name: MII unit tests
run: |
BRANCH="main"
if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
BRANCH="${{ github.event.inputs.mii_branch }}"
fi
echo "Cloning DeepSpeed-MII branch: $BRANCH"
git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git
cd DeepSpeed-MII
pip install .[dev]
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF ./
nv-accelerate-v100 .github/workflows/nv-accelerate-v100.yml
View raw YAML
name: nv-accelerate-v100
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: HF Accelerate tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
git clone https://github.com/huggingface/accelerate
cd accelerate
git rev-parse --short HEAD
# temp workaround until this is resolved https://github.com/huggingface/accelerate/issues/3676
pip install datasets==3.6.0
# installing dependencies
pip install .[testing]
# force protobuf version due to issues
pip install "protobuf<4.21.0"
pip list
pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
nv-ds-chat perms .github/workflows/nv-ds-chat.yml
View raw YAML
name: nv-ds-chat
on:
workflow_dispatch:
inputs:
dse_branch:
description: 'DeepSpeedExamples Branch'
required: false
default: 'master'
type: string
pull_request:
paths:
- ".github/workflows/nv-ds-chat.yml"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/hybrid_engine.py"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev]
pip install transformers==4.48.3
ds_report
- name: Install deepspeed-chat
run: |
BRANCH="master"
if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
BRANCH="${{ github.event.inputs.dse_branch }}"
fi
echo "DeepSpeedExamples Branch: $BRANCH"
git clone -b $BRANCH https://github.com/deepspeedai/DeepSpeedExamples.git
cd DeepSpeedExamples/applications/DeepSpeed-Chat
pip install -r requirements.txt
pip install -e .
- name: Python environment
run: |
pip list
- name: DS-Chat unit tests
run: |
cd DeepSpeedExamples/applications/DeepSpeed-Chat
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
unset NCCL_DEBUG
cd tests
pytest $PYTEST_OPTS ./
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-flash-attn .github/workflows/nv-flash-attn.yml
View raw YAML
name: nv-flash-attn
on:
workflow_dispatch:
pull_request:
paths:
- 'deepspeed/sequence/**'
- 'tests/unit/sequence_parallelism/**'
- '.github/workflows/nv-flash-attn.yml'
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:24.12-py3
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
python -m pip install .[dev]
ds_report
# install transformers after deepspeed so that the right version of transformers is installed
- name: Install transformers
run: |
python -m pip install transformers==4.50.0
- name: Install FlashAttention
run: |
python -m pip install flash-attn
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-inference .github/workflows/nv-inference.yml
View raw YAML
name: nv-inference
on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/nv-inference.yml'
- 'requirements/**'
- 'deepspeed/__init__.py'
- 'deepspeed/inference/**'
- '!deepspeed/inference/v2/**' # exclude v2 dir
- 'tests/unit/inference/**'
- '!tests/unit/inference/v2/**' # exclude v2 tests dir
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
#git checkout f370bebdc
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
DS_ACCELERATOR=cpu pip install .[dev,1bit,autotuning,inf]
#pip install .[dev,1bit,autotuning,inf,triton]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
#pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.4"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
# run ds_report again to check updated op list
ds_report
nv-mii .github/workflows/nv-mii.yml
View raw YAML
name: nv-mii
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev]
ds_report
# install transformers after deepspeed so that the right version of transformers is installed
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout v4.42.4
git rev-parse --short HEAD
pip install .
- name: Python environment
run: |
pip list
- name: MII unit tests
run: |
BRANCH="main"
if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
BRANCH="${{ github.event.inputs.mii_branch }}"
fi
echo "Cloning DeepSpeed-MII branch: $BRANCH"
git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git
cd DeepSpeed-MII
pip install .[dev]
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests/legacy
pytest $PYTEST_OPTS --forked -m "deepspeed" ./
nv-nightly perms .github/workflows/nv-nightly.yml
View raw YAML
name: nv-nightly
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout v4.42.4
git rev-parse --short HEAD
pip install .
- name: Install datasets
run: |
pip install datasets
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.6" --cuda_ver="12.4"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-pre-compile-ops .github/workflows/nv-pre-compile-ops.yml
View raw YAML
name: nv-pre-compile-ops
on:
workflow_dispatch:
pull_request:
branches:
'**'
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: ubuntu-24.04
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git python3 python3-pip libaio-dev ninja-build
ln -sf /usr/bin/python3 /usr/bin/python
- uses: actions/checkout@v4
- name: Install PyTorch
run: |
pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu126
- name: environment
run: |
which python
python --version
python -c "import torch; print('torch:', torch.__version__, torch)"
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Compile DeepSpeed Ops
run: |
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
- name: DS Report
run: |
DS_ACCELERATOR=cuda ds_report
nv-sd perms .github/workflows/nv-sd.yml
View raw YAML
name: nv-sd
on:
workflow_dispatch:
pull_request:
paths:
- "deepspeed/ops/transformer/inference/diffusers_**"
- "tests/unit/inference/test_stable_diffusion.py"
- "deepspeed/model_implementations/diffusers/unet.py"
- "deepspeed/model_implementations/diffusers/vae.py"
- "deepspeed/module_inject/containers/vae.py"
- "deepspeed/module_inject/containers/unet.py"
- ".github/workflows/nv-sd.yml"
- "requirements/requirements-sd.txt"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
sd-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:24.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v4
- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
pip install image-similarity-measures
python -m pip install opencv-python==4.6.* --force-reinstall
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning,sd]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12"
- name: Open GitHub issue if weekly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-torch-latest-v100 .github/workflows/nv-torch-latest-v100.yml
View raw YAML
name: nv-torch-latest-v100
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout 981c276
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning,deepcompile]
pip install pytest-timeout pytest-instafail
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest -x $PYTEST_OPTS --instafail --timeout 600 --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
pytest $PYTEST_OPTS --instafail --timeout 600 --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
nv-torch-nightly-v100 perms .github/workflows/nv-torch-nightly-v100.yml
View raw YAML
name: nv-torch-nightly-v100
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 981c276
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 8 unit/
pytest $PYTEST_OPTS --forked -m 'sequential' unit/
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
nv-transformers-v100 .github/workflows/nv-transformers-v100.yml
View raw YAML
name: nv-transformers-v100
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu124, v100]
steps:
- uses: actions/checkout@v4
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Install pytorch
run: |
# use the same pytorch version as transformers CI
pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu124 --index-url https://download.pytorch.org/whl/cu124
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
git checkout e7e9261a2
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: HF transformers tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd transformers
pip install .[testing]
# find reqs used in ds integration tests
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
# force protobuf version due to issues
pip install "protobuf<4.21.0"
pip list
WANDB_DISABLED=true RUN_SLOW=1 pytest $PYTEST_OPTS tests/deepspeed
python matrix .github/workflows/python.yml
View raw YAML
name: python
on:
workflow_dispatch:
pull_request:
branches:
'**'
paths-ignore:
- 'docs/**'
- 'blogs/**'
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
strategy:
matrix:
pyVersion: ["3.10", "3.11", "3.12"]
fail-fast: false
runs-on: ubuntu-24.04
container:
image: python:${{ matrix.pyVersion }}-slim
steps:
- uses: actions/checkout@v4
- name: Install build dependencies
run: |
apt-get update && apt-get install -y build-essential ninja-build
- name: environment
run: |
which python
python --version
- name: Install PyTorch (CPU)
run: |
pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Install deepspeed
run: |
pip install .
- name: DS Report
run: |
ds_report
release .github/workflows/release.yml
View raw YAML
name: Build and publish DeepSpeed release
on:
push:
tags:
- 'v*.*.*'
jobs:
deploy:
runs-on: ubuntu-24.04
environment: release-env
steps:
- uses: actions/checkout@v4
with:
ref: "master"
- id: setup-venv
uses: ./.github/workflows/setup-venv
- name: Get release version from tag
run: |
echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
- name: Check release version
run: |
pip install packaging
python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
- name: Build DeepSpeed
run: |
pip install setuptools
pip install build
DS_BUILD_STRING=" " python -m build --sdist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_API_TOKEN }}
repository-url: https://upload.pypi.org/legacy/
- name: Bump version
run: |
python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
- name: Create Pull Request
uses: peter-evans/create-pull-request@v6
with:
token: ${{ secrets.GH_PAT }}
add-paths: |
version.txt
body: |
**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - ${{ env.RELEASE_VERSION }}
Author - @${{ github.actor }}
branch: AutoPR/${{ env.RELEASE_VERSION }}
assignees: ${{ github.actor }}
title: "Update version.txt after ${{ env.RELEASE_VERSION }} release"
author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
xpu-compile perms .github/workflows/xpu-compile.yml
View raw YAML
name: xpu-compile
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/xpu-compile.yml"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
compile-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
steps:
- uses: actions/checkout@v4
- name: Install prerequisite
run: |
apt-get update
apt-get install clinfo libaio-dev python3-pip -y
pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
pip install py-cpuinfo numpy
pip install .[dev,autotuning]
- name: Check container state
run: |
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
pip list
- name: Compile Status
shell: bash
run: |
echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY
export FI_HMEM=system
ulimit -n 1048575
cd tests/torch_compile
export ZE_AFFINITY_MASK=0,1
echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY
deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt
# for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk
cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY
deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt
cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
xpu-max1100 perms .github/workflows/xpu-max1100.yml
View raw YAML
name: xpu-max1100
on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/xpu-max1100.yml"
- "accelerator/xpu_accelerator.py"
- "accelerator/abstract_accelerator.py"
- "accelerator/cpu_accelerator.py"
- "accelerator/real_accelerator.py"
- "csrc/xpu/**"
- "deepspeed/runtime/engine.py"
- "deepspeed/runtime/bf16_optimizer.py"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/zero/partition_parameters.py"
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
- "deepspeed/runtime/zero/parameter_offload.py"
- "deepspeed/runtime/pipe/engine.py"
- "deepspeed/runtime/utils.py"
- "op_builder/xpu/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
issues: write
jobs:
unit-tests:
runs-on: [self-hosted, intel, xpu]
container:
image: intel/oneapi-basekit:2025.0.2-0-devel-ubuntu22.04
ports:
- 80
options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
steps:
- uses: actions/checkout@v4
- name: Install prerequisite
shell: bash
run: |
apt-get update
apt-get install -y python3.11 python3.11-dev python3-pip clinfo libaio-dev
pip install --upgrade pip
pip install py-cpuinfo
pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
pip install .[dev,autotuning]
- name: Check container state
shell: bash
run: |
ldd --version
ds_report
python3 -c "import torch; print('torch:', torch.__version__, torch)"
python3 -c "import torch; print('XPU available:', torch.xpu.is_available())"
python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
pip list
- name: Unit tests
shell: bash
run: |
cd tests/unit
export FI_PROVIDER="tcp"
export I_MPI_SHM=off
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose model_parallelism/*
pytest --verbose monitor/*
pytest --verbose utils/*
pytest --verbose runtime/test_ds_config_model.py
pytest --verbose runtime/pipe/test_pipe_schedule.py
pytest --verbose runtime/zero/test_zero_config.py
pytest --verbose runtime/zero/test_zero_tiled.py
pytest --verbose runtime/zero/test_zeropp.py
pytest --verbose runtime/test_autocast.py
pytest --verbose runtime/test_data.py
pytest --verbose runtime/zero/test_zero_dynamic_class.py