NVIDIA/Megatron-LM

31 workflows · maturity 67% · 10 patterns · GitHub ↗

Security 8.06/100

Practices

✓ Matrix✓ Permissions○ Security scan✓ AI review○ Cache✓ Concurrency✓ Reusable workflows

Detected patterns

ai-code-review chaos-engineering ecosystem-ci flaky-test-retry hardware-matrix least-privilege-permissions multi-channel-release multi-stage-release per-sample-ci reusable-workflows

Security dimensions

permissions

8.1

security scan

supply chain

secret handling

harden runner

Workflows (31)

_build_test_publish_wheel matrix .github/workflows/_build_test_publish_wheel.yml

Triggers

workflow_call

Runs on

${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}, ubuntu-latest

Jobs

build-and-test-wheels, publish-wheels

Matrix

include, include.IMAGE, include.PACKAGE, include.PLATFORM→ amd64, arm64, megatron-core, megatron-fsdp, quay.io/pypa/manylinux_2_28_aarch64, quay.io/pypa/manylinux_2_28_x86_64

Commands

set -x if [ "$PACKAGE" = "megatron-core" ]; then ROOTDIR="megatron/core" BUILD_DIR="." elif [ "$PACKAGE" = "megatron-fsdp" ]; then ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp" BUILD_DIR="megatron/core/distributed/fsdp/src" else echo Unknown package: $PACKAGE exit 1 fi if [ "$PUBLISH_DRYRUN" = "true" ]; then PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '$.*$'/\1/p" $ROOTDIR/package_info.py) sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py fi pushd $BUILD_DIR rm LICENSE || true docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\ for python_version in cp310 cp311 cp312 cp313; do \ /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \ done && \ for python_version in cp310 cp311 cp312 cp313; do \ /opt/python/${python_version}-${python_version}/bin/python -m build; \ done \ ' PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl") if [ -n "$PLATFORM_WHEELS" ]; then echo "Found platform wheels to repair: $PLATFORM_WHEELS" docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/ fi popd pushd $ROOTDIR EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)") popd echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}" if [ "$PACKAGE" = "megatron-fsdp" ]; then mkdir -p dist/ cp -a megatron/core/distributed/fsdp/src/dist/* dist/ fi ls -al dist/
ls -al dist/ if [ "$PACKAGE" = "megatron-core" ]; then ROOTPATH="megatron.core" WHEEL_PREFIX="megatron_core" elif [ "$PACKAGE" = "megatron-fsdp" ]; then ROOTPATH="megatron_fsdp" WHEEL_PREFIX="megatron_fsdp" else echo Unknown package: $PACKAGE exit 1 fi if [ "$PACKAGE" = "megatron-core" ]; then if [[ "$PLATFORM" == "arm64" ]]; then for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do pip install --no-cache-dir "$file" done else for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do pip install --no-cache-dir "$file" done fi else pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl fi sudo rm -rf megatron/ RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)") test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
# Delete sdist for arm64 since we already upload it with amd64. if [ "$PLATFORM" == "arm64" ]; then rm dist/*.tar.gz fi ls -al dist/ pip install twine twine upload \ --verbose \ -r $TWINE_REPOSITORY \ -u $TWINE_USERNAME \ -p $TWINE_PASSWORD \ dist/*

View raw YAML

on:
  workflow_call:
    inputs:
      ref:
        required: false
        description: Ref (SHA or branch) to release
        type: string
        default: ${{ github.sha }}
      dry-run:
        required: false
        description: Upload to PyPy Test instance
        type: boolean
        default: true
      no-publish:
        required: false
        description: Do not publish the wheel
        type: boolean
        default: true
    secrets:
      TWINE_PASSWORD:
        required: true

jobs:
  build-and-test-wheels:
    strategy:
      fail-fast: false
      matrix:
        include:
          - PACKAGE: megatron-core
            PLATFORM: arm64
            IMAGE: quay.io/pypa/manylinux_2_28_aarch64
          - PACKAGE: megatron-core
            PLATFORM: amd64
            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
          - PACKAGE: megatron-fsdp
            IMAGE: quay.io/pypa/manylinux_2_28_x86_64
            PLATFORM: amd64
    runs-on: ${{ matrix.PLATFORM == 'amd64' && 'ubuntu-22.04' || 'ubuntu-22.04-arm' }}
    env:
      PACKAGE: ${{ matrix.PACKAGE }}
      IMAGE: ${{ matrix.IMAGE }}
      PLATFORM: ${{ matrix.PLATFORM }}
      PUBLISH_DRYRUN: ${{ inputs.dry-run }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          ref: ${{ inputs.ref }}

      - name: Build wheel
        id: build-wheel
        run: |
          set -x

          if [ "$PACKAGE" = "megatron-core" ]; then
            ROOTDIR="megatron/core"
            BUILD_DIR="."
          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
            ROOTDIR="megatron/core/distributed/fsdp/src/megatron_fsdp"
            BUILD_DIR="megatron/core/distributed/fsdp/src"
          else
            echo Unknown package: $PACKAGE
            exit 1
          fi

          if [ "$PUBLISH_DRYRUN" = "true" ]; then
            PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
            sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
          fi

          pushd $BUILD_DIR
            rm LICENSE || true
            docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
              for python_version in cp310 cp311 cp312 cp313; do \
                /opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \
              done && \
              for python_version in cp310 cp311 cp312 cp313; do \
                /opt/python/${python_version}-${python_version}/bin/python -m build; \
              done \
            '

            PLATFORM_WHEELS=$(find dist -name "*.whl" -not -name "*-none-any.whl")
            if [ -n "$PLATFORM_WHEELS" ]; then
                echo "Found platform wheels to repair: $PLATFORM_WHEELS"
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE auditwheel repair $PLATFORM_WHEELS
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE rm -rf dist/*.whl
                docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE cp -a wheelhouse/* dist/
            fi
          popd

          pushd $ROOTDIR
            EXPECTED_RELEASE_NUMBER=$(python -c "import package_info; print(package_info.__version__)")
          popd

          echo "expected-release-number=$EXPECTED_RELEASE_NUMBER" | tee -a "${GITHUB_OUTPUT}"

          if [ "$PACKAGE" = "megatron-fsdp" ]; then
            mkdir -p dist/
            cp -a megatron/core/distributed/fsdp/src/dist/* dist/
          fi

          ls -al dist/

      - name: Test wheels
        run: |
          ls -al dist/

          if [ "$PACKAGE" = "megatron-core" ]; then
            ROOTPATH="megatron.core"
            WHEEL_PREFIX="megatron_core"
          elif [ "$PACKAGE" = "megatron-fsdp" ]; then
            ROOTPATH="megatron_fsdp"
            WHEEL_PREFIX="megatron_fsdp"
          else
            echo Unknown package: $PACKAGE
            exit 1
          fi

          if [ "$PACKAGE" = "megatron-core" ]; then
            if [[ "$PLATFORM" == "arm64" ]]; then
              for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
                pip install --no-cache-dir "$file"
              done
            else
              for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
                pip install --no-cache-dir "$file"
              done
            fi
          else
            pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
          fi

          sudo rm -rf megatron/

          RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
          test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"

      - name: Upload wheels
        uses: actions/upload-artifact@v6
        with:
          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
          path: dist/

  publish-wheels:
    needs: [build-and-test-wheels]
    runs-on: ubuntu-latest
    if: inputs.no-publish == false
    strategy:
      fail-fast: false
      matrix:
        include:
          - PACKAGE: megatron-core
            PLATFORM: arm64
          - PACKAGE: megatron-core
            PLATFORM: amd64
          - PACKAGE: megatron-fsdp
            PLATFORM: amd64
    env:
      PACKAGE: ${{ matrix.PACKAGE }}
    steps:
      - name: Download wheels
        uses: actions/download-artifact@v7
        with:
          name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
          path: dist/
          merge-multiple: true

      - name: Publish wheels
        env:
          TWINE_USERNAME: __token__
          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
          TWINE_REPOSITORY: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'pypi' || 'testpypi' }}
          PLATFORM: ${{ matrix.PLATFORM }}
        run: |

          # Delete sdist for arm64 since we already upload it with amd64.
          if [ "$PLATFORM" == "arm64" ]; then
            rm dist/*.tar.gz
          fi

          ls -al dist/
          pip install twine
          twine upload \
            --verbose \
            -r $TWINE_REPOSITORY \
            -u $TWINE_USERNAME \
            -p $TWINE_PASSWORD \
            dist/*

_release_library perms .github/workflows/_release_library.yml

Triggers

workflow_call

Runs on

ubuntu-latest, ubuntu-latest, ubuntu-latest

Jobs

build-test-publish-wheels-dry-run, bump-next-version, build-test-publish-wheels, create-gh-release, publish-docs, notify

Actions

mikepenz/release-changelog-builder-action

Commands

set +u cd ${{ github.run_id }} PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py" MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}') MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}') PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}') PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'") echo "release-version=$MAJOR.$MINOR.$PATCH$PRERELEASE" | tee -a "$GITHUB_OUTPUT" if [[ "$PRERELEASE" != "" ]]; then if [[ "$PRERELEASE" == *rc* ]]; then NEXT_PATCH=$PATCH NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1)) elif [[ "$PRERELEASE" == *a* ]]; then NEXT_PATCH=$PATCH NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1)) else echo "Unknown pre-release: $PRERELEASE" exit 1 fi else NEXT_PATCH=$((${PATCH} + 1)) NEXT_PRERELEASE=$PRERELEASE fi sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"
set +u cd ${{ github.run_id }} PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py" MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}') MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}') PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}') PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'") if [[ "$PRERELEASE" != "" ]]; then if [[ "$PRERELEASE" == *rc* ]]; then NEXT_PATCH=$PATCH NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1)) elif [[ "$PRERELEASE" == *a* ]]; then NEXT_PATCH=$PATCH NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1)) else echo "Unknown pre-release: $PRERELEASE" exit 1 fi else NEXT_PATCH=$((${PATCH} + 1)) NEXT_PRERELEASE=$PRERELEASE fi sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"
cd ${{ github.run_id }} TMP_BRANCH="deploy-release/$(uuidgen)" git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" git checkout -b "$TMP_BRANCH" git add -A . git commit -m "beep boop 🤖: Bumping versions" || echo "No changes to commit" git push -u origin "$TMP_BRANCH" echo "TMP_BRANCH=$TMP_BRANCH" | tee -a $GITHUB_ENV # Create PR to collect app based status checks that run on PRs only # (like DCO check) PR_URL=$(gh pr create \ --base ${{ inputs.version-bump-branch }} \ --head $TMP_BRANCH \ --title "beep boop 🤖: Bumping versions" \ --body "This is an automated PR to bump versions.") # Extract PR number from URL PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')
cd ${{ github.run_id }} git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" CMD=$(echo -E 'git push origin ${{ inputs.version-bump-branch }}') if [[ "$IS_DRY_RUN" == "true" ]]; then echo "dry-run enabled, would have run: $CMD" else # Here we account for potential race conditions from multiple concurrent releases. # Those can be legit (operating on different packages within the monorepo, for example) # but the pushes would be still rejected purely because of git's inability to # push non-fast-forward updates to the branch. In this case we would need to let # a retry. git fetch origin ${{ inputs.version-bump-branch }} git checkout ${{ inputs.version-bump-branch }} git merge ${{ env.TMP_BRANCH }} for attempt in {1..3}; do if eval "$CMD"; then echo "Git push succeeded on attempt $attempt" break else echo "Git push failed on attempt $attempt" if [[ $attempt -lt 3 ]]; then sleep $((RANDOM % 3 + 1)) # We refetch, reset and re-merge. Note resetting because the local # branch is "contaminated" with previous merge attempt. git fetch origin ${{ inputs.version-bump-branch }} git reset --hard origin/${{ inputs.version-bump-branch }} git merge ${{ env.TMP_BRANCH }} else echo "Git push failed after 3 attempts" exit 1 fi fi done fi
cd ${{ github.run_id }} git push -d origin ${{ env.TMP_BRANCH }}
cd ${{ github.run_id }} # If gh-release-from-tag is provided, use it if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then FROM_TAG="${{ inputs.gh-release-from-tag }}" echo "Using provided fromTag: $FROM_TAG" else # Get the most recent tag FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") if [[ -z "$FROM_TAG" ]]; then echo "No previous tags found, leaving fromTag empty" else echo "Auto-detected most recent tag: $FROM_TAG" fi fi echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT
cd ${{ github.run_id }} IS_RELEASE_CANDIDATE=$([[ "$VERSION" == *rc* ]] && echo "true" || echo "false") IS_ALPHA=$([[ "$VERSION" == *a* ]] && echo "true" || echo "false") IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false") NAME="NVIDIA $PROJECT_NAME ${VERSION}" # Use built changelog if available, otherwise fall back to CHANGELOG.md if [[ -n "$BUILT_CHANGELOG" ]]; then CHANGELOG="$BUILT_CHANGELOG" elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then DATE=$(date +"%Y-%m-%d") CHANGELOG="Prerelease: $NAME ($DATE)" else CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md) CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//') fi echo "is-release-candidate=$IS_RELEASE_CANDIDATE" | tee -a "$GITHUB_OUTPUT" PAYLOAD=$(jq -nc \ --arg TAG_NAME "${TAG_PREFIX}v${VERSION}" \ --arg CI_COMMIT_BRANCH "$SHA" \ --arg NAME "$NAME" \ --arg BODY "$CHANGELOG" \ --argjson PRERELEASE "$IS_PRERELEASE" \ '{ "tag_name": $TAG_NAME, "target_commitish": $CI_COMMIT_BRANCH, "name": $NAME, "body": $BODY, "draft": false, "prerelease": $PRERELEASE, "generate_release_notes": false }' ) echo -E "$PAYLOAD" > payload.txt CMD=$(echo -E 'curl -L \ -X POST \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer '"$GH_TOKEN"'" \ -H "X-GitHub-Api-Version: 2022-11-28" \ https://api.github.com/repos/'"$REPOSITORY"'/releases \ -d @payload.txt ') if [[ "$IS_DRY_RUN" == "true" ]]; then echo -E "$CMD" else eval "$CMD" fi

View raw YAML

# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "Release"

defaults:
  run:
    shell: bash -x -e -u -o pipefail {0}

on:
  workflow_call:
    inputs:
      release-ref:
        required: true
        description: Ref (SHA or branch) to release
        type: string
      dry-run:
        type: boolean
        required: true
        description: Do not publish a wheel and GitHub release.
      version-bump-branch:
        type: string
        required: true
        description: Branch to target for version bump
      create-gh-release:
        required: false
        description: Create a GitHub release
        type: boolean
        default: true
      gh-release-use-changelog-builder:
        required: false
        description: Use release-changelog-builder-action to dynamically build changelog
        type: boolean
        default: true
      gh-release-changelog-config:
        required: false
        description: Path to changelog builder configuration file
        type: string
        default: ".github/workflows/config/changelog-config.json"
      gh-release-from-tag:
        required: false
        description: Starting tag for changelog builder (leave empty for auto-detect)
        type: string
        default: ""
      publish-docs:
        required: false
        description: Publish documentation to S3 after release
        type: boolean
        default: true
    secrets:
      TWINE_PASSWORD:
        required: true
      SLACK_WEBHOOK:
        required: true
      PAT:
        required: true
      AWS_ASSUME_ROLE_ARN:
        required: true
      AWS_ACCESS_KEY_ID:
        required: true
      AWS_SECRET_ACCESS_KEY:
        required: true
      AKAMAI_HOST:
        required: true
      AKAMAI_CLIENT_TOKEN:
        required: true
      AKAMAI_CLIENT_SECRET:
        required: true
      AKAMAI_ACCESS_TOKEN:
        required: true
      S3_BUCKET_NAME:
        required: true

permissions:
  contents: write # To read repository content
  pull-requests: write # To create PR(s)

jobs:
  build-test-publish-wheels-dry-run:
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      dry-run: true
      ref: ${{ inputs.release-ref }}
      no-publish: true
    secrets:
      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

  bump-next-version:
    runs-on: ubuntu-latest
    needs: build-test-publish-wheels-dry-run
    if: |
      (
        success() || !failure()
      )
      && !cancelled()
    outputs:
      release-version: ${{ steps.bump-version-mcore.outputs.release-version }}
    env:
      IS_DRY_RUN: ${{ inputs.dry-run }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          path: ${{ github.run_id }}
          token: ${{ secrets.PAT }}
          fetch-depth: 0
          fetch-tags: true
          ref: ${{ inputs.release-ref }}
      - name: Bump version MCore
        id: bump-version-mcore
        env:
          SRC_DIR: ""
          PYPROJECT_NAME: "megatron.core"
        run: |
          set +u
          cd ${{ github.run_id }}

          PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"

          MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
          MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
          PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
          PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")

          echo "release-version=$MAJOR.$MINOR.$PATCH$PRERELEASE" | tee -a "$GITHUB_OUTPUT"

          if [[ "$PRERELEASE" != "" ]]; then
            if [[ "$PRERELEASE" == *rc* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
            elif [[ "$PRERELEASE" == *a* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
            else
              echo "Unknown pre-release: $PRERELEASE"
              exit 1
            fi
          else
            NEXT_PATCH=$((${PATCH} + 1))
            NEXT_PRERELEASE=$PRERELEASE
          fi

          sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE

          echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"

      - name: Bump version MFSDP
        id: bump-version-mfsdp
        env:
          SRC_DIR: "megatron/core/distributed/fsdp/src/"
          PYPROJECT_NAME: "megatron_fsdp"
        run: |
          set +u

          cd ${{ github.run_id }}

          PACKAGE_INFO_FILE="$SRC_DIR${PYPROJECT_NAME//.//}/package_info.py"

          MAJOR=$(cat $PACKAGE_INFO_FILE | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
          MINOR=$(cat $PACKAGE_INFO_FILE | awk '/^MINOR = /' | awk -F"= " '{print $2}')
          PATCH=$(cat $PACKAGE_INFO_FILE | awk '/^PATCH = /' | awk -F"= " '{print $2}')
          PRERELEASE=$(cat $PACKAGE_INFO_FILE | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")

          if [[ "$PRERELEASE" != "" ]]; then
            if [[ "$PRERELEASE" == *rc* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
            elif [[ "$PRERELEASE" == *a* ]]; then
              NEXT_PATCH=$PATCH
              NEXT_PRERELEASE=a$((${PRERELEASE#a} + 1))
            else
              echo "Unknown pre-release: $PRERELEASE"
              exit 1
            fi
          else
            NEXT_PATCH=$((${PATCH} + 1))
            NEXT_PRERELEASE=$PRERELEASE
          fi

          sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" $PACKAGE_INFO_FILE
          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = \"$NEXT_PRERELEASE\"" $PACKAGE_INFO_FILE

          echo "version=$MAJOR.$MINOR.$NEXT_PATCH$NEXT_PRERELEASE" | tee -a "$GITHUB_OUTPUT"

      - name: Create and push deployment branch
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          cd ${{ github.run_id }}

          TMP_BRANCH="deploy-release/$(uuidgen)"
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git checkout -b "$TMP_BRANCH"
          git add -A .
          git commit -m "beep boop 🤖: Bumping versions" || echo "No changes to commit"
          git push -u origin "$TMP_BRANCH"
          echo "TMP_BRANCH=$TMP_BRANCH" | tee -a $GITHUB_ENV

          # Create PR to collect app based status checks that run on PRs only
          # (like DCO check)
          PR_URL=$(gh pr create \
            --base ${{ inputs.version-bump-branch }} \
            --head $TMP_BRANCH \
            --title "beep boop 🤖: Bumping versions" \
            --body "This is an automated PR to bump versions.")

          # Extract PR number from URL
          PR_NUMBER=$(echo $PR_URL | grep -o '[0-9]*$')

      - name: Wait for status checks on tmp branch
        uses: actions/github-script@v8
        id: wait-status
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            const branch = process.env.TMP_BRANCH;
            const owner = context.repo.owner;
            const repo = context.repo.repo;

            // Get latest commit SHA of branch
            const { data: refData } = await github.rest.git.getRef({
              owner,
              repo,
              ref: `heads/${branch}`,  // note: no 'refs/' prefix here
            });

            const sha = refData.object.sha;

            console.log(`Polling status for commit SHA: ${sha}`);

            let checksPassed = false;
            let maxAttempts = 30;
            let attempt = 0;
            const delay = ms => new Promise(res => setTimeout(res, ms));

            while (!checksPassed && attempt < maxAttempts) {
              attempt++;

              // Use commit SHA instead of branch ref
              const { data: status } = await github.rest.repos.getCombinedStatusForRef({
                owner,
                repo,
                ref: sha,
              });

              const { data: checks } = await github.rest.checks.listForRef({
                owner,
                repo,
                ref: sha,
              });

              const allStatuses = status.statuses;
              const allChecks = checks.check_runs;

              if (allStatuses.length === 0 && allChecks.length === 0) {
                console.log(`Attempt ${attempt}: No checks or statuses yet. Waiting...`);
                await delay(10000);
                continue;
              }

              const statusesOk = allStatuses.every(s => s.state === 'success');
              const checksOk = allChecks.every(c => c.status === 'completed');

              if (statusesOk && checksOk) {
                console.log('✅ All checks passed.');
                checksPassed = true;
                break
              }

              console.log(`Attempt ${attempt}: Checks not complete yet. Waiting...`);
              await delay(10000);
            }

            if (!checksPassed) {
              core.setFailed('❌ Status checks did not pass in time');
            }

      - name: Merge into ${{ inputs.version-bump-branch }}
        run: |
          cd ${{ github.run_id }}

          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"

          CMD=$(echo -E 'git push origin ${{ inputs.version-bump-branch }}')

          if [[ "$IS_DRY_RUN" == "true" ]]; then
            echo "dry-run enabled, would have run: $CMD"
          else
            # Here we account for potential race conditions from multiple concurrent releases.
            # Those can be legit (operating on different packages within the monorepo, for example)
            # but the pushes would be still rejected purely because of git's inability to
            # push non-fast-forward updates to the branch. In this case we would need to let
            # a retry.
            git fetch origin ${{ inputs.version-bump-branch }}
            git checkout ${{ inputs.version-bump-branch }}
            git merge ${{ env.TMP_BRANCH }}

            for attempt in {1..3}; do
              if eval "$CMD"; then
                echo "Git push succeeded on attempt $attempt"
                break
              else
                echo "Git push failed on attempt $attempt"
                if [[ $attempt -lt 3 ]]; then
                  sleep $((RANDOM % 3 + 1))
                  # We refetch, reset and re-merge. Note resetting because the local
                  # branch is "contaminated" with previous merge attempt.
                  git fetch origin ${{ inputs.version-bump-branch }}
                  git reset --hard origin/${{ inputs.version-bump-branch }}
                  git merge ${{ env.TMP_BRANCH }}
                else
                  echo "Git push failed after 3 attempts"
                  exit 1
                fi
              fi
            done
          fi

      - name: Delete ${{ env.TMP_BRANCH }} branch
        if: always()
        run: |
          cd ${{ github.run_id }}
          git push -d origin ${{ env.TMP_BRANCH }}

  build-test-publish-wheels:
    needs: [bump-next-version]
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      dry-run: false
      ref: ${{ inputs.release-ref }}
      no-publish: false
    secrets:
      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}

  create-gh-release:
    needs: [build-test-publish-wheels, bump-next-version]
    runs-on: ubuntu-latest
    if: |
      (
        success() || !failure()
      )
      && inputs.create-gh-release == true
      && !cancelled()
    outputs:
      is-release-candidate: ${{ steps.version-number.outputs.is-release-candidate }}
    env:
      REPOSITORY: ${{ github.repository }}
      PROJECT_NAME: Megatron Core
      VERSION: ${{ needs.bump-next-version.outputs.release-version }}
      TAG_PREFIX: core_
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          path: ${{ github.run_id }}
          ref: ${{ inputs.release-ref }}
          token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}

      - name: Determine fromTag for changelog
        id: determine-from-tag
        if: inputs.gh-release-use-changelog-builder == true
        run: |
          cd ${{ github.run_id }}

          # If gh-release-from-tag is provided, use it
          if [[ -n "${{ inputs.gh-release-from-tag }}" ]]; then
            FROM_TAG="${{ inputs.gh-release-from-tag }}"
            echo "Using provided fromTag: $FROM_TAG"
          else
            # Get the most recent tag
            FROM_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "")
            if [[ -z "$FROM_TAG" ]]; then
              echo "No previous tags found, leaving fromTag empty"
            else
              echo "Auto-detected most recent tag: $FROM_TAG"
            fi
          fi

          echo "from-tag=$FROM_TAG" >> $GITHUB_OUTPUT

      - name: Build Changelog
        id: build-changelog
        if: inputs.gh-release-use-changelog-builder == true
        uses: mikepenz/release-changelog-builder-action@v6.1.0
        env:
          GITHUB_TOKEN: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
        with:
          configuration: ${{ github.run_id }}/${{ inputs.gh-release-changelog-config }}
          owner: ${{ github.repository_owner }}
          repo: ${{ github.event.repository.name }}
          ignorePreReleases: "false"
          failOnError: "false"
          fromTag: ${{ steps.determine-from-tag.outputs.from-tag }}
          toTag: ${{ inputs.release-ref }}
          mode: ${{ inputs.gh-release-changelog-mode }}

      - name: Create release
        id: version-number
        env:
          SHA: ${{ inputs.release-ref }}
          GH_TOKEN: ${{ secrets.PAT }}
          IS_DRY_RUN: ${{ inputs.dry-run }}
          BUILT_CHANGELOG: ${{ steps.build-changelog.outputs.changelog }}
        run: |
          cd ${{ github.run_id }}

          IS_RELEASE_CANDIDATE=$([[ "$VERSION" == *rc* ]] && echo "true" || echo "false")
          IS_ALPHA=$([[ "$VERSION" == *a* ]] && echo "true" || echo "false")
          IS_PRERELEASE=$([[ "$IS_RELEASE_CANDIDATE" == "true" || "$IS_ALPHA" == "true" ]] && echo "true" || echo "false")
          NAME="NVIDIA $PROJECT_NAME ${VERSION}"

          # Use built changelog if available, otherwise fall back to CHANGELOG.md
          if [[ -n "$BUILT_CHANGELOG" ]]; then
            CHANGELOG="$BUILT_CHANGELOG"
          elif [[ "$IS_RELEASE_CANDIDATE" == "true" ]]; then
            DATE=$(date +"%Y-%m-%d")
            CHANGELOG="Prerelease: $NAME ($DATE)"
          else
            CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
            CHANGELOG=$(echo "$CHANGELOG" | sed '/./,$!d' | sed ':a;N;$!ba;s/\n$//')
          fi

          echo "is-release-candidate=$IS_RELEASE_CANDIDATE" | tee -a "$GITHUB_OUTPUT"

          PAYLOAD=$(jq -nc \
                      --arg TAG_NAME "${TAG_PREFIX}v${VERSION}" \
                      --arg CI_COMMIT_BRANCH "$SHA" \
                      --arg NAME "$NAME" \
                      --arg BODY "$CHANGELOG" \
                      --argjson PRERELEASE "$IS_PRERELEASE" \
                      '{
                        "tag_name": $TAG_NAME,
                        "target_commitish": $CI_COMMIT_BRANCH,
                        "name": $NAME,
                        "body": $BODY,
                        "draft": false,
                        "prerelease": $PRERELEASE,
                        "generate_release_notes": false
                      }'
                  )
          echo -E "$PAYLOAD" > payload.txt

          CMD=$(echo -E 'curl -L \
            -X POST \
            -H "Accept: application/vnd.github+json" \
            -H "Authorization: Bearer '"$GH_TOKEN"'" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            https://api.github.com/repos/'"$REPOSITORY"'/releases \
            -d @payload.txt
          ')

          if [[ "$IS_DRY_RUN" == "true" ]]; then
            echo -E "$CMD"
          else
            eval "$CMD"
          fi

  publish-docs:
    needs: [bump-next-version, create-gh-release]
    uses: ./.github/workflows/release-docs.yml
    if: |
      (
        success() || !failure()
      )
      && inputs.publish-docs == true
      && !cancelled()
    with:
      dry-run: ${{ inputs.dry-run }}
      publish-as-latest: true
      docs-version-override: ${{ needs.bump-next-version.outputs.release-version }}
      build-docs-ref: ${{ inputs.release-ref }}
    secrets: inherit

  notify:
    needs: [build-test-publish-wheels, create-gh-release, bump-next-version]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.17.0
          path: send-slack-alert

      - name: Send Slack alert
        uses: ./send-slack-alert/.github/actions/send-slack-alert
        env:
          MESSAGE: |
            ${{ inputs.dry-run == true && 'This is a dry-run, nothing actually happened: ' || '' }}We have released `${{ needs.bump-next-version.outputs.release-version }}` of `NVIDIA Megatron Core` 🚀✨🎉

            • <https://github.com/${{ github.repository }}/releases/tag/core_v${{ needs.bump-next-version.outputs.release-version }}|GitHub release>
            • <https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/megatron-core/${{ needs.bump-next-version.outputs.release-version }}/|PyPi release>

        with:
          message: ${{ env.MESSAGE }}
          webhook: ${{ secrets.SLACK_WEBHOOK }}

_update_dependencies .github/workflows/_update_dependencies.yml

Triggers

workflow_call

Runs on

ubuntu-latest, linux-amd64-cpu16, ubuntu-latest

Jobs

pre-flight, update-lockfile, create-pr

Actions

peter-evans/create-pull-request

Commands

echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"
mkdir -p assets/
NGC_VERSION=$(cat docker/.ngc_version.dev) echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT"
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core .
if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then git checkout -b $SOURCE_BRANCH $TARGET_BRANCH git push origin $SOURCE_BRANCH fi
docker run \ --rm \ -v $(pwd):/workspace \ -w /workspace \ -e GH_TOKEN=${{ secrets.PAT }} \ megatron-core \ bash -c 'uv lock --upgrade'
if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then git fetch origin ${{ env.SOURCE_BRANCH }} git rebase -S origin/${{ env.SOURCE_BRANCH }} fi
PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}" if [ -z "$PR_NUMBER" ]; then echo "No PR was created, skipping comment" exit 0 fi SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}" gh pr comment "$PR_NUMBER" --body "/ok to test $SHA"

View raw YAML

name: ~Update dependencies template
on:
  workflow_call:
    inputs:
      target-branch:
        required: true
        type: string
        description: "The target branch to bump"
    secrets:
      PAT:
        required: true
      SSH_KEY:
        required: true
      SSH_PWD:
        required: true

jobs:
  pre-flight:
    runs-on: ubuntu-latest
    outputs:
      bump-branch: bump-ci-container-${{ steps.ref.outputs.date }}-${{ inputs.target-branch }}
      date: ${{ steps.ref.outputs.date }}
    steps:
      - name: Get date
        id: ref
        run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"

  update-lockfile:
    runs-on: linux-amd64-cpu16
    needs: [pre-flight]
    env:
      SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
      TARGET_BRANCH: ${{ inputs.target-branch }}
    steps:
      - name: Checkout repo
        uses: actions/checkout@v6
        with:
          ref: ${{ env.TARGET_BRANCH }}

      - name: Mock test data
        run: mkdir -p assets/

      - name: Fetch NGC Version
        id: ngc-version
        run: |
          NGC_VERSION=$(cat docker/.ngc_version.dev)
          echo "NGC_VERSION=${NGC_VERSION}" | tee -a "$GITHUB_OUTPUT"

      - name: Build container
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="${{ steps.ngc-version.outputs.NGC_VERSION }}" --target=main -t megatron-core .

      - name: Create bump branch if not exists
        run: |
          if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then
            git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
            git push origin $SOURCE_BRANCH
          fi

      - name: Checkout repo
        uses: actions/checkout@v6
        with:
          ref: ${{ env.SOURCE_BRANCH }}

      - name: Upgrade lock file
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          docker run \
          --rm \
          -v $(pwd):/workspace \
          -w /workspace \
          -e GH_TOKEN=${{ secrets.PAT }} \
          megatron-core \
          bash -c 'uv lock --upgrade'

      - name: Upload lock file
        uses: actions/upload-artifact@v6
        with:
          name: lock-file-${{ env.SOURCE_BRANCH }}
          path: uv.lock

  create-pr:
    needs: [update-lockfile, pre-flight]
    runs-on: ubuntu-latest
    env:
      SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
      TARGET_BRANCH: ${{ inputs.target-branch }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}
          ref: ${{ env.TARGET_BRANCH }}

      - name: Rebase against ${{ env.SOURCE_BRANCH }}
        run: |
          if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then
            git fetch origin ${{ env.SOURCE_BRANCH }}
            git rebase -S origin/${{ env.SOURCE_BRANCH }}
          fi

      - name: Download lock file
        uses: actions/download-artifact@v7
        with:
          name: lock-file-${{ env.SOURCE_BRANCH }}

      - name: Create Bump PR
        uses: peter-evans/create-pull-request@v8
        id: create-pull-request
        env:
          title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})"
        with:
          branch: ${{ env.SOURCE_BRANCH }}
          base: ${{ env.TARGET_BRANCH }}
          title: ${{ env.title }}
          token: ${{ secrets.PAT }}
          body: |
            🚀 PR to bump `uv.lock` in `${{ inputs.target-branch }}`.  

            📝 Please remember the following to-do's before merge: 
            - [ ] Verify the presubmit CI  

            🙏 Please merge this PR only if the CI workflow completed successfully.
          commit-message: ${{ env.title }}
          signoff: true
          committer: "github-actions[bot] <github-actions[bot]@users.noreply.github.com>"

      - name: Post /ok to test comment
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping comment"
            exit 0
          fi
          SHA="${{ steps.create-pull-request.outputs.pull-request-head-sha }}"
          gh pr comment "$PR_NUMBER" --body "/ok to test $SHA"

      - name: Wait for CI checks
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping wait"
            exit 0
          fi

          # Fetch required status checks from branch protection rules
          REQUIRED_CHECKS=$(gh api \
            "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
            --jq '.checks[].context' 2>/dev/null \
            || gh api \
            "repos/${{ github.repository }}/branches/${{ env.TARGET_BRANCH }}/protection/required_status_checks" \
            --jq '.contexts[]' 2>/dev/null \
            || true)

          if [ -z "$REQUIRED_CHECKS" ]; then
            echo "No branch protection rules found for ${{ env.TARGET_BRANCH }}, skipping wait"
            exit 0
          fi

          echo "Required checks from branch protection:"
          echo "$REQUIRED_CHECKS"

          echo "Waiting for required checks to complete on PR #$PR_NUMBER..."
          i=0
          INITIALIZED=false
          while true; do
            i=$((i + 1))
            CHECKS_JSON=$(gh pr checks "$PR_NUMBER" --json name,state 2>/dev/null || echo "[]")
            ALL_DONE=true
            FAILED_CHECKS=""
            while IFS= read -r check; do
              CHECK_STATE=$(echo "$CHECKS_JSON" | jq -r --arg name "$check" '.[] | select(.name == $name) | .state // ""' | tr '[:upper:]' '[:lower:]')
              case "$CHECK_STATE" in
                *success*|*pass*|*skip*|*neutral*) ;;
                *pending*|*queued*|*progress*|*waiting*|*request*|"")
                  ALL_DONE=false
                  INITIALIZED=true
                  break
                  ;;
                *)
                  if [ "$INITIALIZED" = "true" ]; then
                    FAILED_CHECKS="${FAILED_CHECKS}  - ${check} (${CHECK_STATE})"$'\n'
                  else
                    ALL_DONE=false
                  fi
                  ;;
              esac
            done <<< "$REQUIRED_CHECKS"
            if [ "$ALL_DONE" = "true" ]; then
              if [ -n "$FAILED_CHECKS" ]; then
                echo "Required check(s) did not pass:"
                echo "$FAILED_CHECKS"
                exit 1
              fi
              echo "All required checks passed!"
              break
            fi
            echo "Checks not yet complete (attempt $i), retrying in 30s..."
            sleep 30
          done

      - name: Merge PR
        env:
          title: "chore(beep boop 🤖): Bump `uv.lock` (${{ env.TARGET_BRANCH}}) (${{ needs.pre-flight.outputs.date }})"
        run: |
          PR_NUMBER="${{ steps.create-pull-request.outputs.pull-request-number }}"
          if [ -z "$PR_NUMBER" ]; then
            echo "No PR was created, skipping merge"
            exit 0
          fi
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          git fetch origin ${{ env.SOURCE_BRANCH }}
          git fetch origin ${{ env.TARGET_BRANCH }}
          git checkout ${{ env.TARGET_BRANCH }}
          git merge --squash origin/${{ env.SOURCE_BRANCH }}
          git commit -m "${{ env.title }}"
          git pull --rebase origin ${{ env.TARGET_BRANCH }}
          git push origin ${{ env.TARGET_BRANCH }}
          git push origin --delete ${{ env.SOURCE_BRANCH }}

auto-assign-milestone perms .github/workflows/auto-assign-milestone.yml

Triggers

push

Runs on

ubuntu-latest

Jobs

assign-milestone

Actions

nv-gha-runners/get-pr-info

Commands

MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ --repo ${{ github.repository }} \ --json milestone \ --jq '.milestone.title') if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then echo "has_milestone=false" >> $GITHUB_OUTPUT else echo "has_milestone=true" >> $GITHUB_OUTPUT echo "PR already has milestone: $MILESTONE" fi
# Get the most recent open milestone (sorted by due date, then by creation date) MILESTONE_NUMBER=$(gh api \ "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ --jq '.[0].number') MILESTONE_TITLE=$(gh api \ "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \ --jq '.[0].title') if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then echo "No open milestones found" echo "milestone_found=false" >> $GITHUB_OUTPUT else echo "milestone_found=true" >> $GITHUB_OUTPUT echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)" fi
gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \ --repo ${{ github.repository }} \ --milestone "${{ steps.get_milestone.outputs.milestone_title }}" echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}"

View raw YAML

name: Auto-assign Milestone to PR

on:
  push:
    branches:
      - "pull-request/[0-9]+"

permissions:
  contents: read
  pull-requests: write
  issues: write

jobs:
  assign-milestone:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/')
        uses: nv-gha-runners/get-pr-info@main

      - name: Check if PR has milestone
        id: check_milestone
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          MILESTONE=$(gh pr view ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
            --repo ${{ github.repository }} \
            --json milestone \
            --jq '.milestone.title')

          if [ "$MILESTONE" = "null" ] || [ -z "$MILESTONE" ]; then
            echo "has_milestone=false" >> $GITHUB_OUTPUT
          else
            echo "has_milestone=true" >> $GITHUB_OUTPUT
            echo "PR already has milestone: $MILESTONE"
          fi

      - name: Get most recent open milestone
        if: steps.check_milestone.outputs.has_milestone == 'false'
        id: get_milestone
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          # Get the most recent open milestone (sorted by due date, then by creation date)
          MILESTONE_NUMBER=$(gh api \
            "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
            --jq '.[0].number')

          MILESTONE_TITLE=$(gh api \
            "repos/${{ github.repository }}/milestones?state=open&sort=due_on&direction=desc" \
            --jq '.[0].title')

          if [ -z "$MILESTONE_NUMBER" ] || [ "$MILESTONE_NUMBER" = "null" ]; then
            echo "No open milestones found"
            echo "milestone_found=false" >> $GITHUB_OUTPUT
          else
            echo "milestone_found=true" >> $GITHUB_OUTPUT
            echo "milestone_number=$MILESTONE_NUMBER" >> $GITHUB_OUTPUT
            echo "milestone_title=$MILESTONE_TITLE" >> $GITHUB_OUTPUT
            echo "Found milestone: $MILESTONE_TITLE (number: $MILESTONE_NUMBER)"
          fi

      - name: Assign milestone to PR
        if: steps.check_milestone.outputs.has_milestone == 'false' && steps.get_milestone.outputs.milestone_found == 'true'
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          gh pr edit ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} \
            --repo ${{ github.repository }} \
            --milestone "${{ steps.get_milestone.outputs.milestone_title }}"

          echo "✅ Assigned milestone '${{ steps.get_milestone.outputs.milestone_title }}' to PR #${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}"

auto-reminder-bot .github/workflows/auto-reminder-bot.yml

Triggers

workflow_dispatch, schedule

Runs on

ubuntu-latest

Jobs

run-script

Commands

pip install --no-cache-dir PyGithub slack-sdk
export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }} export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }} export GH_TOKEN=${{ secrets.PAT }} python tests/test_utils/python_scripts/auto_reminder_github.py

View raw YAML

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Auto Reminder Bot

on:
  workflow_dispatch:
  schedule:
    - cron: "0 12 * * *"

jobs:
  run-script:
    name: Run Auto Reminder Bot
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Check out repository code
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Install dependencies
        run: |
          pip install --no-cache-dir PyGithub slack-sdk

      - name: Run Auto Reminder Bot
        run: |
          export SLACK_TOKEN=${{ secrets.SLACK_BOT_TOKEN }}
          export SLACK_WEBHOOK_URL=${{ secrets.SLACK_REVIEW_REMINDER_CHANNEL_WEBHOOK }}
          export GH_TOKEN=${{ secrets.PAT }}
          python tests/test_utils/python_scripts/auto_reminder_github.py

auto-swap-labels perms .github/workflows/auto-swap-labels.yml

Triggers

pull_request_target, workflow_run

Runs on

ubuntu-latest

Jobs

check-approval

Commands

if [ "${{ github.event_name }}" = "workflow_run" ]; then if [ "${{ steps.get-pr.outcome }}" != "success" ]; then echo "No approval artifact found — review was not an approval. Skipping." exit 0 fi echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT else echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT fi
pip install --no-cache-dir PyGithub slack-sdk
export GH_TOKEN=${{ secrets.PAT }} export PR_NUMBER=${{ steps.pr.outputs.number }} python tests/test_utils/python_scripts/swap_pr_labels.py

View raw YAML

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Auto Swap Labels
on:
  pull_request_target:
    types: [ready_for_review, synchronize]
    branches:
      - main
  workflow_run:
    workflows: ["Review Trigger"]
    types: [completed]

permissions:
  pull-requests: write
  contents: read
  actions: read

jobs:
  check-approval:
    runs-on: ubuntu-latest
    if: >-
      github.repository == 'NVIDIA/Megatron-LM' && (
        (github.event_name == 'pull_request_target' &&
         github.event.pull_request.base.ref == 'main' &&
         !github.event.pull_request.draft) ||
        (github.event_name == 'workflow_run' &&
         github.event.workflow_run.conclusion == 'success')
      )

    steps:
      - name: Get PR number from workflow_run
        id: get-pr
        if: github.event_name == 'workflow_run'
        continue-on-error: true
        uses: actions/download-artifact@v4
        with:
          name: pr-number
          path: pr-number
          github-token: ${{ github.token }}
          run-id: ${{ github.event.workflow_run.id }}

      - name: Set PR number
        id: pr
        run: |
          if [ "${{ github.event_name }}" = "workflow_run" ]; then
            if [ "${{ steps.get-pr.outcome }}" != "success" ]; then
              echo "No approval artifact found — review was not an approval. Skipping."
              exit 0
            fi
            echo "number=$(cat pr-number/number)" >> $GITHUB_OUTPUT
          else
            echo "number=${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT
          fi

      - name: Check out repository code
        if: steps.pr.outputs.number
        uses: actions/checkout@v4

      - name: Set up Python
        if: steps.pr.outputs.number
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Install dependencies
        if: steps.pr.outputs.number
        run: |
          pip install --no-cache-dir PyGithub slack-sdk

      - name: Run Auto Swap Labels
        if: steps.pr.outputs.number
        run: |
          export GH_TOKEN=${{ secrets.PAT }}
          export PR_NUMBER=${{ steps.pr.outputs.number }}
          python tests/test_utils/python_scripts/swap_pr_labels.py

auto-update-copy-pr-bot .github/workflows/auto-update-copy-pr-bot.yml

Triggers

workflow_dispatch, schedule

Runs on

ubuntu-latest

Jobs

auto-update-copy-pr-bot

Commands

#!/bin/bash get_members() { local org=$1 team=$2 seen_file=$3 gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file" gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do get_members "$org" "$child" "$seen_file" done cat "$seen_file" } tmp=$(mktemp) echo "" > final.txt get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp" tmp=$(mktemp) get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp" cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique' export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique') yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml
git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git git config --global user.name "GitHub Actions" git config --global user.email "github-actions[bot]@users.noreply.github.com" git add .github/copy-pr-bot.yaml if git diff --cached --exit-code --quiet; then echo "No changes to commit. Exiting gracefully." exit 0 fi git commit -m "Update copy-pr-bot.yaml [skip ci]" git push -u origin main

View raw YAML

name: Auto Update Copy PR Bot

on:
  workflow_dispatch:
  schedule:
    - cron: "0 0 * * *"

jobs:
  auto-update-copy-pr-bot:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}
          ref: main

      - name: Fetch list of members in mcore-reviewers team
        shell: bash -euxo pipefail {0}
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          #!/bin/bash

          get_members() {
              local org=$1 team=$2 seen_file=$3    

              gh api "/orgs/$org/teams/$team/members" --paginate --jq '.[].login' >> "$seen_file"
              
              gh api "/orgs/$org/teams/$team/teams" --paginate --jq '.[].slug' | while read -r child; do
                  get_members "$org" "$child" "$seen_file"
              done

              cat "$seen_file"
          }

          tmp=$(mktemp)
          echo "" > final.txt
          get_members "NVIDIA" "mcore-engineers" "$tmp" | sort -u >> final.txt && rm "$tmp"

          tmp=$(mktemp)
          get_members "NVIDIA" "mcore-reviewers" "$tmp" | sort -u >> final.txt && rm "$tmp"

          cat final.txt | jq -sR 'split("\n") | map(select(. != "")) | flatten | unique'

          export TRUSTEES=$(cat final.txt | jq -csR 'split("\n") | map(select(. != "")) | flatten | unique')
          yq '.trustees_override = env(TRUSTEES)' .github/copy-pr-bot.yaml | yq -o yaml > .github/copy-pr-bot.yaml.new

          mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml

      - name: Commit changes
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
          git config --global user.name "GitHub Actions"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add .github/copy-pr-bot.yaml
          if git diff --cached --exit-code --quiet; then
            echo "No changes to commit. Exiting gracefully."
            exit 0
          fi
          git commit -m "Update copy-pr-bot.yaml [skip ci]"
          git push -u origin main

build-docs .github/workflows/build-docs.yml

Triggers

push

Runs on

ubuntu-latest

Jobs

pre-flight, build-docs, build-docs-summary

Commands

FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then echo "✅ All previous jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' exit 1 fi

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build docs

on:
  push:
    branches:
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
  cancel-in-progress: true

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2

  build-docs:
    needs: [pre-flight]
    if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.80.2

  build-docs-summary:
    needs: [pre-flight, build-docs]
    if: |
      (
        needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi

build-test-publish-wheel perms .github/workflows/build-test-publish-wheel.yml

Triggers

push, merge_group

Runs on

ubuntu-latest

Jobs

pre-flight, build-test-publish-wheels, build-test-publish-wheel-summary

Commands

FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels")))] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then echo "✅ All build-and-test-wheels jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed build-and-test-wheels job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels"))) | .name' exit 1 fi

View raw YAML

# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Build, test, and publish a PyPi wheel (to testpypi).

on:
  push:
    branches:
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

defaults:
  run:
    shell: bash -x -e -u -o pipefail {0}

permissions:
  id-token: write
  contents: read

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  build-test-publish-wheels:
    needs: [pre-flight]
    uses: ./.github/workflows/_build_test_publish_wheel.yml
    with:
      no-publish: true
    secrets:
      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}

  build-test-publish-wheel-summary:
    needs: [pre-flight, build-test-publish-wheels]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && github.repository == 'NVIDIA/Megatron-LM'
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: false
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels")))] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All build-and-test-wheels jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed build-and-test-wheels job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels"))) | .name'
              exit 1
          fi

cherry-pick-release-commit .github/workflows/cherry-pick-release-commit.yml

Triggers: push
Runs on: —
Jobs: cherry-pick

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Create PR to main with cherry-pick from release

on:
  push:
    branches:
      - main

jobs:
  cherry-pick:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9
    if: github.repository == 'NVIDIA/Megatron-LM'
    with:
      target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+'
    secrets:
      PAT: ${{ secrets.PAT }}
      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
      SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}

cicd-approve-test-queue matrix .github/workflows/cicd-approve-test-queue.yml

Triggers

schedule, workflow_dispatch

Runs on

ubuntu-latest, ubuntu-latest

Jobs

approve-queue, notify

Matrix

branch, contributor_type→ dev, external, internal, main, others

Commands

python -m pip install --upgrade pip pip install requests
gh release download v0.1.0 \ --repo NVIDIA-GitHub-Management/github-audits \ --pattern users_sso.json \ --output users_sso.json || echo '{}' > users_sso.json
import os import json import requests import re # GitHub API configuration GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] REPO = os.environ["GITHUB_REPOSITORY"] CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"] if CONTRIBUTOR_TYPE == "external": # Global limit across all branches — no division needed since we count globally. MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"]) else: MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2 API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM" # Load SSO users for internal/external classification with open(os.environ["SSO_USERS_FILE"]) as f: sso_users = json.load(f) # Headers for GitHub API headers = { "Authorization": f"token {GITHUB_TOKEN}", "Accept": "application/vnd.github.v3+json", "X-GitHub-Api-Version": "2022-11-28", } def make_request(endpoint, method="GET", data=None): """Make a request to the GitHub API with error handling.""" url = f"{API_BASE}/{endpoint}" try: if method == "GET": response = requests.get(url, headers=headers) else: response = requests.post(url, headers=headers, json=data) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: print(f"Error making request to {endpoint}: {str(e)}") if hasattr(e.response, 'text'): print(f"Response: {e.response.text}") return None def is_internal_contributor(pr_info): """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member).""" login = pr_info.get("user", {}).get("login", "") org_roles = sso_users.get(login, {}).get("org_roles", []) return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles) def get_pr_base_branch(workflow_run): """ Return the base branch of the PR associated with a workflow run, or None. Extracts PR number from head branch like 'pull-request/1913' and fetches PR info. Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run. """ print(workflow_run.get("head_branch", "")) head_branch = workflow_run.get("head_branch", "") match = re.match(r"pull-request/(\d+)", head_branch) if not match: return None, None # Not a PR branch pattern pr_number = int(match.group(1)) # Fetch PR info from GitHub API pr_info = make_request(f"pulls/{pr_number}") if not pr_info: print(f"Failed to fetch PR #{pr_number}") return None, None base_branch = pr_info.get("base", {}).get("ref") return base_branch, pr_info def matches_contributor(workflow_run, contributor_type): """Return True if the workflow run matches the contributor type (ignores branch).""" _, pr_info = get_pr_base_branch(workflow_run) if pr_info is None: return False internal = is_internal_contributor(pr_info) return (contributor_type == "internal") == internal def matches_queue(workflow_run, target_branch, contributor_type): """ Return True if the workflow run belongs to this queue cell: matching target branch AND matching contributor type (internal/external). """ base_branch, pr_info = get_pr_base_branch(workflow_run) if base_branch is None: return False branch_match = ( (base_branch == target_branch) or (base_branch != "main" and base_branch != "dev" and target_branch == "others") ) if not branch_match: return False pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1) internal = is_internal_contributor(pr_info) contributor_match = (contributor_type == "internal") == internal if branch_match and contributor_match: print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})") return branch_match and contributor_match # Get current running and queued workflows print("Fetching workflow runs...") queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", []) in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", []) # For external contributors, enforce a single global concurrency limit across ALL branches. # For internal contributors, enforce per-branch limits as before. if CONTRIBUTOR_TYPE == "external": queued_workflow_runs = [run for run in queued_workflow_runs if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)] in_progress_workflow_runs = [run for run in in_progress_workflow_runs if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)] else: # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type queued_workflow_runs = [run for run in queued_workflow_runs if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] in_progress_workflow_runs = [run for run in in_progress_workflow_runs if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] # Count running and queued workflows queued_workflows = len(queued_workflow_runs) in_progress_workflows = len(in_progress_workflow_runs) total_workflows = queued_workflows + in_progress_workflows print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}") print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}") print(f"Total workflows: {total_workflows}") print(f"Max concurrency: {MAX_CONCURRENCY}") if total_workflows >= MAX_CONCURRENCY: print("Maximum concurrency reached, no new approvals will be made") exit(0) # Get waiting CI workflows for test environment print("Fetching deployments...") pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", []) print("Pending workflows:", len(pending_workflows)) pending_workflows = [run for run in pending_workflows if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)] # Sort deployments by creation date (oldest first) print("Sorting workflows...") pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"]) # Process each deployment print(f"Processing {len(pending_workflows)} pending workflows...") for workflow in pending_workflows: if total_workflows >= MAX_CONCURRENCY: print("Maximum concurrency reached, stopping approvals") break workflow_id = workflow["id"] workflow_name = workflow["display_title"] print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") deployment_url = f"actions/runs/{workflow_id}/pending_deployments" deployment = make_request(deployment_url)[0] environment_id = deployment["environment"]["id"] # Approve the deployment status_data = { "environment_ids": [environment_id], "state": "approved", "comment": "Automatically approved by queue manager" } result = make_request(deployment_url, method="POST", data=status_data) if result: total_workflows += 1 else: print(f"Failed to approve deployment {deployment['id']}") exit(1)
curl -X POST \ -H 'Content-type: application/json' \ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \ $SLACK_WEBHOOK

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Approve Test Queue

on:
  schedule:
    - cron: "*/5 * * * *" # Runs every 5 minutes
  workflow_dispatch: # Allows manual triggering

jobs:
  approve-queue:
    runs-on: ubuntu-latest
    environment: main
    if: github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      matrix:
        branch: [main, dev, others]
        contributor_type: [internal, external]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.12"

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install requests

      - name: Download SSO users list
        run: |
          gh release download v0.1.0 \
            --repo NVIDIA-GitHub-Management/github-audits \
            --pattern users_sso.json \
            --output users_sso.json || echo '{}' > users_sso.json
        env:
          GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}

      - name: Approve waiting deployments
        env:
          GITHUB_TOKEN: ${{ secrets.PAT }}
          MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
          MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 1 }}
          CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
          SSO_USERS_FILE: users_sso.json
          PYTHONUNBUFFERED: 1
        shell: python
        run: |
          import os
          import json
          import requests
          import re

          # GitHub API configuration
          GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
          REPO = os.environ["GITHUB_REPOSITORY"]
          CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
          if CONTRIBUTOR_TYPE == "external":
              # Global limit across all branches — no division needed since we count globally.
              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
          else:
              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
          API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"

          # Load SSO users for internal/external classification
          with open(os.environ["SSO_USERS_FILE"]) as f:
              sso_users = json.load(f)

          # Headers for GitHub API
          headers = {
              "Authorization": f"token {GITHUB_TOKEN}",
              "Accept": "application/vnd.github.v3+json",
              "X-GitHub-Api-Version": "2022-11-28",
          }

          def make_request(endpoint, method="GET", data=None):
              """Make a request to the GitHub API with error handling."""
              url = f"{API_BASE}/{endpoint}"
              try:
                  if method == "GET":
                      response = requests.get(url, headers=headers)
                  else:
                      response = requests.post(url, headers=headers, json=data)
                  response.raise_for_status()
                  return response.json()
              except requests.exceptions.RequestException as e:
                  print(f"Error making request to {endpoint}: {str(e)}")
                  if hasattr(e.response, 'text'):
                      print(f"Response: {e.response.text}")
                  return None

          def is_internal_contributor(pr_info):
              """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
              login = pr_info.get("user", {}).get("login", "")
              org_roles = sso_users.get(login, {}).get("org_roles", [])
              return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)

          def get_pr_base_branch(workflow_run):
              """
              Return the base branch of the PR associated with a workflow run, or None.
              Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
              Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
              """
              print(workflow_run.get("head_branch", ""))
              head_branch = workflow_run.get("head_branch", "")
              match = re.match(r"pull-request/(\d+)", head_branch)
              if not match:
                  return None, None  # Not a PR branch pattern

              pr_number = int(match.group(1))

              # Fetch PR info from GitHub API
              pr_info = make_request(f"pulls/{pr_number}")
              if not pr_info:
                  print(f"Failed to fetch PR #{pr_number}")
                  return None, None

              base_branch = pr_info.get("base", {}).get("ref")
              return base_branch, pr_info

          def matches_contributor(workflow_run, contributor_type):
              """Return True if the workflow run matches the contributor type (ignores branch)."""
              _, pr_info = get_pr_base_branch(workflow_run)
              if pr_info is None:
                  return False
              internal = is_internal_contributor(pr_info)
              return (contributor_type == "internal") == internal

          def matches_queue(workflow_run, target_branch, contributor_type):
              """
              Return True if the workflow run belongs to this queue cell:
              matching target branch AND matching contributor type (internal/external).
              """
              base_branch, pr_info = get_pr_base_branch(workflow_run)
              if base_branch is None:
                  return False

              branch_match = (
                  (base_branch == target_branch) or
                  (base_branch != "main" and base_branch != "dev" and target_branch == "others")
              )
              if not branch_match:
                  return False

              pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
              internal = is_internal_contributor(pr_info)
              contributor_match = (contributor_type == "internal") == internal
              if branch_match and contributor_match:
                  print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
              return branch_match and contributor_match

          # Get current running and queued workflows
          print("Fetching workflow runs...")
          queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
          in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])

          # For external contributors, enforce a single global concurrency limit across ALL branches.
          # For internal contributors, enforce per-branch limits as before.
          if CONTRIBUTOR_TYPE == "external":
              queued_workflow_runs = [run for run in queued_workflow_runs
                                      if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
              in_progress_workflow_runs = [run for run in in_progress_workflow_runs
                                          if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
          else:
              # Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
              queued_workflow_runs = [run for run in queued_workflow_runs
                                      if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
              in_progress_workflow_runs = [run for run in in_progress_workflow_runs
                                          if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]

          # Count running and queued workflows
          queued_workflows = len(queued_workflow_runs)
          in_progress_workflows = len(in_progress_workflow_runs)

          total_workflows = queued_workflows + in_progress_workflows
          print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
          print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
          print(f"Total workflows: {total_workflows}")
          print(f"Max concurrency: {MAX_CONCURRENCY}")

          if total_workflows >= MAX_CONCURRENCY:
              print("Maximum concurrency reached, no new approvals will be made")
              exit(0)

          # Get waiting CI workflows for test environment
          print("Fetching deployments...")
          pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
          print("Pending workflows:", len(pending_workflows))
          pending_workflows = [run for run in pending_workflows
                              if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]

          # Sort deployments by creation date (oldest first)
          print("Sorting workflows...")
          pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])

          # Process each deployment
          print(f"Processing {len(pending_workflows)} pending workflows...")
          for workflow in pending_workflows:
              if total_workflows >= MAX_CONCURRENCY:
                  print("Maximum concurrency reached, stopping approvals")
                  break

              workflow_id = workflow["id"]
              workflow_name = workflow["display_title"]
              print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")

              deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
              deployment = make_request(deployment_url)[0]
              environment_id = deployment["environment"]["id"]

              # Approve the deployment
              status_data = {
                  "environment_ids": [environment_id],
                  "state": "approved",
                  "comment": "Automatically approved by queue manager"
              }
              result = make_request(deployment_url, method="POST", data=status_data)

              if result:
                  total_workflows += 1
              else:
                  print(f"Failed to approve deployment {deployment['id']}")
                  exit(1)
  notify:
    if: failure()
    runs-on: ubuntu-latest
    needs: [approve-queue]
    steps:
      - name: Notify
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_REPOSITORY: ${{ github.repository }}
        run: |
          curl -X POST \
            -H 'Content-type: application/json' \
            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
            $SLACK_WEBHOOK

cicd-main matrix perms .github/workflows/cicd-main.yml

Triggers

schedule, push, merge_group, workflow_dispatch

Runs on

ubuntu-latest, ubuntu-latest, ubuntu-latest, ubuntu-latest, ubuntu-latest, ubuntu-latest, ubuntu-latest, ${{ matrix.runner }}, ubuntu-latest, ${{ needs.is-not-external-contributor.outputs.selected_runner }}, ubuntu-latest, ${{ needs.is-not-external-contributor.outputs.selected_runner }}, ubuntu-latest, ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}, ubuntu-latest, ubuntu-latest, ubuntu-latest, ubuntu-latest, ${{ needs.is-not-external-contributor.outputs.selected_runner }}

Jobs

is-not-external-contributor, pre-flight, configure, linting, cicd-wait-in-queue, cicd-parse-downstream-testing, cicd-mbridge-testing, cicd-compute-build-matrix, cicd-container-build, cicd-parse-unit-tests, cicd-unit-tests-latest, cicd-parse-integration-tests-h100, cicd-integration-tests-latest-h100, cicd-parse-integration-tests-gb200, cicd-integration-tests-latest-gb200, Nemo_CICD_Test, Coverage_Fake, Coverage, merge-queue-notification, cleanup-taint-node

Matrix

flag, include→ ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }}, ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }}, ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}, unit-test

Actions

nv-gha-runners/get-pr-info, nv-gha-runners/get-pr-info, astral-sh/setup-uv, nv-gha-runners/get-pr-info, nv-gha-runners/get-pr-info, convictional/trigger-workflow-and-wait, nv-gha-runners/get-pr-info, docker/setup-buildx-action, docker/build-push-action, codecov/codecov-action

Commands

# Skip SSO check for scheduled jobs, main branch, or merge groups if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi # Use SSO membership check result IS_MEMBER="${{ steps.check-sso.outputs.is_member }}" # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} echo "Checking if $PR_AUTHOR is a repo collaborator..." API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then IS_MEMBER="true" else exit 1 fi fi # Use SSO membership check result if [ "$IS_MEMBER" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT else echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT fi
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} # Fetch all labels in a single API call; fall back to empty list if no PR LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]') || LABELS='[]' HAS_RUN_TESTS=$(echo "$LABELS" | jq 'any(. == "Run tests")') HAS_RUN_FUNCTIONAL=$(echo "$LABELS" | jq 'any(. == "Run functional tests")') HAS_LTS=$(echo "$LABELS" | jq 'any(. == "container::lts")') HAS_MBRIDGE=$(echo "$LABELS" | jq 'any(. == "Run MBridge tests")') # Scheduled/CI workloads have no PR — treat as "Run functional tests" [ "$IS_CI_WORKLOAD" == "true" ] && HAS_RUN_FUNCTIONAL=true if [ "$IS_MERGE_GROUP" == "true" ]; then SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=false elif [ "$HAS_RUN_TESTS" == "true" ]; then SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=true elif [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then SCOPE=mr-github; N_REPEAT=5; LIGHTWEIGHT=false else SCOPE=mr-github-slim; N_REPEAT=5; LIGHTWEIGHT=false fi if [ "$HAS_MBRIDGE" == "true" || $IS_MERGE_GROUP == "true" ]; then MBRIDGE_SUITE="L1" else MBRIDGE_SUITE="unit-only" fi DEV=true echo "scope=$SCOPE" | tee -a $GITHUB_OUTPUT echo "n_repeat=$N_REPEAT" | tee -a $GITHUB_OUTPUT echo "lightweight=$LIGHTWEIGHT" | tee -a $GITHUB_OUTPUT echo "lts=$HAS_LTS" | tee -a $GITHUB_OUTPUT echo "mbridge_suite=$MBRIDGE_SUITE" | tee -a $GITHUB_OUTPUT echo "dev=$DEV" | tee -a $GITHUB_OUTPUT # Pre-compute active row markers for the decision tree _MG=$( [ "$IS_MERGE_GROUP" == "true" ] && echo "**→**" || echo "" ) _RT=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" == "true" ] && echo "**→**" || echo "" ) _RF=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" == "true" ] && echo "**→**" || echo "" ) _DF=$( [ "$SCOPE" == "mr-github-slim" ] && echo "**→**" || echo "" ) _LTS=$( [ "$HAS_LTS" == "true" ] && echo "**→**" || echo "" ) _DEV=$( [ "$HAS_LTS" != "true" ] && echo "**→**" || echo "" ) cat <<SUMMARY >> $GITHUB_STEP_SUMMARY Beep boop 🤖 I have consulted the labels and decided to run **$SCOPE** $( [ "$LIGHTWEIGHT" == "true" ] && echo "in lightweight mode " || echo "" )against the **$( [ "$HAS_LTS" == "true" ] && echo "lts" || echo "dev" )** container with **$N_REPEAT** repetition(s). You are welcome. | Setting | Value | |---|---| | \`scope\` | \`$SCOPE\` | | \`n_repeat\` | \`$N_REPEAT\` | | \`lightweight\` | \`$LIGHTWEIGHT\` | | \`lts\` | \`$HAS_LTS\` | | \`dev\` | \`$DEV\` | | \`mbridge_suite\` | \`$MBRIDGE_SUITE\` | ### Decision tree **Test scope** | | Trigger | \`scope\` | \`n_repeat\` | \`lightweight\` | |---|---|---|---|---| | $_MG | Merge group | \`mr-github\` | \`1\` | \`false\` | | $_RT | Label: _Run tests_ | \`mr-github\` | \`1\` | \`true\` | | $_RF | Label: _Run functional tests_ / CI workload | \`mr-github\` | \`5\` | \`false\` | | $_DF | _(default)_ | \`mr-github-slim\` | \`5\` | \`false\` | **Container image** | | Trigger | \`image\` | |---|---|---| | $_LTS | Label: _container::lts_ | \`lts\` | | $_DEV | _(default)_ | \`dev\` | ### Glossary - **\`lightweight\`**: trains for 4 steps instead of 100 and skips comparison against golden values — faster feedback, no correctness guarantees - **\`lts\`**: uses the Long Term Support container base image instead of the latest dev image - **\`dev\`**: uses the latest development container base image (default) SUMMARY
uv sync --locked --only-group linting
export PATH=".venv/bin:$PATH" export GITLAB_ENDPOINT=github.com export CI_PROJECT_NAMESPACE=NVIDIA export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" export CHECK_ONLY=true export SKIP_DOCS=false bash tools/autoformat.sh
echo "Running CI tests" echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"
bash .github/scripts/readme.sh
cd megatron-bridge git fetch origin main git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force
if [[ "$IS_PR" == "true" ]]; then SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }} elif [[ "$IS_MERGE_GROUP" == "true" ]]; then SHA=${{ github.event.merge_group.head_sha }} else SHA=${GITHUB_SHA} fi echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: CICD Megatron-LM
on:
  schedule:
    - cron: 0 0 * * *
  push:
    branches:
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]
  workflow_dispatch:

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.event.merge_group.head_ref || github.ref }}
  cancel-in-progress: true

permissions:
  id-token: write
  contents: read

env:
  container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com
  container-registry-gb200: us-east4-docker.pkg.dev/nv-projdgxchipp-20260113193621/megatron-lm

jobs:
  is-not-external-contributor:
    runs-on: ubuntu-latest
    if: github.repository == 'NVIDIA/Megatron-LM'
    outputs:
      is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
      is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }}
      selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }}
      selected_runner_gb200: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-gcp-gpu-x4' || 'ubuntu-latest' }}
    permissions:
      issues: write
      pull-requests: write
    env:
      GITHUB_TOKEN: ${{ secrets.PAT }}
      REPO: ${{ github.repository }}
      DISABLE_EXTERNAL_CONTRIBUTOR: ${{ vars.DISABLE_EXTERNAL_CONTRIBUTOR }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          token: ${{ env.GITHUB_TOKEN }}

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Check NVIDIA SSO membership
        id: check-sso
        uses: ./.github/actions/check-nvidia-sso-membership
        with:
          username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
          github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
          sso_users_filename: ${{ vars.SSO_USERS_FILENAME }}

      - name: Set maintainer status
        id: check-membership
        env:
          IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
          SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
        run: |
          # Skip SSO check for scheduled jobs, main branch, or merge groups
          if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
            echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
            exit 0
          fi

          # Use SSO membership check result
          IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"

          # If external contributor is disabled, check if user is a repo collaborator or an org collaborator to NVIDIA or NVIDIA-NeMo
          if [ "${{ env.DISABLE_EXTERNAL_CONTRIBUTOR }}" == "true" ] && [ "${{ steps.check-sso.outputs.is_member }}" != "true" ]; then
            PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}

            echo "Checking if $PR_AUTHOR is a repo collaborator..."
            API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
            REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
            API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
            ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
            API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
            ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
              -H "Accept: application/vnd.github+json" \
              -H "Authorization: Bearer $GITHUB_TOKEN" \
              -H "X-GitHub-Api-Version: 2022-11-28" \
              $API_URL)

            if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
              IS_MEMBER="true"
            else
              exit 1
            fi
          fi

          # Use SSO membership check result
          if [ "$IS_MEMBER" == "true" ]; then
            echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
          else
            echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
          fi

  pre-flight:
    needs: [is-not-external-contributor]
    if: github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2

  configure:
    runs-on: ubuntu-latest
    needs: [pre-flight]
    if: github.repository == 'NVIDIA/Megatron-LM'
    outputs:
      scope:         ${{ steps.configure.outputs.scope }}
      n_repeat:      ${{ steps.configure.outputs.n_repeat }}
      lightweight:   ${{ steps.configure.outputs.lightweight }}
      lts:           ${{ steps.configure.outputs.lts }}
      mbridge_suite: ${{ steps.configure.outputs.mbridge_suite }}
      dev:           ${{ steps.configure.outputs.dev }}
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Configure
        id: configure
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ secrets.PAT }}
          IS_CI_WORKLOAD: ${{ needs.pre-flight.outputs.is_ci_workload }}
          IS_MERGE_GROUP: ${{ needs.pre-flight.outputs.is_merge_group }}
        run: |
          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}

          # Fetch all labels in a single API call; fall back to empty list if no PR
          LABELS=$(gh pr view $PR_NUMBER --repo ${{ github.repository }} --json labels --jq '[.labels[].name]') || LABELS='[]'

          HAS_RUN_TESTS=$(echo "$LABELS"      | jq 'any(. == "Run tests")')
          HAS_RUN_FUNCTIONAL=$(echo "$LABELS" | jq 'any(. == "Run functional tests")')
          HAS_LTS=$(echo "$LABELS"            | jq 'any(. == "container::lts")')
          HAS_MBRIDGE=$(echo "$LABELS"        | jq 'any(. == "Run MBridge tests")')

          # Scheduled/CI workloads have no PR — treat as "Run functional tests"
          [ "$IS_CI_WORKLOAD" == "true" ] && HAS_RUN_FUNCTIONAL=true

          if [ "$IS_MERGE_GROUP" == "true" ]; then
            SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=false
          elif [ "$HAS_RUN_TESTS" == "true" ]; then
            SCOPE=mr-github; N_REPEAT=1; LIGHTWEIGHT=true
          elif [ "$HAS_RUN_FUNCTIONAL" == "true" ]; then
            SCOPE=mr-github; N_REPEAT=5; LIGHTWEIGHT=false
          else
            SCOPE=mr-github-slim; N_REPEAT=5; LIGHTWEIGHT=false
          fi

          if [ "$HAS_MBRIDGE" == "true" || $IS_MERGE_GROUP == "true" ]; then
            MBRIDGE_SUITE="L1"
          else
            MBRIDGE_SUITE="unit-only"
          fi

          DEV=true

          echo "scope=$SCOPE"                 | tee -a $GITHUB_OUTPUT
          echo "n_repeat=$N_REPEAT"           | tee -a $GITHUB_OUTPUT
          echo "lightweight=$LIGHTWEIGHT"     | tee -a $GITHUB_OUTPUT
          echo "lts=$HAS_LTS"                 | tee -a $GITHUB_OUTPUT
          echo "mbridge_suite=$MBRIDGE_SUITE" | tee -a $GITHUB_OUTPUT
          echo "dev=$DEV"                     | tee -a $GITHUB_OUTPUT

          # Pre-compute active row markers for the decision tree
          _MG=$( [ "$IS_MERGE_GROUP" == "true" ]                                                                           && echo "**→**" || echo "" )
          _RT=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" == "true" ]                                        && echo "**→**" || echo "" )
          _RF=$( [ "$IS_MERGE_GROUP" != "true" ] && [ "$HAS_RUN_TESTS" != "true" ] && [ "$HAS_RUN_FUNCTIONAL" == "true" ]  && echo "**→**" || echo "" )
          _DF=$( [ "$SCOPE" == "mr-github-slim" ]                                                                          && echo "**→**" || echo "" )
          _LTS=$( [ "$HAS_LTS" == "true" ]                                                                                 && echo "**→**" || echo "" )
          _DEV=$( [ "$HAS_LTS" != "true" ]                                                                                 && echo "**→**" || echo "" )

          cat <<SUMMARY >> $GITHUB_STEP_SUMMARY
          Beep boop 🤖 I have consulted the labels and decided to run **$SCOPE** $( [ "$LIGHTWEIGHT" == "true" ] && echo "in lightweight mode " || echo "" )against the **$( [ "$HAS_LTS" == "true" ] && echo "lts" || echo "dev" )** container with **$N_REPEAT** repetition(s). You are welcome.

          | Setting | Value |
          |---|---|
          | \`scope\` | \`$SCOPE\` |
          | \`n_repeat\` | \`$N_REPEAT\` |
          | \`lightweight\` | \`$LIGHTWEIGHT\` |
          | \`lts\` | \`$HAS_LTS\` |
          | \`dev\` | \`$DEV\` |
          | \`mbridge_suite\` | \`$MBRIDGE_SUITE\` |

          ### Decision tree

          **Test scope**

          | | Trigger | \`scope\` | \`n_repeat\` | \`lightweight\` |
          |---|---|---|---|---|
          | $_MG | Merge group | \`mr-github\` | \`1\` | \`false\` |
          | $_RT | Label: _Run tests_ | \`mr-github\` | \`1\` | \`true\` |
          | $_RF | Label: _Run functional tests_ / CI workload | \`mr-github\` | \`5\` | \`false\` |
          | $_DF | _(default)_ | \`mr-github-slim\` | \`5\` | \`false\` |

          **Container image**

          | | Trigger | \`image\` |
          |---|---|---|
          | $_LTS | Label: _container::lts_ | \`lts\` |
          | $_DEV | _(default)_ | \`dev\` |

          ### Glossary
          - **\`lightweight\`**: trains for 4 steps instead of 100 and skips comparison against golden values — faster feedback, no correctness guarantees
          - **\`lts\`**: uses the Long Term Support container base image instead of the latest dev image
          - **\`dev\`**: uses the latest development container base image (default)
          SUMMARY

  linting:
    runs-on: ubuntu-latest
    needs: [pre-flight]
    if: |
      (
        needs.pre-flight.outputs.is_deployment_workflow == 'false'
          && needs.pre-flight.outputs.is_ci_workload == 'true'
      ) || (
        needs.pre-flight.outputs.is_deployment_workflow == 'false'
          && needs.pre-flight.outputs.is_ci_workload == 'false'
          && needs.pre-flight.outputs.docs_only == 'false'
      )
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Install uv
        uses: astral-sh/setup-uv@v1
        with:
          version: 0.7.2

      - name: Install linting tools
        run: |
          uv sync --locked --only-group linting

      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Run linting
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        run: |
          export PATH=".venv/bin:$PATH"
          export GITLAB_ENDPOINT=github.com
          export CI_PROJECT_NAMESPACE=NVIDIA
          export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
          export CHECK_ONLY=true
          export SKIP_DOCS=false
          bash tools/autoformat.sh

  cicd-wait-in-queue:
    runs-on: ubuntu-latest
    needs: [pre-flight, linting]
    environment: "test"
    if: |
      !(needs.pre-flight.outputs.is_ci_workload == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.docs_only == 'true')
    steps:
      - name: Running CI tests
        run: |
          echo "Running CI tests"
          echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"

  cicd-parse-downstream-testing:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - configure
      - cicd-wait-in-queue
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.configure.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      mbridge-test-suite: ${{ needs.configure.outputs.mbridge_suite }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: How-To
        run: bash .github/scripts/readme.sh

  cicd-mbridge-testing:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - cicd-wait-in-queue
      - cicd-parse-downstream-testing
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-downstream-testing.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Checkout MBridge and create testing branch
        uses: actions/checkout@v6
        with:
          ref: main
          repository: NVIDIA-NeMo/Megatron-Bridge
          path: megatron-bridge
          token: ${{ secrets.PAT }}

      - name: Create testing branch
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        run: |
          cd megatron-bridge
          git fetch origin main
          git checkout -b ${{ env.MBRIDGE_BRANCH_NAME }} origin/main
          git push origin ${{ env.MBRIDGE_BRANCH_NAME }} --force

      - name: Get merge commit sha
        shell: bash -x -e -u -o pipefail {0}
        id: sha
        env:
          IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
        run: |
          if [[ "$IS_PR" == "true" ]]; then
            SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
          elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
            SHA=${{ github.event.merge_group.head_sha }}
          else
            SHA=${GITHUB_SHA}
          fi
          echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"

      - name: Trigger MBridge tests
        uses: convictional/trigger-workflow-and-wait@v1.6.5
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        with:
          owner: NVIDIA-NeMo
          repo: Megatron-Bridge
          workflow_file_name: cicd-main.yml
          github_token: ${{ secrets.PAT }}
          ref: ${{ env.MBRIDGE_BRANCH_NAME }}
          wait_interval: 60
          propagate_failure: true
          client_payload: |
            {
              "mcore_ref": "${{ steps.sha.outputs.main }}",
              "test_suite": "${{ needs.cicd-parse-downstream-testing.outputs.mbridge-test-suite }}",
              "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }

      - name: Delete testing branch
        if: always()
        env:
          MBRIDGE_BRANCH_NAME: mcore-testing-${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || github.run_id }}
        run: |
          cd megatron-bridge
          git push origin --delete ${{ env.MBRIDGE_BRANCH_NAME }}

  cicd-compute-build-matrix:
    runs-on: ubuntu-latest
    needs: [is-not-external-contributor]
    outputs:
      matrix: ${{ steps.compute.outputs.matrix }}
    steps:
      - name: Compute build matrix
        id: compute
        env:
          IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
          SELECTED_RUNNER: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
          SELECTED_RUNNER_GB200: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
          REGISTRY_AWS: ${{ env.container-registry }}
          REGISTRY_GCP: ${{ env.container-registry-gb200 }}
        run: |
          AWS_ENTRY=$(jq -nc --arg registry "$REGISTRY_AWS" --arg runner "$SELECTED_RUNNER" \
            '{"cloud": "aws", "registry": $registry, "runner": $runner}')
          if [ "$IS_MAINTAINER" == "true" ]; then
            GCP_ENTRY=$(jq -nc --arg registry "$REGISTRY_GCP" --arg runner "$SELECTED_RUNNER_GB200" \
              '{"cloud": "gcp", "registry": $registry, "runner": $runner}')
            MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" --argjson gcp "$GCP_ENTRY" \
              '{"include": [$aws, $gcp]}')
          else
            MATRIX=$(jq -nc --argjson aws "$AWS_ENTRY" '{"include": [$aws]}')
          fi
          echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"

  cicd-container-build:
    needs: [is-not-external-contributor, pre-flight, configure, cicd-wait-in-queue, cicd-compute-build-matrix]
    strategy:
      fail-fast: false
      matrix: ${{ fromJson(needs.cicd-compute-build-matrix.outputs.matrix) }}
    runs-on: ${{ matrix.runner }}
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-compute-build-matrix.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
      )
      && !cancelled()
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/') && github.event_name == 'push'
        uses: nv-gha-runners/get-pr-info@main

      - name: Get merge commit sha
        shell: bash -x -e -u -o pipefail {0}
        id: sha
        env:
          IS_PR: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
          IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
        run: |
          if [[ "$IS_PR" == "true" ]]; then
            SHA=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').merge_commit_sha }}
          elif [[ "$IS_MERGE_GROUP" == "true" ]]; then
            SHA=${{ github.event.merge_group.head_sha }}
          else
            SHA=${GITHUB_SHA}
          fi
          echo "main=${SHA}" | tee -a "$GITHUB_OUTPUT"

      - name: Checkout
        uses: actions/checkout@v6
        with:
          ref: ${{ steps.sha.outputs.main }}

      - name: Setup python
        uses: actions/setup-python@v6
        with:
          python-version: 3.12

      - name: Install GH CLI
        shell: bash -x -e -u -o pipefail {0}
        run: |
          apt-get update
          apt-get install -y gh

      - name: Download test data
        shell: bash
        run: |
          echo "::group::Download test data"
          pip install --no-cache-dir click requests
          python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
          echo "::endgroup::"

      - name: Install GH CLI
        shell: bash
        run: |
          apt-get update
          apt-get install -y gh

      - name: Get last merged PR
        id: cache_from
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          LAST_PRS=$(gh api graphql -f query='
            query {
              repository(owner: "NVIDIA", name: "Megatron-LM") {
                pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
                  nodes {
                    number
                  }
                }
              }
            }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
              echo "type=registry,ref=${{ matrix.registry }}/megatron-lm:$number-buildcache,mode=max"
            done)

          echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
          echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
          echo "EOF" | tee -a $GITHUB_OUTPUT

      - name: Parse baseimage
        shell: bash
        id: base-image
        env:
          HAS_LTS_LABEL: ${{ needs.configure.outputs.lts }}
        run: |
          if [ "$HAS_LTS_LABEL" == "true" ]; then
            NGC_VERSION=$(cat docker/.ngc_version.lts)
            echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
            echo "image_type=lts" | tee -a $GITHUB_OUTPUT
          else
            NGC_VERSION=$(cat docker/.ngc_version.dev)
            echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
            echo "image_type=dev" | tee -a $GITHUB_OUTPUT
          fi

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Build and push
        uses: docker/build-push-action@v6
        with:
          file: ./docker/Dockerfile.ci.dev
          push: true
          context: .
          target: main
          build-args: |
            FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
            IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
          cache-from: |
            type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
            type=registry,ref=${{ matrix.registry }}/megatron-lm:main-buildcache,mode=max
            ${{ steps.cache_from.outputs.LAST_PRS }}
          cache-to: |
            type=registry,ref=${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
          no-cache: false
          tags: |
            ${{ matrix.registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
            ${{ matrix.registry }}/megatron-lm:${{ github.sha }}
          secrets: |
            GH_TOKEN=${{ secrets.PAT }}

  cicd-parse-unit-tests:
    runs-on: ubuntu-latest
    outputs:
      unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }}
    needs:
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Parse unit tests
        id: parse-unit-tests
        run: |
          cat tests/test_utils/recipes/h100/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
          echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT

  cicd-unit-tests-latest:
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-parse-unit-tests
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    timeout-minutes: 60
    name: "${{ matrix.bucket }} - latest"
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-parse-unit-tests.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.bucket }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "true"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}

  cicd-parse-integration-tests-h100:
    runs-on: ubuntu-latest
    needs:
      - pre-flight
      - configure
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-unit-tests-latest
    if: |
      needs.pre-flight.result != 'cancelled'
      && needs.configure.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      integration-tests-h100: ${{ steps.main.outputs.integration-tests-h100 }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Parse functional tests
        id: main
        env:
          SCOPE: ${{ needs.configure.outputs.scope }}
          LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }}
        run: |
          export PYTHONPATH=$(pwd)

          ARGS=(--scope $SCOPE)
          [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode)

          python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
            --n-repeat 5 \
            --time-limit 2700 \
            --test-cases all \
            --container-image mcore_ci_dev \
            --container-tag latest \
            --dependent-job functional:configure \
            --record-checkpoints false \
            --slurm-account gh \
            --no-enable-warmup \
            --environment dev \
            --platform dgx_h100 \
            --cluster ghci \
            ${ARGS[@]} \
            --output-path integration-tests-h100.yaml

          cat integration-tests-h100.yaml | \
            yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c  > integration-tests-h100.json

          echo "integration-tests-h100=$(cat integration-tests-h100.json)" | tee -a "$GITHUB_OUTPUT"

  cicd-integration-tests-latest-h100:
    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-integration-tests-h100.outputs.integration-tests-h100) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - configure
      - cicd-wait-in-queue
      - cicd-parse-integration-tests-h100
      - cicd-unit-tests-latest
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    if: |
      needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.configure.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-integration-tests-h100.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.test_case }}
          model: ${{ matrix.model }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "false"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
          scope: ${{ needs.configure.outputs.scope }}
          n_repeat: ${{ needs.configure.outputs.n_repeat }}
          lightweight: ${{ needs.configure.outputs.lightweight }}

  cicd-parse-integration-tests-gb200:
    runs-on: ubuntu-latest
    needs:
      - is-not-external-contributor
      - pre-flight
      - configure
      - cicd-wait-in-queue
      - cicd-container-build
      - cicd-unit-tests-latest
    if: |
      needs.is-not-external-contributor.outputs.is_maintainer == 'true'
      && needs.pre-flight.result != 'cancelled'
      && needs.configure.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-container-build.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    outputs:
      integration-tests-gb200: ${{ steps.main.outputs.integration-tests-gb200 }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Parse functional tests
        id: main
        env:
          SCOPE: ${{ needs.configure.outputs.scope }}
          LIGHTWEIGHT: ${{ needs.configure.outputs.lightweight }}
        run: |
          export PYTHONPATH=$(pwd)

          ARGS=(--scope $SCOPE)
          [ "$LIGHTWEIGHT" == "true" ] && ARGS+=(--enable-lightweight-mode)

          python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
            --n-repeat 5 \
            --time-limit 2700 \
            --test-cases all \
            --container-image mcore_ci_dev \
            --container-tag latest \
            --dependent-job functional:configure \
            --record-checkpoints false \
            --slurm-account gh \
            --no-enable-warmup \
            --environment dev \
            --platform dgx_gb200 \
            --cluster dgxgb200_oci-hsg \
            ${ARGS[@]} \
            --output-path integration-tests-gb200.yaml

          cat integration-tests-gb200.yaml | \
            yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c  > integration-tests-gb200.json

          echo "integration-tests-gb200=$(cat integration-tests-gb200.json)" | tee -a "$GITHUB_OUTPUT"

  cicd-integration-tests-latest-gb200:
    timeout-minutes: 60
    strategy:
      fail-fast: false
      matrix:
        include: ${{ fromJson(needs.cicd-parse-integration-tests-gb200.outputs.integration-tests-gb200) }}
    needs:
      - is-not-external-contributor
      - pre-flight
      - configure
      - cicd-wait-in-queue
      - cicd-parse-integration-tests-gb200
      - cicd-unit-tests-latest
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner_gb200 }}
    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
    env:
      PIP_DISABLE_PIP_VERSION_CHECK: 1
      PIP_NO_PYTHON_VERSION_WARNING: 1
      PIP_ROOT_USER_ACTION: ignore
    if: |
      needs.is-not-external-contributor.outputs.is_maintainer == 'true'
      && needs.is-not-external-contributor.result != 'cancelled'
      && needs.pre-flight.result != 'cancelled'
      && needs.configure.result != 'cancelled'
      && needs.cicd-wait-in-queue.result != 'cancelled'
      && needs.cicd-parse-integration-tests-gb200.result != 'cancelled'
      && needs.cicd-unit-tests-latest.result != 'cancelled'
      && (
        success()
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.force_run_all == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
      )
      && !cancelled()
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: main
        uses: ./.github/actions
        with:
          test_case: ${{ matrix.test_case }}
          model: ${{ matrix.model }}
          tag: latest
          timeout: ${{ matrix.timeout || 30 }}
          is_unit_test: "false"
          PAT: ${{ secrets.PAT }}
          container-image: ${{ env.container-registry-gb200 }}/megatron-lm:${{ github.sha }}
          scope: ${{ needs.configure.outputs.scope }}
          n_repeat: ${{ needs.configure.outputs.n_repeat }}
          lightweight: ${{ needs.configure.outputs.lightweight }}
          platform: dgx_gb200

  Nemo_CICD_Test:
    needs:
      - pre-flight
      - is-not-external-contributor
      - cicd-unit-tests-latest
      - cicd-integration-tests-latest-h100
      - cicd-integration-tests-latest-gb200
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || needs.pre-flight.outputs.is_ci_workload == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    permissions: write-all
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          DOCS_ONLY: ${{ needs.pre-flight.outputs.docs_only }}
          IS_DEPLOYMENT: ${{ needs.pre-flight.outputs.is_deployment_workflow }}
          IS_MAINTAINER: ${{ needs.is-not-external-contributor.outputs.is_maintainer }}
          UNIT_RESULT: ${{ needs.cicd-unit-tests-latest.result }}
          H100_RESULT: ${{ needs.cicd-integration-tests-latest-h100.result }}
          GB200_RESULT: ${{ needs.cicd-integration-tests-latest-gb200.result }}
        run: |
          # Docs-only and deployment workflows intentionally skip all tests
          if [ "$DOCS_ONLY" == "true" ] || [ "$IS_DEPLOYMENT" == "true" ]; then
            echo "✅ Docs-only or deployment workflow — test checks skipped"
            exit 0
          fi

          FAILED=false

          # Unit tests must always succeed (never skipped or cancelled)
          if [ "$UNIT_RESULT" != "success" ]; then
            echo "❌ cicd-unit-tests-latest: $UNIT_RESULT"
            FAILED=true
          fi

          # H100 integration tests must always succeed
          if [ "$H100_RESULT" != "success" ]; then
            echo "❌ cicd-integration-tests-latest-h100: $H100_RESULT"
            FAILED=true
          fi

          # GB200 integration tests may be skipped only for non-maintainer PRs
          # (no GB200 runners available); maintainer runs must always succeed
          if [ "$GB200_RESULT" == "skipped" ] && [ "$IS_MAINTAINER" == "true" ]; then
            echo "❌ cicd-integration-tests-latest-gb200: skipped unexpectedly for a maintainer run"
            FAILED=true
          elif [ "$GB200_RESULT" != "success" ] && [ "$GB200_RESULT" != "skipped" ]; then
            echo "❌ cicd-integration-tests-latest-gb200: $GB200_RESULT"
            FAILED=true
          fi

          # Broad scan: catch any individual job failures or cancellations
          # (e.g. a single matrix instance cancelled mid-run)
          BAD_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '
            [.jobs[] | select(
              .status == "completed"
              and (.conclusion == "failure" or .conclusion == "cancelled")
              and .name != "merge-queue-notification"
              and .name != "cicd-mbridge-testing"
            )] | length
          ') || BAD_JOBS=0

          if [ "${BAD_JOBS:-0}" -gt 0 ]; then
            echo "❌ Found ${BAD_JOBS} failed or cancelled job(s):"
            gh run view $GITHUB_RUN_ID --json jobs --jq '
              .jobs[] | select(
                .status == "completed"
                and (.conclusion == "failure" or .conclusion == "cancelled")
                and .name != "merge-queue-notification"
                and .name != "cicd-mbridge-testing"
              ) | .name + " → " + .conclusion
            '
            FAILED=true
          fi

          if [ "$FAILED" != "true" ]; then
            echo "✅ All previous jobs completed successfully"
          else
            exit 1
          fi

  Coverage_Fake:
    runs-on: ubuntu-latest
    needs: [Nemo_CICD_Test, pre-flight]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || github.event == 'merge_group'
      )
      && needs.pre-flight.outputs.is_ci_workload == 'false'
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Generate fake coverage report
        uses: actions/github-script@v8
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
              repo: context.repo.repo,
              sha: context.sha,
              state: 'success',
              description: 'No code changes - coverage check skipped',
              context: 'codecov/patch'
            });

  Coverage:
    runs-on: ubuntu-latest
    needs: [Nemo_CICD_Test]
    if: |
      (
        (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
        || success()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      matrix:
        flag: [unit-test]
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Download coverage reports of current branch
        uses: actions/download-artifact@v7
        with:
          pattern: coverage-${{ matrix.flag }}-*

      - name: List coverage files
        run: find . -type f -name "*.xml" -o -name "*.lcov"

      - name: Get total coverage of current branch
        shell: bash -x -e -u -o pipefail {0}
        if: always()
        run: |
          pip install coverage

          ls -al .
          ls -al coverage-*/
          coverage combine --keep $(ls coverage-*/.coverage)
          coverage report -i
          rm -rf coverage-*
          ls -al

      - name: Upload coverage reports to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          verbose: true
          flags: ${{ matrix.flag }}

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
          name: coverage-${{ matrix.flag }}-aggregated
          path: |
            .coverage
          include-hidden-files: true

  merge-queue-notification:
    runs-on: ubuntu-latest
    if: github.event_name == 'merge_group'
    permissions:
      pull-requests: write
    steps:
      - name: Extract PR number from merge group
        id: get-pr-number
        run: |
          # Extract PR number from merge group head_ref (format: refs/heads/gh-readonly-queue/main/pr-<number>-<sha>)
          PR_NUMBER=$(echo "${{ github.event.merge_group.head_ref }}" | sed -n 's/.*\/pr-\([0-9]*\)-.*/\1/p')
          echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

      - name: Comment on PR with action run URL
        uses: actions/github-script@v8
        with:
          github-token: ${{ secrets.PAT }}
          script: |
            const prNumber = ${{ steps.get-pr-number.outputs.pr_number }};
            const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`;

            await github.rest.issues.createComment({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: prNumber,
              body: `🔄 Merge queue validation started!\n\nYou can track the progress here: ${runUrl}`
            });

  cleanup-taint-node:
    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
    needs:
      - is-not-external-contributor
      - cicd-container-build
      - cicd-unit-tests-latest
      - cicd-integration-tests-latest-h100
      - cicd-integration-tests-latest-gb200
      - Coverage
      - Coverage_Fake
    if: |
      always()
      && !cancelled()
      && contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
      && !needs.pre-flight.outputs.is_deployment_workflow == 'true'
    steps:
      - name: Taint node for cleanup
        shell: bash
        run: taint-node.sh

claude-complexity-label AI .github/workflows/claude-complexity-label.yml

Triggers: pull_request_target
Runs on: ubuntu-latest
Jobs: label-complexity
Actions: anthropics/claude-code-action

View raw YAML

name: Claude Complexity Label

on:
  pull_request_target:
    types: [ready_for_review]

jobs:
  label-complexity:
    name: Label PR Complexity
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
      issues: write
      id-token: write
    env:
      GH_TOKEN: ${{ secrets.PAT }}
      REPO: ${{ github.repository }}
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Run Claude Complexity Analysis
        uses: anthropics/claude-code-action@v1
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          github_token: ${{ secrets.PAT }}
          prompt: |
            REPO: ${{ env.REPO }}
            PR NUMBER: ${{ env.PR_NUMBER }}

            You are a PR complexity analyzer. Your job is to analyze the diff of this PR and apply exactly one complexity label.

            STEPS:
            1. Get the PR diff by running: gh pr diff $PR_NUMBER --repo $REPO
            2. Analyze every changed line (added or removed) in the diff and classify each as one of:
               - "docs-only": changes to docstrings, comments (lines starting with # or //), documentation files (.md, .rst, .txt), or similar non-functional text
               - "test": changes in test files (files with "test" in the name/path, or inside a tests/ directory)
               - "real code": all other changes (functional source code)
            3. Compute "real code line changes" using this formula:
               real_code_line_changes = (number of real code lines changed) + (number of test lines changed / 10)
               Count both added and removed lines. Do not count unchanged context lines. Do not count comments or docstrings.
            4. Remove any previously applied complexity or docs-only labels:
               gh pr edit $PR_NUMBER --repo $REPO --remove-label "complexity: low,complexity: medium,complexity: high,docs-only"
            5. Apply exactly ONE label using the gh CLI:
               - If there are ZERO real code lines and ZERO test lines (only docs-only changes), apply label "docs-only":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "docs-only"
               - If real_code_line_changes < 100, apply label "complexity: low":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: low"
               - If real_code_line_changes >= 100 and < 500, apply label "complexity: medium":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: medium"
               - If real_code_line_changes >= 500, apply label "complexity: high":
                 gh pr edit $PR_NUMBER --repo $REPO --add-label "complexity: high"

            Do NOT post any comments on the PR. Only apply the label.
          claude_args: |
            --allowedTools "Bash(gh pr diff:*),Bash(gh pr edit:*),Bash(gh pr view:*)"

claude-copy-to-main AI .github/workflows/claude-copy-to-main.yml

Triggers

issue_comment

Runs on

ubuntu-latest

Jobs

copy-to-main

Actions

anthropics/claude-code-action

Commands

PERMISSION=$(gh api repos/$REPO/collaborators/$COMMENTER/permission --jq .permission) if [[ "$PERMISSION" != "admin" && "$PERMISSION" != "write" ]]; then gh pr comment $PR_NUMBER --repo $REPO --body "❌ You do not have write access to use \`/claude copy\`." exit 1 fi
PR_JSON=$(gh pr view $PR_NUMBER --repo $REPO --json baseRefName,mergedAt) PR_BASE=$(echo "$PR_JSON" | jq -r .baseRefName) PR_MERGED=$(echo "$PR_JSON" | jq -r .mergedAt) if [ "$PR_BASE" = "main" ]; then gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR already targets \`main\`. \`/claude copy\` only works on PRs targeting non-main branches." exit 1 fi if [ "$PR_MERGED" = "null" ] || [ -z "$PR_MERGED" ]; then gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR has not been merged yet. \`/claude copy\` only works on merged PRs." exit 1 fi
git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER-head

View raw YAML

name: Claude Copy PR to Main

on:
  issue_comment:
    types: [created]

jobs:
  copy-to-main:
    name: Copy PR to Main
    if: |
      github.event_name == 'issue_comment' &&
      github.event.issue.pull_request &&
      contains(github.event.comment.body, '/claude copy')
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
      issues: write
      id-token: write
    env:
      GH_TOKEN: ${{ secrets.PAT }}
      REPO: ${{ github.repository }}
      PR_NUMBER: ${{ github.event.issue.number }}
    steps:
      - name: Check commenter has write access
        env:
          COMMENTER: ${{ github.event.comment.user.login }}
        run: |
          PERMISSION=$(gh api repos/$REPO/collaborators/$COMMENTER/permission --jq .permission)
          if [[ "$PERMISSION" != "admin" && "$PERMISSION" != "write" ]]; then
            gh pr comment $PR_NUMBER --repo $REPO --body "❌ You do not have write access to use \`/claude copy\`."
            exit 1
          fi

      - name: Check PR is merged and targets non-main
        run: |
          PR_JSON=$(gh pr view $PR_NUMBER --repo $REPO --json baseRefName,mergedAt)
          PR_BASE=$(echo "$PR_JSON" | jq -r .baseRefName)
          PR_MERGED=$(echo "$PR_JSON" | jq -r .mergedAt)

          if [ "$PR_BASE" = "main" ]; then
            gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR already targets \`main\`. \`/claude copy\` only works on PRs targeting non-main branches."
            exit 1
          fi

          if [ "$PR_MERGED" = "null" ] || [ -z "$PR_MERGED" ]; then
            gh pr comment $PR_NUMBER --repo $REPO --body "❌ This PR has not been merged yet. \`/claude copy\` only works on merged PRs."
            exit 1
          fi

      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          token: ${{ secrets.PAT }}

      - name: Fetch PR head ref from fork
        run: |
          git fetch origin pull/$PR_NUMBER/head:pr-$PR_NUMBER-head

      - name: Run Claude Copy to Main
        uses: anthropics/claude-code-action@v1
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          trigger_phrase: "/claude copy"
          github_token: ${{ secrets.PAT }}
          prompt: |
            REPO: ${{ env.REPO }}
            PR NUMBER: ${{ env.PR_NUMBER }}

            You are a PR copy assistant. Your job is to apply the final changes from a merged PR onto a new branch based on `main` and create a new PR targeting `main`.

            The PR's commits originated from a fork and have been fetched locally as the branch: pr-${PR_NUMBER}-head

            STEPS:
            1. Get the PR details (title, body, and base branch):
               gh pr view $PR_NUMBER --repo $REPO --json title,body,baseRefName

            2. Configure git for committing (use the svcnvidia-nemo-ci service account since secrets.PAT belongs to it):
               git config user.name "svcnvidia-nemo-ci"
               git config user.email "svcnvidia-nemo-ci@nvidia.com"

            3. Create a new branch from `main`:
               git checkout main
               git pull origin main
               git checkout -b copy-pr-${PR_NUMBER}-to-main

            4. Generate a patch of the PR's final changes and apply it:
               MERGE_BASE=$(git merge-base origin/<baseRefName> pr-${PR_NUMBER}-head)
               git diff $MERGE_BASE pr-${PR_NUMBER}-head | git apply --3way
               (Replace <baseRefName> with the actual base branch name from step 1.)

               If the apply fails due to merge conflicts:
               a. Identify conflicted files: git diff --name-only --diff-filter=U
               b. For each conflicted file, read its contents to see the conflict markers
               c. Resolve the conflicts by favoring the `main` branch side when there is a genuine
                  conflict between the two sides. The goal is to bring the PR's changes into main
                  without overriding what is already on main.
               d. Stage the resolved files: git add <file>

            5. Commit the changes:
               git add -A
               git commit -m "Copy PR #${PR_NUMBER} to main"

            6. Push the new branch:
               git push origin copy-pr-${PR_NUMBER}-to-main

            7. Create a new PR targeting `main`:
               gh pr create --repo $REPO \
                 --base main \
                 --head copy-pr-${PR_NUMBER}-to-main \
                 --title "[Copy to main] <original PR title>" \
                 --body "🤖 **This PR was auto-generated by Claude** via the \`/claude copy\` command.\n\nCherry-picked from #${PR_NUMBER}.\n\n---\n\n<original PR body>"

            8. Comment on the original PR with a link to the newly created PR.

            IMPORTANT:
            - When resolving merge conflicts, favor `main` over the non-main branch. Do not override changes already on main.
            - Do NOT force push.
          claude_args: |
            --allowedTools "Bash(git:*),Bash(gh:*),Read,Edit"
            --model "claude-opus-4-6"

claude_review AI .github/workflows/claude_review.yml

Triggers

issue_comment

Runs on

ubuntu-latest

Jobs

review-on-comment

Actions

anthropics/claude-code-action

Commands

echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT

View raw YAML

name: Claude Code Review

on:
  issue_comment:
    types: [created]

jobs:
  review-on-comment:
    name: Claude Review (comment trigger)
    if: |
      github.event_name == 'issue_comment' &&
      github.event.issue.pull_request &&
      contains(github.event.comment.body, '/claude review')
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
      issues: write
      id-token: write
    env:
      GH_TOKEN: ${{ github.token }}
      REPO: ${{ github.repository }}
      PR_NUMBER: ${{ github.event.issue.number }}
    steps:
      - name: Get PR head commit
        id: get-pr-head-commit
        run: |
          echo "sha=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid -q .headRefOid)" | tee -a $GITHUB_OUTPUT

      - name: Checkout repository
        uses: actions/checkout@v6
        with:
          fetch-depth: 1
          ref: ${{ steps.get-pr-head-commit.outputs.sha }}

      - name: Run Claude Code Review
        uses: anthropics/claude-code-action@v1
        with:
          anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
          trigger_phrase: "/claude review"
          show_full_output: true
          claude_args: |
            --allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*)"
            --model "claude-opus-4-6"
          prompt: |
            REPO: ${{ env.REPO }}
            PR NUMBER: ${{ env.PR_NUMBER }}

            You are doing a light code review. Keep it concise and actionable.

            Focus ONLY on:
            - Critical bugs or logic errors
            - Typos in code, comments, or strings
            - Missing or insufficient test coverage for changed code
            - Outdated or inaccurate documentation affected by the changes

            Do NOT comment on:
            - Style preferences or formatting
            - Minor naming suggestions
            - Architectural opinions or refactoring ideas
            - Performance unless there is a clear, measurable issue

            Only use inline ```suggestion blocks for simple, self-contained line replacements (typos,
            renames, single-line fixes). For structural changes that add, remove, or reorganize blocks
            of code (e.g. adding a new function, inserting a YAML step, reordering logic), use a
            top-level PR comment with a code block showing the proposed change instead — inline
            suggestions cannot express insertions or multi-block restructuring and will break the code
            if applied.

            It's perfectly acceptable to not have anything to comment on.
            If you do not have anything to comment on, approve the PR with: gh pr review $PR_NUMBER --repo $REPO --approve --body "LGTM"

close-inactive-issue-pr .github/workflows/close-inactive-issue-pr.yml

Triggers: schedule
Runs on: —
Jobs: close-issues

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Stale-Close-Inactive-Issues-PRs
on:
  schedule:
    - cron: "30 1 * * *"

jobs:
  close-issues:
    if: github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_close_inactive_issue_pr.yml@v0.44.0

community-bot .github/workflows/community-bot.yml

Triggers: issues, issue_comment
Runs on: —
Jobs: community-bot

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Community Bot

on:
  issues:
    types: [opened, edited, reopened, closed, deleted]
  issue_comment:
    types: [created, edited, deleted]

jobs:
  community-bot:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10
    with:
      community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
    if: github.repository == 'NVIDIA/Megatron-LM'
    secrets:
      GH_TOKEN: ${{ secrets.PAT }}

Triggers

push, merge_group

Runs on

ubuntu-latest

Jobs

pre-flight, copyright-check, copyright-check-summary

Commands

FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then echo "✅ All previous jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' exit 1 fi

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Copyright check

on:
  push:
    branches:
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  copyright-check:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.66.7

  copyright-check-summary:
    needs: [pre-flight, copyright-check]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi

dependabot matrix perms .github/workflows/dependabot.yml

Triggers

schedule, workflow_dispatch

Runs on

ubuntu-latest, ubuntu-latest

Jobs

get-release-branch-names, bump-tags, notify

Matrix

include, include.target-branch→ ${{ needs.get-release-branch-names.outputs.mcore }}, main

Commands

latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' | grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' | sort -V | tail -n1) echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT
curl -X POST \ -H 'Content-type: application/json' \ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \ $SLACK_WEBHOOK

View raw YAML

name: Dependabot
on:
  schedule:
    - cron: "0 8 * * 1"
  workflow_dispatch: # Allow manual triggering

permissions:
  id-token: write
  contents: write

jobs:
  get-release-branch-names:
    runs-on: ubuntu-latest
    outputs:
      mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
    if: github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Get release branch names
        id: get-branch
        env:
          PAT: ${{ secrets.PAT }}
        run: |
          latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' | 
            grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' | 
            sort -V | 
            tail -n1)
          echo "mcore_release_branch=$latest_branch" | tee -a $GITHUB_OUTPUT

  bump-tags:
    needs: [get-release-branch-names]
    if: github.repository == 'NVIDIA/Megatron-LM'
    strategy:
      fail-fast: false
      matrix:
        include:
          - target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
          - target-branch: main
    uses: ./.github/workflows/_update_dependencies.yml
    with:
      target-branch: ${{ matrix.target-branch }}
    secrets:
      PAT: ${{ secrets.PAT }}
      SSH_KEY: ${{ secrets.SSH_KEY }}
      SSH_PWD: ${{ secrets.SSH_PWD }}

  notify:
    if: failure() && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: ubuntu-latest
    needs: [bump-tags]
    steps:
      - name: Notify
        env:
          SLACK_WEBHOOK: ${{ secrets.SLACK_CI_CHANNEL_WEBHOOK }}
          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_TEAM_GROUP_ID }}>
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_REPOSITORY: ${{ github.repository }}
        run: |
          curl -X POST \
            -H 'Content-type: application/json' \
            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
            $SLACK_WEBHOOK

force-draft-pr perms .github/workflows/force-draft-pr.yml

Triggers

pull_request_target

Runs on

ubuntu-latest

Jobs

force-draft

Commands

gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}
gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \ "This PR has been automatically converted to **draft** because all PRs must start as drafts. When you are ready for review, click **Ready for Review** to begin the review process. This will: 1. Add the oncall reviewer (optional reviewer) 2. Add required review teams based on your changes See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."

View raw YAML

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

name: Force Draft PR

on:
  pull_request_target:
    types: [opened]
    branches:
      - main

permissions:
  pull-requests: write

jobs:
  force-draft:
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft && github.repository == 'NVIDIA/Megatron-LM' }}
    steps:
      - name: Convert PR to draft
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          gh pr ready --undo ${{ github.event.pull_request.number }} --repo ${{ github.repository }}

      - name: Add comment explaining draft policy
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --body \
            "This PR has been automatically converted to **draft** because all PRs must start as drafts.

          When you are ready for review, click **Ready for Review** to begin the review process. This will:
          1. Add the oncall reviewer (optional reviewer)
          2. Add required review teams based on your changes

          See the [contribution guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/developer/submit.md) for more details."

install-test matrix .github/workflows/install-test.yml

Triggers

push, merge_group

Runs on

linux-amd64-cpu16, linux-amd64-cpu16, ubuntu-latest

Jobs

pre-flight, pip-test-pytorch, uv-test-pytorch, install-test-summary

Matrix

python-version→ 3.12

Commands

echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV" echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV" echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV" echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV" echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV" echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV" echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV" echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV" echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV" echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV" echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV" echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV" echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
bash docker/common/install.sh --environment dev --base-image pytorch --use-uv
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then echo "✅ All previous jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' exit 1 fi

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This workflow verifies that the basic install works across all supported platforms.
# For basic install, all imports need to either be successful or appropriately guarded.

name: Installation Test

on:
  push:
    branches:
      - dev
      - main
      - "pull-request/[0-9]+"
      - "deploy-release/*"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  pip-test-pytorch:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: linux-amd64-cpu16
    name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
    container:
      image: nvcr.io/nvidia/pytorch:25.05-py3
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set PATH
        run: |
          echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
          echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
          echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
          echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
          echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"

      - name: Install megatron-core
        shell: bash -x -e -u -o pipefail {0}
        run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}

      - name: Checkout check-imports
        uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.63.2
          path: FW-CI-templates

      - name: Check imports for megatron-core
        uses: ./FW-CI-templates/.github/actions/check-imports
        with:
          package-name: megatron.core
          python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python

  uv-test-pytorch:
    needs: [pre-flight]
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
      && github.repository == 'NVIDIA/Megatron-LM'
    runs-on: linux-amd64-cpu16
    name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
    container:
      image: nvcr.io/nvidia/pytorch:25.05-py3
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.12"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Set PATH
        run: |
          echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV"
          echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
          echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
          echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
          echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
          echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV"
          echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"

      - name: Install project
        shell: bash
        run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv

      # NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
      # - name: Checkout check-imports
      #   uses: actions/checkout@v6
      #   with:
      #     repository: NVIDIA-NeMo/FW-CI-templates
      #     ref: v0.63.2
      #     path: FW-CI-templates

      # - name: Check imports for megatron-core
      #   uses: ./FW-CI-templates/.github/actions/check-imports
      #   with:
      #     package-name: megatron.core
      #     python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python

  install-test-summary:
    needs: [pre-flight, pip-test-pytorch, uv-test-pytorch]
    runs-on: ubuntu-latest
    name: Install test summary
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && !cancelled()
      && github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Get workflow result
        id: result
        shell: bash -x -e -u -o pipefail {0}
        env:
          GH_TOKEN: ${{ github.token }}
          RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi

multi-approval-bot .github/workflows/multi-approval-bot.yml

Triggers

push, merge_group

Runs on

ubuntu-latest, ubuntu-latest

Jobs

pre-flight, codeowners-approval, multi-approval-bot-summary

Actions

nv-gha-runners/get-pr-info

Commands

FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then echo "✅ All previous jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' exit 1 fi

View raw YAML

name: "Codeowners Approval Workflow"

on:
  push:
    branches:
      - "pull-request/[0-9]+"
  merge_group:
    types: [checks_requested]

jobs:
  pre-flight:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
    if: github.repository == 'NVIDIA/Megatron-LM'

  codeowners-approval:
    needs: [pre-flight]
    runs-on: ubuntu-latest
    if: |
      !(needs.pre-flight.outputs.docs_only == 'true'
      || needs.pre-flight.outputs.is_merge_group == 'true'
      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
    steps:
      - name: Get PR info
        id: get-pr-info
        if: startsWith(github.ref, 'refs/heads/pull-request/')
        uses: nv-gha-runners/get-pr-info@main

      - name: Checkout action
        uses: actions/checkout@v6
        with:
          repository: noamelf/codeowner-multi-approval-action
          ref: v0.1
          path: codeowner-multi-approval-action

      - name: Check Codeowners Approval
        uses: ./codeowner-multi-approval-action
        with:
          pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
          repo-name: ${{ github.repository }}
          github-token: ${{ secrets.PAT }}

  multi-approval-bot-summary:
    needs: [pre-flight, codeowners-approval]
    if: |
      (
        needs.pre-flight.outputs.docs_only == 'true'
        || needs.pre-flight.outputs.is_merge_group == 'true'
        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
        || always()
      )
      && github.repository == 'NVIDIA/Megatron-LM'
      && !cancelled()
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Result
        env:
          GH_TOKEN: ${{ github.token }}
          GITHUB_RUN_ID: ${{ github.run_id }}
          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
        run: |
          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
              echo "✅ All previous jobs completed successfully"
              exit 0
          else
              echo "❌ Found $FAILED_JOBS failed job(s)"
              # Show which jobs failed
              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
              exit 1
          fi

oncall-assign perms .github/workflows/oncall-assign.yml

Triggers

pull_request_target

Runs on

ubuntu-latest

Jobs

assign-reviewer

Commands

pip install requests slack-sdk
python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }}

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Oncall Assign

on:
  pull_request_target:
    types: [ready_for_review]
    branches:
      - main

permissions:
  pull-requests: write
  contents: read

jobs:
  assign-reviewer:
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: pip install requests slack-sdk

      - name: Assign Reviewer
        env:
          GH_TOKEN: ${{ secrets.PAT }}
        run: |
          python .github/scripts/oncall_manager.py assign --pr ${{ github.event.pull_request.number }}

oncall-rotation perms .github/workflows/oncall-rotation.yml

Triggers

schedule, workflow_dispatch

Runs on

ubuntu-latest

Jobs

rotate-schedule

Commands

pip install --no-cache-dir "uv<0.9.29" uv venv .venv uv cache clean uv sync --no-cache uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate
git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" git add .github/oncall_schedule.json git commit -m "chore: rotate oncall schedule" || echo "No changes to commit" git pull --rebase git push origin HEAD:main

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Oncall Rotation

on:
  schedule:
    # Runs at 09:00 UTC every Wednesday
    - cron: "0 9 * * 3"
  workflow_dispatch:

permissions:
  contents: write

jobs:
  rotate-schedule:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          token: ${{ secrets.PAT }}

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Rotate Schedule
        env:
          # Token to read org team members. Needs read:org scope.
          GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
          # Slack token for updating the Slack usergroup
          SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
        run: |
          pip install --no-cache-dir "uv<0.9.29"
          uv venv .venv
          uv cache clean
          uv sync --no-cache 
          uv run --with slack-sdk python .github/scripts/oncall_manager.py rotate

      - name: Commit and Push changes
        run: |
          git config --global user.name "github-actions[bot]"
          git config --global user.email "github-actions[bot]@users.noreply.github.com"
          git add .github/oncall_schedule.json
          git commit -m "chore: rotate oncall schedule" || echo "No changes to commit"
          git pull --rebase
          git push origin HEAD:main

release perms .github/workflows/release.yaml

Triggers: workflow_dispatch
Runs on: —
Jobs: release

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Release Megatron-Core"

on:
  workflow_dispatch:
    inputs:
      release-ref:
        description: Ref (SHA or branch name) to release
        required: true
        type: string
      dry-run:
        description: Do not publish a wheel and GitHub release.
        required: true
        default: true
        type: boolean
      create-gh-release:
        description: Create a GitHub release
        required: true
        default: true
        type: boolean
      generate-changelog:
        description: Generate changelog
        required: false
        default: true
        type: boolean
      publish-docs:
        description: Publish docs
        required: false
        default: true
        type: boolean
      version-bump-branch:
        description: Branch for version bump
        required: true
        type: string
      gh-release-from-tag:
        description: Tag of previous release for changelog builder
        required: false
        type: string
        default: ""

permissions:
  contents: write # To read repository content
  pull-requests: write # To create PRs

jobs:
  release:
    uses: ./.github/workflows/_release_library.yml
    with:
      release-ref: ${{ inputs.release-ref || github.sha }}
      dry-run: ${{ inputs.dry-run || false }}
      version-bump-branch: ${{ inputs.version-bump-branch || github.ref_name }}
      create-gh-release: ${{ inputs.create-gh-release || true }}
      gh-release-use-changelog-builder: ${{ inputs.generate-changelog }}
      publish-docs: ${{ inputs.publish-docs }}
      gh-release-from-tag: ${{ inputs.gh-release-from-tag }}
    secrets:
      TWINE_PASSWORD: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SVC_PYPI_TOKEN || secrets.SVC_PYPI_TEST_TOKEN }}
      SLACK_WEBHOOK: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && secrets.SLACK_MAIN_CHANNEL_WEBHOOK || secrets.SLACK_CI_CHANNEL_WEBHOOK }}
      PAT: ${{ secrets.PAT }}
      AWS_ASSUME_ROLE_ARN: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      AKAMAI_HOST: ${{ secrets.AKAMAI_HOST }}
      AKAMAI_CLIENT_TOKEN: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
      AKAMAI_CLIENT_SECRET: ${{ secrets.AKAMAI_CLIENT_SECRET }}
      AKAMAI_ACCESS_TOKEN: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
      S3_BUCKET_NAME: ${{ secrets.S3_BUCKET_NAME }}

release-docs .github/workflows/release-docs.yml

Triggers: workflow_dispatch, workflow_call
Runs on: ubuntu-latest
Jobs: build-docs, publish-docs

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Release docs
on:
  workflow_dispatch:
    inputs:
      dry-run:
        description: Whether to run the workflow in dry-run mode
        required: true
        type: boolean
        default: true
      publish-as-latest:
        description: Publish as Latest stable version.
        required: false
        type: boolean
        default: true
      docs-version-override:
        description: Docs version if commit is not tagged
        required: false
        type: string
        default: ""
      update-version-picker:
        description: Update version picker.
        required: false
        type: boolean
        default: true
      notify-emails:
        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
        required: false
        type: string
  workflow_call:
    inputs:
      dry-run:
        description: Whether to run the workflow in dry-run mode
        required: true
        type: boolean
        default: true
      publish-as-latest:
        description: Publish as Latest stable version.
        required: false
        type: boolean
        default: true
      docs-version-override:
        description: Docs version if commit is not tagged
        required: false
        type: string
        default: ""
      update-version-picker:
        description: Update version picker.
        required: false
        type: boolean
        default: true
      notify-emails:
        description: Email addresses to send the notification to. Format as "me@me.com,you@you.com".
        required: false
        type: string
      build-docs-ref:
        description: Reference to build the docs from
        required: false
        type: string
        default: ${{ github.sha }}

jobs:
  build-docs:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.67.0
    with:
      ref: ${{ inputs.build-docs-ref }}

  publish-docs:
    runs-on: ubuntu-latest
    needs: [build-docs]
    steps:
      - uses: actions/checkout@v6
        with:
          repository: NVIDIA-NeMo/FW-CI-templates
          ref: v0.74.0
          path: FW-CI-templates

      - uses: ./FW-CI-templates/.github/actions/publish-docs
        # This workflow runs either on main, or on a version tag. Any other git ref will lead
        # to an error.
        # If its on main, it will publish to "latest" directory in Akamai.
        # If its on a versioned tag, it will extract the version number from the tag (strip `v` prefix)
        # and publish to the versioned directory in Akamai.
        with:
          dry-run: ${{ inputs.dry-run }}
          artifacts-name: docs-html
          artifacts-path: _build/html
          emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
          overwrite-latest-on-tag: ${{ inputs.publish-as-latest }}
          docs-version-override: ${{ inputs.docs-version-override }}
          update-version-picker: ${{ inputs.update-version-picker }}
          run-on-version-tag-only: ${{ github.ref_name != 'main' }}
          request-name: megatron-core-publish-docs-${{ github.run_id }}
          aws-region: ${{ vars.DOCS_AWS_REGION }}
          aws-role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          akamai-host: ${{ secrets.AKAMAI_HOST }}
          akamai-client-token: ${{ secrets.AKAMAI_CLIENT_TOKEN }}
          akamai-client-secret: ${{ secrets.AKAMAI_CLIENT_SECRET }}
          akamai-access-token: ${{ secrets.AKAMAI_ACCESS_TOKEN }}
          s3-target-root: ${{ secrets.S3_BUCKET_NAME }}
          s3-target-path: megatron-core/developer-guide

release-freeze .github/workflows/release-freeze.yml

Triggers: workflow_dispatch
Runs on: —
Jobs: code-freeze

View raw YAML

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Code freeze"

on:
  workflow_dispatch:
    inputs:
      release-type:
        type: choice
        description: Type of release
        options:
          - major
          - minor
      freeze-commit:
        type: string
        description: Commit SHA to use for cut-off
        required: false
        default: main
      dry-run:
        type: boolean
        description: Dry-run of code-freeze
        required: false
        default: true
jobs:
  code-freeze:
    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.22.5
    with:
      library-name: Megatron-Bridge
      python-package: megatron.bridge
      release-type: ${{ inputs.release-type }}
      freeze-commit: ${{ inputs.freeze-commit }}
      dry-run: ${{ inputs.dry-run }}
    secrets:
      SLACK_WEBHOOK: ${{ secrets.SLACK_MAIN_CHANNEL_WEBHOOK }}
      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}

release-nightly-docs .github/workflows/release-nightly-docs.yml

Triggers: schedule
Runs on: —
Jobs: call-release-docs

View raw YAML

# Copyright (c) 2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Release Nightly Docs

on:
  schedule:
    - cron: "0 10 * * *"

jobs:
  call-release-docs:
    uses: ./.github/workflows/release-docs.yml
    with:
      dry-run: false
      publish-as-latest: false
      docs-version-override: "nightly"
      update-version-picker: false
    secrets: inherit

review-trigger .github/workflows/review-trigger.yml

Triggers

pull_request_review

Runs on

ubuntu-latest

Jobs

signal

Commands

mkdir -p pr echo "${{ github.event.pull_request.number }}" > pr/number

View raw YAML

# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Lightweight workflow that triggers on review approval, otherwise there is no access to right secret.
# No secrets needed — just signals auto-swap-labels.yml via workflow_run.

name: Review Trigger

on:
  pull_request_review:
    types: [submitted]

jobs:
  signal:
    runs-on: ubuntu-latest
    if: >-
      github.event.review.state == 'approved' &&
      github.event.pull_request.base.ref == 'main' &&
      github.repository == 'NVIDIA/Megatron-LM'
    steps:
      - name: Save PR number
        run: |
          mkdir -p pr
          echo "${{ github.event.pull_request.number }}" > pr/number
      - name: Upload PR number
        uses: actions/upload-artifact@v4
        with:
          name: pr-number
          path: pr/

sync-team-usergroups .github/workflows/sync-team-usergroups.yml

Triggers

workflow_dispatch, schedule

Runs on

ubuntu-latest

Jobs

sync-usergroups

Commands

pip install --no-cache-dir "uv<0.9.29" uv venv .venv uv cache clean uv sync --no-cache uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py

View raw YAML

# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Sync GitHub Teams to Slack User Groups

on:
  workflow_dispatch:
  schedule:
    - cron: "0 0 * * *"

jobs:
  sync-usergroups:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.10"

      - name: Sync Teams to User Groups
        env:
          GH_TOKEN: ${{ secrets.NVIDIA_MCORE_ONCALL_TOKEN || secrets.PAT || secrets.GITHUB_TOKEN }}
          SLACK_TOKEN: ${{ secrets.ONCALL_SLACK_TOKEN }}
        run: |
          pip install --no-cache-dir "uv<0.9.29"
          uv venv .venv
          uv cache clean
          uv sync --no-cache 
          uv run --with slack-sdk python .github/scripts/sync_team_usergroups.py

trigger-mbridge-tests .github/workflows/trigger-mbridge-tests.yml

Triggers: workflow_dispatch
Runs on: ubuntu-latest
Jobs: trigger-mbridge-tests
Actions: convictional/trigger-workflow-and-wait

View raw YAML

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name: Trigger MBridge Tests
on:
  workflow_dispatch:
    inputs:
      mbridge_ref:
        description: "MBridge branch/ref to trigger"
        required: false
        type: string
        default: "main"
      test_suite:
        description: "Test suite to run"
        required: false
        type: choice
        options:
          - "all"
          - "unit-only"
          - "functional-only"
        default: "all"

jobs:
  trigger-mbridge-tests:
    runs-on: ubuntu-latest
    steps:
      - name: Trigger MBridge tests
        uses: convictional/trigger-workflow-and-wait@v1.6.5
        with:
          owner: NVIDIA-NeMo
          repo: Megatron-Bridge
          workflow_file_name: cicd-main.yml
          github_token: ${{ secrets.PAT }}
          ref: ${{ inputs.mbridge_ref }}
          wait_interval: 60
          propagate_failure: true
          client_payload: |
            {
              "mcore_ref": "${{ github.sha }}",
              "test_suite": "${{ inputs.test_suite }}",
              "triggered_by": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
            }